diff --git a/Makefile b/Makefile
index c6eb1d15d..6c7f4a51b 100644
--- a/Makefile
+++ b/Makefile
@@ -11,7 +11,7 @@ ifndef CC
     CC = gcc
 endif
 
-INCPATH += -I./src -I./include -I./src/leveldb/include -I./src/leveldb \
+INCPATH += -I./src -I./include -I./src/leveldb/include -I./src/leveldb -I./src/sdk \
            -I./src/sdk/java/native-src $(DEPS_INCPATH) 
 CFLAGS += $(OPT) $(INCPATH) -fPIC -fvisibility=hidden # hide internal symbol of tera
 CXXFLAGS += -std=gnu++11 $(CFLAGS)
@@ -28,28 +28,42 @@ PROTO_OUT_H := $(PROTO_FILES:.proto=.pb.h)
 MASTER_SRC := $(wildcard src/master/*.cc)
 TABLETNODE_SRC := $(wildcard src/tabletnode/*.cc)
 IO_SRC := $(wildcard src/io/*.cc)
-SDK_SRC := $(wildcard src/sdk/*.cc)
+SDK_SRC := $(wildcard src/sdk/*.cc) $(wildcard src/sdk/test/global_txn_testutils.cc) \
+		   src/observer/rowlocknode/zk_rowlock_client_zk_adapter.cc src/observer/rowlocknode/ins_rowlock_client_zk_adapter.cc
 HTTP_SRC := $(wildcard src/sdk/http/*.cc)
 PROTO_SRC := $(filter-out %.pb.cc, $(wildcard src/proto/*.cc)) $(PROTO_OUT_CC)
 JNI_TERA_SRC := $(wildcard src/sdk/java/native-src/*.cc)
 VERSION_SRC := src/version.cc
 OTHER_SRC := $(wildcard src/zk/*.cc) $(wildcard src/utils/*.cc) $(VERSION_SRC) \
-             src/tera_flags.cc
+             src/tera_flags.cc src/sdk/test/global_txn_testutils.cc
 COMMON_SRC := $(wildcard src/common/base/*.cc) $(wildcard src/common/net/*.cc) \
               $(wildcard src/common/file/*.cc) $(wildcard src/common/file/recordio/*.cc) \
-              $(wildcard src/common/console/*.cc)
+              $(wildcard src/common/console/*.cc) $(wildcard src/common/log/*.cc) \
+			  $(wildcard src/common/metric/*.cc)
 SERVER_WRAPPER_SRC := src/tera_main_wrapper.cc
 SERVER_SRC := src/tera_main.cc src/tera_entry.cc
 CLIENT_SRC := src/teracli_main.cc
+TERAUTIL_SRC := src/terautil.cc
+GTXN_TEST_SRC := src/sdk/test/global_txn_test_tool.cc
 TEST_CLIENT_SRC := src/tera_test_main.cc
 TERA_C_SRC := src/tera_c.cc
 MONITOR_SRC := src/monitor/teramo_main.cc
 MARK_SRC := src/benchmark/mark.cc src/benchmark/mark_main.cc
+COMMON_TEST_SRC := $(wildcard src/common/test/*.cc)
 TEST_SRC := src/utils/test/prop_tree_test.cc src/utils/test/tprinter_test.cc \
             src/io/test/tablet_io_test.cc src/io/test/tablet_scanner_test.cc \
             src/io/test/load_test.cc src/master/test/master_test.cc \
             src/master/test/master_impl_test.cc src/master/test/trackable_gc_test.cc \
-            src/common/test/thread_pool_test.cc
+            src/observer/test/rowlock_test.cc src/observer/test/scanner_test.cc \
+			src/observer/test/observer_test.cc \
+			$(wildcard src/sdk/test/*_test.cc) $(COMMON_TEST_SRC)
+
+TIMEORACLE_SRC := $(wildcard src/timeoracle/*.cc) src/tera_entry.cc
+TIMEORACLE_BENCH_SRC := src/timeoracle/bench/timeoracle_bench.cc
+ROWLOCK_SRC := $(wildcard src/observer/rowlocknode/*.cc) src/sdk/rowlock_client.cc
+ROWLOCK_PROXY_SRC := $(wildcard src/observer/rowlockproxy/*.cc) 
+OBSERVER_SRC := src/observer/executor/scanner_impl.cc src/observer/executor/random_key_selector.cc 
+OBSERVER_DEMO_SRC := $(wildcard src/observer/observer_demo.cc)
 
 TEST_OUTPUT := test_output
 UNITTEST_OUTPUT := $(TEST_OUTPUT)/unittest
@@ -65,39 +79,53 @@ COMMON_OBJ := $(COMMON_SRC:.cc=.o)
 SERVER_WRAPPER_OBJ := $(SERVER_WRAPPER_SRC:.cc=.o)
 SERVER_OBJ := $(SERVER_SRC:.cc=.o)
 CLIENT_OBJ := $(CLIENT_SRC:.cc=.o)
+TERAUTIL_OBJ := $(TERAUTIL_SRC:.cc=.o)
+GTXN_TEST_OBJ := $(GTXN_TEST_SRC:.cc=.o)
 TEST_CLIENT_OBJ := $(TEST_CLIENT_SRC:.cc=.o)
 TERA_C_OBJ := $(TERA_C_SRC:.cc=.o)
 MONITOR_OBJ := $(MONITOR_SRC:.cc=.o)
 MARK_OBJ := $(MARK_SRC:.cc=.o)
 HTTP_OBJ := $(HTTP_SRC:.cc=.o)
+COMMON_TEST_OBJ := $(COMMON_TEST_SRC:.cc=.o)
 TEST_OBJ := $(TEST_SRC:.cc=.o)
+TIMEORACLE_OBJ := $(TIMEORACLE_SRC:.cc=.o)
+TIMEORACLE_BENCH_OBJ := $(TIMEORACLE_BENCH_SRC:.cc=.o)
+ROWLOCK_OBJ := $(ROWLOCK_SRC:.cc=.o)
+ROWLOCK_PROXY_OBJ := $(ROWLOCK_PROXY_SRC:.cc=.o)
+OBSERVER_OBJ := $(OBSERVER_SRC:.cc=.o)
+OBSERVER_DEMO_OBJ := $(OBSERVER_DEMO_SRC:.cc=.o)
 ALL_OBJ := $(MASTER_OBJ) $(TABLETNODE_OBJ) $(IO_OBJ) $(SDK_OBJ) $(PROTO_OBJ) \
-           $(JNI_TERA_OBJ) $(OTHER_OBJ) $(COMMON_OBJ) $(SERVER_OBJ) $(CLIENT_OBJ) \
+           $(JNI_TERA_OBJ) $(OTHER_OBJ) $(COMMON_OBJ) $(SERVER_OBJ) $(CLIENT_OBJ) $(TERAUTIL_OBJ) \
            $(TEST_CLIENT_OBJ) $(TERA_C_OBJ) $(MONITOR_OBJ) $(MARK_OBJ) \
-           $(SERVER_WRAPPER_OBJ)
+           $(SERVER_WRAPPER_OBJ) $(TIMEORACLE_OBJ) $(ROWLOCK_OBJ) $(ROWLOCK_PROXY_OBJ)  $(OBSERVER_OBJ) $(OBSERVER_DEMO_OBJ)
 LEVELDB_LIB := src/leveldb/libleveldb.a
 LEVELDB_UTIL := src/leveldb/util/histogram.o src/leveldb/port/port_posix.o
 
-PROGRAM = tera_main tera_master tabletserver teracli teramo tera_test
+PROGRAM = tera_main tera_master tabletserver teracli terautil teramo tera_test timeoracle timeoracle_bench rowlock observer_demo rowlock_proxy
+TEST_PROGRAM=gtxn_test_tool
+
 LIBRARY = libtera.a
 SOLIBRARY = libtera.so
 TERA_C_SO = libtera_c.so
 JNILIBRARY = libjni_tera.so
+OBSERVER_LIBRARY = libobserver.a
 BENCHMARK = tera_bench tera_mark
 TESTS = prop_tree_test tprinter_test string_util_test tablet_io_test \
-        tablet_scanner_test fragment_test progress_bar_test master_test load_test \
-        thread_pool_test
+        tablet_scanner_test fragment_test progress_bar_test master_test load_test observer_test \
+        common_test sdk_test 
 
 .PHONY: all clean cleanall test
 
-all: $(PROGRAM) $(LIBRARY) $(SOLIBRARY) $(TERA_C_SO) $(JNILIBRARY) $(BENCHMARK)
+all: $(PROGRAM) $(TEST_PROGRAM) $(LIBRARY) $(SOLIBRARY) $(TERA_C_SO) $(JNILIBRARY) $(BENCHMARK) $(OBSERVER_LIBRARY)
 	mkdir -p build/include build/lib build/bin build/log build/benchmark
 	cp $(PROGRAM) build/bin
-	cp $(LIBRARY) $(SOLIBRARY) $(TERA_C_SO) $(JNILIBRARY) build/lib
+	cp $(LIBRARY) $(SOLIBRARY) $(TERA_C_SO) $(JNILIBRARY) $(OBSERVER_LIBRARY) build/lib
 	cp src/leveldb/tera_bench .
 	cp -r benchmark/*.sh benchmark/ycsb4tera/ $(BENCHMARK) build/benchmark
 	cp -r include build/
 	cp -r conf build
+	mkdir -p test/tools
+	cp $(TEST_PROGRAM) test/tools
 	echo 'Done'
 
 test: $(TESTS)
@@ -115,11 +143,12 @@ check: test
 clean:
 	rm -rf $(ALL_OBJ) $(TEST_OBJ) $(PROTO_OUT_CC) $(PROTO_OUT_H) $(TEST_OUTPUT)
 	$(MAKE) clean -C src/leveldb
-	rm -rf $(PROGRAM) $(LIBRARY) $(SOLIBRARY) $(TERA_C_SO) $(JNILIBRARY) $(BENCHMARK) $(TESTS) terahttp
+	rm -rf $(PROGRAM) $(TEST_PROGRAM) $(LIBRARY) $(OBSERVER_LIBRARY) $(SOLIBRARY) $(TERA_C_SO) $(JNILIBRARY) $(BENCHMARK) $(TESTS) terahttp 
 
 cleanall:
 	$(MAKE) clean
 	rm -rf build
+	rm -rf test/tools
 
 tera_main: src/tera_main_wrapper.o src/version.o src/tera_flags.o
 	$(CXX) -o $@ $^ $(LDFLAGS)
@@ -135,6 +164,13 @@ tabletserver: $(SERVER_OBJ) $(TABLETNODE_OBJ) $(IO_OBJ) $(SDK_OBJ) \
 libtera.a: $(SDK_OBJ) $(PROTO_OBJ) $(OTHER_OBJ) $(COMMON_OBJ) $(LEVELDB_UTIL)
 	$(AR) -rs $@ $^
 
+observer_demo : $(OBSERVER_DEMO_OBJ) $(OBSERVER_LIBRARY) $(LIBRARY)
+	$(CXX) -o $@ $^ $(LDFLAGS)
+
+libobserver.a: $(OBSERVER_OBJ) $(SDK_OBJ) $(PROTO_OBJ) $(OTHER_OBJ) $(COMMON_OBJ) $(LEVELDB_UTIL) \
+	           $(IO_OBJ) $(SDK_OBJ)
+	$(AR) -rs $@ $^
+	
 libtera.so: $(SDK_OBJ) $(PROTO_OBJ) $(OTHER_OBJ) $(COMMON_OBJ) $(LEVELDB_UTIL)
 	$(CXX) -o $@ $^ $(SO_LDFLAGS)
 
@@ -144,6 +180,12 @@ libtera_c.so: $(TERA_C_OBJ) $(LIBRARY)
 teracli: $(CLIENT_OBJ) $(LIBRARY)
 	$(CXX) -o $@ $^ $(LDFLAGS)
 
+terautil: $(TERAUTIL_OBJ) $(LIBRARY)
+	$(CXX) -o $@ $^ $(LDFLAGS)
+
+gtxn_test_tool: $(GTXN_TEST_OBJ) $(LIBRARY)
+	$(CXX) -o $@ $^ $(LDFLAGS)
+
 teramo: $(MONITOR_OBJ) $(LIBRARY)
 	$(CXX) -o $@ $^ $(LDFLAGS)
 
@@ -153,6 +195,18 @@ tera_mark: $(MARK_OBJ) $(LIBRARY) $(LEVELDB_LIB)
 tera_test: $(TEST_CLIENT_OBJ) $(LIBRARY)
 	$(CXX) -o $@ $(TEST_CLIENT_OBJ) $(LIBRARY) $(LDFLAGS)
 
+timeoracle: $(TIMEORACLE_OBJ) $(PROTO_OBJ) $(COMMON_OBJ) $(OTHER_OBJ)
+	$(CXX) -o $@ $^ $(LDFLAGS)
+
+timeoracle_bench : $(TIMEORACLE_BENCH_OBJ) $(LIBRARY)
+	$(CXX) -o $@ $^ $(LDFLAGS)
+
+rowlock : $(SERVER_OBJ) $(ROWLOCK_OBJ) $(PROTO_OBJ) $(OTHER_OBJ) $(COMMON_OBJ)
+	$(CXX) -o $@ $^ $(LDFLAGS)
+
+rowlock_proxy : $(SERVER_OBJ) $(ROWLOCK_PROXY_OBJ) $(PROTO_OBJ) $(OTHER_OBJ) $(COMMON_OBJ) $(OBSERVER_LIBRARY)
+	$(CXX) -o $@ $^ $(LDFLAGS)
+
 terahttp: $(HTTP_OBJ) $(PROTO_OBJ) $(LIBRARY)
 	$(CXX) -o $@ $^ $(LDFLAGS)
 
@@ -165,7 +219,7 @@ src/leveldb/libleveldb.a: FORCE
 tera_bench:
 
 # unit test
-thread_pool_test: src/common/test/thread_pool_test.o $(LIBRARY)
+common_test: $(COMMON_TEST_OBJ) $(LIBRARY)
 	$(CXX) -o $@ $^ $(LDFLAGS)
 
 prop_tree_test: src/utils/test/prop_tree_test.o $(LIBRARY)
@@ -200,6 +254,15 @@ master_test: src/master/test/master_test.o  src/master/test/master_impl_test.o \
              $(PROTO_OBJ) $(OTHER_OBJ) $(COMMON_OBJ) $(LEVELDB_LIB)
 	$(CXX) -o $@ $^ $(LDFLAGS)
 
+sdk_test: src/sdk/test/global_txn_internal_test.o src/sdk/test/global_txn_test.o \
+          src/sdk/test/filter_utils_test.o src/sdk/test/scan_impl_test.o \
+          src/sdk/test/sdk_timeout_manager_test.o src/sdk/test/sdk_test.o $(SDK_OBJ) \
+          $(PROTO_OBJ) $(OTHER_OBJ) $(COMMON_OBJ) $(LEVELDB_LIB) 
+	$(CXX) -o $@ $^ $(LDFLAGS)
+
+observer_test: src/observer/test/rowlock_test.o src/observer/test/scanner_test.o  src/observer/test/observer_test.o src/observer/observer_demo/demo_observer.o $(PROTO_OBJ) $(COMMON_OBJ) $(OTHER_OBJ) $(OBSERVER_OBJ) $(LIBRARY)
+	$(CXX) -o $@ $^ $(LDFLAGS)
+
 $(ALL_OBJ): %.o: %.cc $(PROTO_OUT_H)
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 
@@ -222,8 +285,8 @@ proto: $(PROTO_OUT_CC) $(PROTO_OUT_H)
 
 # install output into system directories
 .PHONY: install
-install: $(PROGRAM) $(LIBRARY) $(SOLIBRARY) $(TERA_C_SO) $(JNILIBRARY)
+install: $(PROGRAM) $(LIBRARY) $(SOLIBRARY) $(TERA_C_SO) $(JNILIBRARY) 
 	mkdir -p $(INSTALL_PREFIX)/bin $(INSTALL_PREFIX)/include $(INSTALL_PREFIX)/lib
 	cp -rf $(PROGRAM) $(INSTALL_PREFIX)/bin
 	cp -rf include/* $(INSTALL_PREFIX)/include
-	cp -rf $(LIBRARY) $(SOLIBRARY) $(TERA_C_SO) $(JNILIBRARY) $(INSTALL_PREFIX)/lib
+	cp -rf $(LIBRARY) $(SOLIBRARY) $(TERA_C_SO) $(JNILIBRARY) $(INSTALL_PREFIX)/lib 
diff --git a/benchmark/run_test.sh b/benchmark/run_test.sh
index 8f33ce5e6..b1e9e7c34 100755
--- a/benchmark/run_test.sh
+++ b/benchmark/run_test.sh
@@ -1,8 +1,8 @@
 #!/bin/bash
 
-if [[ $# != 7 || $6 -lt 0 || $6 -gt 100 ]]; then
-    echo "$0 DIST[zipfian, uniform, latest] ROW_NUM OP_NUM VALUE_SIZE COLUMN_NUM UPDATE_PROPORTION[0~100] TABLE_NAME"
-    exit 0
+if [[ $# != 9 || $6 -lt 0 || $6 -gt 100 ]]; then
+    echo "$0 DIST[zipfian, uniform, latest] ROW_NUM OP_NUM VALUE_SIZE COLUMN_NUM UPDATE_PROPORTION[0~100] OP_SPEED THREAD_NUM TABLE_NAME"
+    exit 1
 fi
 
 DIST=$1
@@ -11,11 +11,12 @@ OP_NUM=$3
 VALUE_SIZE=$4
 COLUMN_NUM=$5
 UPDATE_PROPORTION=$6
-TABLE_NAME=$7
+OP_SPEED=$7
+THREAD_NUM=$8
+TABLE_NAME=$9
 
-UPDATE_PROPORTION=`printf "%02d" $6`
-READ_PROPORTION=`expr 100 - $UPDATE_PROPORTION`
-READ_PROPORTION=`printf "%02d" $READ_PROPORTION`
+UPDATE_PROPORTION=`echo $6 | awk '{printf("%.2f",$1/100)}'`
+READ_PROPORTION=`echo $6 | awk '{printf("%.2f",(100-$1)/100)}'`
 
 echo "$UPDATE_PROPORTION"
 echo "$READ_PROPORTION"
@@ -30,8 +31,12 @@ bin/ycsb run tera -p workload=com.yahoo.ycsb.workloads.CoreWorkload \
 		-p operationcount=$OP_NUM \
 		-p fieldlength=$VALUE_SIZE \
 		-p fieldcount=$COLUMN_NUM \
-		-p updateproportion=0.$UPDATE_PROPORTION \
-		-p readproportion=0.$READ_PROPORTION \
+                -p updateproportion=$UPDATE_PROPORTION \
+                -p readproportion=$READ_PROPORTION \
+                -p target=$OP_SPEED \
+                -p thread=$THREAD_NUM \
 		-p exportfile=ycsb.out \
 		| ./tera_mark --mode=m --tablename=$TABLE_NAME --type=async --verify=false
 
+exit $?
+
diff --git a/benchmark/ycsb4tera.md b/benchmark/ycsb4tera.md
index f32ae9e28..d348434ce 100644
--- a/benchmark/ycsb4tera.md
+++ b/benchmark/ycsb4tera.md
@@ -30,6 +30,13 @@
     
     更新（写入）占所有操作的比例
     updateproportion: what proportion of operations should be updates (default: 0.05)
+
+    每秒总共操作的次数
+    target: target ops/sec all threads (default: unthrottled)
+
+    客户端线程数
+    thread: number of client threads (default: 1)
+
 ```
 以下参数对于tera的测试意义不大，使用默认值即可：
 
diff --git a/build.conf.template b/build.conf.template
index 1fd914ec6..170383dde 100755
--- a/build.conf.template
+++ b/build.conf.template
@@ -18,6 +18,7 @@ LIBUNWIND_VERSION=0.99
 GPERFTOOLS_VERSION=2.5
 INS_VERSION=0.17
 NOSE_VERSION=1.3.7
+MONGOOSE_VERSION=6.8
 
 if [ $MIRROR == "china" ]; then
     BOOST_URL=http://mirrors.tuna.tsinghua.edu.cn/macports/distfiles/boost/boost_${BOOST_VERSION}.tar.bz2
@@ -32,6 +33,7 @@ if [ $MIRROR == "china" ]; then
     GPERFTOOLS_URL=https://github.com/00k/gperftools/raw/master/gperftools-${GPERFTOOLS_VERSION}.tar.gz
     INS_URL=https://github.com/baidu/ins/archive/${INS_VERSION}.tar.gz
     NOSE_URL=http://mirrors.163.com/gentoo/distfiles/nose-${NOSE_VERSION}.tar.gz
+	MONGOOSE_URL=https://github.com/cesanta/mongoose/archive/${MONGOOSE_VERSION}.tar.gz
 elif [ $MIRROR == "origin" ]; then
     BOOST_URL=http://downloads.sourceforge.net/project/boost/boost/1.58.0/boost_${BOOST_VERSION}.tar.bz2
     PROTOBUF_URL=https://github.com/google/protobuf/releases/download/v${PROTOBUF_VERSION}/protobuf-${PROTOBUF_VERSION}.tar.bz2
@@ -45,19 +47,7 @@ elif [ $MIRROR == "origin" ]; then
     GPERFTOOLS_URL=https://github.com/gperftools/gperftools/releases/download/gperftools-${GPERFTOOLS_VERSION}/gperftools-${GPERFTOOLS_VERSION}.tar.gz
     INS_URL=https://github.com/baidu/ins/archive/${INS_VERSION}.tar.gz
     NOSE_URL=https://pypi.python.org/packages/58/a5/0dc93c3ec33f4e281849523a5a913fa1eea9a3068acfa754d44d88107a44/nose-${NOSE_VERSION}.tar.gz
-elif [ $MIRROR == "baidu" ]; then
-    BOOST_URL=http://gitlab.baidu.com/baidups/third/raw/master/boost_${BOOST_VERSION}.tar.bz2
-    PROTOBUF_URL=http://gitlab.baidu.com/baidups/third/raw/master/protobuf-${PROTOBUF_VERSION}.tar.bz2
-    SNAPPY_URL=http://gitlab.baidu.com/baidups/third/raw/master/snappy-${SNAPPY_VERSION}.tar.gz
-    SOFA_PBRPC_URL=http://gitlab.baidu.com/baidups/third/raw/master/sofa-pbrpc-${SOFA_PBRPC_VERSION}.tar.gz
-    ZOOKEEPER_URL=http://gitlab.baidu.com/baidups/third/raw/master/zookeeper-${ZOOKEEPER_VERSION}.tar.gz
-    GFLAGS_URL=http://gitlab.baidu.com/baidups/third/raw/master/gflags-${GFLAGS_VERSION}.tar.gz
-    GLOG_URL=http://gitlab.baidu.com/baidups/third/raw/master/glog-${GLOG_VERSION}.tar.gz
-    GTEST_URL=http://gitlab.baidu.com/baidups/third/raw/master/googletest-release-${GTEST_VERSION}.tar.gz
-    LIBUNWIND_URL=http://gitlab.baidu.com/baidups/third/raw/master/libunwind-${LIBUNWIND_VERSION}.tar.gz
-    GPERFTOOLS_URL=http://gitlab.baidu.com/baidups/third/raw/master/gperftools-${GPERFTOOLS_VERSION}.tar.gz
-    INS_URL=http://gitlab.baidu.com/baidups/third/raw/master/ins-${INS_VERSION}.tar.gz
-    NOSE_URL=http://gitlab.baidu.com/baidups/third/raw/master/nose-${NOSE_VERSION}.tar.gz
+	MONGOOSE_URL=https://github.com/cesanta/mongoose/archive/${MONGOOSE_VERSION}.tar.gz
 else
     return 1
 fi
diff --git a/build.sh b/build.sh
index 1e1156aa9..f565149ef 100755
--- a/build.sh
+++ b/build.sh
@@ -218,7 +218,7 @@ elif [ ! -f "${FLAG_DIR}/ins_${INS_VERSION}" ] \
     sed -i "s|^PROTOBUF_PATH ?=.*|PROTOBUF_PATH ?=${DEPS_PREFIX}|" Makefile
     sed -i "s|^PBRPC_PATH ?=.*|PBRPC_PATH ?=${DEPS_PREFIX}|" Makefile
     sed -i "s|^GTEST_PATH ?=.*|GTEST_PATH ?=${DEPS_PREFIX}|" Makefile
-    #BOOST_PATH=${DEPS_PREFIX}/boost_${BOOST_VERSION} make install_sdk
+    # BOOST_PATH=${DEPS_PREFIX}/boost_${BOOST_VERSION} make install_sdk
     make -j4 install_sdk
     cd -
     touch "${FLAG_DIR}/ins_${INS_VERSION}"
@@ -239,6 +239,23 @@ elif [ ! -f "${FLAG_DIR}/nose_${NOSE_VERSION}" ] \
     touch "${FLAG_DIR}/nose_${NOSE_VERSION}"
 fi
 
+# mongoose
+if [ ${MONGOOSE_VERSION} == "DISABLE" ]; then
+    echo "Disable mongoose."
+elif [ ! -f "${FLAG_DIR}/mongoose_${MONGOOSE_VERSION}" ] \
+    || [ ! -f "${DEPS_PREFIX}/include/mongoose.h" ] \
+    || [ ! -f "${DEPS_PREFIX}/lib/libmongoose.a" ]; then
+    wget --no-check-certificate -O mongoose-${MONGOOSE_VERSION}.tar.gz ${MONGOOSE_URL}
+    tar zxf mongoose-${MONGOOSE_VERSION}.tar.gz --recursive-unlink
+	cd mongoose-${MONGOOSE_VERSION}
+	cp -af mongoose.h ${DEPS_PREFIX}/include
+	gcc -c mongoose.c -o mongoose.o -g2 -pipe -Wall -Werror -fPIC
+	ar -rv libmongoose.a mongoose.o
+	cp -af libmongoose.a ${DEPS_PREFIX}/lib
+	cd -
+    touch "${FLAG_DIR}/mongoose_${MONGOOSE_VERSION}"
+fi
+
 cd ${WORK_DIR}
 
 ########################################
diff --git a/build_version.sh b/build_version.sh
index 8cac725a6..2534fcb85 100755
--- a/build_version.sh
+++ b/build_version.sh
@@ -56,7 +56,7 @@ GIT_INFO_FILE=git_info.tmp
 VERSION_CPP_FILE=src/version.cc
 
 # generate template file
-git log | head -n 6 | sed 's/$/&\\n\\/g' > $GIT_INFO_FILE
+git log | head -n 6 | sed 's/"/\\"/g' |  sed 's/$/&\\n\\/g' > $GIT_INFO_FILE
 gen_info_template_header > $TEMPLATE_HEADER_FILE
 gen_info_template_foot > $TEMPLATE_FOOT_FILE
 gen_info_print_template >> $TEMPLATE_FOOT_FILE
diff --git a/depends.mk.template b/depends.mk.template
index 191cd8162..f0dbea180 100644
--- a/depends.mk.template
+++ b/depends.mk.template
@@ -14,17 +14,19 @@ GLOG_PREFIX=./thirdparty
 GTEST_PREFIX=./thirdparty
 GPERFTOOLS_PREFIX=./thirdparty
 INS_PREFIX=./thirdparty
+MONGOOSE_PREFIX=./thirdparty
 BOOST_INCDIR=./thirdparty/boost_1_57_0
 
 SOFA_PBRPC_INCDIR = $(SOFA_PBRPC_PREFIX)/include
 PROTOBUF_INCDIR = $(PROTOBUF_PREFIX)/include
 SNAPPY_INCDIR = $(SNAPPY_PREFIX)/include
-ZOOKEEPER_INCDIR = $(ZOOKEEPER_PREFIX)/include
+ZOOKEEPER_INCDIR = $(ZOOKEEPER_PREFIX)/include/zookeeper
 GFLAGS_INCDIR = $(GFLAGS_PREFIX)/include
 GLOG_INCDIR = $(GLOG_PREFIX)/include
 GTEST_INCDIR = $(GTEST_PREFIX)/include
 GPERFTOOLS_INCDIR = $(GPERFTOOLS_PREFIX)/include
 INS_INCDIR = $(INS_PREFIX)/include
+MONGOOSE_INCDIR = $(MONGOOSE_PREFIX)/include
 
 SOFA_PBRPC_LIBDIR = $(SOFA_PBRPC_PREFIX)/lib
 PROTOBUF_LIBDIR = $(PROTOBUF_PREFIX)/lib
@@ -35,6 +37,7 @@ GLOG_LIBDIR = $(GLOG_PREFIX)/lib
 GTEST_LIBDIR = $(GTEST_PREFIX)/lib
 GPERFTOOLS_LIBDIR = $(GPERFTOOLS_PREFIX)/lib
 INS_LIBDIR = $(INS_PREFIX)/lib
+MONGOOSE_LIBDIR = $(MONGOOSE_PREFIX)/lib
 
 PROTOC = $(PROTOBUF_PREFIX)/bin/protoc
 
@@ -45,13 +48,13 @@ PROTOC = $(PROTOBUF_PREFIX)/bin/protoc
 DEPS_INCPATH = -I$(SOFA_PBRPC_INCDIR) -I$(PROTOBUF_INCDIR) \
                -I$(SNAPPY_INCDIR) -I$(ZOOKEEPER_INCDIR) \
                -I$(GFLAGS_INCDIR) -I$(GLOG_INCDIR) -I$(GTEST_INCDIR) \
-               -I$(GPERFTOOLS_INCDIR) -I$(BOOST_INCDIR) -I$(INS_INCDIR)
+               -I$(GPERFTOOLS_INCDIR) -I$(BOOST_INCDIR) -I$(INS_INCDIR) -I$(MONGOOSE_INCDIR)
 DEPS_LDPATH = -L$(SOFA_PBRPC_LIBDIR) -L$(PROTOBUF_LIBDIR) \
               -L$(SNAPPY_LIBDIR) -L$(ZOOKEEPER_LIBDIR) \
               -L$(GFLAGS_LIBDIR) -L$(GLOG_LIBDIR) -L$(GTEST_LIBDIR) \
-              -L$(GPERFTOOLS_LIBDIR) -L$(INS_LIBDIR)
+              -L$(GPERFTOOLS_LIBDIR) -L$(INS_LIBDIR) -L$(MONGOOSE_LIBDIR)
 SO_DEPS_LDFLAGS = -lins_sdk -lsofa-pbrpc -lprotobuf -lsnappy -lzookeeper_mt \
-                  -lgtest_main -lgtest -lglog -lgflags
+                  -lgtest_main -lgtest -lglog -lgflags -lmongoose
 DEPS_LDFLAGS = $(SO_DEPS_LDFLAGS) -ltcmalloc_minimal -lunwind
 
 ################################################################
diff --git a/doc/README.md b/doc/README.md
new file mode 100644
index 000000000..eb4ed0d0a
--- /dev/null
+++ b/doc/README.md
@@ -0,0 +1,160 @@
+
+# Tera SDK及工具说明
+
+## 目录
+### 1. [主要数据结构](#main-data-structure)
+ 
+* tera::[client](./sdk_reference/client.md)
+* tera::[table](./sdk_reference/table.md)
+* tera::[mutation](./sdk_reference/mutation.md)
+* tera::[reader](./sdk_reference/reader.md)
+* tera::[table_descriptor](./sdk_reference/table_descriptor.md)
+* tera::[transaction](./sdk_reference/transaction.md)
+* tera::[scan](./sdk_reference/scan.md)
+* tera::[utils](./sdk_reference/utils.md)
+
+### 2. [主要工具](#main-tools)
+* [teracli](./tools/teracli.md)
+* [terautil](./tools/terautil.md)
+* [tera_bench & tera_mark](./tools/benchmark.md)
+* [YCSB](./tools/ycsb.md)
+
+
+<a name="main-data-structure"></a> 
+### 1. 主要数据结构
+#### (1) tera::client  访问tera服务主结构，所有对tera的访问或操作全部由此发起。
+一个集群对应一个client即可，如需访问多个client，需要创建多个
+##### 主要功能包括：
+* 表格操作：建、删、加载、卸载、打开、关闭、更新表结构、获取表格信息、快照等
+* 用户管理：建、删、修改密码、组管理等
+* 集群信息获取：获取全部表格列表、状态等
+ 
+#### (2) tera::table  表格主结构，对表格的所有增删查改操作由此发起。
+由tera::Client::OpenTable产生，tera::Client::CloseTable关闭，不可析构。
+ 
+#### (3) tera::error_code 错误码，很多操作会返回，注意检查。
+
+#### (4) tera::mutation
+ 
+#### (5) tera::scan 扫描操作，并获取返回数据。
+ 
+#### (6) tera::reader 读取操作，并获取返回数据。
+ 
+#### (7) tera::table_descriptor 表格描述符主体
+ 
+#### (8) tera::transaction 单行事务
+ 
+#### (9) tera::scan 扫描
+ 
+#### (10) tera::utils 编码解码
+ 
+<a name="main-tools"></a> 
+### 2. 主要工具
+#### (1) teracli  操作tera的工具
+* 实际上封装了对数据的操作等，可用来进行表格创建、schema更新等管理、控制操作。
+* 查看有哪些命令可用 ：./teracli help；
+* 查看某个命令的help：./teracli help [cmd]，例如./teracli help tablet
+ 
+#### (2) terautil  集群间数据迁移的dump工具
+
+* 具体用法./terautil dump help
+* 建表主要用法：./terautil --flagfile=../conf/terautil.flag dump prepare_safe
+* 扫表run起来主要用法：./terautil --flagfile=../conf/terautil.flag dump run
+* flag配置
+<table>
+<tr>
+<th>flag名称</th>
+<th>flag默认值或格式</th>
+<th>flag介绍</th>
+</tr>
+<tr>
+<td>dump_tera_src_conf </td>
+<td>../conf/src_tera.flag（格式）</td>
+<td>tera的源集群</td>
+</tr>
+<tr>
+<td>dump_tera_dest_conf</td>
+<td>../conf/dest_tera.flag（格式）</td>
+<td>tera的目的集群</td>
+</tr>
+<tr>
+<td>dump_tera_src_root_path</td>
+<td>/xxx_（路径格式）</td>
+<td>tera的源路径</td>
+</tr>
+<tr>
+<td>dump_tera_dest_root_path</td>
+<td>/xxx_（路径格式）</td>
+<td>tera的目的路径</td>
+</tr>
+<tr>
+<td>ins_cluster_addr</td>
+<td>terautil_ins（格式）</td>
+<td>锁服务器的地址</td>
+</tr>
+<tr>
+<td>ins_cluster_root_path</td>
+<td>/terautil/dump/xxxx（格式）</td>
+<td>锁服务器路径</td>
+</tr>
+<tr>
+<td>dump_tera_src_meta_addr</td>
+<td>“”</td>
+<td>源meta表的地址</td>
+</tr>
+<tr>
+<td>dump_tera_dest_meta_addr</td>
+<td>“”</td>
+<td>目的meta表的地址</td>
+</tr>
+<tr>
+<td>dump_manual_split_interval</td>
+<td>1000</td>
+<td>手动分裂时间间隔，单位为ms</td>
+</tr>
+<tr>
+<td>dump_enable_manual_split</td>
+<td>false</td>
+<td>是否允许手动分裂</td>
+</tr>
+</table>
+
+ 
+#### (3) tera_mark   读写数据
+* 支持异步读写scan
+```
+＃示例：
+./tera_mark --mode=w --tablename=test --type=async  --verify=false --entry_limit=1000
+```
+* 参数列表
+
+参数名 | 意义 | 有效取值 | 单位 | 默认值 | 其它说明
+---    | ---  | ---      | ---  | ---    | ---
+table | 表名 | - | - | "" |
+mode | 模式 | "w"/"r"/"s"/"m" | - | "w" | -
+type | 类型 | "sync"/"async" | - | "async" | -
+pend_size | 最大pending大小 | - | - | 100 | -
+pend_count | 最大pending数 | - | - | 100000 | -
+start_key | scan的开始key | - | - | "" | -
+end_key | scan的结束key | - | - | "" | -
+cf_list | scan的列簇 | -  | - | "" | -
+print | scan的结果是否需要打印 | true/false | - | false | -
+buf_size | scan的buffer_size | >0  | - | 65536 | -
+verify | md5 verify(writer&read) | true/false  | - | true | -
+max_outflow | max_outflow | -  | - | -1 | -
+max_rate | max_rate | - | - | -1 | -
+scan_streaming | enable streaming scan | true/false  | - | false | -
+batch_count | batch_count(sync) | - | - | 1 | -
+entry_limit | writing/reading speed limit | - | - | 0 | -
+ 
+#### (4) tera_bench 造数据的工具
+```
+./tera_bench --compression_ratio=1 --key_seed=1 --value_seed=20  --value_size=1000 --num=200000
+--benchmarks=random  --key_size=24 --key_step=1
+```
+ 
+#### (5) YCSB 业界通用NoSQL测试的基准测试工具
+ 
+* 全称Yahoo! Cloud Serving Benchmark，Yahoo公司开发的专门用于NoSQL测试的基准测试工具
+* YCSB支持各种不同的数据分布方式，如Uniform（等概论随机选择记录）、Zipfian（随机选择记录，存在热记录）、Latest（近期写入的记录为热记录）
+
diff --git a/doc/cn/README.md b/doc/cn/README.md
index d18e4cf74..12eeb4d98 100644
--- a/doc/cn/README.md
+++ b/doc/cn/README.md
@@ -1,5 +1,5 @@
-# Tera文档专区
 
+# Tera文档专区
 ## 简介
 
 [系统设计](../tera_design.md)
@@ -10,9 +10,15 @@
 
 [体验单机Tera](onebox.md)
 
-[命令行工具teracli使用方法](teracli.md)
+[命令行工具teracli使用方法](../tools/teracli.md)
+
+[集群间数据迁移的dump工具terautil使用方法](../tools/terautil.md)
 
-[主要api使用方法](sdk_guide.md)
+[造数据的工具 & 读写数据使用方法](../tools/benchmark.md)
+
+[性能测试工具ycsb的使用方法](../tools/ycsb.md)
+
+[主要api使用方法](../sdk_reference/readme.md)
 
 [搭建tera集群](cluster_setup.md)
 
@@ -35,3 +41,4 @@
 ## 版本发布
 
 [版本发布及管理](../release_management.md)
+
diff --git a/doc/global_txn.md b/doc/global_txn.md
new file mode 100644
index 000000000..bb62d4c79
--- /dev/null
+++ b/doc/global_txn.md
@@ -0,0 +1,3 @@
+# Tera全局事务的原理及实现
+
+[image-1]:	../resources/images/global_txn.png
diff --git a/doc/sdk_reference/client.md b/doc/sdk_reference/client.md
new file mode 100644
index 000000000..a7f6fe878
--- /dev/null
+++ b/doc/sdk_reference/client.md
@@ -0,0 +1,169 @@
+
+# Client接口说明
+
+## 主要功能
+ 
+#### 1. 表格管理
+##### (1) 新建client  Client::NewClient
+```
+1.1) static Client* NewClient(const std::string& confpath, const std::string& log_prefix, ErrorCode* err = NULL)
+1.2) static Client* NewClient(const std::string& confpath, ErrorCode* err = NULL)
+1.3) static Client* NewClient()
+```
+ 
+##### (2) 打开表格 Client::OpenTable
+```
+Table* OpenTable(const std::string& table_name, ErrorCode* err) = 0
+```
+##### (3) 建表 Client::CreateTable
+```
+1） bool CreateTable(const TableDescriptor& desc, ErrorCode* err) = 0  //新建带有具体描述符的表格
+2） bool CreateTable(const TableDescriptor& desc, const std::vector<std::string>& tablet_delim, ErrorCode* err) = 0 //新建多个前缀为tablet_delim的tablets
+```
+ 
+##### (4) 更新schema Client::UpdateTableSchema
+ 
+```
+bool ClientImpl::UpdateTableSchema(const TableDescriptor& desc, ErrorCode* err) = 0
+```
+调用UpdateTable(desc, err)，分两种情况：
+* 更新lg属性。需要先disable表格
+* 更新cf属性。直接更新
+##### (5) 检查更新状态 Client::UpdateCheck
+ 
+```
+bool UpdateCheck(const std::string& table_name, bool* done, ErrorCode* err) = 0
+```
+ 
+##### (6) disable表 Client::DisableTable
+暂停表，表格不再提供读、写服务。某些属性的更新需要先disable表；使用drop删除表时，需要先执行disable操作，此操作不可回滚。
+ 
+```
+bool DisableTable(const std::string& name, ErrorCode* err) = 0
+```
+ 
+##### (7) drop表 Client::DropTable
+删除处于disable状态的表格，此操作不可回滚。
+ 
+```
+bool DropTable(const std::string& name, ErrorCode* err) = 0
+```
+ 
+##### (8) enable表 Client::EnableTable
+ 
+将处于disable状态的表格重新enable，恢复读、写服务。
+ 
+```
+bool EnableTable(const std::string& name, ErrorCode* err) = 0
+```
+ 
+##### (9) 获取表的描述符 Client::GetTableDescriptor
+```
+TableDescriptor* GetTableDescriptor(const std::string& table_name, ErrorCode* err) = 0
+```
+ 
+##### (10) 列出所有的表 Client::List
+```
+bool List(std::vector<TableInfo>* table_list, ErrorCode* err) = 0;//列出所有的表
+bool List(const std::string& table_name, TableInfo* table_info, std::vector<TabletInfo>* tablet_list, ErrorCode* err) = 0;//获取指定的表
+```
+##### (11) 检查表是否存在 Client::IsTableExist
+```
+bool IsTableExist(const std::string& table_name, ErrorCode* err) = 0
+``` 
+ 
+##### (12) 检查表是否为enable状态 Client::IsTableEnabled
+```
+bool IsTableEnabled(const std::string& table_name, ErrorCode* err) = 0
+```
+ 
+##### (13) 检查表是否为空 Client::IsTableEmpty
+```
+bool IsTableEmpty(const std::string& table_name, ErrorCode* err) = 0
+```
+ 
+##### (14) 发送请求给服务器 Client::CmdCtrl
+```
+bool CmdCtrl(const std::string& command, const std::vector<std::string>& arg_list, bool* bool_result, std::string* str_result, ErrorCode* err) = 0
+```
+ 
+##### (15) 使用glog的用户防止冲突 Client::SetGlogIsInitialized
+```
+void SetGlogIsInitialized()
+```
+ 
+##### (16) 删除表格 Client::DeleteTable
+```
+bool DeleteTable(const std::string& name, ErrorCode* err) = 0
+```
+ 
+##### (17) 更新表格 Client::UpdateTable
+```
+bool UpdateTable(const TableDescriptor& desc, ErrorCode* err) = 0
+```
+ 
+##### (18) 获得表格的位置 Client::GetTabletLocation
+```
+bool GetTabletLocation(const std::string& table_name, std::vector<TabletInfo>* tablets, ErrorCode* err) = 0
+```
+ 
+##### (19) 重命名表格 Client::Rename
+```
+bool Rename(const std::string& old_table_name, const std::string& new_table_name, ErrorCode* err) = 0
+```
+#### 2. 用户管理
+ 
+##### (1) 创建用户 Client::CreateUser
+ 
+```
+bool ClientImpl::CreateUser(const std::string& user,
+                            const std::string& password, ErrorCode* err) = 0
+```
+##### (2) 删除用户 Client::DeleteUser
+ 
+```
+bool ClientImpl::DeleteUser(const std::string& user, ErrorCode* err) = 0
+```
+ 
+##### (3) 修改用户密码 Client::ChangePwd
+ 
+```
+bool ClientImpl::ChangePwd(const std::string& user, const std::string& password, ErrorCode* err) = 0
+```
+ 
+##### (4) 显示指定用户信息 Client::ShowUser
+ 
+```
+bool ClientImpl::ShowUser(const std::string& user, std::vector<std::string>& user_groups, ErrorCode* err) = 0
+```
+ 
+##### (5) 添加用户到用户群 Client::AddUserToGroup
+ 
+```
+bool ClientImpl::AddUserToGroup(const std::string& user_name, const std::string& group_name, ErrorCode* err)= 0
+```
+ 
+##### (6) 从用户群中删除用户 Client::DeleteUserFromGroup
+ 
+```
+bool ClientImpl::DeleteUserFromGroup(const std::string& user_name, const std::string& group_name, ErrorCode* err) = 0
+```   
+<!-- 
+#### 3. 快照管理
+##### (1) 为表格创建快照 Client::GetSnapshot
+```
+bool GetSnapshot(const std::string& name, uint64_t* snapshot, ErrorCode* err) = 0
+```
+ 
+##### (2) 删除快照 Client::DelSnapshot
+```
+bool DelSnapshot(const std::string& name, uint64_t snapshot, ErrorCode* err) = 0
+```
+ 
+##### (3) 为特定的快照执行回滚操作 Client::Rollback
+```
+bool Rollback(const std::string& name, uint64_t snapshot, const std::string& rollback_name, ErrorCode* err) = 0
+```
+-->    
+ 
+
diff --git a/doc/sdk_reference/mutation.md b/doc/sdk_reference/mutation.md
index 54e444607..752891f9b 100644
--- a/doc/sdk_reference/mutation.md
+++ b/doc/sdk_reference/mutation.md
@@ -1,108 +1,154 @@
-# RowMutation
 
+# RowMutation接口说明
 tera sdk中通过RowMutation结构描述一次行更新操作，包含删除操作。
-一个RowMutaion中可以同时对多列进行操作，保证：
- * 服务端生效时序与RowMutation的执行时序相同。比如对某列的删除+更新，服务端生效时不会乱序，导致先更新再删除的情况发生。
- * 同一个RowMutation中的操作保证同时成功或失败。
- * 操作不存在的列族会返回成功，但无法读取。
-
-## 创建与析构
-
-由tera::Table::NewRowMutation创建，不能由用户创建。
-
-用户需要自行析构：
- * 同步模式下Put返回后即可析构
- * 异步模式下需要等待回调返回，并处理完成后析构，建议在回调函数末尾进行析构
  
-## API
-
-### 更新
-
-Key-value模式更新。若设定ttl，数据会在ttl时间超时后被淘汰。
+## 1. 数据结构
+```
+    enum Type {  
+        kPut,
+        kDeleteColumn,
+        kDeleteColumns,
+        kDeleteFamily,
+        kDeleteRow,
+        kAdd,
+        kPutIfAbsent,
+        kAppend,
+        kAddInt64
+    };
+    struct Mutation {
+        Type type;
+        std::string family;
+        std::string qualifier;
+        std::string value;
+        int64_t timestamp;
+        int32_t ttl;
+    };
+```
+
+## 2. 主要接口与用法
+#### 2.1 更新
+<style type="text/css">
+table th:first-of-type {
+    width: 10%;
+}
+table th:nth-of-type(2) {
+    width: 10%;
+}
+table th:nth-of-type(3) {
+    width: 5%;
+}
+table th:nth-of-type(4) {
+    width: 50%;
+}
+table th:nth-of-type(5) {
+    width: 10%;
+}
+table th:nth-of-type(6) {
+    width: 5%;
+}
+</style>
+
+表格类型 | 接口功能 | 接口 | 参数 | 可省参数 | 返回值类型 | 其它说明
+---  | ---    | ---  | --- | ---  | --- | ---
+表格模式 | 修改一个列 | Put | const std::string& family, const std::string& qualifier, const int64_t value, int64_t timestamp | timestamp可省，省略时为－1 | void | Counter场景下使用，设定初始值。
+表格模式 | 修改一个列的特定版本 | Put | const std::string& family, const std::string& qualifier, const std::string& value, int64_t timestamp| timestamp可省，省略时为－1  | void | 若设定timestamp，数据会被更新至指定时间，危险，不建议使用
+表格模式 | 修改一个带TTL列的特定版本 | Put | const std::string& family, const std::string& qualifier, int64_t timestamp, const std::string& value, int32_t ttl | | void |
+表格模式 | 修改一个列的特定版本 | Put | const std::string& family, const std::string& qualifier, int64_t timestamp, const std::string& value | | void |
+表格模式 | 原子操作：如果不存在才能Put成功 | PutIfAbsent | const std::string& family, const std::string& qualifier, const int64_t delta | | void |若不存在，更新生效；否则更新数据不生效。delta可为负数。
+表格模式 | 原子加一个Cell | Add | const std::string& family, const std::string& qualifier, const int64_t delta | | void  | Counter场景下使用，累加。若无初始值，会从0开始累加
+表格模式 | 原子加一个Cell | Append | const std::string& family, const std::string& qualifier, const std::string& value | | void | 将value追加至此列原数据末尾；若原数据不存在，则与Put等效。
+k-v模式 |修改带TTL的默认列 | Put | const std::string& value, int32_t ttl | ttl 可省，默认为－1 | void |若设定ttl，数据会在ttl时间超时后被淘汰。
+ 
+#### 2.2 删除
+##### (1) 删除整行  RowMutation::DeleteRow
+删除整行的指定范围版本。
+```
+void DeleteRow(int64_t timestamp = -1) = 0;//若设定timestamp，则删除此时间之前的所有更新。 Key-value模式下timestamp不生效。
+```
+ 
+##### (2) 删除某列族  RowMutation::DeleteFamily
+删除一个列族的所有列的指定范围版本。
 ```
-void Put(const std::string& value, int32_t ttl = -1);
+void DeleteFamily(const std::string& family, int64_t timestamp = -1) = 0;//若设定timestamp，则删除此时间之前的所有更新。
 ```
-表格模式更新。若设定timestamp，数据会被更新至指定时间，危险，不建议使用。
+ 
+##### (3) 删除某列所有版本  RowMutation::DeleteColumns
+删除一个列的指定范围版本。
 ```
-void Put(const std::string& family, const std::string& qualifier, const std::string& value, int64_t timestamp = -1);
+void DeleteColumns(const std::string& family, const std::string& qualifier, int64_t timestamp = -1) = 0;//若设定timestamp，则删除此时间之前的所有更新。
 ```
-表格模式更新。Counter场景下使用，设定初始值。
+ 
+##### (4) 删除一个列的指定版本  RowMutation::DeleteColumn
 ```
-void Put(const std::string& family, const std::string& qualifier, int64_t value, int64_t timestamp = -1);
+void DeleteColumn(const std::string& family, const std::string& qualifier, int64_t timestamp) = 0;//若不存在，则不生效。
 ```
-表格模式更新。Counter场景下使用，累加。若无初始值，会从0开始累加。
+
+
+#### 2.3 错误码
+##### (1) 行更新错误码  RowMutation::ErrorCode
 ```
-void Add(const std::string& family, const std::string& qualifier, const int64_t delta);
+const ErrorCode& GetError() = 0; //成功返回KOK
 ```
-表格模式更新。若不存在，更新生效；否则更新数据不生效。
+##### (2) 设置错误码 RowMutation::SetError
 ```
-void PutIfAbsent(const std::string& family, const std::string& qualifier, const std::string& value);
+void SetError(ErrorCode::ErrorCodeType err, const std::string& reason) = 0;
 ```
-表格模式更新。将value追加至此列原数据末尾；若原数据不存在，则与Put等效。
+#### 2.4 异步
+若设定回调，则异步提交；否则同步提交。
+##### (1) 设置回调  RowMutation::SetCallBack
+ 
+设置异步回调, 操作会异步返回。
 ```
-void Append(const std::string& family, const std::string& qualifier, const std::string& value);
+void SetCallBack(Callback callback) = 0;
 ```
 
-### 删除
-
-删除整行。若设定timestamp，则删除此时间之前的所有更新。
-Key-value模式下timestamp不生效。
+##### (2) 获得回调函数  RowMutation::GetCallBack
 ```
-void DeleteRow(int64_t timestamp = -1);
+Callback GetCallBack() = 0;
 ```
-删除某列族。若设定timestamp，则删除此时间之前的所有更新。
+ 
+#### 2.5 上下文设定
+##### (1) 设置上下文  RowMutation::SetContext
+设置用户上下文，可在回调函数中获取。
 ```
-void DeleteFamily(const std::string& family, int64_t timestamp = -1);
+void SetContext(void* context) = 0;
 ```
-删除某列所有版本。若设定timestamp，则删除此时间之前的所有更新。
+ 
+##### (2) 获取用户上下文  RowMutation::GetContext
 ```
-void DeleteColumns(const std::string& family, const std::string& qualifier, int64_t timestamp = -1);
+void* GetContext() = 0;
 ```
-删除某列指定时间更新。若不存在，则不生效。
+#### 2.6 超时设定 
+设定单个mutation的超时时间。 如没有特殊需要，不必单独设定，使用sdk的统一超时即可。
+##### (1) 设置超时时间  RowMutation::SetTimeOut
+ 
+设置超时时间(只影响当前操作,不影响Table::SetWriteTimeout设置的默认写超时)
 ```
-void DeleteColumn(const std::string& family, const std::string& qualifier, int64_t timestamp);
+void SetTimeOut(int64_t timeout_ms) = 0;
 ```
-
-### 异步
-
-若设定回调，则异步提交；否则同步提交。
+ 
+##### (2) 超时  RowMutation::TimeOut
 ```
-typedef void (*Callback)(RowMutation* param);
-void SetCallBack(Callback callback);
-Callback GetCallBack();
-bool IsAsync(); 
+int64_t TimeOut() = 0
 ```
-
-### 超时设定
-
-设定单个mutation的超时时间。
-如没有特殊需要，不必要单独设定，使用sdk的统一超时即可。
+ #### 2.7 其他操作 
+##### (1) 获取行更新的操作数  RowMutation::MutationNum
 ```
-void SetTimeOut(int64_t timeout_ms);
-int64_t TimeOut() = 0;
+uint32_t MutationNum() = 0;
 ```
-
-### 上下文设定
-
-用于回调中获取用户自定义上下文信息。
-内存由用户自己管理。
-
+ 
+##### (2) 获取mutation总大小  RowMutation::Size
 ```
-void SetContext(void* context);
-void* GetContext();
+uint32_t Size() = 0;
 ```
-
-### 其它
-
+ 
+##### (3) 返回row_key  RowMutation::RowKey
 ```
-uint32_t MutationNum();
-uint32_t Size();
-const RowMutation::Mutation& GetMutation(uint32_t index);
+const std::string& RowKey() = 0;
 ```
-
-### 预发布
-
-获取所属事务
+ 
+##### (4) 返回mutation  RowMutation::GetMutation
 ```
-Transaction* GetTransaction();
+const RowMutation::Mutation& GetMutation(uint32_t index) = 0;
 ```
+
diff --git a/doc/sdk_reference/reader.md b/doc/sdk_reference/reader.md
index 476d945fb..876e5f8ca 100644
--- a/doc/sdk_reference/reader.md
+++ b/doc/sdk_reference/reader.md
@@ -1,103 +1,61 @@
-# RowReader
 
+# Reader接口说明
 tera sdk中通过RowReader结构描述一次行读取操作，并获取返回数据。
 
-## 创建与析构
-
-由tera::Table::NewRowReader创建，不能由用户创建。
-
-用户需要自行析构：
- * 同步模式下Get返回后即可析构
- * 异步模式下需要等待回调返回，并处理完成后析构，建议在回调函数末尾进行析构
- 
-## API
-
-### 描述过滤条件
-
-通过相关的API可以对列名、更新时间、版本数目等信息描述，从而对返回数据集合进行过滤。
-
-如果不进行任何描述，默认返回此行所有数据。
-
-#### AddColumnFamily
-
+## 1. 主要接口与用法
+#### 1.1 描述过滤条件
+通过相关的API可以对列名、更新时间、版本数目等信息描述，从而对返回数据集合进行过滤。如果不进行任何描述，默认返回此行所有数据。
+##### (1) 可以增加多个列族  RowReader::AddColumnFamily
 ```
-void AddColumnFamily(const std::string& family);
+void AddColumnFamily(const std::string& family) = 0;//如此“family”不存在于表格的schema中，则不进行过滤
 ```
-
-限定返回数据的列族为“family”。
-
-可以增加多个列族。
-
-如此“family”不存在于表格的schema中，则不进行过滤。
-
-#### AddColumn
-
+ 
+##### (2) 可以增加多个列 RowReader::AddColumn
 ```
-void AddColumn(const std::string& family, const std::string& qualifier);
+void AddColumn(const std::string& family, const std::string& qualifier); //除限定返回数据列族为“family”外，其列名必须为“qualifier”。
 ```
-
-与AddColumnFamily类似，除限定返回数据列族为“family”外，其列名必须为“qualifier”。
-
-此操作与AddColumnFamily共同生效，返回数据为二者并集。
-
-#### SetTimeRange
-
+ 
+##### (3) 设定最大版本数  RowReader::SetMaxVersions
 ```
-void SetTimeRange(int64_t ts_start, int64_t ts_end);
+void SetMaxVersions(uint32_t max_version) = 0; //从最新版本开始计数，若实际数据版本数小于此值，全部返回。在最大版本数基础上再进行时间过滤。
 ```
-
-设定返回数据的更新时间范围。
-
-只返回更新时间在[ts_start, ts_end]范围内的数据。
-
-其中ts_start、ts_end均为Unix时间戳，单位为微秒（us）。
-
-#### SetMaxVersions
-
+ 
+##### (4) 设定返回数据的更新时间范围  RowReader::SetTimeRange
 ```
-void SetMaxVersions(uint32_t max_version);
+void SetTimeRange(int64_t ts_start, int64_t ts_end) = 0;//只返回更新时间在[ts_start, ts_end]范围内的数据。其中ts_start、ts_end均为Unix时间戳，单位为微秒（us）。
 ```
-
-设定最大版本数。
-
-从最新版本开始计数，若实际数据版本数小于此值，全部返回。
-
-过滤优先级高于TimeRange，即在最大版本数基础上再进行时间过滤。
-
-### 获取数据
-
+ 
+#### 1.2 获取数据
 在RowReader被提交至服务端并返回后，可以从此结构中获取返回的数据。
-
 支持两种获取方式：
+<ul>
+<li>迭代器方式。依次遍历所有列、所有版本。</li>
+<li>全量输出。返回一个特定结构的std::Map，可按列名等信息进行访问。</li>
+</ul>
 
- * 迭代器方式。依次遍历所有列、所有版本。
- * 全量输出。返回一个特定结构的std::Map，可按列名等信息进行访问。
-
-#### 迭代器方式
+##### (1) 访问数据前通过Done进行确认  RowReader::Done
+```
+bool Done() = 0;;//若返回false，则数据已遍历完毕。
 
 ```
-bool Done();
-void Next();
+ 
+##### (2) 访问数据前通过Next进行确认  RowReader::Next
 ```
-
-访问数据前通过Done()进行确认。
-
-若返回false，则数据已遍历完毕。
-
+void Next() = 0;
+```
+ 
+##### (3) 当数据存在时，可以通过以下接口访问此单元格的各字段值
+当通过RowReader访问key-value模式的表时，除RowKey和Value外，其它字段值无效。
 ```
 const std::string& RowKey();
 std::string Value();
-std::string Family();
-std::string Qualifier();
-int64_t Timestamp();
+std::string Family() = 0;
+std::string Qualifier() = 0;
+int64_t Timestamp() = 0;
 ```
-
-当数据存在时，可以通过这些接口访问此单元格的各字段值。
-
-当通过RowReader访问key-value模式的表时，除RowKey和Value外，其它字段值无效。
-
-#### 全量输出
-
+ 
+##### (4) 全量输出
+通过多级std::map的形式进行访问。
 ```
 typedef std::map<int64_t, std::string> TColumn;
 typedef std::map<std::string, TColumn> TColumnFamily;
@@ -105,37 +63,50 @@ typedef std::map<std::string, TColumnFamily> TRow;
 virtual void ToMap(TRow* rowmap);
 ```
 
-通过多级std::map的形式进行访问。
-
-### 异步与上下文设定
-
+#### 1.3 错误码
+##### (1) 获取错误码  RowReader::ErrorCode
+```
+const ErrorCode& GetError() = 0; //成功返回KOK
+```
+#### 1.4 异步
 若设定回调，则异步提交；否则同步提交。
+##### (1) 设置回调  RowReader::SetCallBack
 ```
-typedef void (*Callback)(RowMutation* param);
-void SetCallBack(Callback callback);
-Callback GetCallBack();
+void SetCallBack(Callback callback) = 0;
 ```
 
-用于回调中获取用户自定义上下文信息。
-内存由用户自己管理。
-
+##### (2) 设置回调  RowReader::GetCallBack
 ```
-void SetContext(void* context);
-void* GetContext();
+void (*Callback)(RowReader* param);
 ```
 
-### 超时设定
+#### 1.5 上下文设定
+用于回调中获取用户自定义上下文信息。 内存由用户自己管理。
 
-设定单个reader的超时时间。
-如没有特殊需要，不必要单独设定，使用sdk的统一超时即可。
+##### (1) 设置上下文  RowReader::SetContext
 ```
-void SetTimeOut(int64_t timeout_ms);
-int64_t TimeOut() = 0;
+void SetContext(void* context) = 0;
 ```
-
-### 预发布
-
-获取所属事务
+ 
+##### (2) 获取上下文  RowReader::GetContext
+```
+void* GetContext() = 0;
+```
+#### 1.6 超时设定 
+设定单个reader的超时时间。如没有特殊需要，不必要单独设定，使用sdk的统一超时即可。
+##### (1) 设置超时时间  RowReader::SetTimeOut
+```
+void SetTimeOut(int64_t timeout_ms) = 0;
+```
+ 
+#### 1.7 其他
+##### (1) 获取表格  RowReader::GetTable
+```
+Table* GetTable() = 0;
+```
+ 
+##### (2) 获取按列过滤的map
 ```
-Transaction* GetTransaction();
+typedef std::map<std::string, std::set<std::string> >ReadColumnList;
+const ReadColumnList& GetReadColumnList() = 0;
 ```
diff --git a/doc/sdk_reference/readme.md b/doc/sdk_reference/readme.md
new file mode 100644
index 000000000..c57f747cf
--- /dev/null
+++ b/doc/sdk_reference/readme.md
@@ -0,0 +1,42 @@
+# Tera SDK主要api接口说明
+
+
+### 主要数据结构
+ 
+* tera::[client](../sdk_reference/client.md)
+* tera::[table](../sdk_reference/table.md)
+* tera::[mutation](../sdk_reference/mutation.md)
+* tera::[reader](../sdk_reference/reader.md)
+* tera::[table_descriptor](../sdk_reference/table_descriptor.md)
+* tera::[transaction](../sdk_reference/transaction.md)
+* tera::[scan](../sdk_reference/scan.md)
+* tera::[utils](../sdk_reference/utils.md)
+
+<a name="main-data-structure"></a> 
+### 介绍
+#### (1) tera::client  访问tera服务主结构，所有对tera的访问或操作全部由此发起。
+一个集群对应一个client即可，如需访问多个client，需要创建多个
+##### 主要功能包括：
+* 表格操作：建、删、加载、卸载、打开、关闭、更新表结构、获取表格信息、快照等
+* 用户管理：建、删、修改密码、组管理等
+* 集群信息获取：获取全部表格列表、状态等
+ 
+#### (2) tera::table  表格主结构，对表格的所有增删查改操作由此发起。
+由tera::Client::OpenTable产生，tera::Client::CloseTable关闭，不可析构。
+ 
+#### (3) tera::error_code 错误码，很多操作会返回，注意检查。
+
+#### (4) tera::mutation
+ 
+#### (5) tera::scan 扫描操作，并获取返回数据。
+ 
+#### (6) tera::reader 读取操作，并获取返回数据。
+ 
+#### (7) tera::table_descriptor 表格描述符主体
+ 
+#### (8) tera::transaction 单行事务
+
+ 
+#### (9) tera::scan 扫描
+ 
+#### (10) tera::utils 编码解码
diff --git a/doc/sdk_reference/scan.md b/doc/sdk_reference/scan.md
new file mode 100644
index 000000000..dadb915fd
--- /dev/null
+++ b/doc/sdk_reference/scan.md
@@ -0,0 +1,98 @@
+
+# scan接口说明
+tera中scan操作由ResultStream和ScanDescriptor两个数据结构进行描述。
+### 1. ResultStream
+
+##### (1) 检查迭代是否结束
+```
+bool Done(ErrorCode* err = NULL) = 0; //如果检查失败则返回error code。
+```
+
+##### (2) 移到下一个cell
+
+```
+void Next() = 0;
+```
+
+##### (3) 获取当前cell的rowkey名字
+```
+std::string RowName() const = 0;
+```
+##### (4) 获取当前cell的簇
+```
+std::string Family() const = 0;
+```
+ 
+##### (5) 获取当前cell的列
+```
+std::string Qualifier() const = 0;
+```
+ 
+##### (6) 返回时间戳
+```
+int64_t Timestamp() const = 0;
+```
+ 
+##### (7) 返回当前cell的值
+```
+std::string Value() const = 0;
+int64_t ValueInt64() const = 0;
+```
+
+### 2. ScanDescriptor
+ 
+##### (1) 设置扫描的结束key
+```
+void SetEnd(const std::string& rowkey);
+```
+
+##### (2) 设置扫描的目标cf
+
+```
+void AddColumnFamily(const std::string& cf);
+```
+
+##### (3) 设置扫描的目标列
+```
+ void AddColumn(const std::string& cf, const std::string& qualifier);
+```  
+##### (4) 设置每列的maxversion
+```
+void SetMaxVersions(int32_t versions);
+```
+ 
+##### (5) 设置每个扫描结果的时间范围
+```
+void SetTimeRange(int64_t ts_end, int64_t ts_start);
+```
+ 
+##### (6) 设置批量扫描模式
+```
+void SetAsync(bool async);
+```
+ 
+##### (7) 检查扫描是否为批量扫描模式
+```
+bool IsAsync() const;
+```
+ 
+##### (8) 设置扫描的超时时间
+```
+void SetPackInterval(int64_t timeout);
+```
+ 
+##### (9) 设置扫描的buffersize
+```
+void SetBufferSize(int64_t buf_size);//默认为64K
+```
+ 
+##### (10) 设置每次扫描的cell数
+```
+void SetNumberLimit(int64_t number_limit);
+```
+ 
+##### (11) 获取每次扫描的cell数
+```
+int64_t GetNumberLimit();
+```
+
diff --git a/doc/sdk_reference/table.md b/doc/sdk_reference/table.md
new file mode 100644
index 000000000..58894a8ed
--- /dev/null
+++ b/doc/sdk_reference/table.md
@@ -0,0 +1,100 @@
+
+# Table接口说明
+
+## 1. 主要数据结构
+#### 1. 表格信息
+```
+struct TableInfo {
+    TableDescriptor* table_desc; //表的描述符
+    std::string status; //表格状态信息
+};
+```
+#### 2. tablet信息
+```
+struct TabletInfo {
+    std::string table_name; //表名
+    std::string path; //路径
+    std::string server_addr; //服务器地址
+    std::string start_key; //起始key
+    std::string end_key;  //结束key
+    int64_t data_size; //数据大小
+    std::string status; //状态
+};
+```
+ 
+## 2. 主要接口
+##### (1) 获取表名  Table::GetName
+```
+const std::string GetName() = 0
+```
+ 
+##### (2) 行mutation操作 Table::NewRowMutation
+```
+RowMutation* NewRowMutation(const std::string& row_key) = 0
+```
+##### (3) 写数据 Table::Put
+```
+1) void Put(RowMutation* row_mutation) = 0
+2) void Put(const std::vector<RowMutation*>& row_mutations) = 0
+3) bool Put(const std::string& row_key, const std::string& family, const std::string& qualifier, const std::string& value, ErrorCode* err) = 0
+4) bool Put(const std::string& row_key, const std::string& family, const std::string& qualifier, const int64_t value, ErrorCode* err) = 0;
+5) bool PutIfAbsent(const std::string& row_key, const std::string& family, const std::string& qualifier, const std::string& value, ErrorCode* err) = 0;
+```
+ 
+##### (4) 检查写数据是否结束 Table::IsPutFinished
+ 
+```
+bool IsPutFinished() = 0
+```
+
+##### (5) 添加数据 Table::Add
+ 
+```
+bool Add(const std::string& row_key, const std::string& family, const std::string& qualifier, int64_t delta, ErrorCode* err) = 0;
+```
+ 
+##### (6) 追加数据 Table::Append
+ 
+```
+bool Append(const std::string& row_key, const std::string& family, const std::string& qualifier, const std::string& value, ErrorCode* err) = 0;
+```
+ 
+##### (7) 按行读数据 Table::NewRowReader
+ 
+```
+RowReader* NewRowReader(const std::string& row_key) = 0
+```
+ 
+##### (8) 读数据 Table::Get
+ 
+```
+1) void Get(RowReader* row_reader) = 0
+2) void Get(const std::vector<RowReader*>& row_readers) = 0;
+3) bool Get(const std::string& row_key, const std::string& family, const std::string& qualifier, std::string* value, ErrorCode* err) = 0;
+4) bool Get(const std::string& row_key, const std::string& family, const std::string& qualifier, int64_t* value, ErrorCode* err) = 0;
+```
+ 
+##### (9) 检查get是否结束 Table::IsGetFinished
+```
+bool IsGetFinished() = 0;
+```
+ 
+##### (10) 扫描 Table::Scan
+```
+ResultStream* Scan(const ScanDescriptor& desc, ErrorCode* err) = 0
+```
+##### (11)  按行事务处理 Table::StartRowTransaction
+```
+Transaction* StartRowTransaction(const std::string& row_key) = 0
+``` 
+ 
+##### (12) 提交行事务 Table::CommitRowTransaction
+```
+void CommitRowTransaction(Transaction* transaction) = 0
+```
+
+##### (13)  执行mutation Table::ApplyMutation
+```c
+void ApplyMutation(RowMutation* row_mu) = 0;
+void ApplyMutation(const std::vector<RowMutation*>& row_mu_list) = 0;
+```
diff --git a/doc/sdk_reference/table_descriptor.md b/doc/sdk_reference/table_descriptor.md
index ccf79a1f1..cbc2e0670 100644
--- a/doc/sdk_reference/table_descriptor.md
+++ b/doc/sdk_reference/table_descriptor.md
@@ -1,350 +1,217 @@
-# 表格描述
 
-tera中的表格由TableDescriptor、LocalityGroupDescriptor、ColumnFamilyDescriptor三个数据结构进行描述，C++接口。
-
-同时也支持更简单的字符串描述，参见本文最后。
-
-## TableDescriptor
-
-表格描述符主体，LocalityGroupDescriptor、ColumnFamilyDescriptor由其管理。
-
-描述表格全局属性，如key拼装方式、分片分裂合并阈值、ACL等信息。
-
-### 创建与析构
-
-此结构由用户自己创建并析构。
+# table_descriptor接口说明
+tera中的表格由ColumnFamilyDescriptor、LocalityGroupDescriptor、TableDescriptor三个数据结构进行描述。
+### 1. ColumnFamilyDescriptor
+描述一个列族的属性。
+属性支持动态更新。更新状态为最终一致，过程中存在分片之前属性不一致情况，使用时需要注意。
+##### (1) TTL 
+设定列族内cell的TTL（time-to-live)，单位秒，默认无穷大。
+当列族内某cell的更新时间超过此值后，读取时被屏蔽，并在垃圾回收时物理删除。
+```
+void SetTimeToLive(int32_t ttl) = 0;
+int32_t TimeToLive() const = 0;
+```
 
-### 使用场景
+##### (2) 最大版本数MaxVersions
+设定列族内cell的最大版本数，默认为1。
+当某cell的版本数超过此限制后，会将最旧的版本进行屏蔽，并在垃圾回收时物理删除。
+此值不做最大值限制，但随着版本数大量增加，相应的随机读、扫描性能会下降，存储使用上升，用户可按实际情况调整。
+```
+void SetMaxVersions(int32_t max_versions) = 0;
+int32_t MaxVersions() const = 0;
+```
 
- * 表格创建，通过`tera::Client::CreateTable`
- * 表格Schema更新，通过`tera::Client::UpdateTable`
- * 获取表格属性，通过`tera::Client::GetTableDescriptor`
- 
-### API
+##### (3) 获取LG的名字
+```
+const std::string& LocalityGroup() const = 0;
+```
+##### (4) 获取Id
+```
+int32_t Id() const = 0;
+```
 
-#### TableDescriptor
+### 2. LocalityGroupDescriptor
+描述一个locality group的属性。
 
+##### (1) 获取此LG名字
 ```
-TableDescriptor(const std::string& name);
+const std::string& Name() const;
 ```
 
-构造表格名为“name”的表格描述符。
+##### (2) 设定、获取存储介质，默认kInDisk
+```
+void SetStore(StoreType type) = 0;
+StoreType Store() const = 0;
+enum StoreType {
+    kInDisk = 0,
+    kInFlash = 1,
+    kInMemory = 2,
+};
+```
 
-其中表格名长度需要小于256字节，字符只支持{[a-z],[A-Z],[0-9],'_','-'}。
+##### (3) 设定、获取物理文件内部block大小
+```
+void SetBlockSize(int block_size) = 0;//设定、获取物理文件内部block大小，单位KB，默认值：4。
+int BlockSize() const = 0; 
+```
+##### (4) 设定、获取物理文件基础大小
+```
+int32_t SstSize() const = 0;//设定、获取物理文件内部block大小，单位KB，默认值：4。
+void SetSstSize(int32_t sst_size) = 0;
+```
+##### (5) 获取／得到compress type
+```
+ void SetCompress(CompressType type) = 0;
+ CompressType Compress() const = 0;
+```
+##### (6) 设定、获取是否使用bloom filter
+设定、获取是否使用bloom filter，默认不使用。
+``` 
+void SetUseBloomfilter(bool use_bloomfilter) = 0;
+bool UseBloomfilter() const = 0;
+```
+##### (7) 内存内compact
+是否使用内存内compact。
+``` 
+bool UseMemtableOnLeveldb() const = 0;
+void SetUseMemtableOnLeveldb(bool use_mem_ldb) = 0;
+```
+##### (8) 设定、获取内存compact中写缓存大小
+设定、获取内存compact中写缓存大小，单位KB。
+```
+int32_t MemtableLdbWriteBufferSize() const = 0;
+void SetMemtableLdbWriteBufferSize(int32_t buffer_size) = 0;
+```
+##### (9) 设定、获取内存compact中对应block大小
+设定、获取内存compact中对应block大小，单位KB。
+``` 
+int32_t MemtableLdbBlockSize() const = 0;
+void SetMemtableLdbBlockSize(int32_t block_size) = 0;
+```
+ 
+### 3. TableDescriptor
+表格描述符主体，LocalityGroupDescriptor、ColumnFamilyDescriptor由其管理。
+描述表格全局属性，如key拼装方式、分片分裂合并阈值、ACL等信息。
+使用场景
+<ul>
+<li>表格创建，通过tera::Client::CreateTable</li>
+<li>表格Schema更新，通过tera::Client::UpdateTable</li>
+<li>获取表格属性，通过tera::Client::GetTableDescriptor</li>
+</ul>
 
-#### TableName
+#### 3.1 TableDescriptor
 
+##### (1) 获取表名
+设置、返回表格名。
 ```
 void SetTableName(const std::string& name);
 std::string TableName() const;
 ```
 
-设置、返回表格名。
-
-#### LocalityGroup
-
+##### (2) 新增一个名为‘lg_name’的LG
+其中，LocalityGroup名长度需要小于256字节，字符只支持{[a-z],[A-Z],[0-9],'_','-'}
 ```
 LocalityGroupDescriptor* AddLocalityGroup(const std::string& lg_name);
 ```
 
-新增一个名为‘lg_name’的LG。
-
-其中的LocalityGroup名长度需要小于256字节，字符只支持{[a-z],[A-Z],[0-9],'_','-'}。
-
+##### (3) 删除名为‘lg_name’的LG
 ```
-bool RemoveLocalityGroup(const std::string& lg_name);
+bool RemoveLocalityGroup(const std::string& lg_name);//如果此LG中还有列族存在，删除失败。
 ```
-
-删除名为‘lg_name’的LG。
-
-如果此LG中还有列族存在，删除失败。
-
+##### (4) 通过id/名称访问对应LG
+LG在表格内部以vector形式保存，id为其对应的下标。
 ```
 const LocalityGroupDescriptor* LocalityGroup(int32_t id) const;
 const LocalityGroupDescriptor* LocalityGroup(const std::string& lg_name) const;
 ```
-
-通过id/名称访问对应LG。
-
-LG在表格内部以vector形式保存，id为其对应的下标。
-
+##### (5) 获取／得到compress type
 ```
-int32_t LocalityGroupNum() const;
+ void SetCompress(CompressType type) = 0;
+ CompressType Compress() const = 0;
 ```
-
-返回当前表格中LG数量。
-
-#### ColumnFamily
-
+##### (6) 返回当前表格中LG数量
 ```
-ColumnFamilyDescriptor* AddColumnFamily(const std::string& cf_name,const std::string& lg_name);
+int32_t LocalityGroupNum() const;
 ```
+ 
+#### 3.2 ColumnFamily
 
-在‘lg_name’下新增一个名为‘cf_name’的列族。
-
-若‘lg_name’不存在，返回NULL。
-
-其中列族名长度需要小于256字节，字符只支持{[a-z],[A-Z],[0-9],'_','-'}。
-
+##### (1) 在‘lg_name’下新增一个名为‘cf_name’的列族
+若‘lg_name’不存在，返回NULL。其中列族名长度需要小于256字节，字符只支持{[a-z],[A-Z],[0-9],'_','-'}。
+``` 
+ColumnFamilyDescriptor* AddColumnFamily(const std::string& cf_name, const std::string& lg_name = "lg0");
 ```
+##### (2) 删除名为‘cf_name’的列族
+``` 
 void RemoveColumnFamily(const std::string& cf_name);
 ```
-
-删除名为‘cf_name’的列族。
-
+##### (3) 通过id/名称访问对应列族
+列族在表格内部以vector形式保存，id为其对应的下标。
 ```
 const ColumnFamilyDescriptor* ColumnFamily(int32_t id) const;
 const ColumnFamilyDescriptor* ColumnFamily(const std::string& cf_name) const;
 ```
-
-通过id/名称访问对应列族。
-
-列族在表格内部以vector形式保存，id为其对应的下标。
-
+##### (4) 返回当前表格中列族数量
 ```
 int32_t ColumnFamilyNum() const;
 ```
 
-返回当前表格中列族数量。
+#### 3.3 RawKey
 
-#### RawKey
-
-```
+##### (1) 表格内部key的拼装格式
+决定了表格的存储及访问格式，推荐kBinary。
+``` 
+void SetRawKey(RawKeyType type);
+RawKeyType RawKey() const;
 enum RawKeyType {
     kReadable = 0,
-    kBinary = 1, 
+    kBinary = 1,
     kTTLKv = 2,
     kGeneralKv = 3,
-};                 
-void SetRawKey(RawKeyType type);
-RawKeyType RawKey() const;
-```
-
-表格内部key的拼装格式。
-
-决定了表格的存储及访问格式，推荐kBinary。
-
-#### SplitSize
-
-```
-void SetSplitSize(int64_t size);
-int64_t SplitSize() const;
+};
 ```
-
-分片分裂阈值。
-
+#### 3.4 SplitSize
+##### (1) 分片分裂阈值
 当分片数据量（物理存储）超过此阈值时，会被一分为二，并可能被两个不同服务器加载。
-
 此分裂阈值是一个基础参考值，系统会根据实际动态负载在此值基础上进行调整。
-
-#### MergeSize
-
 ```
-void SetMergeSize(int64_t size);
-int64_t MergeSize() const;
+void SetSplitSize(int64_t size);
+int64_t SplitSize() const;
 ```
 
-分片合并阈值。
-
+#### 3.5 MergeSize
+##### (1) 分片合并阈值
 当分片数据量（物理存储）低于此阈值时，会被合并至相临分片中。
-
 此值是一个基础参考值，系统会根据实际动态负载在此值基础上进行调整。
-
 需要小于分裂阈值的1/3，防止出现合并、分裂的循环出现。
 
-#### Write Ahead Log
-
-```
-void DisableWal();         
-bool IsWalDisabled() const;
+``` 
+void SetMergeSize(int64_t size);
+int64_t MergeSize() const;
 ```
-
-配置日志开关，默认打开。
-
+#### 3.6 Write Ahead Log
+##### (1) 配置日志开关，默认打开
 当此表格数据没有强特久化需求时，可以选择关闭日志。
-
 会大幅提升写性能、降低系统IO消耗。
-
 当有服务器宕机时，内存中数据将丢失，谨慎关闭。
 
-#### Admin
-
-```
-void SetAdmin(const std::string& name);
-std::string Admin() const;
-void SetAdminGroup(const std::string& name);
-std::string AdminGroup() const;
-```
-
-设置表格ACL信息。
-
-## LocalityGroupDescriptor
-
-描述一个locality group的属性。
-
-### 创建与析构
-
-通过`TableDescriptor::AddLocalityGroup`进行创建。
-
-无须用户析构。
- 
-### API
-
-#### Name
-
-```
-const std::string& Name() const;
-```
-
-获取此LG名字。
-
-#### Store
-
-```
-enum StoreType {                                                                    
-    kInDisk = 0,                                                                    
-    kInFlash = 1,                                                                   
-    kInMemory = 2,                                                                  
-};                                                                                  
-void SetStore(StoreType type);
-StoreType Store() const;
-```
-
-设定、获取存储介质，默认kInDisk。
-
-#### BlockSize、SstSize、BloomFilter
-
-```
-void SetBlockSize(int block_size);                                  
-int BlockSize() const;                                              
-```
-
-设定、获取物理文件内部block大小，单位KB，默认值：4。
-
-物理存储基于leveldb开发，此概念与leveldb中的block相似。
-
-```
-void SetSstSize(int sst_size);                                  
-int SstSize() const;                                              
-```
-
-设定、获取物理文件基础大小，单位MB，默认值：8。
-
-物理存储基于leveldb开发，此概念与leveldb中的level1文件大小相同。
-
-```
-void SetUseBloomfilter(bool use_bloomfilter);
-bool UseBloomfilter() const;
-```
-
-设定、获取是否使用bloom filter，默认不使用。
-
-物理存储基于leveldb开发，此概念与leveldb中的bloom filter。
-
-#### 内存内compact
-
-```
-bool UseMemtableOnLeveldb() const;
-void SetUseMemtableOnLeveldb(bool use_mem_ldb);
-```
-
-是否使用内存内compact。
-
-```
-int32_t MemtableLdbWriteBufferSize() const;
-void SetMemtableLdbWriteBufferSize(int32_t buffer_size);
-```
-
-设定、获取内存compact中写缓存大小，单位KB。
-
-```
-int32_t MemtableLdbBlockSize() const;
-void SetMemtableLdbBlockSize(int32_t block_size);
+``` 
+void DisableWal();
+bool IsWalDisabled() const;
 ```
-
-设定、获取内存compact中对应block大小，单位KB。
-
-## ColumnFamilyDescriptor
-
-描述一个列族的属性。
-
-属性支持动态更新。更新状态为最终一致，过程中存在分片之前属性不一致情况，使用时需要注意。
-
-### 创建与析构
-
-通过`TableDescriptor::AddColumnFamily`进行创建。
-
-无须用户析构。
- 
-### API
-
-#### TTL
+#### 3.7 事务
+##### (1) 事务处理
 
 ```
-void SetTimeToLive(int32_t ttl);
-int32_t TimeToLive() const; 
+void EnableTxn();
+bool IsTxnEnabled() const;
 ```
-
-设定列族内cell的TTL（time-to-live)，单位秒，默认无穷大。
-
-当列族内某cell的更新时间超过此值后，读取时被屏蔽，并在垃圾回收时物理删除。
-
-#### MaxVersion
+#### 3.8 Admin
+##### (1) 设置表格的admin
 
 ```
-void SetMaxVersions(int32_t max_versions);
-int32_t MaxVersions() const; 
+void SetAdmin(const std::string& name);
+std::string Admin() const;
+void SetAdminGroup(const std::string& name);
+std::string AdminGroup() const;
 ```
-
-设定列族内cell的最大版本数，默认为1。
-
-当某cell的版本数超过此限制后，会将最旧的版本进行屏蔽，并在垃圾回收时物理删除。
-
-此值不做最大值限制，但随着版本数大量增加，相应的随机读、扫描性能会下降，存储使用上升，用户可按实际情况调整。
-
-## 字符串描述
-
-描述表格的字符串是一个支持描述节点属性的树结构，语法详见[PropTree](https://github.com/BaiduPS/tera/blob/master/doc/prop_tree.md)
-
-### 描述表格存储
-
-表格结构中包含表名、locality groups定义、column families定义，一个典型的表格定义如下（可写入文件）：
-
-    # tablet分裂阈值为4096M，合并阈值为512M
-    # 三个lg，分别配置为flash、flash、磁盘存储
-    table_hello <splitsize=4096, mergesize=512> {
-        lg_index <storage=flash, blocksize=4> {
-            update_flag <maxversions=1>
-        },
-        lg_props <storage=flash, blocksize=32> {
-            level<ttl=1000000>,
-            weight
-        },
-        lg_raw <storage=disk, blocksize=128> {
-            data <maxversions=10>
-        }
-    }
-
-如果无需配置LG，指定表名和所需列名即可（所有的属性可配）：
-
-    table_hello {cf0<ttl=10000>, cf1, cf2}
-
-### 描述key-value存储
-
-只需指定表名即可，若需要指定存储介质等属性，可选择性添加：
-
-    kv_hello                                                # 简单key-value
-    kv_hello <storage=flash, splitsize=2048, mergesize=128> # 配置若干属性
-
-### 属性及含义
-
-span | 属性名 | 意义 | 有效取值 | 单位 | 默认值 | 其它说明
----  | ---    | ---  | ---      | ---  | ---    | ---
-table | splitsize | 某个tablet增大到此阈值时分裂为2个子tablets| >=0，等于0时关闭split | MB | 512 |
-table | mergesize | 某个tablet减小到此阈值时和相邻的1个tablet合并 | >=0，等于0时关闭merge | MB | 0 | splitsize至少要为mergesize的5倍
-lg    | storage   | 存储类型 | "disk" / "flash" / "memory" | - | "disk" |
-lg    | blocksize | LevelDB中block的大小       | >0 | KB | 4 |
-lg    | use_memtable_on_leveldb | 是否启用内存compact | "true" / "false" | - | false |
-lg    | sst_size  | 第一层sst文件大小 | >0 | MB | 8 |
-cf    | maxversions | 保存的最大版本数  | >0 | - | 1 |
-cf    | ttl | 数据有效时间 | >=0，等于0时此数据永远有效 | second | 0 |
diff --git a/doc/sdk_reference/transaction.md b/doc/sdk_reference/transaction.md
new file mode 100644
index 000000000..7a9ba1ae1
--- /dev/null
+++ b/doc/sdk_reference/transaction.md
@@ -0,0 +1,60 @@
+
+# 单行事务transaction接口说明
+
+## 主要功能
+ 
+
+##### (1) 提交一个修改操作  Transaction::ApplyMutation
+```
+void ApplyMutation(RowMutation* row_mu) = 0
+```
+ 
+##### (2) 读取操作 Transaction::Get
+```
+ErrorCode Get(RowReader* row_reader) = 0
+```
+##### (3) 回调函数原型 Transaction::Callback
+```
+typedef void (*Callback)(Transaction* transaction)
+```
+ 
+##### (4) 设置提交回调, 提交操作会异步返回 Transaction::SetCommitCallback
+ 
+```
+void SetCommitCallback(Callback callback) = 0;
+```
+
+##### (5) 获取提交回调 Transaction::GetCommit
+ 
+```
+Callback GetCommitCallback() = 0;
+```
+ 
+##### (6) 设置用户上下文，可在回调函数中获取 Transaction::SetContext
+ 
+```
+void SetContext(void* context) = 0;
+```
+ 
+##### (7) 获取用户上下文 Transaction::GetContext
+ 
+```
+void* GetContext() = 0
+```
+ 
+##### (8) 获得结果错误码 Transaction::GetError
+ 
+```
+const ErrorCode& GetError() = 0; // 异步模式下，通过GetError()获取提交结果
+```
+ 
+##### (9) 同步模式下，获得提交的结果 Transaction::Commit
+```
+ErrorCode Commit() = 0 // 同步模式下，Commit()的返回值代表了提交操作的结果(成功 或者 失败及其原因)
+```
+ 
+##### (10) 获取事务开始时间戳 Transaction::GetStartTimestamp
+```
+int64_t GetStartTimestamp() = 0 //仅在全局事务场景下有效
+```
+
diff --git a/doc/sdk_reference/utils.md b/doc/sdk_reference/utils.md
new file mode 100644
index 000000000..0ad5ba27a
--- /dev/null
+++ b/doc/sdk_reference/utils.md
@@ -0,0 +1,14 @@
+
+# utils接口说明
+tera中utils操作主要用来编码和解码counter cell
+##### (1) 编码
+```
+static std::string EncodeCounter(int64_t counter);
+```
+
+##### (2) 解码
+
+```
+static bool DecodeCounter(const std::string& buf, int64_t* counter);
+```
+
diff --git a/doc/tools/benchmark.md b/doc/tools/benchmark.md
new file mode 100644
index 000000000..5f8ce2941
--- /dev/null
+++ b/doc/tools/benchmark.md
@@ -0,0 +1,38 @@
+
+## 1. tera_bench 
+造数据的工具
+### (1) 用法
+```
+./tera_bench --compression_ratio=1 --key_seed=1 --value_seed=20  --value_size=1000 --num=200000 --benchmarks=random  --key_size=24 --key_step=1
+```
+ 
+## 2. tera_mark   
+读写数据,支持异步读写scan
+
+### (1) 用法
+```
+#示例：
+./tera_mark --mode=w --tablename=test --type=async  --verify=false --entry_limit=1000
+```
+
+### (2) 参数列表
+
+参数名 | 意义 | 有效取值 | 单位 | 默认值 | 其它说明
+---    | ---  | ---      | ---  | ---    | ---
+table | 表名 | - | - | "" |
+mode | 模式 | "w"/"r"/"s"/"m" | - | "w" | -
+type | 类型 | "sync"/"async" | - | "async" | -
+pend_size | 最大pending大小 | - | - | 100 | -
+pend_count | 最大pending数 | - | - | 100000 | -
+start_key | scan的开始key | - | - | "" | -
+end_key | scan的结束key | - | - | "" | -
+cf_list | scan的列簇 | -  | - | "" | -
+print | scan的结果是否需要打印 | true/false | - | false | -
+buf_size | scan的buffer_size | >0  | - | 65536 | -
+verify | md5 verify(writer&read) | true/false  | - | true | -
+max_outflow | max_outflow | -  | - | -1 | -
+max_rate | max_rate | - | - | -1 | -
+scan_streaming | enable streaming scan | true/false  | - | false | -
+batch_count | batch_count(sync) | - | - | 1 | -
+entry_limit | writing/reading speed limit | - | - | 0 | -
+
diff --git a/doc/tools/readme.md b/doc/tools/readme.md
new file mode 100644
index 000000000..401fad9e9
--- /dev/null
+++ b/doc/tools/readme.md
@@ -0,0 +1,10 @@
+
+# Tera 主要工具说明
+
+## 主要工具
+* 操作tera的工具: [teracli](../tools/teracli.md) 
+* 集群间数据迁移的dump工具: [terautil](../tools/terautil.md) 
+* 造数据 & 读写数据的工具: [tera_bench & tera_mark](../tools/benchmark.md)  
+* 业界通用NoSQL测试的基准测试工具: [YCSB](../tools/ycsb.md) 
+
+
diff --git a/doc/tools/teracli.md b/doc/tools/teracli.md
new file mode 100644
index 000000000..1ca78c460
--- /dev/null
+++ b/doc/tools/teracli.md
@@ -0,0 +1,448 @@
+
+# teracli使用说明
+./teracli help即可看到相关的命令和使用方法
+ 
+### 1. create 创建表格
+#### 1.1 基本命令
+
+```c
+./teracli  create        <table-schema>  [<tablet-delimiter-file>]
+./teracli  createbyfile  <schema-file>   [<tablet-delimiter-file>]
+```
+说明：
+* table-schema是一个描述表格结构的字符串。
+* 表名规范：首字符为字母（大小写均可），
+* 有效字符包括大小写的英文字母(a-zA-Z)、数字(0-9)、下划线(_)、连字符(-)、点(.)。 1 <= 有效长度 <=
+* 512
+* Tera支持在建立表格时预分配若干tablet，tablet分隔的key写在tablet-delimiter-file中，按“\n”分隔。
+* 如果表格schema比较复杂，可以将其写入文件中，通过createbyfile命令进行创建。
+ 
+#### 1.2 创建table模式存储
+表格结构中包含表名、locality groups定义、column families定义，一个典型的表格定义如下（可写入文件）
+```c
+# tablet分裂阈值为4096M，合并阈值为512M
+# 三个lg，分别配置为flash、flash、磁盘存储
+table_hello <splitsize=4096, mergesize=512> {
+    lg_index <storage=flash, blocksize=4> {
+        update_flag <maxversions=1>
+    },
+    lg_props <storage=flash, blocksize=32> {
+        level,
+        weight
+    },
+    lg_raw <storage=disk, blocksize=128> {
+        data <maxversions=10>
+    }
+}
+```
+如果只希望简单的使用tera，对性能没有很高要求，那么schema只需指定表名和所需列名即可（如需要，所有的属性也是可配的）：
+```c
+table_hello {cf0, cf1, cf2}
+```
+ 
+#### 1.3 创建key-value表
+tera支持高性能的key-value存储，其schema只需指定表名即可，若需要指定存储介质等属性，可选择性添加：
+```c
+ # 表名为key-value，默认storage为disk, splitsize为512M, mergesize为0
+./teracli  create kv_hello   
+ # 配置若干属性                                            
+./teracli  create "kv_hello <storage=flash, splitsize=2048, mergesize=128>"
+```
+#### 1.4 表格各级属性
+ 
+span | 属性名 | 意义 | 有效取值 | 单位 | 默认值 | 其它说明
+---  | ---    | ---  | ---      | ---  | ---    | ---
+table | splitsize | 某个tablet增大到此阈值时分裂为2个子tablets| >=0，等于0时关闭split | MB | 512 |
+table | mergesize | 某个tablet减小到此阈值时和相邻的1个tablet合并 | >=0，等于0时关闭merge | MB | 0 |
+splitsize至少要为mergesize的3倍,建议为mergesize的10倍，避免merge后又分裂
+lg    | storage   | 存储类型 | "disk" / "flash" / "memory" | - | "disk" |
+lg    | blocksize | LevelDB中block的大小       | >0 | KB | 4 |
+lg    | use_memtable_on_leveldb | 是否启用内存compact | "true" / "false" | - | false |
+lg    | sst_size  | 第一层sst文件大小 | >0 | MB | 8 |
+cf    | maxversions | 保存的最大版本数  | >0 | - | 1 |
+cf    | ttl | 数据有效时间 | >=0，等于0时此数据永远有效 | second | 0 |
+和minversions冲突时以minversions为准
+<!--
+table | rawkey | rawkey的拼装模式 | "binary" / "kv"/ "ttlkv" | - | key的长度必须小于64KB |
+lg    | compress  | 压缩算法 | "snappy" / "none" | - | "snappy" |
+lg    | memtable_ldb_write_buffer_size | 内存compact开启后，写buffer的大小 | >0 | MB | 1 |
+一般不用暴露给用户
+lg    | memtable_ldb_block_size |  内存compact开启后，压缩块的大小 | >0 | KB | 4 | 一般不用暴露给用户
+cf    | diskquota   | 存储限额  | >0 | MB | 0 | 暂未使用
+cf    | minversions | 保存的最小版本数 | >0 | - | 1 |
+-->
+
+### 2 update 更新表格schema
+更新时使用schema语法和建表时的语法基本一致，
+不同主要在于更新时只需指定要更新的属性，不需要改动的属性无需列出。
+#### 2.1 基本语法
+```c
+./teracli update <tableschema>
+```
+#### 2.2 分类
+主要分为两大类更新：
+* 更新table模式schema
+* 更新kv模式schema
+ 
+#### 2.3 更新table模式schema
+ 
+支持表格、cf属性热更新
+##### 2.3.1 更新table的属性（不更新lg、cf属性）
+```c
+./teracli update "table_hello<mergesize=512>" //更新mergesize
+./teracli update "table_hello<splitsize=1024,mergesize=128>" //更新mergesize和splitsize
+```
+##### 2.3.2 更新lg属性时，***需要disable表格***
+```c
+./teracli disable table_hello
+./teracli update "table_hello{lg0<sst_size=9>}"
+./teracli update "table_hello<splitsize=1536>{lg0<sst_size=9>}" //也可以同时修改table属性
+```
+##### 2.3.3 更新cf属性
+```c
+./teracli update "table_hello{lg0{cf0<ttl=999>}}"
+#也可以同时修改table或者lg属性
+./teracli update "table_hello<splitsize=512>{lg0<sst_size=9>{cf0<ttl=999>}}"
+```
+##### 2.3.4 增加、删除cf
+
+```c
+# 在lg0下增加cf1，并设置属性ttl值为123.
+# op意为操作，op=add需要放在cf属性的最前面
+./teracli update "table_hello{lg0{cf1<op=add,ttl=123>}}"
+
+# 从lg0中删除cf1
+./teracli update "table_hello{lg0{cf1<op=del>}}"
+```
+ 
+#### 2.4 更新kv模式schema
+```c
+# 更新部分属性时需要disable表格，程序会在运行时给出提示
+./teracli update "kv_hello<splitsize=1024>"
+```
+ 
+### 3. update-check
+ 
+### 4. enable
+将处于disable状态的表格重新enable，恢复读、写服务。
+```c
+./teracli enable <tablename>
+```
+ 
+### 5. disable
+将处于表格置于disable状态，不再提供读、写服务。
+```c
+./teracli enable <tablename>
+```
+ 
+### 6. drop
+删除处于disable状态的表格，此操作不可回滚。
+```c
+./teracli drop <tablename>
+```
+### 7. rename 重命名表格
+```c
+#语法：
+./teracli rename <old table_name> <new table_name>
+```
+示例：
+```c
+./teracli rename tb1 tb2
+```
+ 
+### 8. put 向表中写入一个value
+向表中写入以rowkey为key,列为columnfamily:qualifier的值value.对于kv模式的表来说，无需columnfamily:qualifier.
+```c
+#语法：
+./teracli put <tablename> <rowkey> [<columnfamily:qualifier>] <value>
+```
+示例：
+```c
+./teracli put mytable rowkey cf0:qu0 value
+```
+
+### 9. put-ttl 新增的ttl字段表示这个value的有效时间
+```c
+#语法：
+./teracli put-ttl <tablename> <rowkey> [<columnfamily:qualifier>] <value> <ttl(second)>
+```
+示例：
+```c
+#这个value在20秒内有效，超时就读不到了。
+./teracli put-ttl mytable rowkey cf0:qu0 value 20
+```
+
+### 10. putif 原子操作，如果不存在才能put成功
+
+```c
+#语法：
+./teracli putif <tablename> <rowkey> [<columnfamily:qualifier>] <value>
+```
+ 
+### 11. get 读取一个value
+```c
+#语法：
+./teracli get <tablename> <rowkey> [<columnfamily:qualifier>]
+```
+示例：
+```c
+#这个value在20秒内有效，超时就读不到了。
+./teracli get mytable rowkey cf0:qu0
+```
+ 
+### 12. scan 扫描一个表
+将表中key从[startkey, endkey)范围的所有数据扫描出来。
+每个value可以有多个版本(versions)，scan命令默认只输出每个value的最新版本，
+想要获取全部版本可以使用scanallv命令。
+```c
+#语法：
+./teracli scan[allv] <tablename> <startkey> <endkey>
+```
+示例：
+```c
+#扫描整个表
+./teracli scan mytable "" ""
+```
+
+ 
+### 13. delete 删除一个value
+如果只想删除某列最新的一个版本可以用delete1v命令。
+```c
+#语法：
+./teracli delete[1v] <tablename> <rowkey> [<columnfamily:qualifier>]
+```
+
+### 14. put_counter 写入一个counter（计数器）
+```c
+#语法：
+./teracli put_counter <tablename> <rowkey> [<columnfamily:qualifier>] <integer(int64_t)>
+```
+示例：
+```c
+#写入一个初始值为3的计数器：
+./teracli put_counter mytable rowkey cf0:qu0 3
+```
+### 15. get_counter 读取一个counter
+```
+#语法：
+./teracli get_counter <tablename> <rowkey> [<columnfamily:qualifier>]
+```
+示例：
+```c
+#读取之前写入的那个counter：
+./teracli get_counter mytable rowkey cf0:qu0
+```
+ 
+### 16. add 给某个counter加上一个delta值
+```
+#语法：
+./teracli add <tablename> <rowkey> <columnfamily:qualifier> delta
+```
+示例：
+```c
+#读取之前写入的那个counter：
+./teracli get_counter mytable rowkey cf0:qu0
+```
+ 
+### 17. putint64 写入一个int64类型counter（计数器）
+
+```
+#语法：
+./teracli putint64 <tablename> <rowkey> [<columnfamily:qualifier>] <integer(int64_t)>
+```
+示例：
+```c
+#写入一个初始值为67的计数器：
+./teracli putint64 mytable row1 cf0:qu0 67
+```
+ 
+### 18. getint64 读取一个int64类型的counter
+
+```
+#语法：
+./teracli getint64 <tablename> <rowkey> [<columnfamily:qualifier>]
+```
+示例：
+```c
+./teracli getint64 mytable row1 cf0:qu0
+```
+ 
+### 19. addint64 对int64类型的counter执行原子加操作
+```
+#语法：
+./teracli addint64 <tablename> <rowkey> <columnfamily:qualifier>  delta
+```
+示例：
+```c
+#对之前写入的counter执行-3的操作：
+# addint64操作执行完以后，该counter的值为 64
+./teracli addint64 mytable row1 cf0:qu0 -3
+```
+### 20. append 原子操作：追加内容到一个Cell
+```
+#语法：
+./teracli append <tablename> <rowkey> [<columnfamily:qualifier>] <value>
+```
+示例：
+```c
+./teracli put mytalbe rowkey cf0:qu0 hello
+./teracli append mytable rowkey cf0:qu0 world
+#此时再去get会得到helloworld
+./teracli get mytable rowkey cf0:qu0
+```
+### 20. batchput 批量写数据
+```
+#语法：
+./teracli batchput <tablename> <input file>
+```
+### 21. batchget 批量读数据
+```
+#语法：
+./teracli batchget <tablename> <input file>
+```
+### 22. show 显示表格信息
+```
+#语法：
+./teracli show[x]  [<tablename>]
+```
+示例：
+```c
+#查看某个table的信息：
+./teracli show mytable
+#查看集群内所有table的信息：
+./teracli show
+```
+ 
+### 23. showx 显示表格详细信息
+```
+#语法：
+./teracli show[x]  [<tablename>]
+```
+示例：
+```c
+#查看某个table的信息：
+./teracli showx mytable
+```
+ 
+### 24. showschema 显示表格schema
+表格schema里含有很多属性（例如某个cf保留的最小版本数），创建表格时，没有显示指定的属性都取默认值，
+这些属性在showschema时不会显示出来；想要显示全部属性，可以使用showschemax命令。
+```
+#语法：
+./teracli showschema[x] <tablename>
+```
+
+
+### 25. showts 显示tabletnode的信息
+带上后缀'x'得到的信息会更详细（showtsx）。
+```
+#语法：
+./teracli showts [<tabletnode_addr>]
+```
+示例：
+```c
+#显示某个tabletnode的信息：
+./teracli showts "example.company.com:7770"
+#显示集群内所有tabletnode的信息:
+./teracli showts
+```
+ 
+### 26. range 显示表的范围
+```
+#语法：
+./teracli range <tablename>
+```
+### 27. txn 事务（仅支持单事务行操作）
+```
+#语法：
+./teracli txn <operation> <params>
+operation包括start和commit
+./teracli txn start <tablename> <row_key>
+./teracli txn commit
+```
+ 
+### 28. user用户管理
+```
+#语法：
+./teracli user <operation> <params>
+operation包括create、changepwd、show、delete、addtogroup和deletefromgroup
+user <operation> <params>
+          create          <username> <password>
+          changepwd       <username> <new-password>
+          show            <username>
+          delete          <username>
+          addtogroup      <username> <groupname>
+          deletefromgroup <username> <groupname>
+```
+### 29. tablet
+```
+#语法：
+./teracli tablet <operation> <params>
+operation包括move、reload、compact、split、merge和scan
+tablet <operation> <params>
+            move    <tablet_path> <target_addr>
+            reload  <tablet_path>
+                    force to unload and load on the same ts
+            compact <tablet_path>
+            split   <tablet_path>
+            merge   <tablet_path>
+            scan    <tablet_path>
+```
+ 
+### 30. compact
+```
+#语法：
+./teracli compact <tablename>
+```
+ 
+### 31. safemode
+```
+#语法：
+./teracli safemode [get|enter|leave]
+```
+ 
+### 32. meta
+meta for master memory, meta2 for meta table.
+```
+#语法：
+./teracli meta[2] [backup|check|repair|show]
+```
+### 33. findmaster master的位置
+```
+#语法：
+./teracli findmaster
+```
+### 34. reload
+```
+#语法：
+./teracli reload config hostname:port
+```
+
+### 35. kick
+```
+#语法：
+./teracli kick <tablename>
+```
+ 
+### 36. findtablet
+```
+#语法：
+./teracli findtablet <tablename> <rowkey-prefix>
+./teracli findtablet <tablename> <start-key> <end-key>
+```
+ 
+### 37. cookie
+```
+#语法：
+./teracli  cookie <command> <args>
+cookie <command> <args>
+            dump     cookie-file     -- dump contents of specified files
+            findkey  cookie-file key -- find the info of a key
+```
+ 
+### 38. version版本
+```
+#语法：
+./teracli version
+```
+
diff --git a/doc/tools/terautil.md b/doc/tools/terautil.md
new file mode 100644
index 000000000..842b572eb
--- /dev/null
+++ b/doc/tools/terautil.md
@@ -0,0 +1,78 @@
+
+ 
+# terautil 
+ 
+集群间数据迁移的dump工具
+### 1. 用法
+```
+./terautil dump help
+```
+#### (1)建表
+```
+./terautil --flagfile=../conf/terautil.flag dump prepare_safe
+```
+#### (2) 将扫表操作run起来
+```
+./terautil --flagfile=../conf/terautil.flag dump run
+```
+ 
+### 2. flag配置
+<table>
+<tr>
+<th>flag名称</th>
+<th>flag默认值或格式</th>
+<th>flag介绍</th>
+</tr>
+<tr>
+<td>dump_tera_src_conf </td>
+<td>../conf/src_tera.flag（格式）</td>
+<td>tera的源集群</td>
+</tr>
+<tr>
+<td>dump_tera_dest_conf</td>
+<td>../conf/dest_tera.flag（格式）</td>
+<td>tera的目的集群</td>
+</tr>
+<tr>
+<td>dump_tera_src_root_path</td>
+<td>/xxx_（路径格式）</td>
+<td>tera的源路径</td>
+</tr>
+<tr>
+<td>dump_tera_dest_root_path</td>
+<td>/xxx_（路径格式）</td>
+<td>tera的目的路径</td>
+</tr>
+<tr>
+<td>ins_cluster_addr</td>
+<td>terautil_ins（格式）</td>
+<td>锁服务器的地址</td>
+</tr>
+<tr>
+<td>ins_cluster_root_path</td>
+<td>/terautil/dump/xxxx（格式）</td>
+<td>锁服务器路径</td>
+</tr>
+<tr>
+<td>dump_tera_src_meta_addr</td>
+<td>“”</td>
+<td>源meta表的地址</td>
+</tr>
+<tr>
+<td>dump_tera_dest_meta_addr</td>
+<td>“”</td>
+<td>目的meta表的地址</td>
+</tr>
+<tr>
+<td>dump_manual_split_interval</td>
+<td>1000</td>
+<td>手动分裂时间间隔，单位为ms</td>
+</tr>
+<tr>
+<td>dump_enable_manual_split</td>
+<td>false</td>
+<td>是否允许手动分裂</td>
+</tr>
+</table>
+
+
diff --git a/doc/tools/ycsb.md b/doc/tools/ycsb.md
new file mode 100644
index 000000000..b6f922bc7
--- /dev/null
+++ b/doc/tools/ycsb.md
@@ -0,0 +1,294 @@
+
+# YCSB工具使用说明
+ 
+### 1. 属性
+ 
+#### 1.1 核心YCSB属性
+所有工作量文件可以指定以下属性：
+<table>
+<tr>
+<th>参数名</th>
+<th>意义</th>
+<th>默认值</th>
+</tr> 
+ 
+<tr>
+<td>workload</td>
+<td>要使用的工作量类，如com.yahoo.ycsb.workloads.CoreWorkload</td>
+<td></td>
+</tr>
+ 
+<tr>
+<td>db</td>
+<td>要使用的数据库类。可选地，这在命令行可以指定</td>
+<td>com.yahoo.ycsb.BasicDB</td>
+</tr>
+ 
+<tr>
+<td>exporter</td>
+<td>要是用的测量结果的输出类</td>
+<td>com.yahoo.ycsb.measurements.exporter.TextMeasurementsExporter</td>
+</tr>
+ 
+<tr>
+<td>exportfile</td>
+<td>用于替代stdout的输出文件路径</td>
+<td>未定义/输出到stdout</td>
+</tr>
+ 
+<tr>
+<td>threadcount</td>
+<td>YCSB客户端的线程数。可选地，这可以在命令行指定</td>
+<td>1</td>
+</tr>
+ 
+<tr>
+<td>measurementtype</td>
+<td>支持的测量结果类型有直方图和时间序列</td>
+<td>直方图</td>
+</tr>
+</table>
+
+ 
+ 
+ 
+ 
+ 
+#### 1.2 核心工作量包属性
+和核心工作量构造器一起使用的属性文件可以指定以下属性及值 
+#####1.2.1 重要参数 
+<table>
+<tr>
+<th>参数名</th>
+<th>意义</th>
+<th>默认值</th>
+<th>有效取值</th>
+</tr> 
+ 
+<tr>
+<td>recordcount</td>
+<td>数据行数，装载进数据库的初始记录数</td>
+<td>0</td>
+<td></td>
+</tr>
+ 
+<tr>
+<td>operationcount</td>
+<td>要进行的操作数数量</td>
+<td>无</td>
+<td></td>
+</tr>
+ 
+<tr>
+<td>fieldcount</td>
+<td>每行的qualifier个数</td>
+<td>10</td>
+<td></td>
+</tr>
+ 
+<tr>
+<td>fieldlength</td>
+<td每个字段的大小</td>
+<td>100</td>
+<td></td>
+</tr>
+ 
+<tr>
+<td>requestdistribution</td>
+<td>随机读的数据分布</td>
+<td>uniform</td>
+<td>uniform、zipfian、latest</td>
+</tr>
+ 
+<tr>
+<td>insertorder</td>
+<td>写入顺序，ordered是顺序写，hashed是随机写</td>
+<td>hashed</td>
+<td>ordered、hashed</td>
+</tr>
+ 
+<tr>
+<td>readallfields</td>
+<td>读取所有qualifier还是只读一个qualifier</td>
+<td>true</td>
+<td>true、false</td>
+</tr>
+ 
+<tr>
+<td>readproportion</td>
+<td>随机读占所有操作的比例</td>
+<td>0.95</td>
+<td></td>
+</tr>
+ 
+<tr>
+<td>readproportion</td>
+<td>更新（写入）占所有操作的比例</td>
+<td>0.05</td>
+<td></td>
+</tr>
+ 
+<tr>
+<td>target</td>
+<td>每秒总共操作的次数</td>
+<td>unthrottled</td>
+<td></td>
+</tr>
+ 
+<tr>
+<td>thread</td>
+<td>客户端线程数</td>
+<td>1</td>
+<td></td>
+</tr>
+
+</table>
+ 
+##### 1.2.2 非必需参数（对tera测试意义不大，用默认值即可）
+ <table>
+<tr>
+<th>参数名</th>
+<th>意义</th>
+<th>默认值</th>
+<th>有效取值</th>
+</tr>
+ 
+ 
+<tr>
+<td>insertproportion</td>
+<td>插入（写入）占所有操作的比例</td>
+<td>0</td>
+<td></td>
+</tr>
+ 
+<tr>
+<td>scanproportion</td>
+<td>scan占所有操作的比例，tera_mark不支持</td>
+<td>0</td>
+<td></td>
+</tr>
+ 
+<tr>
+<td>readmodifywriteproportion</td>
+<td>readmodifywrite占所有操作的比例，tera不支持该操作</td>
+<td>0</td>
+<td></td>
+</tr>
+ 
+<tr>
+<td>maxscanlength</td>
+<td>每次scan需要读取的行数，tera不支持指定行数的scan</td>
+<td>1000</td>
+<td></td>
+</tr>
+ 
+<tr>
+<td>scanlengthdistribution</td>
+<td>scan的行数选择策略</td>
+<td>uniform</td>
+<td></td>
+</tr>
+ 
+<tr>
+<td>maxexecutiontime</td>
+<td>最大执行时间，超过此时间会强行结束测试（单位为秒）</td>
+<td></td>
+<td></td>
+</tr>
+ 
+<tr>
+<td>table</td>
+<td>表名，tera_mark不支持</td>
+<td>usertable</td>
+<td></td>
+</tr>
+</table>
+ 
+#### 1.3 测量结果属性
+每一个测量结果类型可以为如下属性形式：
+ <table>
+<tr>
+<th>类型</th>
+<th>参数名</th>
+<th>意义</th>
+<th>默认值</th>
+<th>有效取值</th>
+</tr> 
+ 
+<tr>
+<td>直方图</td>
+<td>histogram.buckets</td>
+<td>直方图输出的区间数</td>
+<td>1000</td>
+<td></td>
+</tr>
+ 
+<tr>
+<td>时间序列</td>
+<td>timeseries.granularity</td>
+<td>时间序列输出的粒度</td>
+<td>1000</td>
+<td></td>
+</tr>
+ 
+</table>
+ 
+### 2 运行时参数
+即使工作负载类和参数文件定义了一个特定的工作负载，在运行基准测试时你还是想指定一些额外的设置。当你运行YCSB客户端时命令行提供了这些设置。这些设置包括：
+* -threads :客户端的线程。默认地，YCSB客户端使用一个工作者线程，但是额外的线程可以被指定。当需要增加对数据库的装载数量时这是经常使用的。
+* -target：每秒的目标操作数。默认地，YCSB客户端将试图尽可能地执行最多操作。例如，如果每个操作平均使用了100ms，客户端每个工作者线程每秒将执行10个操作。然而，你可以限制每秒的目标操作数。比如，为了生成一条延迟-吞吐量曲线，你可以指定不同的目标吞吐量，以测试每种吞吐量下的延迟。
+* -s：状态。对于一个运行时间长的工作负载，让客户端报告状态是有用的，这可以让你知道它并没有挂掉，并且给你某些对它的执行过程的想法。通过在命令行指定“-s”，客户端将每10秒输出状态到stderr。
+
+
+
+ 
+### 3 用法
+ 
+#### 3.1 相关命令
+* load: 执行加载命令
+* run: 执行工作负载
+* shell: 交互式模式
+```
+＃basic参数告诉客户端使用哑BasicDB层。你也可以在你的参数文件中使用“db”属性指定它（例如，“db=com.yahoo.ycsb.BasicDB”）
+./bin/ycsb shell basic           
+> help
+Commands:
+read key [field1 field2 ...]                  // Read a record
+scan key recordcount [field1 field2 ...]     // Scan starting at key
+insert key name1=value1 [name2=value2 ...]  // Insert a new record
+update key name1=value1 [name2=value2 ...] // Update a record
+delete key                                // Delete a record
+table [tablename]                        // Get or [set] the name of the table
+quit                                    // Quit
+```
+ 
+#### 3.2 使用方法
+使用时，先建表，再加载数据，最后执行相关事务。
+
+##### 3.2.1 建表
+ycsb的生成的row都是“user”+19位数字的格式，如 user9105318085603802964。 因此，如果需要预分表，必须以“user”+N个数字作为分隔，建议选择2个数字。 例如要预分4个tablet，分隔字符串为：user25、user50、user75
+```
+create 'usertable','f1','f2','f3'
+```
+
+##### 3.2.2 向tera中加载测试数据
+```
+bin/ycsb load tera -p workload=com.yahoo.ycsb.workloads.CoreWorkload \          //load参数告诉客户端执行工作负载的装载阶段。
+                   -p recordcount=$(ROW_NUM) \                                  //-p参数被用于设置参数，-P参数用于装载属性文件。
+                   -p fieldlength=$(QUALIFIER_NUM) \
+                   -p fieldcount=$(VALUE_SIZE)
+```
+ 
+##### 3.2.3 执行测试
+```
+bin/ycsb run tera -p workload=com.yahoo.ycsb.workloads.CoreWorkload \
+                  -p recordcount=$(ROW_NUM) \
+                  -p operationcount=$(ROW_NUM) \
+                  -p requestdistribution=$(DIST) \
+                  -p fieldlength=$(QUALIFIER_NUM) \
+                  -p fieldcount=$(VALUE_SIZE) \
+                  -p updateproportion=$(WRITE_PROP) \
+                  -p readproportion=$(READ_PROP)
+```
+ 
+
diff --git a/example/onebox/conf/tera.flag b/example/onebox/conf/tera.flag
index 99f62b45e..37329893d 100644
--- a/example/onebox/conf/tera.flag
+++ b/example/onebox/conf/tera.flag
@@ -7,10 +7,10 @@
 --tera_leveldb_env_type=local
 
 ## 是否使用zk
-# 指定使用非zk模式, 但只能本机访问tera
---tera_zk_enabled=false
+# 指定使用fake_zk模式, 只能本机访问tera
+--tera_coord_type=fake_zk
 # 指定使用zk, 可以跨服务使用, 配置相应地址和路径即可
-#--tera_zk_enabled=true
+--tera_zk_enabled=false
 #--tera_zk_addr_list=localhost:2181
 #--tera_zk_root_path=/tera
 --tera_master_query_tabletnode_period=1000
@@ -18,3 +18,18 @@
 
 # sdk
 --tera_sdk_timeout=20000
+
+# balancer
+#--tera_info_log_clean_enable=false
+#--logbugsecs=0
+#--v=5
+#--tera_master_load_balance_ts_load_threshold=1000000000
+#--tera_master_load_balance_ts_size_threshold=10000000000000
+#--tera_master_meta_isolate_enabled=true
+#--tera_lb_load_balance_period_s=60
+#--tera_lb_tablet_max_move_num=10
+#--tera_lb_min_cost_need_balance=0.05
+#--tera_lb_move_cost_weight=10
+#--tera_lb_size_cost_weight=90
+#--tera_lb_debug_mode_enabled=false
+--online_schema_update_enabled=true
diff --git a/include/tera/client.h b/include/tera/client.h
index 2ef68638e..80308a911 100644
--- a/include/tera/client.h
+++ b/include/tera/client.h
@@ -12,6 +12,7 @@
 #include "error_code.h"
 #include "table.h"
 #include "table_descriptor.h"
+#include "transaction.h"
 
 #pragma GCC visibility push(default)
 namespace tera {
@@ -101,6 +102,10 @@ class Client {
     // Rename a table.
     virtual bool Rename(const std::string& old_table_name, const std::string& new_table_name,
                         ErrorCode* err) = 0 ;
+
+    /// New a global transaction
+    virtual Transaction* NewGlobalTransaction() = 0;
+
     Client() {}
     virtual ~Client() {}
 
diff --git a/include/tera/error_code.h b/include/tera/error_code.h
index a03df0905..ad6ab2b64 100644
--- a/include/tera/error_code.h
+++ b/include/tera/error_code.h
@@ -26,7 +26,22 @@ class ErrorCode {
         kNoAuth    = 7,
         kUnknown   = 8,
         kNotImpl   = 9,
-        kTxnFail   = 10
+        kTxnFail   = 10,
+
+        // only for global transaction error
+        kGTxnDataTooLarge      = 101,
+        kGTxnNotSupport        = 102,
+        kGTxnSchemaError       = 103,
+        kGTxnOpAfterCommit     = 104,
+        kGTxnPrimaryLost       = 105,
+        kGTxnWriteConflict     = 106,
+        kGTxnLockConflict      = 107,
+        kGTxnOKButAckFailed    = 108,
+        kGTxnOKButNotifyFailed = 109,
+        kGTxnPrewriteTimeout   = 110,
+        kGTxnPrimaryCommitTimeout  = 111,
+        kGTxnTimestampLost     = 112
+        // end of global transaction error
     };
 
 public:
diff --git a/include/tera/reader.h b/include/tera/reader.h
index cc916c14d..08615f4d8 100644
--- a/include/tera/reader.h
+++ b/include/tera/reader.h
@@ -31,6 +31,12 @@ class RowReader {
     virtual void AddColumn(const std::string& family, const std::string& qualifier) = 0;
     // Set the maximum number of versions of each column.
     virtual void SetMaxVersions(uint32_t max_version) = 0;
+
+    // Set the the max qualifiers of each column family when read this row
+    // This is useful when a column family contains too many qualifiers
+    // If this value is not set, the default value is std::numeric_limits<uint64_t>::max()
+    virtual void SetMaxQualifiers(uint64_t max_qualifiers) = 0;
+
     // If set, only returns cells of which update timestamp is within [ts_start, ts_end].
     virtual void SetTimeRange(int64_t ts_start, int64_t ts_end) = 0;
 
diff --git a/include/tera/scan.h b/include/tera/scan.h
index 45646ec9d..c9023f9b6 100644
--- a/include/tera/scan.h
+++ b/include/tera/scan.h
@@ -79,6 +79,11 @@ class ScanDescriptor {
     // Set max version number per column.
     void SetMaxVersions(int32_t versions);
 
+    // Set the the max qualifiers of each column family when read this row
+    // This is useful when a column family contains too many qualifiers
+    // If this value is not set, the default value is std::numeric_limits<uint64_t>::max()
+    void SetMaxQualifiers(uint64_t max_qualifiers);
+
     // Set time range for the scan result,
     // which likes the SQL statement (SELECT * from Table WHERE timestamp in [ts_start, ts_end]).
     // Return the newest value first.
diff --git a/include/tera/table_descriptor.h b/include/tera/table_descriptor.h
index 8865d5a9d..4b464070f 100644
--- a/include/tera/table_descriptor.h
+++ b/include/tera/table_descriptor.h
@@ -54,6 +54,12 @@ class ColumnFamilyDescriptor {
     virtual int64_t DiskQuota() const = 0;
     virtual void SetAcl(ACL acl) = 0;
     virtual ACL Acl() const = 0;
+    virtual void EnableGlobalTransaction() = 0;
+    virtual void DisableGlobalTransaction() = 0;
+    virtual bool GlobalTransaction() const = 0;
+    virtual void EnableNotify() = 0;
+    virtual void DisableNotify() = 0;
+    virtual bool IsNotifyEnabled() const = 0;
 
     ColumnFamilyDescriptor() {}
     virtual ~ColumnFamilyDescriptor() {}
diff --git a/include/tera/transaction.h b/include/tera/transaction.h
index dc63a7842..81722f35b 100644
--- a/include/tera/transaction.h
+++ b/include/tera/transaction.h
@@ -15,9 +15,15 @@
 #pragma GCC visibility push(default)
 namespace tera {
 
-
 class RowReader;
 class RowMutation;
+class Table;
+
+/// 事务隔离级别
+enum class IsolationLevel {
+    kReadCommitedSnapshot = 0,
+    kSnapshot             = 1
+};
 
 /// 事务操作接口
 class Transaction {
@@ -47,9 +53,36 @@ class Transaction {
     /// 异步模式下，通过GetError()获取提交结果
     virtual ErrorCode Commit() = 0;
 
-    /// 获取事务开始时间戳，仅在多行事务场景下有效
+    /// 获取事务开始时间戳
     virtual int64_t GetStartTimestamp() = 0;
 
+    /// 获取事务提交时间戳
+    virtual int64_t GetCommitTimestamp() = 0;
+
+    /// 仅全局事务支持
+    virtual void Ack(Table* t, 
+                     const std::string& row_key, 
+                     const std::string& column_family, 
+                     const std::string& qualifier) = 0;
+
+    /// 仅全局事务支持
+    virtual void Notify(Table* t,
+                        const std::string& row_key, 
+                        const std::string& column_family, 
+                        const std::string& qualifier) = 0;
+
+    /// 设置隔离级别
+    virtual void SetIsolation(const IsolationLevel& isolation_level) = 0;
+
+    /// 获取隔离级别
+    virtual IsolationLevel Isolation() = 0;
+
+    // Set timeout(ms).
+    virtual void SetTimeout(int64_t timeout_ms) = 0;
+
+    // Get timeout(ms).
+    virtual int64_t Timeout() = 0;
+
     Transaction() {}
     virtual ~Transaction() {}
 
@@ -58,10 +91,6 @@ class Transaction {
     void operator=(const Transaction&);
 };
 
-/// cross-row, cross-table transaction
-/// 跨行，跨表事务
-Transaction* NewTransaction();
-
 } // namespace tera
 #pragma GCC visibility pop
 
diff --git a/readme-cn.md b/readme-cn.md
index 7e136a26c..7edc5362f 100644
--- a/readme-cn.md
+++ b/readme-cn.md
@@ -1,63 +1,48 @@
 [高性能、可伸缩的结构化数据库](http://github.com/baidu/tera)
 ====
 Tera是一个高性能、可伸缩的结构化数据存储系统，被设计用来管理搜索引擎万亿量级的超链与网页信息。为实现数据的实时分析与高效访问，我们使用按行键、列名和时间戳全局排序的三维数据模型组织数据，使用多级Cache系统，充分利用新一代服务器硬件大内存、SSD盘和万兆网卡的性能优势，做到模型灵活的同时，实现了高吞吐与水平扩展。([English](README.md))
-
 # 特性
- * 全局有序
- * 热点自动分片
- * 数据强一致
- * 多版本,自动垃圾收集
- * 按列存储,支持内存表
- * 动态schema
- * 支持表格快照
- * 高效随机读写
-
+* 全局有序
+* 热点自动分片
+* 数据强一致
+* 多版本,自动垃圾收集
+* 按列存储,支持内存表
+* 动态schema
+* 支持表格快照
+* 高效随机读写
 # 数据模型
 Tera使用了Bigtable的数据模型，可以将一张表格理解为这样一种数据结构：
 ```
 map<RowKey, map<ColummnFamily:Qualifier, map<Timestamp, Value> > >
 ```
 其中RowKey、ColumnFamily、Qualifier和Value是字符串，Timestamp是一个64位整形。ColumnFamliy需要建表时指定，是访问控制、版本保留等策略的基本单位。
-
 # 系统架构
 系统主要由Tabletserver、Master和ClientSDK三部分构成。其中Tabletserver是核心服务器，承载着所有的数据管理与访问；Master是系统的仲裁者，负责表格的创建、schema更新与负载均衡；ClientSDK包含供管理员使用的命令行工具teracli和给用户使用的SDK。
 表格被按RowKey全局排序，并横向切分成多个Tablet，每个Tablet负责服务RowKey的一个区间，表格又被纵向切分为多个LocalityGroup，一个Tablet的多个Localitygroup在物理上单独存储，可以选择不同的存储介质，以优化访问效率。
-
 ![架构图](resources/images/arch.png)
-
 # 系统依赖
- * 使用分布式文件系统（[BFS](https://github.com/baidu/bfs)、HDFS等）持久化数据与元信息
- * 使用分布式协调服务（[Nexus](https://github.com/baidu/ins/)或者Zookeeper）选主与协调
- * 使用[Sofa-pbrpc](https://github.com/baidu/sofa-pbrpc/)实现跨进程通信
-
+* 使用分布式文件系统（[BFS](https://github.com/baidu/bfs)、HDFS等）持久化数据与元信息
+* 使用分布式协调服务（[Nexus](https://github.com/baidu/ins/)或者Zookeeper）选主与协调
+* 使用[Sofa-pbrpc](https://github.com/baidu/sofa-pbrpc/)实现跨进程通信
 # 系统构建
-sh ./build.sh  
+sh ./build.sh 
 参考[BUILD](BUILD-cn)
-
 # 使用示例
-
 [体验单机Tera](doc/cn/onebox.md)
-
 [通过docker体验Tera](example/docker)
-
-[主要api使用方法](doc/cn/sdk_guide.md)
-
-[客户端teracli使用方法](doc/cn/teracli.md)
-
+[主要api使用方法](doc/sdk_reference/README.md)
+[客户端teracli使用方法](doc/tools/teracli.md)
+[集群间数据迁移的dump工具terautil使用方法](doc/tools/terautil.md)
+[造数据 & 读写数据的工具使用方法](doc/tools/benchmark.md)
+[性能测试工具ycsb使用方法](doc/tools/ycsb.md)
 [其它文档](doc/cn/README.md)
-
 #反馈与技术支持
 tera_dev at baidu.com
-
 # 成为贡献者
 阅读[RoadMap](doc/cn/roadmap.md)文件或者源代码，了解我们当前的开发方向。
-
 完成[5个小任务](doc/to_be_a_contributor.md),帮你一步步成为tera贡献者。
-
 # Become a Committer
-
 成为tera的committer，你需要知道的一些[规则](doc/cn/to_be_a_committer.md)。
-
 # 欢迎加入
 如果你热爱开源，热爱分布式技术，请将简历发送至： 
 opensearch at baidu.com
diff --git a/resources/images/global_txn.png b/resources/images/global_txn.png
new file mode 100644
index 000000000..0e6e8f950
Binary files /dev/null and b/resources/images/global_txn.png differ
diff --git a/src/benchmark/mark.cc b/src/benchmark/mark.cc
index a0081e2e4..a53d1f8b7 100644
--- a/src/benchmark/mark.cc
+++ b/src/benchmark/mark.cc
@@ -49,7 +49,7 @@ void sdk_write_callback(tera::RowMutation* row_mu) {
     adapter->WriteCallback(row_mu, req_size, req_time);
 }
 
-void Adapter::Write(const std::string& row,
+void Adapter::Write(int opt, const std::string& row,
                     std::map<std::string, std::set<std::string> >& column,
                     uint64_t timestamp,
                     std::string& value) {
@@ -74,7 +74,13 @@ void Adapter::Write(const std::string& row,
             if (FLAGS_verify) {
                 add_checksum(row, family, qualifier, &value);
             }
-            row_mu->Put(family, qualifier, value, (int64_t)timestamp);
+            if (opt == PUT) {
+                row_mu->Put(family, qualifier, value, (int64_t)timestamp);
+            } else if (opt == PIF) {
+                row_mu->PutIfAbsent(family, qualifier, value);
+            } else {
+                abort();
+            }
             if (FLAGS_verify) {
                 remove_checksum(&value);
             }
@@ -122,6 +128,8 @@ void Adapter::WriteCallback(tera::RowMutation* row_mu, size_t req_size,
     tera::ErrorCode err = row_mu->GetError();
     if (err.GetType() == tera::ErrorCode::kOK) {
         write_marker_.OnSuccess(req_size, latency);
+    } else if (err.GetType() == tera::ErrorCode::kTxnFail) {
+        write_marker_.OnConflict(req_size, latency); 
     } else {
         /*std::cerr << "fail to write: row=[" << row << "], column=["
             << family << ":" << qualifier << "], timestamp=["
diff --git a/src/benchmark/mark.h b/src/benchmark/mark.h
index c510de42c..ec5099eb5 100644
--- a/src/benchmark/mark.h
+++ b/src/benchmark/mark.h
@@ -18,7 +18,7 @@
 
 #include "common/mutex.h"
 #include "tera.h"
-#include "utils/counter.h"
+#include "common/counter.h"
 
 DECLARE_int64(pend_size);
 DECLARE_int64(pend_count);
@@ -46,7 +46,8 @@ enum OP {
     PUT = 1,
     GET = 2,
     SCN = 3,
-    DEL = 4
+    DEL = 4,
+    PIF = 5
 };
 
 int64_t Now();
@@ -201,8 +202,11 @@ class Statistic {
           last_finish_size_(0),
           last_success_count_(0),
           last_success_size_(0),
+          last_conflict_count_(0),
+          last_conflict_size_(0),
           finish_marker_(1000000),
-          success_marker_(1000000) {}
+          success_marker_(1000000),
+          conflict_marker_(1000000) {}
 
     int GetOpt() {
         return opt_;
@@ -210,24 +214,30 @@ class Statistic {
 
     void GetStatistic(int64_t* total_count, int64_t* total_size,
                       int64_t* finish_count, int64_t* finish_size,
-                      int64_t* success_count, int64_t* success_size) {
+                      int64_t* success_count, int64_t* success_size,
+                      int64_t* conflict_count, int64_t* conflict_size) {
         *total_count = last_total_count_ = total_count_.Get();
         *total_size = last_total_size_ = total_size_.Get();
         *finish_count = last_finish_count_ = finish_count_.Get();
         *finish_size = last_finish_size_ = finish_size_.Get();
         *success_count = last_success_count_ = success_count_.Get();
         *success_size = last_success_size_ = success_size_.Get();
+        *conflict_count = last_conflict_count_ = conflict_count_.Get();
+        *conflict_size = last_conflict_size_ = conflict_size_.Get();
     }
 
     void GetLastStatistic(int64_t* total_count, int64_t* total_size,
                           int64_t* finish_count, int64_t* finish_size,
-                          int64_t* success_count, int64_t* success_size) {
+                          int64_t* success_count, int64_t* success_size,
+                          int64_t* conflict_count, int64_t* conflict_size) {
         *total_count = last_total_count_;
         *total_size = last_total_size_;
         *finish_count = last_finish_count_;
         *finish_size = last_finish_size_;
         *success_count = last_success_count_;
         *success_size = last_success_size_;
+        *conflict_count = last_conflict_count_;
+        *conflict_size = last_conflict_size_;
     }
 
     Marker* GetFinishMarker() {
@@ -238,6 +248,10 @@ class Statistic {
         return &success_marker_;
     }
 
+    Marker* GetConflictMarker() {
+        return &conflict_marker_;
+    }
+
     void OnReceive(size_t size) {
         last_send_time_ = Now();
         last_send_size_ = size;
@@ -257,6 +271,12 @@ class Statistic {
         success_marker_.AddLatency(latency);
     }
 
+    void OnConflict(size_t size, uint32_t latency) {
+        conflict_count_.Inc();
+        conflict_size_.Add(size);
+        conflict_marker_.AddLatency(latency);
+    }
+
     void CheckPending() {
         int64_t max_pend_count = FLAGS_pend_count;
         int64_t max_pend_size = FLAGS_pend_size << 20;
@@ -297,6 +317,8 @@ class Statistic {
     tera::Counter finish_size_;
     tera::Counter success_count_;
     tera::Counter success_size_;
+    tera::Counter conflict_count_;
+    tera::Counter conflict_size_;
 
     size_t last_send_size_;
     int64_t last_send_time_;
@@ -307,9 +329,12 @@ class Statistic {
     int64_t last_finish_size_;
     int64_t last_success_count_;
     int64_t last_success_size_;
+    int64_t last_conflict_count_;
+    int64_t last_conflict_size_;
 
     Marker finish_marker_;
     Marker success_marker_;
+    Marker conflict_marker_;
 };
 
 class Adapter {
@@ -317,7 +342,7 @@ class Adapter {
     Adapter(tera::Table* table);
     ~Adapter();
 
-    void Write(const std::string& row,
+    void Write(int opt, const std::string& row,
                std::map<std::string, std::set<std::string> >& column,
                uint64_t timestamp,
                std::string& value);
diff --git a/src/benchmark/mark_main.cc b/src/benchmark/mark_main.cc
index 36ae66c4b..dd57af93a 100644
--- a/src/benchmark/mark_main.cc
+++ b/src/benchmark/mark_main.cc
@@ -6,6 +6,7 @@
 
 #include <iomanip>
 #include <iostream>
+#include <sstream>
 #include <pthread.h>
 #include <stdio.h>
 #include <stdlib.h>
@@ -61,6 +62,8 @@ bool parse_row(const char* buffer, ssize_t size,
             *op = GET;
         } else if (strncmp(buffer, "PUT", 3) == 0) {
             *op = PUT;
+        } else if (strncmp(buffer, "PIF", 3) == 0) {
+            *op = PIF;
         } else {
             return false;
         }
@@ -76,13 +79,14 @@ bool parse_row(const char* buffer, ssize_t size,
         delim = end;
     }
     row->assign(buffer, delim - buffer);
-    if ((delim == end && mode != WRITE && (mode != MIX || *op != PUT)) ||
-        (delim == end && mode == DELETE)) {
+    if ((delim == end && mode != WRITE && 
+        (mode != MIX || (*op != PUT && *op != PIF)))
+        ||(delim == end && mode == DELETE)) {
         return true;
     }
 
     // parse value
-    if (mode == WRITE || (mode == MIX && *op == PUT)) {
+    if (mode == WRITE || (mode == MIX && (*op == PUT || *op == PIF))) {
         if (delim == end) {
             return false;
         }
@@ -170,7 +174,7 @@ bool parse_row(const char* buffer, ssize_t size,
     }
     if (comma == end) {
         return true;
-    } else if (mode == WRITE || (mode == MIX && *op == PUT)) {
+    } else if (mode == WRITE || (mode == MIX && (*op == PUT || *op == PIF))) {
         return false;
     }
 
@@ -217,10 +221,11 @@ bool get_next_row(int* op, std::string* row,
 void print_header() {
     std::cout << "HH:MM:SS OPT\t";
     if (mode != SCAN && type == ASYNC) {
-        std::cout << "SENT [speed/total]\t\t";
+        std::cout << "SENT [total/speed]\t\t";
     }
-    std::cout << "FINISH [speed/total]\t\t";
-    std::cout << "SUCCESS [speed/total]\t\t";
+    std::cout << "FINISH [total/speed]\t\t";
+    std::cout << "SUCCESS [total/speed]\t\t";
+    std::cout << "CONFLICT [total/speed]\t\t";
     if (mode != SCAN && type == ASYNC) {
         std::cout << "PENDING [count]";
     }
@@ -271,24 +276,28 @@ void print_size_and_count(int64_t size, int64_t count) {
 }
 
 void print_statistic(Statistic* statistic) {
-    int64_t old_total_count, old_finish_count, old_success_count;
-    int64_t old_total_size, old_finish_size, old_success_size;
+    int64_t old_total_count, old_finish_count, old_success_count, old_conflict_count;
+    int64_t old_total_size, old_finish_size, old_success_size, old_conflict_size;
     statistic->GetLastStatistic(&old_total_count, &old_total_size,
                                 &old_finish_count, &old_finish_size,
-                                &old_success_count, &old_success_size);
+                                &old_success_count, &old_success_size,
+                                &old_conflict_count, &old_conflict_size);
 
-    int64_t new_total_count, new_finish_count, new_success_count;
-    int64_t new_total_size, new_finish_size, new_success_size;
+    int64_t new_total_count, new_finish_count, new_success_count, new_conflict_count;
+    int64_t new_total_size, new_finish_size, new_success_size, new_conflict_size;
     statistic->GetStatistic(&new_total_count, &new_total_size,
                             &new_finish_count, &new_finish_size,
-                            &new_success_count, &new_success_size);
+                            &new_success_count, &new_success_size,
+                            &new_conflict_count, &new_conflict_size);
 
     int64_t total_count = new_total_count - old_total_count;
     int64_t finish_count = new_finish_count - old_finish_count;
     int64_t success_count = new_success_count - old_success_count;
+    int64_t conflict_count = new_conflict_count - old_conflict_count;
     int64_t total_size = new_total_size - old_total_size;
     int64_t finish_size = new_finish_size - old_finish_size;
     int64_t success_size = new_success_size - old_success_size;
+    int64_t conflict_size = new_conflict_size - old_conflict_size;
 
     int64_t total_pending_count = new_total_count - new_finish_count;
     // scan
@@ -317,6 +326,11 @@ void print_statistic(Statistic* statistic) {
     std::cout << "/";
     print_size_and_count(success_size, success_count);
     std::cout << "\t\t";
+    
+    print_size_and_count(new_conflict_size, new_conflict_count);
+    std::cout << "/";
+    print_size_and_count(conflict_size, conflict_count);
+    std::cout << "\t\t";
 
     if (mode != SCAN && type == ASYNC) {
         std::cout << total_pending_count;
@@ -341,6 +355,11 @@ void print_marker(Statistic* statistic) {
     std::cout << " [SUCCESS]" << std::endl;
     Marker* success_marker = statistic->GetSuccessMarker();
     print_marker(success_marker);
+    if (statistic->GetOpt() == PUT) {
+        std::cout << " [CONFLICT]" << std::endl;
+        Marker* conflict_marker = statistic->GetConflictMarker();
+        print_marker(conflict_marker);
+    }
 }
 
 void* print_proc(void* param) {
@@ -416,11 +435,12 @@ void* print_proc(void* param) {
 }
 
 void print_summary(Statistic* marker, double duration) {
-    int64_t total_count, finish_count, success_count;
-    int64_t total_size, finish_size, success_size;
+    int64_t total_count, finish_count, success_count, conflict_count;
+    int64_t total_size, finish_size, success_size, conflict_size;
     marker->GetStatistic(&total_count, &total_size,
                          &finish_count, &finish_size,
-                         &success_count, &success_size);
+                         &success_count, &success_size,
+                         &conflict_count, &conflict_size);
 
     print_opt(marker);
     std::streamsize precision = std::cout.precision();
@@ -432,7 +452,10 @@ void print_summary(Statistic* marker, double duration) {
                          << (double)finish_size / 1048576 / duration << " MB/s\n"
         << "     succ: " << success_size << " bytes "
                          << success_count << " records "
-                         << (double)success_size / 1048576 / duration << " MB/s"
+                         << (double)success_size / 1048576 / duration << " MB/s\n"
+        << " conflict: " << conflict_size << " bytes "
+                         << conflict_count << " records "
+                         << (double)conflict_size / 1048576 / duration << " MB/s"
         << std::endl;
     std::cout.precision(precision);
     std::cout.flags(flag);
@@ -616,10 +639,11 @@ int main(int argc, char** argv) {
 
         switch (opt) {
         case PUT:
+        case PIF:
             if (type == SYNC && mode == MIX && last_opt == GET) {
                 adapter->CommitSyncRead();
             }
-            adapter->Write(row, column, largest_ts, value);
+            adapter->Write(opt, row, column, largest_ts, value);
             break;
         case GET:
             if (type == SYNC && mode == MIX && last_opt == PUT) {
diff --git a/src/benchmark/tpcc/data_generator.cc b/src/benchmark/tpcc/data_generator.cc
new file mode 100644
index 000000000..8fd76cbe6
--- /dev/null
+++ b/src/benchmark/tpcc/data_generator.cc
@@ -0,0 +1,182 @@
+// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+//
+// Author: baorenyi@baidu.com
+
+#include "benchmark/tpcc/data_generator.h"
+#include "benchmark/tpcc/tpccdb.h"
+#include "common/thread_pool.h"
+#include "common/timer.h"
+
+DECLARE_int32(warehouses_count);
+DECLARE_int32(tpcc_thread_pool_size);
+DECLARE_int32(generate_data_wait_times);
+
+namespace tera {
+namespace tpcc {
+
+DataGenerator::DataGenerator(RandomGenerator* rand_gen, TpccDb* db)
+    : event_(),
+      rand_gen_(rand_gen), 
+      db_(db), 
+      now_datatime_(get_curtime_str()), 
+      thread_pool_(FLAGS_tpcc_thread_pool_size) {
+    for (int i = 0; i < kTpccTableCnt; ++i) {
+        states_.push_back(std::make_pair(Counter(), Counter()));
+    }
+}
+
+void DataGenerator::PrintJoinTimeoutInfo(int need_cnt, int table_enum_num) {
+    if (need_cnt > states_[table_enum_num].first.Get() + states_[table_enum_num].second.Get()) {
+        LOG(ERROR) << "table:" << kTpccTables[table_enum_num] 
+                   << "[need/succ/fail]:[" 
+                   << need_cnt << "/" 
+                   << states_[table_enum_num].first.Get() << "/"
+                   << states_[table_enum_num].first.Get() << "]";
+    }
+}
+
+void DataGenerator::Join() {
+    event_.Trigger();
+    if (!event_.TimeWait(FLAGS_generate_data_wait_times)) {
+		int stock_cnt = FLAGS_warehouses_count * kItemCount;
+		int districts_cnt = FLAGS_warehouses_count * kDistrictCountPerWarehouse;
+		int customers_cnt = districts_cnt * kCustomerCountPerDistrict;
+        PrintJoinTimeoutInfo(kItemCount, kItemTable);
+        PrintJoinTimeoutInfo(stock_cnt, kStockTable);
+        PrintJoinTimeoutInfo(FLAGS_warehouses_count, kWarehouseTable);
+        PrintJoinTimeoutInfo(districts_cnt, kDistrictTable);
+        PrintJoinTimeoutInfo(customers_cnt, kCustomerTable);
+        PrintJoinTimeoutInfo(customers_cnt, kCustomerLastIndex);
+        PrintJoinTimeoutInfo(customers_cnt, kHistoryTable);
+    }
+}
+
+void DataGenerator::GenStocks(int32_t warehouse_id) {
+    IdSet original_ids = PickUniqueIdSet(rand_gen_, kItemCount / 10, 1, kItemCount);
+    event_.AddEventSources(kItemCount);
+    for (int id = 1; id <= kItemCount; ++id) {
+        bool is_original = original_ids.find(id) != original_ids.end();
+        PushToInsertQueue(std::bind(&DataGenerator::GenStock, this, id, warehouse_id, is_original));
+    }
+}
+
+void DataGenerator::GenStock(int32_t id, int32_t warehouse_id, bool is_original) {
+    Stock s(id, warehouse_id, is_original, rand_gen_);
+    VLOG(12) << s.ToString();
+    db_->InsertStock(s) ? states_[kStockTable].first.Inc() : states_[kStockTable].second.Inc();
+    event_.Complete();
+}
+
+void DataGenerator::GenCustomers(int32_t district_id, int32_t warehouse_id) {
+    IdSet bad_credit_ids = PickUniqueIdSet(rand_gen_, 
+            kCustomerCountPerDistrict / 10, 1, kCustomerCountPerDistrict);
+    event_.AddEventSources(kCustomerCountPerDistrict);
+    for (int c_id = 1; c_id <= kCustomerCountPerDistrict; ++c_id) {
+        bool is_bad_credit = bad_credit_ids.find(c_id) != bad_credit_ids.end();
+        Customer c(c_id, district_id, warehouse_id, now_datatime_, is_bad_credit, rand_gen_);
+        VLOG(12) << c.ToString();
+        db_->InsertCustomer(c) ? states_[kCustomerTable].first.Inc() : states_[kCustomerTable].second.Inc();
+    }
+    event_.Complete(kCustomerCountPerDistrict);
+}
+
+void DataGenerator::GenHistorys(int32_t district_id, int32_t warehouse_id) {
+    event_.AddEventSources(kCustomerCountPerDistrict);
+    for (int h_id = 1; h_id <= kCustomerCountPerDistrict; ++h_id) {
+        History h(h_id, district_id, warehouse_id, now_datatime_, rand_gen_);
+        VLOG(12) << h.ToString();
+        db_->InsertHistory(h) ? states_[kHistoryTable].first.Inc() : states_[kHistoryTable].second.Inc();
+    }
+    event_.Complete(kCustomerCountPerDistrict);
+}
+
+void DataGenerator::GenOrderLines(int cnt, int32_t order_id, int32_t district_id, 
+                                  int32_t warehouse_id, bool new_order) {
+    event_.AddEventSources(cnt);
+    for (int i = 1; i <= cnt; ++i) {
+        OrderLine ol(order_id, district_id, warehouse_id, i, new_order, now_datatime_, rand_gen_);
+        VLOG(12) << ol.ToString();
+        db_->InsertOrderLine(ol) ? states_[kOrderLineTable].first.Inc() : states_[kOrderLineTable].second.Inc();
+    }
+    event_.Complete(cnt);
+}
+
+void DataGenerator::GenOrders(int32_t d_id, int32_t w_id) {
+    std::vector<int> disorder_ids = rand_gen_->MakeDisOrderList(1, kCustomerCountPerDistrict);
+    event_.AddEventSources(kCustomerCountPerDistrict);
+    for (int o_id = 1; o_id <= kCustomerCountPerDistrict; ++o_id) {
+        bool new_order = (kCustomerCountPerDistrict - kInitNewOrderCountPerDistrict) < o_id;
+        int32_t c_id = disorder_ids[o_id];
+        Order o(o_id, c_id, d_id, w_id, new_order, now_datatime_, rand_gen_);
+        // insert order line and new order first
+        // this use sync interface
+        GenOrderLines(o.o_ol_cnt, o_id, d_id, w_id, new_order);
+        if (new_order) {
+            event_.AddEventSources(1);
+            NewOrder no(o_id, d_id, w_id);
+            VLOG(12) << no.ToString();
+            db_->InsertNewOrder(no) ? states_[kNewOrderTable].first.Inc() : states_[kNewOrderTable].second.Inc();
+            event_.Complete(1);
+        }
+        // wait orderline and neworder insert done
+        VLOG(12) << o.ToString();
+        db_->InsertOrder(o) ? states_[kOrderTable].first.Inc() : states_[kOrderTable].second.Inc();
+    }
+    event_.Complete(kCustomerCountPerDistrict);
+}
+
+void DataGenerator::GenDistricts(int32_t warehouse_id) {
+    event_.AddEventSources(kDistrictCountPerWarehouse);
+    for (int d_id = 1; d_id <= kDistrictCountPerWarehouse; ++d_id) {
+        District d(d_id, warehouse_id, rand_gen_);
+        VLOG(12) << d.ToString();
+        db_->InsertDistrict(d) ? states_[kDistrictTable].first.Inc() : states_[kDistrictTable].second.Inc();
+        GenCustomers(d_id, warehouse_id);
+        GenHistorys(d_id, warehouse_id);
+
+        GenOrders(d_id, warehouse_id);
+    } 
+    event_.Complete(kDistrictCountPerWarehouse);
+}
+
+void DataGenerator::GenWarehouses() {
+    event_.AddEventSources(FLAGS_warehouses_count);
+    for (int32_t w_id = 1; w_id <= FLAGS_warehouses_count; ++w_id) {
+        GenStocks(w_id);
+        Warehouse w(w_id, rand_gen_);
+        VLOG(12) << w.ToString();
+        db_->InsertWarehouse(w) ? states_[kWarehouseTable].first.Inc() : states_[kWarehouseTable].second.Inc();
+
+        GenDistricts(w_id);
+    }
+    event_.Complete(FLAGS_warehouses_count);
+}
+
+void DataGenerator::GenItems() {
+    IdSet original_ids = PickUniqueIdSet(rand_gen_, kItemCount / 10, 1, kItemCount);
+    event_.AddEventSources(kItemCount);
+    for (int i_id = 1; i_id <= kItemCount; ++i_id) {
+        bool is_original = original_ids.find(i_id) != original_ids.end();
+        PushToInsertQueue(std::bind(&DataGenerator::GenItem, this, i_id, is_original));
+    }
+}
+
+void DataGenerator::GenItem(int32_t item_id, bool is_original) {
+    Item item(item_id, is_original, rand_gen_);
+    VLOG(12) << item.ToString();
+    db_->InsertItem(item) ? states_[kItemTable].first.Inc() : states_[kItemTable].second.Inc();
+    event_.Complete();
+}
+
+void DataGenerator::PushToInsertQueue(const ThreadPool::Task& task) {
+    while(thread_pool_.PendingNum() > FLAGS_tpcc_thread_pool_size / 2) {
+        usleep(100);
+    }
+    thread_pool_.AddTask(task);
+    VLOG(12) << "thread_pool pending num = " << thread_pool_.PendingNum();
+}
+
+} // namespace tpcc
+} // namespace tera
diff --git a/src/benchmark/tpcc/data_generator.h b/src/benchmark/tpcc/data_generator.h
new file mode 100644
index 000000000..f5593b64c
--- /dev/null
+++ b/src/benchmark/tpcc/data_generator.h
@@ -0,0 +1,61 @@
+// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+//
+// Author: baorenyi@baidu.com
+
+#ifndef TERA_BENCHMARK_TPCC_DATA_GENERATOR_H
+#define TERA_BENCHMARK_TPCC_DATA_GENERATOR_H
+
+#include <stdint.h>
+#include <string>
+
+#include "benchmark/tpcc/random_generator.h"
+#include "benchmark/tpcc/tpccdb.h"
+#include "common/counter.h"
+#include "common/event.h"
+#include "common/thread_pool.h"
+
+namespace tera {
+namespace tpcc {
+
+
+class DataGenerator {
+public:
+    DataGenerator(RandomGenerator* random_gen, TpccDb* db);
+    ~DataGenerator(){}
+    void GenWarehouses();
+    void GenItems();
+    void Join();
+
+private:
+    void PrintJoinTimeoutInfo(int need_cnt, int table_enum_num);
+
+    // for generate data
+    void GenStocks(int32_t warehouse_id);
+    void GenCustomers(int32_t district_id, int32_t warehouse_id);
+    void GenHistorys(int32_t district_id, int32_t warehouse_id);
+    void GenOrderLines(int cnt, int32_t order_id, int32_t district_id, 
+                       int32_t warehouse_id, bool new_order);
+    void GenOrders(int32_t district_id, int32_t warehouse_id);
+    void GenDistricts(int32_t warehouse_id);
+    
+    void GenItem(int32_t item_id, bool is_original);
+    void GenStock(int32_t id, int32_t warehouse_id, bool is_original);
+
+    // for async insert
+    void PushToInsertQueue(const ThreadPool::Task& task);
+private:
+    typedef std::vector<std::pair<Counter, Counter>> InsertStates; 
+    CompletedEvent event_;
+    RandomGenerator* rand_gen_;
+    TpccDb* db_;
+    InsertStates states_;
+    std::string now_datatime_;
+    common::ThreadPool thread_pool_;
+};
+
+} // namespace tpcc
+} // namespace tera
+
+#endif /* TERA_BENCHMARK_TPCC_DATA_GENERATOR_H */
diff --git a/src/benchmark/tpcc/driver.cc b/src/benchmark/tpcc/driver.cc
new file mode 100644
index 000000000..aed2e6235
--- /dev/null
+++ b/src/benchmark/tpcc/driver.cc
@@ -0,0 +1,190 @@
+// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+//
+// Author: baorenyi@baidu.com
+
+#include "benchmark/tpcc/driver.h"
+#include "benchmark/tpcc/tpccdb.h"
+#include "common/thread_pool.h"
+#include "common/timer.h"
+
+DECLARE_int32(driver_wait_times);
+DECLARE_int32(warehouses_count);
+DECLARE_int32(tpcc_run_gtxn_thread_pool_size);
+DECLARE_int64(transactions_count);
+
+namespace tera {
+namespace tpcc {
+
+Driver::Driver(RandomGenerator* rand_gen, TpccDb* db)
+    : event_(),
+      rand_gen_(rand_gen), 
+      db_(db), 
+      now_datatime_(get_curtime_str()),
+      thread_pool_(FLAGS_tpcc_run_gtxn_thread_pool_size) {
+}
+
+void Driver::PrintJoinTimeoutInfo(int need_cnt, int table_enum_num) {
+    if (need_cnt < states_[table_enum_num].first.Get() + states_[table_enum_num].second.Get()) {
+        LOG(ERROR) << "table:" << kTpccTables[table_enum_num] 
+                   << "[need/succ/fail]:[" 
+                   << need_cnt << "/" 
+                   << states_[table_enum_num].first.Get() << "/"
+                   << states_[table_enum_num].first.Get() << "]";
+    }
+}
+
+void Driver::RunTransactions() {
+    for (int64_t i = 0; i < FLAGS_transactions_count; ++i) {
+        RunOneTransaction();
+    }
+}
+
+void Driver::Join() {
+    event_.Trigger();
+    if (!event_.TimeWait(FLAGS_driver_wait_times)) {
+        // TODO
+    }
+}
+
+void Driver::RunOneTransaction() {
+    int rand_num = rand_gen_->GetRandom(1, 100);
+    if (rand_num <= kTpccTransactionRatios[0]) {        //  %4 do stock_level
+        RunStockLevelTxn();
+    } else if (rand_num <= kTpccTransactionRatios[1]) { //  %4 do order_status
+        RunOrderStatusTxn();
+    } else if (rand_num <= kTpccTransactionRatios[2]) { //  %4 do delivery
+        RunDeliveryTxn();
+    } else if (rand_num <= kTpccTransactionRatios[3]) { // %43 do payment
+        RunPaymentTxn();
+    } else {                                            // %45 do new_order
+        RunNewOrderTxn();
+    }
+} 
+
+void Driver::RunStockLevelTxn() {
+    int32_t threshold = rand_gen_->GetRandom(kMinStockLevelThreshold, kMaxStockLevelThreshold); 
+    StockLevelResult ret;
+    db_->StockLevelTxn(FindWareHouse(), FindDistrict(), threshold, &ret);
+}
+
+void Driver::RunOrderStatusTxn() {
+    int x = rand_gen_->GetRandom(1, 100);
+    OrderStatusResult ret;
+    if (x <= 60) {
+        // 60% order_status by lastname
+        std::string last_name = GenLastName(rand_gen_, kCustomerCountPerDistrict);
+        db_->OrderStatusTxn(true, FindWareHouse(), FindDistrict(), 
+                -1, last_name, &ret);
+    } else {
+        // 40% order_status by customer_id
+        db_->OrderStatusTxn(false, FindWareHouse(), FindDistrict(), 
+                FindCustomerId(), "", &ret);
+    }
+}
+
+void Driver::RunDeliveryTxn() {
+    int32_t carrier_id = rand_gen_->GetRandom(kMinCarrierId, kMaxCarrierId);
+    DeliveryResult ret;;
+    db_->DeliveryTxn(FindWareHouse(), carrier_id, get_curtime_str(), &ret); 
+}
+
+void Driver::RunPaymentTxn() {
+    int32_t warehouse_id = FindWareHouse();
+    int32_t district_id = FindDistrict();
+
+    float h_amount = rand_gen_->MakeFloat(kRuntimeMinAmount, kRuntimeMaxAmount, 
+            kRuntimeAmountDigits);
+
+    int32_t customer_warehouse_id = -1;
+    int32_t customer_district_id = -1;
+
+    int x = rand_gen_->GetRandom(1, 100);
+    
+    // set customer c_w_id and c_d_id 
+    if (FLAGS_warehouses_count == 1 && x <= 85) {
+        // 85% payment through local warehouse (or only one warehouse)
+        customer_warehouse_id = warehouse_id;
+        customer_district_id = district_id;
+    } else {
+        // 15% payment through remote warehouse
+        customer_warehouse_id = 
+            rand_gen_->GetRandom(1, FLAGS_warehouses_count, warehouse_id);
+        customer_district_id = FindDistrict(); 
+    }
+
+    x = rand_gen_->GetRandom(1, 100);
+    PaymentResult ret;
+    if (x <= 60) {
+        // 60% payment by lastname
+        std::string last_name = GenLastName(rand_gen_, kCustomerCountPerDistrict);
+        db_->PaymentTxn(true, warehouse_id, district_id, 
+                customer_warehouse_id, customer_district_id, -1,
+                last_name, h_amount, &ret);
+    } else {
+        // 40% payment by customer_id
+        db_->PaymentTxn(false, warehouse_id, district_id, 
+                customer_warehouse_id, customer_district_id, FindCustomerId(),
+                "", h_amount, &ret);
+    }
+}
+
+void Driver::RunNewOrderTxn() {
+    int32_t warehouse_id = FindWareHouse();
+
+    // init NewOrderInfo
+    NewOrderInfo info;
+    // 1% of new_order transactions will be failed
+    info.need_failed = rand_gen_->GetRandom(1,100) == 1 ? true : false; 
+    info.o_ol_cnt = rand_gen_->GetRandom(kMinOrderLineCnt, kMaxOrderLineCnt);
+
+    info.ol_supply_w_ids.reserve(info.o_ol_cnt);
+    info.ol_i_ids.reserve(info.o_ol_cnt);
+    info.ol_quantities.reserve(info.o_ol_cnt);
+    info.o_all_local = 1;
+    for (int32_t i = 0; i < info.o_ol_cnt; ++i) {
+        // 1% of orderlines will be remote order
+        bool remote = rand_gen_->GetRandom(1, 100) == 1 ? true : false;
+        if (FLAGS_warehouses_count > 1 && remote) {
+            info.ol_supply_w_ids.emplace_back(
+                    rand_gen_->GetRandom(1, FLAGS_warehouses_count, warehouse_id));
+            info.o_all_local = 0;
+        } else {
+            info.ol_supply_w_ids.emplace_back(warehouse_id);
+        }
+        info.ol_i_ids.emplace_back(FindItemId());
+        info.ol_quantities.emplace_back(
+                rand_gen_->GetRandom(1, kMaxOrderLineQuantity));
+    }
+
+    NewOrderResult ret;
+    db_->NewOrderTxn(warehouse_id, FindDistrict(), FindCustomerId(), info, &ret);
+}
+
+void Driver::PushToInsertQueue(const ThreadPool::Task& task) {
+    while(thread_pool_.PendingNum() > FLAGS_tpcc_run_gtxn_thread_pool_size / 2) {
+        usleep(100);
+    }
+    thread_pool_.AddTask(task);
+    VLOG(12) << "thread_pool pending num = " << thread_pool_.PendingNum();
+}
+
+int32_t Driver::FindWareHouse() {
+    return rand_gen_->GetRandom(1, FLAGS_warehouses_count);
+}
+
+int32_t Driver::FindDistrict() {
+    return rand_gen_->GetRandom(1, kDistrictCountPerWarehouse);
+}
+
+int32_t Driver::FindCustomerId() {
+    return rand_gen_->NURand(1023, 1, kCustomerCountPerDistrict);
+}
+
+int32_t Driver::FindItemId() {
+    return rand_gen_->NURand(8191, 1, kItemCount); 
+}
+
+} // namespace tpcc
+} // namespace tera
diff --git a/src/benchmark/tpcc/driver.h b/src/benchmark/tpcc/driver.h
new file mode 100644
index 000000000..56bf5a66f
--- /dev/null
+++ b/src/benchmark/tpcc/driver.h
@@ -0,0 +1,68 @@
+// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+//
+// Author: baorenyi@baidu.com
+
+#ifndef TERA_BENCHMARK_TPCC_DRIVER_H
+#define TERA_BENCHMARK_TPCC_DRIVER_H
+
+#include <stdint.h>
+#include <string>
+
+#include "benchmark/tpcc/random_generator.h"
+#include "benchmark/tpcc/tpccdb.h"
+#include "common/counter.h"
+#include "common/event.h"
+#include "common/thread_pool.h"
+
+namespace tera {
+namespace tpcc {
+
+class Driver {
+public:
+    Driver(RandomGenerator* random_gen, TpccDb* db);
+    ~Driver(){}
+    void RunTransactions();
+    void Join();
+
+private:
+    void PrintJoinTimeoutInfo(int need_cnt, int table_enum_num);
+
+    // for run transaction
+    void RunOneTransaction();
+    //
+    void RunStockLevelTxn();
+
+    void RunOrderStatusTxn();
+
+    void RunDeliveryTxn();
+
+    void RunPaymentTxn();
+
+    void RunNewOrderTxn();
+
+    // for async run txn
+    void PushToInsertQueue(const ThreadPool::Task& task);
+
+    int32_t FindWareHouse();
+
+    int32_t FindDistrict();
+    
+    int32_t FindCustomerId();
+    
+    int32_t FindItemId();
+private:
+    typedef std::vector<std::pair<Counter, Counter>> TxnStates; 
+    CompletedEvent event_;
+    RandomGenerator* rand_gen_;
+    TpccDb* db_;
+    TxnStates states_;
+    std::string now_datatime_;
+    common::ThreadPool thread_pool_;
+};
+
+} // namespace tpcc
+} // namespace tera
+
+#endif /* TERA_BENCHMARK_TPCC_DATA_GENERATOR_H */
diff --git a/src/benchmark/tpcc/mock_tpccdb.cc b/src/benchmark/tpcc/mock_tpccdb.cc
new file mode 100644
index 000000000..ee8cce0d0
--- /dev/null
+++ b/src/benchmark/tpcc/mock_tpccdb.cc
@@ -0,0 +1,18 @@
+// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+//
+// Author: baorenyi@baidu.com
+
+#include "benchmark/tpcc/mock_tpccdb.h"
+
+#include <gflags/gflags.h>
+#include <glog/logging.h>
+
+namespace tera {
+namespace tpcc {
+
+MockTpccDb::MockTpccDb() : flag_(true) {}
+
+} // namespace tpcc
+} // namespace tera
diff --git a/src/benchmark/tpcc/mock_tpccdb.h b/src/benchmark/tpcc/mock_tpccdb.h
new file mode 100644
index 000000000..0f29f0320
--- /dev/null
+++ b/src/benchmark/tpcc/mock_tpccdb.h
@@ -0,0 +1,98 @@
+// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+//
+// Author: baorenyi@baidu.com
+
+#ifndef TERA_BENCHMARK_TPCC_MOCK_TPCCDB_H
+#define TERA_BENCHMARK_TPCC_MOCK_TPCCDB_H
+
+#include "benchmark/tpcc/tpccdb.h"
+
+namespace tera {
+namespace tpcc {
+
+class TpccDb;
+class TxnResult;
+
+class MockTpccDb : public TpccDb {
+public:
+    MockTpccDb();
+    virtual ~MockTpccDb() {}
+
+    virtual bool CreateTables() { return true; }
+    virtual bool CleanTables() { return true; }
+
+    // init db 
+    virtual bool InsertItem(const Item& i) { 
+        return flag_; 
+    }
+
+    virtual bool InsertWarehouse(const Warehouse& w) { 
+        return flag_; 
+    }
+
+    virtual bool InsertDistrict(const District& d) { 
+        return flag_; 
+    }
+
+    virtual bool InsertCustomer(const Customer& c) { 
+        return flag_; 
+    }
+    
+    virtual bool InsertHistory(const History& h) { 
+        return flag_; 
+    }
+
+    virtual bool InsertStock(const Stock& s) { 
+        return flag_; 
+    }
+    
+    virtual bool InsertOrder(const Order& o) { 
+        return flag_; 
+    }
+    
+    virtual bool InsertOrderLine(const OrderLine& ol) { 
+        return flag_; 
+    }
+
+    virtual bool InsertNewOrder(const NewOrder& no) { 
+        return flag_; 
+    }
+
+    virtual void StockLevelTxn(int32_t warehouse_id, int32_t district_id, 
+                               int32_t threshold, 
+                               StockLevelResult* ret) {}
+
+    virtual void DeliveryTxn(int32_t warehouse_id, 
+                             int32_t carrier_id, 
+                             const std::string& delivery_datetime,
+                             DeliveryResult* ret) {}
+
+    virtual void OrderStatusTxn(bool by_last_name,
+                                int32_t warehouse_id, int32_t district_id, 
+                                int32_t c_customer_id, 
+                                const std::string& last_name,
+                                OrderStatusResult* ret) {}
+
+    virtual void PaymentTxn(bool by_last_name,
+                            int32_t warehouse_id, int32_t district_id, 
+                            int32_t c_warehouse_id, int32_t c_district_id, 
+                            int32_t c_customer_id, 
+                            const std::string& last_name,
+                            int32_t h_amount,
+                            PaymentResult* ret) {}
+
+    virtual void NewOrderTxn(int32_t warehouse_id, 
+                             int32_t district_id, 
+                             int32_t customer_id, const NewOrderInfo& info,
+                             NewOrderResult* ret) {}
+
+private:
+    bool flag_;
+};
+
+} // namespace tpcc
+} // namespace tera
+
+#endif /* TERA_BENCHMARK_TPCC_MOCK_TPCCDB_H */
diff --git a/src/benchmark/tpcc/random_generator.cc b/src/benchmark/tpcc/random_generator.cc
new file mode 100644
index 000000000..9308ec6e9
--- /dev/null
+++ b/src/benchmark/tpcc/random_generator.cc
@@ -0,0 +1,132 @@
+// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+//
+// Author: baorenyi@baidu.com
+
+#include "benchmark/tpcc/random_generator.h"
+
+#include <assert.h>
+
+namespace tera {
+namespace tpcc {
+
+RandomGenerator::RandomGenerator():c_({0,0,0}) {
+    InitRandomState();
+}
+
+void RandomGenerator::InitRandomState() {
+    memset(&rand_state_, 0, sizeof(rand_state_));
+    int ret = initstate_r(static_cast<unsigned int>(time(NULL)),
+                          rand_state_buf_,
+                          sizeof(rand_state_buf_),
+                          &rand_state_);
+    assert(ret == 0);
+}
+
+NURandConstant RandomGenerator::GetRandomConstant() const {
+    return c_;
+}
+
+void RandomGenerator::SetRandomConstant() {
+    c_.c_last = GetRandom(0, 255);
+    c_.c_id = GetRandom(0, 1023);
+    c_.ol_i_id = GetRandom(0, 8191);
+}
+
+inline bool VarfiyConstantAvailableForRun(int run_last, int load_last) {
+    int delta = run_last - load_last;
+    delta = delta > 0 ? delta : -1 * delta;
+    return 65 <=delta && delta <= 119 && delta != 96 && delta != 112;
+}
+
+void RandomGenerator::SetRandomConstant(const NURandConstant& constant_for_load) {
+    c_.c_last = GetRandom(0, 255);
+    c_.c_id = GetRandom(0, 1023);
+    c_.ol_i_id = GetRandom(0, 8191);
+    while (!VarfiyConstantAvailableForRun(c_.c_last, constant_for_load.c_last)) {
+        c_.c_last = GetRandom(0, 255);
+    }
+}
+
+int RandomGenerator::GetRandom(int lower, int upper) {
+    int ret = 0;
+    int err = random_r(&rand_state_, &ret);
+    assert(err == 0);
+    return lower <= upper ? (ret % (upper - lower + 1) + lower) : (ret % (lower - upper + 1) + upper);
+}
+
+int RandomGenerator::GetRandom(int lower, int upper, int exclude) {
+    if (exclude > upper || exclude < lower) {
+        return GetRandom(lower, upper);
+    } else {
+        int rand = GetRandom(lower, upper - 1);
+        if (rand >= exclude) {
+            ++rand;
+        }
+        return rand;
+    }
+}
+
+std::string RandomGenerator::MakeAString(int lower_len, int upper_len) {
+    int len = GetRandom(lower_len, upper_len); 
+    std::string ret;
+    for (int i = 0; i < len; ++i) {
+        ret += (char)('a' + GetRandom(0, 25));
+    }
+    return ret;
+}
+
+std::string RandomGenerator::MakeNString(int lower_len, int upper_len) {
+    int len = GetRandom(lower_len, upper_len); 
+    std::string ret;
+    for (int i = 0; i < len; ++i) {
+        ret += (char)('0' + GetRandom(0, 9));
+    }
+    return ret;
+}
+
+float RandomGenerator::MakeFloat(float lower, float upper, int digits) {
+	float num = 1.0;
+    for (int i = 0; i < digits; ++i) {
+        num *= 10;
+	}
+    return GetRandom(int(lower * num + 0.5), int(upper * num + 0.5)) / num;
+}
+
+std::vector<int> RandomGenerator::MakeDisOrderList(int lower, int upper) {
+    std::vector<int> ret(upper - lower + 1, -1);
+    for (int i = 0; i < upper - lower + 1; ++i) {
+        int rand_pos = GetRandom(0, upper - lower);
+        while (true) {
+            if (ret[rand_pos] == -1) {
+                ret[rand_pos] = lower + i;
+                break;
+            }
+            rand_pos = GetRandom(0, upper - lower);
+        }
+    }
+    return ret;
+}
+
+int RandomGenerator::NURand(int A, int x, int y) {
+    int C = 0;
+    switch(A) {
+        case 255:
+            C = c_.c_last;
+            break;
+        case 1023:
+            C = c_.c_id;
+            break;
+        case 8191:
+            C = c_.ol_i_id;
+            break;
+        default:
+            LOG(ERROR) << "NURand: A = " << A << " not available";
+            abort();
+    }
+    return (((GetRandom(0, A) | GetRandom(x, y)) + C) % (y - x + 1)) + x;
+}
+
+} // namespace tpcc
+} // namespace tera
diff --git a/src/benchmark/tpcc/random_generator.h b/src/benchmark/tpcc/random_generator.h
new file mode 100644
index 000000000..c39070294
--- /dev/null
+++ b/src/benchmark/tpcc/random_generator.h
@@ -0,0 +1,64 @@
+// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+//
+// Author: baorenyi@baidu.com
+
+#ifndef TERA_BENCHMARK_TPCC_RANDOM_GENERATOR_H
+#define TERA_BENCHMARK_TPCC_RANDOM_GENERATOR_H
+
+#include <stdlib.h>
+#include <string>
+#include <vector>
+
+#include "benchmark/tpcc/tpcc_types.h"
+
+namespace tera {
+namespace tpcc {
+
+struct NURandConstant {
+    int c_last;
+    int c_id;
+    int ol_i_id;
+};
+
+class RandomGenerator {
+public:
+    RandomGenerator();
+    virtual ~RandomGenerator(){}
+
+    NURandConstant GetRandomConstant() const;
+    void SetRandomConstant();
+    void SetRandomConstant(const NURandConstant& constant_for_load);
+
+    // make a string A len=rand[lower_len, upper_len] A[x] = set(a..z)
+    std::string MakeAString(int lower_len, int upper_len);
+
+    // make a string N len=rand[lower_len, upper_len] N[x] = set(0..9)
+    std::string MakeNString(int lower_len, int upper_len);
+
+	float MakeFloat(float lower, float upper, int digits);
+
+    std::vector<int> MakeDisOrderList(int lower, int upper);
+
+    int NURand(int A, int lower, int upper);
+
+    // get rand int from [lower, upper]
+    int GetRandom(int lower, int upper); 
+
+    int GetRandom(int lower, int upper, int exclude);
+private:
+    void InitRandomState();
+private:
+    // for system call random_r and initstate_r
+    char rand_state_buf_[kRandomStateSize];
+    struct random_data rand_state_;
+    
+    // for NURand, need a constant
+    NURandConstant c_;
+};
+
+} // namespace tpcc
+} // namespace tera
+
+#endif /* TERA_BENCHMARK_TPCC_RANDOM_GENERATOR_H */
diff --git a/src/benchmark/tpcc/tera_tpccdb.cc b/src/benchmark/tpcc/tera_tpccdb.cc
new file mode 100644
index 000000000..f35f4ed2a
--- /dev/null
+++ b/src/benchmark/tpcc/tera_tpccdb.cc
@@ -0,0 +1,538 @@
+// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+//
+// Author: baorenyi@baidu.com
+
+#include "benchmark/tpcc/tera_tpccdb.h"
+
+#include <gflags/gflags.h>
+#include <glog/logging.h>
+
+#include "sdk/client_impl.h"
+#include "sdk/sdk_utils.h"
+
+DECLARE_string(tera_client_flagfile);
+DECLARE_string(tera_table_schema_dir);
+
+namespace tera {
+namespace tpcc {
+
+TeraTpccDb::TeraTpccDb() : client_(NULL) {
+    ErrorCode error_code;
+    client_ = Client::NewClient(FLAGS_tera_client_flagfile, "tera_tpcc", &error_code);
+    if (client_ == NULL) {
+        LOG(ERROR) << "new client failed. err:" << error_code.ToString();
+        _Exit(EXIT_FAILURE);
+    }
+}
+
+TeraTpccDb::~TeraTpccDb() {
+    delete client_;
+}
+
+bool TeraTpccDb::CreateTables() {
+    ErrorCode err;
+    for (auto table : kTpccTables) {
+        std::string schema_file = FLAGS_tera_table_schema_dir + table;
+        TableDescriptor* desc = new TableDescriptor();
+        if (ParseTableSchemaFile(schema_file, desc, &err)) {
+            if (client_->CreateTable(*desc, &err) && err.GetType() == ErrorCode::kOK) {
+                LOG(INFO) << "create table " << table << " ok";
+                Table* table_ptr = client_->OpenTable(table, &err);
+                if (table_ptr == NULL) {
+                    LOG(ERROR) << "open table " << table << " failed";
+                    delete desc;
+                    return false;
+                } else {
+                    table_map_[table] = table_ptr;
+                    LOG(INFO) << "open table " << table << " ok";
+                }
+            } else {
+                LOG(ERROR) << "create table " << table << " failed";
+                delete desc;
+                return false;
+            }
+        } else {
+            LOG(ERROR) << "load schema failed, schema_file:" << schema_file << "err:" << err.ToString();
+            delete desc;
+            return false;
+        }
+        delete desc;
+    }
+    return true;
+}
+
+bool TeraTpccDb::CleanTables() {
+    ErrorCode err;
+    for (auto table : kTpccTables) {
+        if (!client_->DisableTable(table, &err)) {
+            LOG(ERROR) << "fail to disable table : " << table << " err: " <<err.ToString();
+        } else {
+            // make sure clean tables 
+            TableMeta table_meta;
+            TabletMetaList tablet_list;
+            tera::ClientImpl* client_impl = static_cast<tera::ClientImpl*>(client_);
+            if (!client_impl->ShowTablesInfo(table, &table_meta, &tablet_list, &err)) {
+                LOG(ERROR) << "table not exist: " << table;
+                continue;
+            }
+            uint64_t tablet_num = tablet_list.meta_size();
+            VLOG(11) << tablet_num;
+            int wait_times = 0;
+            while (true) {
+                if (!client_impl->ShowTablesInfo(table, &table_meta, &tablet_list, &err)) {
+                    LOG(ERROR) << "table not exist: " << table;
+                    break;
+                }
+                uint64_t tablet_cnt = 0;
+                for (int32_t i = 0; i < tablet_list.meta_size(); ++i) {
+                    const TabletMeta& tablet = tablet_list.meta(i);
+                    if (tablet.status() == kTabletDisable || tablet.status() == kTableOffLine) {
+                        tablet_cnt++;
+                    }
+                }
+                if (tablet_cnt == tablet_num) {
+                    break;
+                }
+                if (wait_times < 20) {
+                    sleep(1);
+                } else {
+                    LOG(ERROR) << "disable  table : " << table << " failed, try " << wait_times << " time(s)";
+                    break;
+                }
+            }
+        }
+        if (!client_->DeleteTable(table, &err)) {
+            LOG(ERROR) << "drop table: " << table << " failed. " << err.ToString();
+        } else {
+            LOG(INFO) << "drop table: "<< table << " done.";
+        }
+    }
+    return true;
+}
+
+// init db 
+bool TeraTpccDb::InsertItem(const Item& i) {
+    std::string tablename = "t_item";
+    if ( table_map_.find(tablename) == table_map_.end()) {
+        return false;
+    }
+    Table* table = table_map_[tablename];
+    Transaction* gtxn = client_->NewGlobalTransaction();
+    RowMutation* mu = table->NewRowMutation(i.PrimaryKey());
+    mu->Put("cf0", "i_id", std::to_string(i.i_id));
+    mu->Put("cf0", "i_im_id", std::to_string(i.i_im_id));
+    mu->Put("cf0", "i_price", std::to_string(i.i_price));
+    mu->Put("cf0", "i_name", i.i_name);
+    mu->Put("cf0", "i_data", i.i_data);
+    gtxn->ApplyMutation(mu);
+    gtxn->Commit();
+    delete mu;
+    if (gtxn->GetError().GetType() != ErrorCode::kOK) {
+        LOG(ERROR) << "insert table:" << tablename << " failed. err:" 
+                   << gtxn->GetError().ToString();
+        delete gtxn;
+        return false;
+    }
+    delete gtxn;
+    return true;
+}
+
+bool TeraTpccDb::InsertWarehouse(const Warehouse& w) {
+    std::string tablename = "t_warehouse";
+    if ( table_map_.find(tablename) == table_map_.end()) {
+        return false;
+    }
+    Table* table = table_map_[tablename];
+    Transaction* gtxn = client_->NewGlobalTransaction();
+    RowMutation* mu = table->NewRowMutation(w.PrimaryKey());
+    mu->Put("cf0", "w_id", std::to_string(w.w_id));
+    mu->Put("cf0", "w_tax", std::to_string(w.w_tax));
+    mu->Put("cf0", "w_ytd", std::to_string(w.w_ytd));
+    mu->Put("cf0", "w_name", w.w_name);
+    mu->Put("cf0", "w_street_1", w.w_street_1);
+    mu->Put("cf0", "w_street_2", w.w_street_2);
+    mu->Put("cf0", "w_city", w.w_city);
+    mu->Put("cf0", "w_state", w.w_state);
+    mu->Put("cf0", "w_zip", w.w_zip);
+    gtxn->ApplyMutation(mu);
+    gtxn->Commit();
+    delete mu;
+    if (gtxn->GetError().GetType() != ErrorCode::kOK) {
+        LOG(ERROR) << "insert table:" << tablename << " failed. err:" 
+                   << gtxn->GetError().ToString();
+        delete gtxn;
+        return false;
+    }
+    delete gtxn;
+    return true;
+}
+
+bool TeraTpccDb::InsertDistrict(const District& d) {
+    std::string tablename = "t_district";
+    if ( table_map_.find(tablename) == table_map_.end()) {
+        return false;
+    }
+    Table* table = table_map_[tablename];
+    Transaction* gtxn = client_->NewGlobalTransaction();
+    RowMutation* mu = table->NewRowMutation(d.PrimaryKey());
+    mu->Put("cf0", "d_id", std::to_string(d.d_id));
+    mu->Put("cf0", "d_w_id", std::to_string(d.d_w_id));
+    mu->Put("cf0", "d_tax", std::to_string(d.d_tax));
+    mu->Put("cf0", "d_ytd", std::to_string(d.d_ytd));
+    mu->Put("cf0", "d_next_o_id", std::to_string(d.d_next_o_id));
+    mu->Put("cf0", "d_name", d.d_name);
+    mu->Put("cf0", "d_street_1", d.d_street_1);
+    mu->Put("cf0", "d_street_2", d.d_street_2);
+    mu->Put("cf0", "d_city", d.d_city);
+    mu->Put("cf0", "d_state", d.d_state);
+    mu->Put("cf0", "d_zip", d.d_zip);
+    gtxn->ApplyMutation(mu);
+    gtxn->Commit();
+    delete mu;
+    if (gtxn->GetError().GetType() != ErrorCode::kOK) {
+        LOG(ERROR) << "insert table:" << tablename << " failed. err:" 
+                   << gtxn->GetError().ToString();
+        delete gtxn;
+        return false;
+    }
+    delete gtxn;
+    return true;
+}
+
+bool TeraTpccDb::InsertCustomer(const Customer& c) {
+    std::string tablename = "t_customer";
+    std::string c_last_index_name = "t_customer_last_index";
+    if ( table_map_.find(tablename) == table_map_.end()
+            || table_map_.find(c_last_index_name) == table_map_.end()) {
+        return false;
+    }
+    Table* table = table_map_[tablename];
+    Table* t_index = table_map_[tablename];
+    Transaction* gtxn = client_->NewGlobalTransaction();
+    std::string key = std::to_string(c.c_w_id) + "_" + std::to_string(c.c_d_id)
+        + "_" + c.c_last + "_" + std::to_string(c.c_id);
+    RowMutation* index_mu = t_index->NewRowMutation(key);
+    index_mu->Put("cf0", "c_id", std::to_string(c.c_id));
+    index_mu->Put("cf0", "c_d_id", std::to_string(c.c_d_id));
+    index_mu->Put("cf0", "c_w_id", std::to_string(c.c_w_id));
+    index_mu->Put("cf0", "c_last", c.c_last);
+    gtxn->ApplyMutation(index_mu);
+    delete index_mu;
+
+    RowMutation* mu = table->NewRowMutation(c.PrimaryKey());
+    mu->Put("cf0", "c_id", std::to_string(c.c_id));
+    mu->Put("cf0", "c_d_id", std::to_string(c.c_d_id));
+    mu->Put("cf0", "c_w_id", std::to_string(c.c_w_id));
+    mu->Put("cf0", "c_credit_lim", std::to_string(c.c_credit_lim));
+    mu->Put("cf0", "c_discount", std::to_string(c.c_discount));
+    mu->Put("cf0", "c_balance", std::to_string(c.c_balance));
+    mu->Put("cf0", "c_ytd_payment", std::to_string(c.c_ytd_payment));
+    mu->Put("cf0", "c_payment_cnt", std::to_string(c.c_payment_cnt));
+    mu->Put("cf0", "c_delivery_cnt", std::to_string(c.c_delivery_cnt));
+    mu->Put("cf0", "c_first", c.c_first);
+    mu->Put("cf0", "c_middle", c.c_middle);
+    mu->Put("cf0", "c_last", c.c_last);
+    mu->Put("cf0", "c_street_1", c.c_street_1);
+    mu->Put("cf0", "c_street_2", c.c_street_2);
+    mu->Put("cf0", "c_city", c.c_city);
+    mu->Put("cf0", "c_state", c.c_state);
+    mu->Put("cf0", "c_zip", c.c_zip);
+    mu->Put("cf0", "c_phone", c.c_phone);
+    mu->Put("cf0", "c_since", c.c_since);
+    mu->Put("cf0", "c_credit", c.c_credit);
+    mu->Put("cf0", "c_data", c.c_data);
+    gtxn->ApplyMutation(mu);
+    gtxn->Commit();
+    delete mu;
+    if (gtxn->GetError().GetType() != ErrorCode::kOK) {
+        LOG(ERROR) << "insert table:" << tablename << " failed. err:" 
+                   << gtxn->GetError().ToString();
+        delete gtxn;
+        return false;
+    }
+    delete gtxn;
+    return true;
+}
+
+bool TeraTpccDb::InsertHistory(const History& h) {
+    std::string tablename = "t_history";
+    std::string history_index_name = "t_history_index";
+
+    if (table_map_.find(tablename) == table_map_.end() || 
+            table_map_.find(history_index_name) == table_map_.end()) {
+        return false;
+    }
+    Table* table = table_map_[tablename];
+    Table* t_history_index = table_map_[history_index_name];
+    Transaction* gtxn = client_->NewGlobalTransaction();
+    
+    RowReader* hindex_reader = t_history_index->NewRowReader("count");
+    RetTuples hindex_ret;
+    int cnt = -1;
+    TxnResult ret;
+    if (hindex_reader->GetError().GetType() != ErrorCode::kNotFound
+            && !GetValues(&ret, gtxn, hindex_reader, 
+                   {"count"},
+                   &hindex_ret,
+                   "@insert_history|hindex_reader|count")) {
+        return false;
+    } else if (hindex_reader->GetError().GetType() == ErrorCode::kNotFound) {
+        cnt = 0;
+    } else {
+        cnt = std::stoi(hindex_ret["count"]);
+    }
+    
+    RowMutation* hindex_mu = t_history_index->NewRowMutation("count");
+    hindex_mu->Put("cf0", "count", std::to_string(++cnt));
+    gtxn->ApplyMutation(hindex_mu);
+    delete hindex_mu;
+
+    RowMutation* mu = table->NewRowMutation(std::to_string(cnt));
+    mu->Put("cf0", "h_c_id", std::to_string(h.h_c_id));
+    mu->Put("cf0", "h_c_d_id", std::to_string(h.h_c_d_id));
+    mu->Put("cf0", "h_c_w_id", std::to_string(h.h_c_w_id));
+    mu->Put("cf0", "h_d_id", std::to_string(h.h_d_id));
+    mu->Put("cf0", "h_w_id", std::to_string(h.h_w_id));
+    mu->Put("cf0", "h_amount", std::to_string(h.h_amount));
+    mu->Put("cf0", "h_date", h.h_date);
+    mu->Put("cf0", "h_data", h.h_data);
+    gtxn->ApplyMutation(mu);
+    gtxn->Commit();
+    delete mu;
+    if (gtxn->GetError().GetType() != ErrorCode::kOK) {
+        LOG(ERROR) << "insert table:" << tablename << " failed. err:" 
+                   << gtxn->GetError().ToString();
+        delete gtxn;
+        return false;
+    }
+    delete gtxn;
+    return true;
+}
+
+bool TeraTpccDb::InsertStock(const Stock& s) {
+    std::string tablename = "t_stock";
+    if ( table_map_.find(tablename) == table_map_.end()) {
+        return false;
+    }
+    Table* table = table_map_[tablename];
+    Transaction* gtxn = client_->NewGlobalTransaction();
+    RowMutation* mu = table->NewRowMutation(s.PrimaryKey());
+
+    mu->Put("cf0", "s_i_id", std::to_string(s.s_i_id));
+    mu->Put("cf0", "s_w_id", std::to_string(s.s_w_id));
+    mu->Put("cf0", "s_quantity", std::to_string(s.s_quantity));
+    mu->Put("cf0", "s_ytd", std::to_string(s.s_ytd));
+    mu->Put("cf0", "s_order_cnt", std::to_string(s.s_order_cnt));
+    mu->Put("cf0", "s_remote_cnt", std::to_string(s.s_remote_cnt));
+    int i = 0;
+    for (auto dist : s.s_dist) {
+        mu->Put("cf0", "s_dist_" + std::to_string(++i), dist);
+    }
+    mu->Put("cf0", "s_data", s.s_data);
+
+    gtxn->ApplyMutation(mu);
+    gtxn->Commit();
+    delete mu;
+    if (gtxn->GetError().GetType() != ErrorCode::kOK) {
+        LOG(ERROR) << "insert table:" << tablename << " failed. err:" 
+                   << gtxn->GetError().ToString();
+        delete gtxn;
+        return false;
+    }
+    delete gtxn;
+    return true;
+}
+
+bool TeraTpccDb::InsertOrder(const Order& o) {
+    std::string tablename = "t_order";
+    std::string indexname = "t_order_index";
+    if ( table_map_.find(tablename) == table_map_.end() || 
+            table_map_.find(indexname) == table_map_.end()) {
+        return false;
+    }
+    Table* table = table_map_[tablename];
+    Table* index = table_map_[indexname];
+
+    Transaction* gtxn = client_->NewGlobalTransaction();
+    RowMutation* mu = table->NewRowMutation(o.PrimaryKey());
+    std::string index_key = o.ForeignKey() + "_" + std::to_string(o.o_id);
+    RowMutation* index_mu = index->NewRowMutation(index_key);
+    index_mu->Put("cf0", "o_id", std::to_string(o.o_id));
+    index_mu->Put("cf0", "o_c_id", std::to_string(o.o_c_id));
+    index_mu->Put("cf0", "o_d_id", std::to_string(o.o_d_id));
+    index_mu->Put("cf0", "o_w_id", std::to_string(o.o_w_id));
+    mu->Put("cf0", "o_id", std::to_string(o.o_id));
+    mu->Put("cf0", "o_c_id", std::to_string(o.o_c_id));
+    mu->Put("cf0", "o_d_id", std::to_string(o.o_d_id));
+    mu->Put("cf0", "o_w_id", std::to_string(o.o_w_id));
+    mu->Put("cf0", "o_carrier_id", std::to_string(o.o_carrier_id));
+    mu->Put("cf0", "o_ol_cnt", std::to_string(o.o_ol_cnt));
+    mu->Put("cf0", "o_all_local", std::to_string(o.o_all_local));
+    mu->Put("cf0", "o_entry_d", o.o_entry_d);
+    gtxn->ApplyMutation(mu);
+    gtxn->ApplyMutation(index_mu);
+    delete mu;
+    delete index_mu;
+    gtxn->Commit();
+    if (gtxn->GetError().GetType() != ErrorCode::kOK) {
+        LOG(ERROR) << "insert table:" << tablename << " failed. err:" 
+                   << gtxn->GetError().ToString();
+        delete gtxn;
+        return false;
+    }
+    delete gtxn;
+    return true;
+}
+
+bool TeraTpccDb::InsertOrderLine(const OrderLine& ol) {
+     std::string tablename = "t_orderline";
+    if ( table_map_.find(tablename) == table_map_.end()) {
+        return false;
+    }
+    Table* table = table_map_[tablename];
+    Transaction* gtxn = client_->NewGlobalTransaction();
+    RowMutation* mu = table->NewRowMutation(ol.PrimaryKey());
+    mu->Put("cf0", "ol_o_id", std::to_string(ol.ol_o_id));
+    mu->Put("cf0", "ol_d_id", std::to_string(ol.ol_d_id));
+    mu->Put("cf0", "ol_w_id", std::to_string(ol.ol_w_id));
+    mu->Put("cf0", "ol_number", std::to_string(ol.ol_number));
+    mu->Put("cf0", "ol_i_id", std::to_string(ol.ol_i_id));
+    mu->Put("cf0", "ol_supply_w_id", std::to_string(ol.ol_supply_w_id));
+    mu->Put("cf0", "ol_quantity", std::to_string(ol.ol_quantity));
+    mu->Put("cf0", "ol_amount", std::to_string(ol.ol_amount));
+    mu->Put("cf0", "ol_delivery_d", ol.ol_delivery_d);
+    mu->Put("cf0", "ol_dist_info", ol.ol_dist_info);
+    gtxn->ApplyMutation(mu);
+    gtxn->Commit();
+    delete mu;
+    if (gtxn->GetError().GetType() != ErrorCode::kOK) {
+        LOG(ERROR) << "insert table:" << tablename << " failed. err:" 
+                   << gtxn->GetError().ToString();
+        delete gtxn;
+        return false;
+    }
+    delete gtxn;
+    return true;
+}
+
+bool TeraTpccDb::InsertNewOrder(const NewOrder& no) {
+    std::string tablename = "t_neworder";
+    if ( table_map_.find(tablename) == table_map_.end()) {
+        return false;
+    }
+    Table* table = table_map_[tablename];
+    Transaction* gtxn = client_->NewGlobalTransaction();
+    RowMutation* mu = table->NewRowMutation(no.PrimaryKey());
+    mu->Put("cf0", "no_o_id", std::to_string(no.no_o_id));
+    mu->Put("cf0", "no_d_id", std::to_string(no.no_d_id));
+    mu->Put("cf0", "no_w_id", std::to_string(no.no_w_id));
+    gtxn->ApplyMutation(mu);
+    gtxn->Commit();
+    delete mu;
+    if (gtxn->GetError().GetType() != ErrorCode::kOK) {
+        LOG(ERROR) << "insert table:" << tablename << " failed. err:" 
+                   << gtxn->GetError().ToString();
+        delete gtxn;
+        return false;
+    }
+    delete gtxn;
+    return true;
+}
+
+void TeraTpccDb::SetTxnResult(TxnResult* ret, Transaction* gtxn, bool state, 
+                              const std::string& msg) {
+    ret->SetState(state);
+    if (msg != "") {
+        ret->SetReason(gtxn->GetError().GetReason() + " msg:" + msg);
+    } else {
+        ret->SetReason(gtxn->GetError().GetReason());
+    }
+}
+
+bool TeraTpccDb::GetValues(TxnResult* ret, Transaction* gtxn, RowReader* reader,
+                           std::initializer_list<std::string> qu_names_initlist,
+                           RetTuples* ret_tuples,
+                           const std::string& if_error_msg) {
+    std::vector<std::string> qu_names(qu_names_initlist);
+    for (auto& qu_name : qu_names) {
+        reader->AddColumn("cf0", qu_name);
+    }
+    gtxn->Get(reader);
+    if (gtxn->GetError().GetType() != ErrorCode::kOK) {
+        SetTxnResult(ret, gtxn, false, if_error_msg);
+        delete reader;
+        return false;
+    } else {
+        RowReader::TRow row;
+        reader->ToMap(&row);
+        for (auto qu_name : qu_names) {
+            if (row["cf0"].find(qu_name) != row["cf0"].end()) {
+                 for (auto k : row["cf0"][qu_name]) {
+                     ret_tuples->insert({{qu_name, k.second}});
+                     break;
+                 }
+            }
+        }
+        delete reader;
+    }
+    return true;
+}
+
+bool TeraTpccDb::GetCustomer(TxnResult* ret, Transaction* gtxn, bool by_last_name, 
+                             const std::string& last_name, int32_t customer_id,
+                             int32_t warehouse_id, int32_t district_id,
+                             std::string* customer_key, RetTuples* customer_ret) {
+    // open table
+    Table* t_customer_last_index = table_map_[kTpccTables[kCustomerLastIndex]];
+    Table* t_customer = table_map_[kTpccTables[kCustomerTable]];
+    *customer_key = std::to_string(warehouse_id) + "_" + std::to_string(district_id) + "_";
+
+    if (by_last_name) {
+        ErrorCode error_code;
+        std::string start_key = *customer_key + last_name + "_";
+        ScanDescriptor scan_desc(start_key);
+        scan_desc.SetEnd(start_key + "~");
+        scan_desc.AddColumnFamily("cf0");
+        ResultStream* scanner = t_customer_last_index->Scan(scan_desc, &error_code);
+        std::vector<std::string> keys;
+        for (scanner->LookUp(start_key); !scanner->Done(); scanner->Next()) {
+            std::string row_key = scanner->RowName();
+            if (row_key.find(start_key) == std::string::npos) {
+                break;
+            }
+
+            RowReader* index_reader = t_customer_last_index->NewRowReader(row_key);
+            RetTuples index_ret;
+            if (!GetValues(ret, gtxn, index_reader, 
+                           {"c_id"},
+                           &index_ret, 
+                           "@get_customer|index_reader|" + row_key)) {
+                delete scanner;
+                return false;
+            }
+            keys.push_back(index_ret["c_id"]);
+        }
+        delete scanner;
+        size_t pos = keys.size();
+        pos = pos % 2 == 0 ? (pos / 2 - 1) : (pos / 2);
+        *customer_key += keys.at(pos);
+    } else {
+        *customer_key += std::to_string(customer_id); 
+    }
+    RowReader* customer_reader = t_customer->NewRowReader(*customer_key);
+    if (!GetValues(ret, gtxn, customer_reader,
+                   {"c_id", "c_d_id", "c_w_id", "c_first", "c_middle", "c_last",
+                    "c_balance", "c_ytd_payment", "c_payment_cnt", "c_credit", 
+                    "c_data", "c_street_1", "c_street_2", "c_city", "c_state",
+                    "c_zip", "c_phone", "c_since", "c_credit_lim", "c_discount"},
+                   customer_ret,
+                   "@get_customer|customer_reader" + *customer_key)) {
+        return false;
+    }
+    return true;
+}
+
+} // namespace tpcc
+} // namespace tera
diff --git a/src/benchmark/tpcc/tera_tpccdb.h b/src/benchmark/tpcc/tera_tpccdb.h
new file mode 100644
index 000000000..a300166b0
--- /dev/null
+++ b/src/benchmark/tpcc/tera_tpccdb.h
@@ -0,0 +1,101 @@
+// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+//
+// Author: baorenyi@baidu.com
+
+#ifndef TERA_BENCHMARK_TPCC_TERA_TPCCDB_H
+#define TERA_BENCHMARK_TPCC_TERA_TPCCDB_H
+
+#include "tera.h"
+#include "benchmark/tpcc/tpccdb.h"
+
+namespace tera {
+namespace tpcc {
+
+class TpccDb;
+class TxnResult;
+
+class TeraTpccDb : public TpccDb {
+public:
+    TeraTpccDb();
+    virtual ~TeraTpccDb();
+
+    virtual bool CreateTables();
+    virtual bool CleanTables();
+
+    // init db 
+    virtual bool InsertItem(const Item& i);
+
+    virtual bool InsertWarehouse(const Warehouse& w);
+
+    virtual bool InsertDistrict(const District& d);
+
+    virtual bool InsertCustomer(const Customer& c);
+
+    virtual bool InsertHistory(const History& h);
+
+    virtual bool InsertStock(const Stock& s);
+    
+    virtual bool InsertOrder(const Order& o);
+    
+    virtual bool InsertOrderLine(const OrderLine& ol);
+
+    virtual bool InsertNewOrder(const NewOrder& no);
+
+    virtual void StockLevelTxn(int32_t warehouse_id, int32_t district_id, 
+                               int32_t threshold, 
+                               StockLevelResult* ret);
+
+    virtual void DeliveryTxn(int32_t warehouse_id, 
+                             int32_t carrier_id, 
+                             const std::string& delivery_datetime,
+                             DeliveryResult* ret);
+
+    virtual void OrderStatusTxn(bool by_last_name,
+                                int32_t warehouse_id, int32_t district_id, 
+                                int32_t c_customer_id, 
+                                const std::string& last_name,
+                                OrderStatusResult* ret);
+
+    virtual void PaymentTxn(bool by_last_name,
+                            int32_t warehouse_id, int32_t district_id, 
+                            int32_t c_warehouse_id, int32_t c_district_id, 
+                            int32_t c_customer_id, 
+                            const std::string& last_name,
+                            int32_t h_amount,
+                            PaymentResult* ret);
+
+    virtual void NewOrderTxn(int32_t warehouse_id, 
+                             int32_t district_id, 
+                             int32_t customer_id, const NewOrderInfo& info,
+                             NewOrderResult* ret);
+
+private:
+    void SetTxnResult(TxnResult* ret, Transaction* gtxn, bool state = true,
+                      const std::string& msg = "");
+
+    bool GetValues(TxnResult* ret, Transaction* gtxn, RowReader* reader,
+                   std::initializer_list<std::string> qu_names_initlist,
+                   RetTuples* ret_tuples,
+                   const std::string& if_error_msg);
+
+    bool GetCustomer(TxnResult* ret, Transaction* gtxn, bool by_last_name, 
+                     const std::string& last_name, int32_t customer_id,
+                     int32_t warehouse_id, int32_t district_id,
+                     std::string* customer_key, RetTuples* customer_ret);
+private:
+    void SetPaymentSingleLineRet(const RetTuples& warehouse_ret,
+                                 const RetTuples& district_ret,
+                                 const RetTuples& customer_ret,
+                                 const RetTuples& other_ret,
+                                 RetTuples* payment_ret);
+private:
+    Client* client_;
+    std::unordered_map<std::string, Table*> table_map_;
+};
+
+} // namespace tpcc
+} // namespace tera
+
+#endif /* TERA_BENCHMARK_TPCC_TERA_TPCCDB_H */
diff --git a/src/benchmark/tpcc/tera_txn/delivery_txn.cc b/src/benchmark/tpcc/tera_txn/delivery_txn.cc
new file mode 100644
index 000000000..d1a7a3e18
--- /dev/null
+++ b/src/benchmark/tpcc/tera_txn/delivery_txn.cc
@@ -0,0 +1,144 @@
+// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+//
+// Author: baorenyi@baidu.com
+
+#include "benchmark/tpcc/tera_tpccdb.h"
+
+#include <gflags/gflags.h>
+#include <glog/logging.h>
+
+#include "sdk/client_impl.h"
+#include "sdk/sdk_utils.h"
+
+namespace tera {
+namespace tpcc {
+
+void TeraTpccDb::DeliveryTxn(int32_t warehouse_id, 
+                             int32_t carrier_id, 
+                             const std::string& delivery_datetime,
+                             DeliveryResult* ret) {
+    // open table
+    Table* t_neworder = table_map_[kTpccTables[kNewOrderTable]];
+    Table* t_order = table_map_[kTpccTables[kOrderTable]];
+    Table* t_orderline = table_map_[kTpccTables[kOrderLineTable]];
+    Table* t_customer = table_map_[kTpccTables[kCustomerTable]];
+    // begin transaction
+    Transaction* gtxn = client_->NewGlobalTransaction(); 
+    for (int32_t district_id = 1; district_id <= kDistrictCountPerWarehouse; ++district_id) {
+        // The row in the NEW-ORDER table with matching NO_W_ID (equals W_ID) 
+        // and NO_D_ID (equals D_ID) and with the lowest NO_O_ID value is selected.
+        ErrorCode error_code;
+        std::string start_key = std::to_string(warehouse_id) + "_" + std::to_string(district_id) + "_";
+        ScanDescriptor scan_desc(start_key);
+        scan_desc.SetEnd(start_key + "~");
+        scan_desc.AddColumnFamily("cf0");
+        tera::ResultStream* scanner = t_neworder->Scan(scan_desc, &error_code);
+        bool not_new_order = false;
+        int32_t order_id = INT32_MAX;
+        for (scanner->LookUp(start_key); !scanner->Done(); scanner->Next()) {
+            std::string row_key = scanner->RowName();
+            if (row_key.find(start_key) == std::string::npos) {
+                not_new_order = true;
+                break;
+            }
+            std::size_t found = row_key.find_last_of("_");
+            int32_t found_order_id = std::stoi(row_key.substr(found + 1));
+            if (order_id > found_order_id) {
+                order_id = found_order_id;
+            }
+        }
+        delete scanner;
+        // If no matching row is found, then the delivery of an order 
+        // for this district is skipped.
+        if (not_new_order || order_id == INT32_MAX) {
+            continue;
+        }
+
+        // The selected row in the NEW-ORDER table is deleted
+        std::string no_primary_key = start_key + std::to_string(order_id);
+        RowReader* no_reader = t_neworder->NewRowReader(no_primary_key);
+        RetTuples no_ret;
+        if (!GetValues(ret, gtxn, no_reader, 
+                       {"no_o_id"},
+                       &no_ret, 
+                       "@delivery|no_reader|" + no_primary_key)) {
+            return;
+        }
+
+        RowMutation* no_mu = t_neworder->NewRowMutation(no_primary_key);
+        no_mu->DeleteColumns("cf0", "no_o_id", gtxn->GetStartTimestamp());
+        no_mu->DeleteColumns("cf0", "no_d_id", gtxn->GetStartTimestamp());
+        no_mu->DeleteColumns("cf0", "no_w_id", gtxn->GetStartTimestamp());
+        gtxn->ApplyMutation(no_mu);
+        delete no_mu;
+
+        // The row in the ORDER table with matching 
+        // O_W_ID (equals W_ID), O_D_ID (equals D_ID), and O_ID (equals NO_O_ID) 
+        // is selected, O_C_ID, the customer number, is retrieved, 
+        // and O_CARRIER_ID is updated.
+        std::string order_primary_key = no_primary_key;
+        RowReader* order_reader = t_order->NewRowReader(order_primary_key);
+        RetTuples order_ret;
+        if (!GetValues(ret, gtxn, order_reader, 
+                       {"o_carrier_id", "o_ol_cnt", "o_c_id"},
+                       &order_ret, 
+                       "@delivery|order_reader|" + order_primary_key)) {
+            return;
+        }
+        RowMutation* order_mu = t_order->NewRowMutation(order_primary_key);
+        order_mu->Put("cf0", "o_carrier_id", std::to_string(carrier_id));
+        gtxn->ApplyMutation(order_mu);
+        delete order_mu;
+
+        int32_t o_ol_cnt = std::stoi(order_ret["o_ol_cnt"]);
+        // the sum of all OL_AMOUNT.
+        float amount = 0.0f;
+        // All rows in the ORDER-LINE table with matching 
+        // OL_W_ID (= O_W_ID), OL_D_ID (= O_D_ID), and OL_O_ID (= O_ID) are selected. 
+        for (int32_t ol_number = 1; ol_number <= o_ol_cnt; ++ ol_number) {
+            std::string ol_key = order_primary_key + "_" + std::to_string(ol_number);
+            RowReader* ol_reader = t_orderline->NewRowReader(ol_key);
+            RetTuples ol_ret;
+            if (!GetValues(ret, gtxn, ol_reader, 
+                           {"ol_amount", "ol_delivery_d"},
+                           &ol_ret, 
+                           "@delivery|ol_reader|" + ol_key)) {
+                return;
+            }
+            amount += std::stof(ol_ret["ol_amount"]);
+            RowMutation* ol_mu = t_orderline->NewRowMutation(ol_key);
+            // All OL_DELIVERY_D, the delivery dates, 
+            // are updated to the current system time as returned by the OS 
+            ol_mu->Put("cf0","ol_delivery_d",delivery_datetime);
+            gtxn->ApplyMutation(ol_mu);
+            delete ol_mu;
+        }
+
+        // The row in the CUSTOMER table with matching 
+        // C_W_ID (= W_ID), C_D_ID (= D_ID), and C_ID (= O_C_ID) is selected 
+        std::string customer_key = start_key + order_ret["o_c_id"];
+        RowReader* customer_reader = t_customer->NewRowReader(customer_key);
+        RetTuples customer_ret;
+        if (!GetValues(ret, gtxn, customer_reader,
+                       {"c_balance", "c_delivery_cnt"}, 
+                       &customer_ret,
+                       "@delivery|customer_reader" + customer_key)) {
+            return;
+        }
+        // and C_BALANCE + sum(OL_AMOUNT) previously retrieved. C_DELIVERY_CNT + 1.
+        RowMutation* customer_mu = t_customer->NewRowMutation(customer_key);
+        customer_mu->Put("cf0", "c_balance", 
+                std::to_string(std::stof(customer_ret["c_balance"]) + amount));
+        customer_mu->Put("cf0", "c_delivery_cnt", 
+                std::to_string(std::stoi(customer_ret["c_delivery_cnt"]) + 1));
+        gtxn->ApplyMutation(customer_mu);
+        delete customer_mu;
+    }
+    gtxn->Commit();
+    SetTxnResult(ret, gtxn, gtxn->GetError().GetType() == ErrorCode::kOK);
+}
+
+} // namespace tpcc
+} // namespace tera
diff --git a/src/benchmark/tpcc/tera_txn/new_order_txn.cc b/src/benchmark/tpcc/tera_txn/new_order_txn.cc
new file mode 100644
index 000000000..df4100824
--- /dev/null
+++ b/src/benchmark/tpcc/tera_txn/new_order_txn.cc
@@ -0,0 +1,214 @@
+// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+//
+// Author: baorenyi@baidu.com
+
+#include "benchmark/tpcc/tera_tpccdb.h"
+
+#include <gflags/gflags.h>
+#include <glog/logging.h>
+
+#include "sdk/client_impl.h"
+#include "sdk/sdk_utils.h"
+
+namespace tera {
+namespace tpcc {
+
+void TeraTpccDb::NewOrderTxn(int32_t warehouse_id, 
+                             int32_t district_id, 
+                             int32_t customer_id, const NewOrderInfo& info, 
+                             NewOrderResult* ret) {
+    // open table
+    Table* t_warehouse = table_map_[kTpccTables[kWarehouseTable]];
+    Table* t_district = table_map_[kTpccTables[kDistrictTable]];
+    Table* t_customer = table_map_[kTpccTables[kCustomerTable]];
+    Table* t_order = table_map_[kTpccTables[kOrderTable]];
+    Table* t_order_index = table_map_[kTpccTables[kOrderIndex]];
+    Table* t_neworder = table_map_[kTpccTables[kNewOrderTable]];
+    Table* t_orderline = table_map_[kTpccTables[kOrderLineTable]];
+    Table* t_item = table_map_[kTpccTables[kItemTable]];
+    Table* t_stock = table_map_[kTpccTables[kStockTable]];
+    // begin transaction
+    std::unique_ptr<Transaction> gtxn(client_->NewGlobalTransaction()); 
+    std::string datetime = get_curtime_str();
+    std::string warehouse_key = std::to_string(warehouse_id);
+    std::string district_key = warehouse_key + "_" + std::to_string(district_id);
+    std::string customer_key = district_key + "_" + std::to_string(customer_id);
+    
+    RowReader* warehouse_reader = t_warehouse->NewRowReader(warehouse_key);
+    RetTuples warehouse_ret;
+    if (!GetValues(ret, gtxn.get(), warehouse_reader, 
+                   {"w_tax"},
+                   &warehouse_ret,
+                   "@new_order|warehouse_reader|" + warehouse_key)) {
+        return;
+    }
+
+    RowReader* district_reader = t_district->NewRowReader(district_key);
+    RetTuples district_ret;
+    if (!GetValues(ret, gtxn.get(), district_reader, 
+                   {"d_next_o_id", "d_tax"},
+                   &district_ret,
+                   "@new_order|district_reader|" + district_key)) {
+        return;
+    }
+    std::string d_next_o_id_str = std::to_string(std::stoi(district_ret["d_next_o_id"]) + 1);
+
+    RowReader* customer_reader = t_customer->NewRowReader(customer_key);
+    RetTuples customer_ret;
+    if (!GetValues(ret, gtxn.get(), customer_reader, 
+                   {"c_discount", "c_credit", "c_last"},
+                   &customer_ret,
+                   "@new_order|customer_reader|" + customer_key)) {
+        return;
+    }
+
+    RowMutation* district_mu = t_district->NewRowMutation(district_key);
+    district_mu->Put("cf0", "d_next_o_id", d_next_o_id_str);
+    gtxn->ApplyMutation(district_mu);
+    delete district_mu;
+    
+    std::string order_key = district_key + "_" + d_next_o_id_str;
+    RowMutation* order_mu = t_order->NewRowMutation(order_key);
+    std::string order_index_key = customer_key + "_" + d_next_o_id_str;
+    RowMutation* order_index_mu = t_order_index->NewRowMutation(order_index_key);
+    order_index_mu->Put("cf0", "o_id", d_next_o_id_str);
+    order_index_mu->Put("cf0", "o_c_id", std::to_string(customer_id));
+    order_index_mu->Put("cf0", "o_d_id", std::to_string(district_id));
+    order_index_mu->Put("cf0", "o_w_id", warehouse_key);
+    order_mu->Put("cf0", "o_id", d_next_o_id_str);
+    order_mu->Put("cf0", "o_c_id", std::to_string(customer_id));
+    order_mu->Put("cf0", "o_d_id", std::to_string(district_id));
+    order_mu->Put("cf0", "o_w_id", warehouse_key);
+    order_mu->Put("cf0", "o_carrier_id", std::to_string(0));
+    order_mu->Put("cf0", "o_ol_cnt", std::to_string(info.o_ol_cnt));
+    order_mu->Put("cf0", "o_all_local", std::to_string(info.o_all_local));
+    order_mu->Put("cf0", "o_entry_d", datetime);
+    gtxn->ApplyMutation(order_mu);
+    gtxn->ApplyMutation(order_index_mu);
+    delete order_mu;
+    delete order_index_mu;
+
+    RowMutation* no_mu = t_neworder->NewRowMutation(order_key);
+    no_mu->Put("cf0", "no_o_id", d_next_o_id_str);
+    no_mu->Put("cf0", "no_d_id", std::to_string(district_id));
+    no_mu->Put("cf0", "no_w_id", warehouse_key);
+    gtxn->ApplyMutation(no_mu);
+    delete no_mu;
+    
+    std::string ol_dist_info_key;
+    if (district_id == kDistrictCountPerWarehouse) {
+        ol_dist_info_key = "s_dist_10";
+    } else {
+        ol_dist_info_key = "s_dist_0" + std::to_string(district_id);
+    }
+
+    float ol_amount_sum = 0;
+    for (int32_t i = 0; i < info.o_ol_cnt; ++i) {
+        int32_t i_id = info.ol_i_ids[i];
+        std::string item_key = std::to_string(i_id);
+        RowReader* item_reader = t_item->NewRowReader(item_key);
+        RetTuples item_ret;
+        if (!GetValues(ret, gtxn.get(), item_reader, 
+                       {"i_price", "i_name", "i_data"},
+                       &item_ret,
+                       "@new_order|item_reader|" + item_key)) {
+            return;
+        }
+        
+        std::string ol_supply_w_id_str = std::to_string(info.ol_supply_w_ids[i]);
+        std::string stock_key = ol_supply_w_id_str+ "_" + item_key;
+        RowReader* stock_reader = t_item->NewRowReader(stock_key);
+        RetTuples stock_ret;
+        if (!GetValues(ret, gtxn.get(), stock_reader, 
+                       {"s_quantity", "s_ytd", "s_order_cnt", "s_remote_cnt", "s_data", ol_dist_info_key},
+                       &stock_ret,
+                       "@new_order|stock_reader|" + stock_key)) {
+            return;
+        }
+
+        int32_t ol_quantity = info.ol_quantities[i];
+        float ol_amount = std::stof(item_ret["i_price"]) * ol_quantity;
+        ol_amount_sum += ol_amount;
+        std::string ol_number_str = std::to_string(i + 1);
+        std::string ol_key = order_key + "_" + ol_number_str;
+        RowMutation* ol_mu = t_orderline->NewRowMutation(ol_key);
+        ol_mu->Put("cf0", "ol_o_id", d_next_o_id_str);
+        ol_mu->Put("cf0", "ol_d_id", std::to_string(district_id));
+        ol_mu->Put("cf0", "ol_w_id", warehouse_key);
+        ol_mu->Put("cf0", "ol_number", ol_number_str);
+        ol_mu->Put("cf0", "ol_i_id", item_key);
+        ol_mu->Put("cf0", "ol_supply_w_id", ol_supply_w_id_str);
+        ol_mu->Put("cf0", "ol_delivery_d", "");
+        ol_mu->Put("cf0", "ol_quantity", std::to_string(ol_quantity));
+        ol_mu->Put("cf0", "ol_amount", std::to_string(ol_amount));
+        ol_mu->Put("cf0", "ol_dist_info", stock_ret[ol_dist_info_key]);
+        gtxn->ApplyMutation(ol_mu);
+        delete ol_mu;
+        // update stock
+        int32_t s_quantity = std::stoi(stock_ret["s_quantity"]);
+        if (s_quantity > ol_quantity + 10) {
+            s_quantity -= ol_quantity;
+        } else {
+            s_quantity = (s_quantity - ol_quantity) + 91;
+        }
+        float s_ytd = std::stof(stock_ret["s_quantity"]) + ol_quantity;
+        int32_t s_order_cnt = std::stoi(stock_ret["s_order_cnt"]) + 1;
+        int32_t s_remote_cnt = std::stoi(stock_ret["s_remote_cnt"]);
+        if (info.ol_supply_w_ids[i] != warehouse_id) {
+            ++s_remote_cnt;
+        }
+        RowMutation* stock_mu = t_stock->NewRowMutation(stock_key);
+        stock_mu->Put("cf0", "s_quantity", std::to_string(s_quantity));
+        stock_mu->Put("cf0", "s_ytd", std::to_string(s_ytd));
+        stock_mu->Put("cf0", "s_order_cnt", std::to_string(s_order_cnt));
+        stock_mu->Put("cf0", "s_remote_cnt", std::to_string(s_remote_cnt));
+        gtxn->ApplyMutation(stock_mu);
+        delete stock_mu;
+
+        // set result
+        RetTuples line;
+        line["ol_supply_w_id"] = ol_supply_w_id_str;
+        line["ol_i_id"] = item_key;
+        line["i_name"] = item_ret["i_name"];
+        line["ol_quantity"] = std::to_string(ol_quantity);
+        line["s_quantity"] = std::to_string(s_quantity);
+        line["i_price"] = item_ret["i_price"];
+        line["ol_amount"] = std::to_string(ol_amount);
+        std::string i_data = item_ret["i_data"];
+        std::string s_data = item_ret["s_data"];
+        if (i_data.find("ORIGINAL") != std::string::npos && 
+                s_data.find("ORIGINAL") != std::string::npos) {
+            line["brand_generic"] = "B";
+        } else {
+            line["brand_generic"] = "G";
+        }
+        ret->AddLine(line);
+    }
+    if (!info.need_failed) {
+        RetTuples single_line;
+        single_line["o_id"] = d_next_o_id_str;
+        single_line["o_ol_cnt"] = std::to_string(info.o_ol_cnt);
+        single_line["c_last"] = customer_ret["c_last"];
+        single_line["c_credit"] = customer_ret["c_credit"];
+        single_line["c_discount"] = customer_ret["c_discount"];
+        single_line["w_tax"] = warehouse_ret["w_tax"];
+        single_line["d_tax"] = district_ret["d_tax"];
+        single_line["o_entry_d"] = datetime;
+        float c_discount = std::stof(customer_ret["c_discount"]);
+        float w_tax = std::stof(warehouse_ret["w_tax"]);
+        float d_tax = std::stof(district_ret["d_tax"]);
+        float total_amount = ol_amount_sum * ( 1 - c_discount) * (1 + w_tax + d_tax);
+        single_line["total_amount"] = std::to_string(total_amount);
+        ret->SetSingleLine(single_line);
+        gtxn->Commit();
+        SetTxnResult(ret, gtxn.get());
+    } else {
+        // set commit failed
+        SetTxnResult(ret, gtxn.get(), false, "@new_order|rowback simulation");
+    }
+}
+
+} // namespace tpcc
+} // namespace tera
diff --git a/src/benchmark/tpcc/tera_txn/order_status_txn.cc b/src/benchmark/tpcc/tera_txn/order_status_txn.cc
new file mode 100644
index 000000000..a88fe7e0c
--- /dev/null
+++ b/src/benchmark/tpcc/tera_txn/order_status_txn.cc
@@ -0,0 +1,89 @@
+// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+//
+// Author: baorenyi@baidu.com
+
+#include "benchmark/tpcc/tera_tpccdb.h"
+
+#include <gflags/gflags.h>
+#include <glog/logging.h>
+
+#include "sdk/client_impl.h"
+#include "sdk/sdk_utils.h"
+
+namespace tera {
+namespace tpcc {
+
+void TeraTpccDb::OrderStatusTxn(bool by_last_name,
+                                int32_t warehouse_id, int32_t district_id, 
+                                int32_t c_customer_id, 
+                                const std::string& last_name,
+                                OrderStatusResult* ret) {
+    // open table
+    Table* t_order_index = table_map_[kTpccTables[kOrderIndex]];
+    Table* t_orderline = table_map_[kTpccTables[kOrderLineTable]];
+    Table* t_order = table_map_[kTpccTables[kOrderTable]];
+    // begin transaction
+    std::unique_ptr<Transaction> gtxn(client_->NewGlobalTransaction()); 
+    std::string customer_key = "";
+    RetTuples customer_ret;
+    if (!GetCustomer(ret, gtxn.get(), by_last_name, last_name, c_customer_id, 
+                warehouse_id, district_id, &customer_key, &customer_ret)) {
+        return;
+    }
+
+    // find newest order from order index
+    ErrorCode error_code;
+    std::string prefix_key = std::to_string(warehouse_id) + "_" 
+        + std::to_string(district_id) + "_";
+    std::string start_key = prefix_key + customer_ret["c_id"] + "_";
+    ScanDescriptor scan_desc(start_key);
+    scan_desc.SetEnd(start_key + "~");
+    scan_desc.AddColumnFamily("cf0");
+    ResultStream* scanner = t_order_index->Scan(scan_desc, &error_code);
+    int32_t max_order_id = -1;
+    for (scanner->LookUp(start_key); !scanner->Done(); scanner->Next()) {
+        std::string row_key = scanner->RowName();
+        RowReader* index_reader = t_order_index->NewRowReader(row_key);
+        RetTuples index_ret;
+        if (!GetValues(ret, gtxn.get(), index_reader, 
+                       {"o_id"},
+                       &index_ret,
+                       "@order_status|order_index_reader|" + row_key)) {
+            break;
+        }
+        if ( max_order_id < std::stoi(index_ret["o_id"])) {
+            max_order_id = std::stoi(index_ret["o_id"]);
+        }
+    }
+    delete scanner;
+    if (max_order_id == -1) {
+        SetTxnResult(ret, gtxn.get(), false, "not found order|" + start_key);
+        return;
+    }
+    std::string order_key = prefix_key + std::to_string(max_order_id);
+    RowReader* order_reader = t_order->NewRowReader(order_key);
+    RetTuples order_ret;
+    if (!GetValues(ret, gtxn.get(), order_reader, 
+                {"o_ol_cnt", "o_id"},
+                &order_ret,
+                "@order_status|order_reader|" + order_key)) {
+        return;
+    }
+    for (int32_t i = 1; i <= std::stoi(order_ret["o_ol_cnt"]); ++i) {
+        std::string ol_key = prefix_key + order_ret["o_id"] + "_" + std::to_string(i);
+        RowReader* ol_reader = t_orderline->NewRowReader(ol_key);
+        RetTuples ol_ret;
+        if (!GetValues(ret, gtxn.get(), ol_reader, 
+                    {}, // TODO
+                    &ol_ret,
+                    "@order_status|ol_reader|" + ol_key)) {
+            return;
+        }
+    }
+    SetTxnResult(ret, gtxn.get());
+}
+
+} // namespace tpcc
+} // namespace tera
diff --git a/src/benchmark/tpcc/tera_txn/payment_txn.cc b/src/benchmark/tpcc/tera_txn/payment_txn.cc
new file mode 100644
index 000000000..c45d371bd
--- /dev/null
+++ b/src/benchmark/tpcc/tera_txn/payment_txn.cc
@@ -0,0 +1,194 @@
+// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+//
+// Author: baorenyi@baidu.com
+
+#include "benchmark/tpcc/tera_tpccdb.h"
+
+#include <gflags/gflags.h>
+#include <glog/logging.h>
+
+#include "sdk/client_impl.h"
+#include "sdk/sdk_utils.h"
+
+namespace tera {
+namespace tpcc {
+
+void TeraTpccDb::PaymentTxn(bool by_last_name,
+                            int32_t warehouse_id, int32_t district_id, 
+                            int32_t customer_warehouse_id, int32_t customer_district_id, 
+                            int32_t c_customer_id,
+                            const std::string& last_name,
+                            int32_t h_amount,
+                            PaymentResult* ret) {
+    // open table
+    Table* t_warehouse = table_map_[kTpccTables[kWarehouseTable]];
+    Table* t_district = table_map_[kTpccTables[kDistrictTable]];
+    Table* t_customer = table_map_[kTpccTables[kCustomerTable]];
+    Table* t_history = table_map_[kTpccTables[kHistoryTable]];
+    Table* t_history_index = table_map_[kTpccTables[kHistoryIndex]];
+
+    // begin transaction
+    Transaction* gtxn = client_->NewGlobalTransaction(); 
+
+    // read customer
+    std::string customer_key = "";
+    RetTuples customer_ret;
+    if (!GetCustomer(ret, gtxn, by_last_name, last_name, c_customer_id, 
+                customer_warehouse_id, customer_district_id, &customer_key, &customer_ret)) {
+        return;
+    }
+
+    // read warehouse
+    std::string warehouse_key = std::to_string(warehouse_id);
+    RowReader* warehouse_reader = t_warehouse->NewRowReader(warehouse_key);
+    RetTuples warehouse_ret;
+    if (!GetValues(ret, gtxn, warehouse_reader,
+                   {"w_ytd", "w_name", "w_street_1", "w_street_2", "w_city", "w_state", "w_zip"},
+                   &warehouse_ret,
+                   "@payment|warehouse_reader|" + warehouse_key)) {
+        return;
+    }
+    
+    // update warehouse
+    RowMutation* warehouse_mu = t_warehouse->NewRowMutation(warehouse_key);
+    // add amount of this payment to the ytd balance of current warehouse.
+    float w_ytd = std::stof(warehouse_ret["w_ytd"]) + h_amount;
+    warehouse_mu->Put("cf0", "w_ytd", std::to_string(w_ytd));
+    gtxn->ApplyMutation(warehouse_mu);
+    delete warehouse_mu;
+
+    // read district
+    std::string district_id_str = std::to_string(district_id);
+    std::string district_key = warehouse_key + "_" + district_id_str;
+    RowReader* district_reader = t_district->NewRowReader(district_key);
+    RetTuples district_ret;
+    if (!GetValues(ret, gtxn, district_reader, 
+                   {"d_ytd", "d_name", "d_street_1", "d_street_2", "d_city", "d_state", "d_zip"},
+                   &district_ret,
+                   "@payment|district_reader|" + district_key)) {
+        return;
+    }
+
+    // update district
+    RowMutation* district_mu = t_district->NewRowMutation(district_key);
+    // add amount of this payment to the ytd balance of current district.
+    float d_ytd = std::stof(district_ret["d_ytd"]) + h_amount;
+    district_mu->Put("cf0", "d_ytd", std::to_string(d_ytd));
+    gtxn->ApplyMutation(district_mu);
+    delete district_mu;
+
+    // update customer
+    // [Revision 5.11 - Page 34] see Clause 2.5.2.2
+    // C_BALANCE is decreased by H_AMOUNT. 
+    // C_YTD_PAYMENT is increased by H_AMOUNT. 
+    // C_PAYMENT_CNT is incremented by 1.
+    RowMutation* customer_mu = t_customer->NewRowMutation(customer_key);
+    std::string c_balance_str = std::to_string(std::stof(customer_ret["c_balance"]) - h_amount);
+    customer_mu->Put("cf0", "c_balance", c_balance_str);
+    customer_mu->Put("cf0", "c_ytd_payment",
+            std::to_string(std::stof(customer_ret["c_ytd_payment"]) + h_amount));
+    customer_mu->Put("cf0", "c_payment_cnt",
+            std::to_string(std::stof(customer_ret["c_payment_cnt"]) + h_amount));
+  
+    if (customer_ret["c_credit"] == "BC") {
+        std::string data_info = customer_key + "_" +  district_key + "_" + std::to_string(h_amount); 
+        customer_ret["c_data"].insert(0, data_info);
+        if (customer_ret["c_data"].size() > kCustomerDataUpperLen) {
+            customer_ret["c_data"].substr(0, kCustomerDataUpperLen);
+        }
+        customer_mu->Put("cf0", "c_data", customer_ret["c_data"]);
+    }
+    gtxn->ApplyMutation(customer_mu);
+    delete customer_mu;
+
+    // read history_index (find newest history)
+    std::string history_data = warehouse_ret["w_name"] + "    " + district_ret["d_name"];
+    RowReader* hindex_reader = t_history_index->NewRowReader("count");
+    RetTuples hindex_ret;
+    if (!GetValues(ret, gtxn, hindex_reader, 
+                   {"count"},
+                   &hindex_ret,
+                   "@payment|hindex_reader|count")) {
+        return;
+    }
+    int cnt = std::stoi(hindex_ret["count"]);
+    
+    // update history_index
+    RowMutation* hindex_mu = t_history_index->NewRowMutation("count");
+    hindex_mu->Put("cf0", "count", std::to_string(++cnt));
+    gtxn->ApplyMutation(hindex_mu);
+    delete hindex_mu;
+
+    // update history use now newest count as the primary key(row_key) of history
+    // default t_history don't have priamry key in tpcc
+    std::string history_key = std::to_string(cnt);
+    RowMutation* mu = t_history->NewRowMutation(history_key);
+    mu->Put("cf0", "h_c_id", customer_ret["c_id"]);
+    mu->Put("cf0", "h_c_d_id", customer_ret["c_d_id"]);
+    mu->Put("cf0", "h_c_w_id", customer_ret["c_w_id"]);
+    mu->Put("cf0", "h_d_id", district_id_str);
+    mu->Put("cf0", "h_w_id", warehouse_key);
+    mu->Put("cf0", "h_amount", std::to_string(h_amount));
+    // The payment date (H_DATE) in generated within the SUT 
+    // by using the current system date and time
+    std::string datetime = get_curtime_str();
+    mu->Put("cf0", "h_date", datetime);
+    mu->Put("cf0", "h_data", history_data);
+    gtxn->ApplyMutation(mu);
+    delete mu;
+
+    gtxn->Commit();
+    RetTuples single_line;
+    RetTuples other_ret = {
+                {"w_id", warehouse_key},
+                {"d_id", district_id_str},
+                {"h_amount", std::to_string(h_amount)},
+                {"h_date", datetime}, 
+                {"c_balance", c_balance_str},
+                {"c_data", customer_ret["c_data"].substr(0,200)}
+            };
+    SetPaymentSingleLineRet(warehouse_ret, district_ret, customer_ret, other_ret,
+            &single_line);
+    
+    SetTxnResult(ret, gtxn);
+}
+
+void TeraTpccDb::SetPaymentSingleLineRet(const RetTuples& warehouse_ret, 
+                                         const RetTuples& district_ret, 
+                                         const RetTuples& customer_ret,
+                                         const RetTuples& other_ret,
+                                         RetTuples* payment_ret) {
+    // The following fields are displayed: 
+    // W_ID, D_ID, C_ID, C_D_ID, C_W_ID, 
+    // W_STREET_1, W_STREET_2, W_CITY, W_STATE, W_ZIP, 
+    // D_STREET_1, D_STREET_2, D_CITY, D_STATE, D_ZIP, 
+    // C_FIRST, C_MIDDLE, C_LAST, C_STREET_1, C_STREET_2, C_CITY, C_STATE, 
+    // C_ZIP, C_PHONE, C_SINCE, C_CREDIT, C_CREDIT_LIM, C_DISCOUNT, C_BALANCE, 
+    // the first 200 characters of C_DATA (only if C_CREDIT = "BC"), 
+    // H_AMOUNT, and H_DATE.
+    payment_ret->insert(other_ret.begin(), other_ret.end());
+    for (auto t : warehouse_ret) {
+        if (t.first != "w_ytd" && t.first != "w_name") {
+            payment_ret->insert(t);
+        }
+    }
+    for (auto t : district_ret) {
+        if (t.first != "d_ytd" && t.first != "w_name") {
+            payment_ret->insert(t);
+        }
+    }
+    std::unordered_set<std::string> c_names = {"c_id", "c_d_id", "c_w_id", 
+        "c_first", "c_middle", "c_last", "c_street_1", "c_street_2", "c_city", 
+        "c_state", "c_zip", "c_phone", "c_since", "c_credit", "c_credit_lim", 
+        "c_discount"};
+    for (auto t : customer_ret) {
+        if (c_names.find(t.first) != c_names.end()) {
+            payment_ret->insert(t);
+        }
+    }
+}
+
+} // namespace tpcc
+} // namespace tera
diff --git a/src/benchmark/tpcc/tera_txn/stocklevel_txn.cc b/src/benchmark/tpcc/tera_txn/stocklevel_txn.cc
new file mode 100644
index 000000000..eeb7bb06d
--- /dev/null
+++ b/src/benchmark/tpcc/tera_txn/stocklevel_txn.cc
@@ -0,0 +1,79 @@
+// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+//
+// Author: baorenyi@baidu.com
+
+#include "benchmark/tpcc/tera_tpccdb.h"
+
+#include <gflags/gflags.h>
+#include <glog/logging.h>
+
+#include "sdk/client_impl.h"
+#include "sdk/sdk_utils.h"
+
+namespace tera {
+namespace tpcc {
+
+void TeraTpccDb::StockLevelTxn(int32_t warehouse_id, int32_t district_id, 
+                               int32_t threshold, 
+                               StockLevelResult* ret) {
+    // open table
+    Table* t_district = table_map_[kTpccTables[kDistrictTable]];
+    Table* t_order = table_map_[kTpccTables[kOrderTable]];
+    Table* t_orderline = table_map_[kTpccTables[kOrderLineTable]];
+    Table* t_stock = table_map_[kTpccTables[kStockTable]];
+    // begin transaction
+    std::unique_ptr<Transaction> gtxn(client_->NewGlobalTransaction()); 
+    std::string district_primary_key = std::to_string(warehouse_id) 
+        + "_" + std::to_string(district_id);
+    RowReader* district_reader = t_district->NewRowReader(district_primary_key);
+    RetTuples district_ret;
+    if (!GetValues(ret, gtxn.get(), district_reader, {"d_next_o_id"}, &district_ret,
+            "@stock_level|district_reader|" + district_primary_key)) {
+        return;
+    }
+    int32_t order_id = std::stoi(district_ret["d_next_o_id"]);
+
+    int32_t cnt = 0;
+    for (int32_t ol_o_id = order_id - 20; ol_o_id <= order_id; ++ol_o_id) {
+        std::string order_primary_key = std::to_string(warehouse_id)
+            + "_" + std::to_string(district_id) + "_" + std::to_string(ol_o_id);
+        RowReader* order_reader = t_order->NewRowReader(order_primary_key);
+        RetTuples order_ret;
+        if (!GetValues(ret, gtxn.get(), order_reader, {"o_ol_cnt"}, &order_ret,
+                "@stock_level|order_reader|" + order_primary_key)) {
+            return;
+        }
+        int32_t o_ol_cnt = std::stoi(order_ret["o_ol_cnt"]);
+        for (int32_t ol_number = 1; ol_number <= o_ol_cnt; ++ ol_number) {
+            std::string ol_primary_key = order_primary_key + "_" + std::to_string(ol_number);
+            RowReader* ol_reader = t_orderline->NewRowReader(ol_primary_key);
+            RetTuples ol_ret;
+            ol_reader->AddColumn("cf0", "ol_i_id");
+            if (!GetValues(ret, gtxn.get(), ol_reader, {"ol_i_id"}, &ol_ret,
+                        "@stock_level|ol_reader|" + ol_primary_key)) {
+                return;
+            }
+            int32_t ol_i_id = std::stoi(ol_ret["ol_i_id"]);
+            std::string stock_key = std::to_string(warehouse_id) 
+                + "_" + std::to_string(ol_i_id);
+            RowReader* stock_reader = t_stock->NewRowReader(stock_key);
+            RetTuples stock_ret;
+            if (!GetValues(ret, gtxn.get(), stock_reader, {"s_quantity"}, &stock_ret,
+                "@stock_level|stock_reader|" + stock_key)) {
+                return;
+            }
+            int32_t s_quantity = std::stoi(stock_ret["s_quantity"]);
+            if (s_quantity < threshold) {
+                ++cnt;
+            }
+        }
+    }
+    // only read not need commit
+    ret->SetLowStock(cnt);
+    SetTxnResult(ret, gtxn.get());
+}
+
+} // namespace tpcc
+} // namespace tera
diff --git a/src/benchmark/tpcc/test/data_generator_test.cc b/src/benchmark/tpcc/test/data_generator_test.cc
new file mode 100644
index 000000000..6c5b71fe7
--- /dev/null
+++ b/src/benchmark/tpcc/test/data_generator_test.cc
@@ -0,0 +1,88 @@
+// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+//
+// Author: baorenyi@baidu.com
+
+#include <iostream>
+
+#include "benchmark/tpcc/data_generator.h"
+#include "benchmark/tpcc/mock_tpccdb.h"
+#include "benchmark/tpcc/random_generator.h"
+#include "benchmark/tpcc/tpccdb.h"
+
+#include "gflags/gflags.h"
+#include "gtest/gtest.h"
+
+DECLARE_int32(warehouses_count);
+
+namespace tera {
+namespace tpcc {
+
+class DataGeneratorTest : public ::testing::Test {
+public:
+    DataGeneratorTest() {
+        random_gen_.SetRandomConstant();
+        TpccDb* db_ = (TpccDb*)(&mdb_);
+        data_gen_ = new DataGenerator(&random_gen_, db_);
+	}
+
+    void CleanStateCounter(int table_enum_num = -1) {
+        if (table_enum_num == -1) {
+            for (int i = 0; i < kTpccTableCnt; ++i) {
+                data_gen_->states_[i].first.Set(0);
+                data_gen_->states_[i].second.Set(0);
+            }
+        } else if (table_enum_num > -1 && table_enum_num < kTpccTableCnt) {
+            data_gen_->states_[table_enum_num].first.Set(0);
+            data_gen_->states_[table_enum_num].second.Set(0);
+        }
+    }
+
+	~DataGeneratorTest() {
+        delete data_gen_;
+    }
+private:
+    RandomGenerator random_gen_;
+    TpccDb* db_;
+    MockTpccDb mdb_;
+    DataGenerator* data_gen_;
+
+};
+
+TEST_F(DataGeneratorTest, GenItem) {
+    CleanStateCounter();
+    mdb_.flag_ = true;
+    data_gen_->GenItem(1, false);
+    EXPECT_TRUE(data_gen_->states_[kItemTable].first.Get() == 1);
+    data_gen_->GenItem(1, false);
+    EXPECT_TRUE(data_gen_->states_[kItemTable].first.Get() == 2);
+    mdb_.flag_ = false;
+    data_gen_->GenItem(1, false);
+    EXPECT_TRUE(data_gen_->states_[kItemTable].second.Get() == 1);
+}
+
+TEST_F(DataGeneratorTest, GenStock) {
+    CleanStateCounter();
+    mdb_.flag_ = true;
+    data_gen_->GenStock(1, 2, false);
+    EXPECT_TRUE(data_gen_->states_[kStockTable].first.Get() == 1);
+    data_gen_->GenStock(1, 2, false);
+    EXPECT_TRUE(data_gen_->states_[kStockTable].first.Get() == 2);
+    mdb_.flag_ = false;
+    data_gen_->GenStock(1, 3, false);
+    EXPECT_TRUE(data_gen_->states_[kStockTable].second.Get() == 1);
+}
+
+TEST_F(DataGeneratorTest, GenStocks) {
+    CleanStateCounter();
+    mdb_.flag_ = true;
+    for (int i = 1; i <=FLAGS_warehouses_count; ++i) {
+        data_gen_->GenStocks(i);
+    }
+    data_gen_->Join();
+    EXPECT_TRUE(data_gen_->states_[kStockTable].first.Get() == FLAGS_warehouses_count * kItemCount);
+}
+
+} // namespace tpcc
+} // namespace tera
diff --git a/src/benchmark/tpcc/test/random_generator_test.cc b/src/benchmark/tpcc/test/random_generator_test.cc
new file mode 100644
index 000000000..978521739
--- /dev/null
+++ b/src/benchmark/tpcc/test/random_generator_test.cc
@@ -0,0 +1,81 @@
+// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+//
+// Author: baorenyi@baidu.com
+
+#include "benchmark/tpcc/random_generator.h"
+
+#include "gtest/gtest.h"
+
+namespace tera {
+namespace tpcc {
+
+class RandomGenerator;
+
+class RandomGeneratorTest : public ::testing::Test, public RandomGenerator {
+public:
+    RandomGeneratorTest() : RandomGenerator() {
+        SetRandomConstant();
+	}
+
+    ~RandomGeneratorTest() {}
+};
+
+TEST_F(RandomGeneratorTest, MakeFloat) { 
+    EXPECT_EQ(MakeFloat(1.0, 1.0, 1), 1.0);
+    float f = MakeFloat(0, 1.0, 2);
+    std::cout << std::to_string(f) << std::endl;
+    EXPECT_TRUE(f >= 0 && f <= 1);
+}
+
+TEST_F(RandomGeneratorTest, MakeAString) {
+    EXPECT_TRUE(MakeAString(0, 0) == "");
+    EXPECT_TRUE((MakeAString(1, 1)).length() == 1);
+    std::string a_str = MakeAString(1,10);
+    EXPECT_TRUE(a_str.length() <= 10 && a_str.length() >= 1);
+    std::string a_str1 = MakeAString(26,27);
+    int cnt = 0;
+    for (int i = 0; i < a_str1.length(); ++i) {
+        for (int j = i + 1; j < a_str1.length(); ++j) {
+            if (a_str1[i] == a_str1[j]) {
+                ++cnt;
+            }
+        }
+    }
+    EXPECT_TRUE(cnt > 0);
+}
+
+TEST_F(RandomGeneratorTest, MakeNString) {
+    EXPECT_TRUE(MakeNString(0, 0) == "");
+    EXPECT_TRUE((MakeNString(1, 1)).length() == 1);
+    std::string n_str = MakeNString(1,10);
+    EXPECT_TRUE(n_str.length() <= 10 && n_str.length() >= 1);
+}
+
+TEST_F(RandomGeneratorTest, MakeDisOrderList) {
+    std::vector<int> dis_order_list = MakeDisOrderList(10,20);
+    sort(dis_order_list.begin(),dis_order_list.end());
+    for (int i = 10; i <= 20; ++i) {
+        EXPECT_EQ(dis_order_list[i-10], i);
+    }
+}
+
+TEST_F(RandomGeneratorTest, SetRandomConstant) {
+    SetRandomConstant();
+    NURandConstant c = GetRandomConstant();
+    EXPECT_TRUE(c.c_last >= 0 && c.c_last <= 255); 
+    EXPECT_TRUE(c.c_last >= 0 && c.c_last <= 1023); 
+    EXPECT_TRUE(c.c_last >= 0 && c.c_last <= 8191); 
+}
+
+TEST_F(RandomGeneratorTest, GetRandom) {
+    EXPECT_EQ(GetRandom(1, 1) , 1);
+    int rand_num = GetRandom(0, 1);
+    int rand_num1 = GetRandom(1, 0);
+    EXPECT_TRUE(rand_num == 0 || rand_num == 1);
+    EXPECT_TRUE(rand_num == 0 || rand_num == 1);
+}
+
+} // namespace tpcc
+} // namespace tera
diff --git a/src/benchmark/tpcc/test/tpcc_test.cc b/src/benchmark/tpcc/test/tpcc_test.cc
new file mode 100644
index 000000000..04d5b4890
--- /dev/null
+++ b/src/benchmark/tpcc/test/tpcc_test.cc
@@ -0,0 +1,21 @@
+// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+//  
+// Author: baorenyi@baidu.com
+
+#include "gflags/gflags.h"
+#include "glog/logging.h"
+#include "gtest/gtest.h"
+
+namespace tera {
+namespace tpcc {
+
+int main(int argc, char* argv[]) {
+    ::testing::InitGoogleTest(&argc, argv);
+    return RUN_ALL_TESTS();
+}
+
+
+} // namespace tpcc
+} // namespace tera
diff --git a/src/benchmark/tpcc/tpcc_flags.cc b/src/benchmark/tpcc/tpcc_flags.cc
new file mode 100644
index 000000000..4de8b300e
--- /dev/null
+++ b/src/benchmark/tpcc/tpcc_flags.cc
@@ -0,0 +1,17 @@
+// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+//
+// Author: baorenyi@baidu.com
+
+#include "gflags/gflags.h"
+
+DEFINE_int64(transactions_count, 200, "the count of transactions");
+DEFINE_int32(warehouses_count, 2, "the count of warsehouses");
+DEFINE_int32(tpcc_thread_pool_size, 20, "size of tpcc thread pool");
+DEFINE_int32(tpcc_run_gtxn_thread_pool_size, 20, "size of tpcc run global transactions thread pool");
+DEFINE_string(db_type, "tera", "test db type");
+DEFINE_string(tera_client_flagfile, "./tera.flag", "the flag file path of tera client");
+DEFINE_string(tera_table_schema_dir, "./tpcc_schemas/", "table schema directory");
+DEFINE_int32(generate_data_wait_times, 36000000, "generate data wait times, default 1h");
+DEFINE_int32(driver_wait_times, 36000000, "driver wait times, default 1h");
diff --git a/src/benchmark/tpcc/tpcc_main.cc b/src/benchmark/tpcc/tpcc_main.cc
new file mode 100644
index 000000000..2e2df8e26
--- /dev/null
+++ b/src/benchmark/tpcc/tpcc_main.cc
@@ -0,0 +1,78 @@
+// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+//
+// Author: baorenyi@baidu.com
+
+#include <iostream>
+
+#include <gflags/gflags.h>
+#include <glog/logging.h>
+
+#include "benchmark/tpcc/data_generator.h"
+#include "benchmark/tpcc/driver.h"
+#include "benchmark/tpcc/random_generator.h"
+#include "benchmark/tpcc/tpccdb.h"
+#include "benchmark/tpcc/tpcc_types.h"
+#include "types.h"
+#include "common/timer.h"
+#include "version.h"
+
+DECLARE_int64(transactions_count);
+DECLARE_int32(warehouses_count);
+DECLARE_string(db_type);
+
+int main(int argc, char *argv[]) {
+    // load conf from flags
+    ::google::ParseCommandLineFlags(&argc, &argv, true);
+
+    if (argc > 1 && strcmp(argv[1], "version") == 0) {
+        PrintSystemVersion();
+        return 0;
+    }
+    if (FLAGS_warehouses_count > tera::tpcc::kMaxWarehouseId
+        && FLAGS_warehouses_count <= 0) {
+        LOG(ERROR) << "--warehouses_count=" << FLAGS_warehouses_count << " is not availability";
+        return -1;
+    }
+
+    tera::tpcc::RandomGenerator random_gen;
+    random_gen.SetRandomConstant();
+
+    tera::tpcc::TpccDb* db = tera::tpcc::TpccDb::NewTpccDb(FLAGS_db_type);
+    // do clean tables
+    if (argc == 2 && strcmp(argv[1], "clean") == 0) {
+        if(!db->CleanTables()) {
+            LOG(ERROR) << "clean tables failed, exit";
+            _Exit(EXIT_FAILURE);
+        }
+        delete db;
+        return 0;
+    }
+    
+    if (!db->CreateTables()) {
+        LOG(ERROR) << "create tables failed, exit";
+        _Exit(EXIT_FAILURE);
+    }
+
+    tera::tpcc::DataGenerator data_gen(&random_gen, db);
+    int64_t beg_ts = tera::get_micros();
+    data_gen.GenItems();
+    data_gen.GenWarehouses();
+    data_gen.Join();
+    int64_t cost_t = tera::get_micros() - beg_ts;
+    LOG(INFO) << "Generate Tables Cost:" << cost_t << "us";
+    
+    // init driver
+    tera::tpcc::NURandConstant constant = random_gen.GetRandomConstant();
+    random_gen.SetRandomConstant(constant);
+    tera::tpcc::Driver driver(&random_gen, db);
+    // run test
+    int64_t beg_txn_ts = tera::get_micros();
+    driver.RunTransactions();
+    driver.Join();
+    int64_t cost_txn_t = tera::get_micros() - beg_txn_ts;
+    LOG(INFO) << "RunTransactions Cost:" << cost_txn_t << "us";
+    delete db;
+    return 0;
+}
diff --git a/src/benchmark/tpcc/tpcc_schemas/t_customer b/src/benchmark/tpcc/tpcc_schemas/t_customer
new file mode 100644
index 000000000..7b8c7ddfd
--- /dev/null
+++ b/src/benchmark/tpcc/tpcc_schemas/t_customer
@@ -0,0 +1,5 @@
+t_customer <txn=on> {
+    lg0 {
+        cf0 <maxversions=10000000,gtxn=on>
+    }
+}
diff --git a/src/benchmark/tpcc/tpcc_schemas/t_customer_last_index b/src/benchmark/tpcc/tpcc_schemas/t_customer_last_index
new file mode 100644
index 000000000..e7990ca13
--- /dev/null
+++ b/src/benchmark/tpcc/tpcc_schemas/t_customer_last_index
@@ -0,0 +1,5 @@
+t_customer_last_index <txn=on> {
+    lg0 {
+        cf0 <maxversions=10000000,gtxn=on>
+    }
+}
diff --git a/src/benchmark/tpcc/tpcc_schemas/t_district b/src/benchmark/tpcc/tpcc_schemas/t_district
new file mode 100644
index 000000000..2a6cbe3a3
--- /dev/null
+++ b/src/benchmark/tpcc/tpcc_schemas/t_district
@@ -0,0 +1,5 @@
+t_district <txn=on> {
+    lg0 {
+        cf0 <maxversions=10000000,gtxn=on>
+    }
+}
diff --git a/src/benchmark/tpcc/tpcc_schemas/t_history b/src/benchmark/tpcc/tpcc_schemas/t_history
new file mode 100644
index 000000000..a21f40001
--- /dev/null
+++ b/src/benchmark/tpcc/tpcc_schemas/t_history
@@ -0,0 +1,5 @@
+t_history <txn=on> {
+    lg0 {
+        cf0 <maxversions=10000000,gtxn=on>
+    }
+}
diff --git a/src/benchmark/tpcc/tpcc_schemas/t_history_index b/src/benchmark/tpcc/tpcc_schemas/t_history_index
new file mode 100644
index 000000000..205b3aa23
--- /dev/null
+++ b/src/benchmark/tpcc/tpcc_schemas/t_history_index
@@ -0,0 +1,5 @@
+t_history_index <txn=on> {
+    lg0 {
+        cf0 <maxversions=10000000,gtxn=on>
+    }
+}
diff --git a/src/benchmark/tpcc/tpcc_schemas/t_item b/src/benchmark/tpcc/tpcc_schemas/t_item
new file mode 100644
index 000000000..02bf1ff5a
--- /dev/null
+++ b/src/benchmark/tpcc/tpcc_schemas/t_item
@@ -0,0 +1,5 @@
+t_item <txn=on> {
+    lg0 {
+        cf0 <maxversions=10000000,gtxn=on>
+    }
+}
diff --git a/src/benchmark/tpcc/tpcc_schemas/t_neworder b/src/benchmark/tpcc/tpcc_schemas/t_neworder
new file mode 100644
index 000000000..e7ef005e0
--- /dev/null
+++ b/src/benchmark/tpcc/tpcc_schemas/t_neworder
@@ -0,0 +1,5 @@
+t_neworder <txn=on> {
+    lg0 {
+        cf0 <maxversions=10000000,gtxn=on>
+    }
+}
diff --git a/src/benchmark/tpcc/tpcc_schemas/t_order b/src/benchmark/tpcc/tpcc_schemas/t_order
new file mode 100644
index 000000000..4e7d0139f
--- /dev/null
+++ b/src/benchmark/tpcc/tpcc_schemas/t_order
@@ -0,0 +1,5 @@
+t_order <txn=on> {
+    lg0 {
+        cf0 <maxversions=10000000,gtxn=on>
+    }
+}
diff --git a/src/benchmark/tpcc/tpcc_schemas/t_order_index b/src/benchmark/tpcc/tpcc_schemas/t_order_index
new file mode 100644
index 000000000..6d2a47528
--- /dev/null
+++ b/src/benchmark/tpcc/tpcc_schemas/t_order_index
@@ -0,0 +1,5 @@
+t_order_index <txn=on> {
+    lg0 {
+        cf0 <maxversions=10000000,gtxn=on>
+    }
+}
diff --git a/src/benchmark/tpcc/tpcc_schemas/t_orderline b/src/benchmark/tpcc/tpcc_schemas/t_orderline
new file mode 100644
index 000000000..d075e7918
--- /dev/null
+++ b/src/benchmark/tpcc/tpcc_schemas/t_orderline
@@ -0,0 +1,5 @@
+t_orderline <txn=on> {
+    lg0 {
+        cf0 <maxversions=10000000,gtxn=on>
+    }
+}
diff --git a/src/benchmark/tpcc/tpcc_schemas/t_stock b/src/benchmark/tpcc/tpcc_schemas/t_stock
new file mode 100644
index 000000000..a35115aa0
--- /dev/null
+++ b/src/benchmark/tpcc/tpcc_schemas/t_stock
@@ -0,0 +1,5 @@
+t_stock <txn=on> {
+    lg0 {
+        cf0 <maxversions=10000000,gtxn=on>
+    }
+}
diff --git a/src/benchmark/tpcc/tpcc_schemas/t_warehouse b/src/benchmark/tpcc/tpcc_schemas/t_warehouse
new file mode 100644
index 000000000..9102544ff
--- /dev/null
+++ b/src/benchmark/tpcc/tpcc_schemas/t_warehouse
@@ -0,0 +1,5 @@
+t_warehouse <txn=on> {
+    lg0 {
+        cf0 <maxversions=10000000,gtxn=on>
+    }
+}
diff --git a/src/benchmark/tpcc/tpcc_types.h b/src/benchmark/tpcc/tpcc_types.h
new file mode 100644
index 000000000..c73e9f489
--- /dev/null
+++ b/src/benchmark/tpcc/tpcc_types.h
@@ -0,0 +1,139 @@
+// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+//
+// Author: baorenyi@baidu.com
+
+#ifndef TERA_BENCHMARK_TPCC_TPCC_TYPES_H
+#define TERA_BENCHMARK_TPCC_TPCC_TYPES_H
+
+#include <string>
+
+#include <gflags/gflags.h>
+#include <glog/logging.h>
+
+namespace tera {
+namespace tpcc {
+
+const int kTpccTableCnt = 12;
+
+// t_customer_last_index is the index of t_customer
+//
+const char* const kTpccTables[] = {"t_item", "t_warehouse", "t_district", 
+                                   "t_customer", "t_history", "t_stock", 
+                                   "t_order", "t_orderline", "t_neworder", 
+                                   "t_customer_last_index", "t_order_index",
+                                   "t_history_index"};
+
+// StockLevel  4%   4
+// OrderStatus 4%   8
+// Delivery    4%  12
+// Payment    43%  55
+// NewOrder   45% 100
+const int kTpccTransactionRatios[] = {4, 8, 12, 55, 100};
+
+// http://www.man7.org/linux/man-pages/man3/initstate.3.html
+// Current "optimal" values for the size of the state array n 
+// are 8, 32, 64, 128, and 256 bytes;
+const int kRandomStateSize = 64;
+
+// YTD
+const float kInitYTD = 300000.00f;
+
+// tax
+const float kTaxMax = 0.20f;
+const float kTaxMin = 0.10f;
+const int kTaxDigits = 2;
+
+// address
+const int kStreetLowerLen = 10;
+const int kStreetUpperLen = 20;
+const int kCityLowerLen = 10;
+const int kCityUpperLen = 20;
+const int kStateLen = 2;
+const int kZipLen = 9;
+
+// warehourse
+const int kMaxWarehouseId = 100;
+const int kWareHouseNameLowerLen = 6;
+const int kWareHouseNameUpperLen = 10;
+
+// stock 
+const int kMaxQuantity = 100;
+const int kMinQuantity = 10;
+const int kDistLen = 24;
+const int kStockDataLowerLen = 26;
+const int kStockDataUpperLen = 50;
+const int kMinStockLevelThreshold = 10;
+const int kMaxStockLevelThreshold = 20;
+
+// item
+const int kItemCount = 100000;
+const int kItemMaxIm = 10000;
+const int kItemMinIm = 1;
+const float kItemMaxPrice = 100.00;
+const float kItemMinPrice = 1.00;
+const int kItemPriceDigits = 2;
+const int kItemMaxNameLen = 24;
+const int kItemMinNameLen = 14;
+const int kItemMaxDataLen = 50;
+const int kItemMinDataLen = 26;
+
+// district
+const int kDistrictCountPerWarehouse = 10;
+const int kDistrictNameLowerLen = 6;
+const int kDistrictNameUpperLen = 10;
+
+// customer
+const int kCustomerCountPerDistrict = 3000;
+const float kInitCreditLimit = 5000.00;
+const float kMaxDisCount = 0.0;
+const float kMinDisCount = 0.5;
+const int kDisCountDigits = 2;
+const float kInitBalance = -10.00;
+const float kInitYTDPayment = 10.00;
+const int kInitPaymentCnt = 1;
+const int kInitDeliveryCnt = 0;
+const int kFirstLowerLen = 6;
+const int kFirstUpperLen = 10;
+const int kMiddleLen = 2;
+const int kLastLen = 16;
+const int kPhoneLen = 16;
+const int kCreditLen = 2;
+const int kCustomerDataUpperLen = 500;
+const int kCustomerDataLowerLen = 300;
+
+// order
+const int kInitOrdersPerDistrict = 3000;
+const int kInitAllLocal = 1;
+const int kMaxCarrierId = 10;
+const int kMinCarrierId = 1;
+const int kMaxOrderLineCnt = 15;
+const int kMinOrderLineCnt = 5;
+
+// new order
+const int kInitNewOrderCountPerDistrict = 900;
+
+// order line
+const int kMaxItemId = 100000;
+const int kMinItemId = 1;
+const int kInitQuantity = 5;
+const int kMaxOrderLineQuantity = 10;
+const float kOrderLineMinAmount = 0.01f;
+const float kOrderLineMaxAmount = 9999.99f;
+const int kOrderLineAmountDigits = 2;
+
+// history
+const float kInitHistoryAmount = 10.00f;
+const int kHistoryDataLowerLen = 12;
+const int kHistoryDataUpperLen = 24;
+
+// runtime h_amount
+const float kRuntimeMaxAmount = 5000.00f;
+const float kRuntimeMinAmount = 1.00f;
+const int kRuntimeAmountDigits = 2;
+
+} // namespace tpcc
+} // namepsace tera
+
+#endif /* TERA_BENCHMARK_TPCC_TPCC_TYPES_H */
diff --git a/src/benchmark/tpcc/tpccdb.cc b/src/benchmark/tpcc/tpccdb.cc
new file mode 100644
index 000000000..bb7e0cfb5
--- /dev/null
+++ b/src/benchmark/tpcc/tpccdb.cc
@@ -0,0 +1,360 @@
+// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+//
+// Author: baorenyi@baidu.com
+
+#include <string>
+
+#include "benchmark/tpcc/mock_tpccdb.h"
+#include "benchmark/tpcc/tera_tpccdb.h"
+#include "benchmark/tpcc/tpccdb.h"
+
+namespace tera {
+namespace tpcc {
+
+class TeraTpccDb;
+class MockTpccDb;
+
+/// ------------------------- [begin item table] -------------------------- ///
+std::string Item::ToString() const {
+    std::stringstream ss;
+    ss << "i_id = " << i_id
+       << ",i_im_id = " << i_im_id
+       << ",i_price = " << i_price
+       << ",i_name = " << i_name
+       << ",i_data = " << i_data;
+    return ss.str();
+}
+
+/// ------------------------- [begin warehouse table] --------------------- ///
+std::string Warehouse::ToString() const {
+    std::stringstream ss;
+    ss << "w_id = " << w_id
+       << ",w_tax = " << w_tax
+       << ",w_ytd = " << w_ytd
+       << ",w_name = " << w_name
+       << ",w_street_1 = " << w_street_1
+       << ",w_street_2 = " << w_street_2
+       << ",w_city = " << w_city
+       << ",w_state = " << w_state
+       << ",w_zip = " << w_zip;
+    return ss.str(); 
+}
+
+/// ------------------------- [begin district table] ---------------------- ///
+
+District::District(int32_t id, int32_t w_id, RandomGenerator* rand_gen) 
+    : d_id(id), d_w_id(w_id), d_ytd(kInitYTD), d_next_o_id(kCustomerCountPerDistrict + 1) {
+    d_tax = GenTax(rand_gen);
+    d_name = rand_gen->MakeAString(kDistrictNameLowerLen, kDistrictNameUpperLen);
+    d_street_1 = rand_gen->MakeAString(kStreetLowerLen, kStreetUpperLen);
+    d_street_2 = rand_gen->MakeAString(kStreetLowerLen, kStreetUpperLen); 
+    d_city = rand_gen->MakeAString(kCityLowerLen, kCityUpperLen);
+    d_state = rand_gen->MakeAString(kStateLen,kStateLen);
+    d_zip = GenZip(rand_gen);
+}
+
+std::string District::PrimaryKey() const { 
+    return std::to_string(d_w_id) + "_" 
+           + std::to_string(d_id);
+}
+
+std::string District::ForeignKey() const {
+    return std::to_string(d_w_id);
+}
+
+std::string District::ToString() const {
+    std::stringstream ss;
+    ss << "d_id  = " << d_id
+       << ",d_w_id = " << d_w_id
+       << ",d_tax = " << d_tax
+       << ",d_ytd = " << d_ytd
+       << ",d_next_o_id = " << d_next_o_id
+       << ",d_name = " << d_name
+       << ",d_street_1 = " << d_street_1
+       << ",d_street_2 = " << d_street_2
+       << ",d_city = " << d_city
+       << ",d_state = " << d_state
+       << ",d_zip = " << d_zip;
+    return ss.str(); 
+}
+
+/// ------------------------- [begin stock table] ------------------------- ///
+
+Stock::Stock(int32_t id, int32_t w_id, bool is_original, RandomGenerator* rand_gen) 
+    : s_i_id (id), s_w_id(w_id) {
+    s_quantity = rand_gen->GetRandom(kMinQuantity, kMaxQuantity);
+    s_ytd = 0;
+    s_order_cnt = 0;
+    s_remote_cnt = 0;
+    for (int i = 0; i < kDistrictCountPerWarehouse; ++i) {
+        s_dist.push_back(rand_gen->MakeAString(kDistLen, kDistLen));
+    }
+    s_data = GenData(rand_gen, kStockDataLowerLen, kStockDataUpperLen, is_original);
+}
+
+std::string Stock::PrimaryKey() const { 
+    return std::to_string(s_w_id) + "_" + std::to_string(s_i_id); 
+}
+
+std::string Stock::ForeignKey() const { 
+    return std::to_string(s_i_id); 
+}
+
+std::string Stock::ToString() const {
+    std::stringstream ss;
+    ss << "s_w_id = " << s_w_id
+       << ",s_quantity = " << s_quantity
+       << ",s_ytd = " << s_ytd
+       << ",s_order_cnt = " << s_order_cnt
+       << ",s_remote_cnt = " << s_remote_cnt
+       << ",s_data = " << s_data
+       << ",s_dist = [";
+    for (auto d : s_dist) {
+        ss << d << ",";
+    }
+    ss << "]";
+    return ss.str(); 
+}
+
+/// ------------------------- [begin order table] ------------------------- ///
+
+Order::Order(int32_t id, int32_t c_id, int32_t d_id, int32_t w_id,
+             bool new_order, const std::string& datetime, 
+             RandomGenerator* rand_gen) 
+    : o_id(id), o_c_id(c_id), o_d_id(d_id), o_w_id(w_id), 
+      o_carrier_id(0), o_all_local(kInitAllLocal), 
+      o_entry_d(datetime) {
+
+    if (!new_order) {
+        o_carrier_id = rand_gen->GetRandom(kMinCarrierId, kMaxCarrierId);
+    }
+    o_ol_cnt = rand_gen->GetRandom(kMinOrderLineCnt, kMaxOrderLineCnt);
+}
+
+std::string Order::PrimaryKey() const { 
+    return std::to_string(o_w_id) + "_" 
+           + std::to_string(o_d_id) + "_" 
+           + std::to_string(o_id);
+}
+
+std::string Order::ForeignKey() const {
+    return std::to_string(o_w_id) + "_"
+           + std::to_string(o_d_id) + "_"
+           + std::to_string(o_c_id);
+}
+
+std::string Order::ToString() const {
+    std::stringstream ss;
+    ss << "o_id = " << o_id
+       << ",o_c_id = " << o_c_id
+       << ",o_d_id = " << o_d_id
+       << ",o_w_id = " << o_w_id
+       << ",o_carrier_id = " << o_carrier_id
+       << ",o_ol_cnt = " << o_ol_cnt
+       << ",o_all_local = " << o_all_local
+       << ",o_entry_d = " << o_entry_d;
+    return ss.str(); 
+}
+
+/// ------------------------- [begin neworder table] ---------------------- ///
+
+
+NewOrder::NewOrder(int32_t o_id, int32_t d_id, int32_t w_id) 
+    : no_o_id(o_id), no_d_id(d_id), no_w_id(w_id) {
+}
+
+std::string NewOrder::ToString() const {
+    std::stringstream ss;
+    ss << "no_o_id = " << no_o_id
+       << ",no_d_id = " << no_d_id
+       << ",no_w_id = " << no_w_id;
+    return ss.str(); 
+}
+
+std::string NewOrder::PrimaryKey() const {
+    return std::to_string(no_w_id) 
+        + "_" + std::to_string(no_d_id) 
+        + "_" + std::to_string(no_o_id);
+}
+
+std::string NewOrder::ForeignKey() const {
+    return std::to_string(no_w_id) 
+        + "_" + std::to_string(no_d_id) 
+        + "_" + std::to_string(no_o_id);
+}
+
+/// ------------------------- [begin orderline table] --------------------- ///
+
+OrderLine::OrderLine(int32_t o_id, int32_t d_id, int32_t w_id, int32_t number, 
+                     bool new_order, const std::string& datetime, 
+                     RandomGenerator* rand_gen) 
+    : ol_o_id(o_id), ol_d_id(d_id), ol_w_id(w_id), ol_number(number),
+      ol_supply_w_id(w_id), ol_quantity(kInitQuantity),
+      ol_amount(0.00f), ol_delivery_d(datetime) {
+
+    ol_i_id = rand_gen->GetRandom(kMinItemId, kMaxItemId);
+    if (new_order) {
+        ol_amount = rand_gen->MakeFloat(kOrderLineMinAmount, 
+                                        kOrderLineMaxAmount, 
+                                        kOrderLineAmountDigits);
+        ol_delivery_d = "";
+    }
+    ol_dist_info = rand_gen->MakeAString(kDistLen, kDistLen);
+}
+
+std::string OrderLine::PrimaryKey() const {
+    return std::to_string(ol_w_id) + "_"
+           + std::to_string(ol_d_id) + "_"
+           + std::to_string(ol_o_id) + "_"
+           + std::to_string(ol_number);
+}
+
+ForeignKeyMap OrderLine::ForeignKeys() const {
+    ForeignKeyMap foreign_keys;
+    std::string order_index = std::to_string(ol_w_id) + "_" 
+                              + std::to_string(ol_d_id) + "_"
+                              + std::to_string(ol_o_id);
+    std::string item_index = std::to_string(ol_supply_w_id) + "_"
+                             + std::to_string(ol_i_id);
+    foreign_keys["order_index"] = order_index;
+    foreign_keys["item_index"] = item_index;
+    return foreign_keys;
+}
+
+std::string OrderLine::ToString() const {
+    std::stringstream ss;
+    ss << "ol_o_id = " << ol_o_id
+       << ",ol_d_id = " << ol_d_id
+       << ",ol_w_id = " << ol_w_id
+       << ",ol_number = " << ol_number
+       << ",ol_i_id = " << ol_i_id
+       << ",ol_supply_w_id = " << ol_supply_w_id
+       << ",ol_quantity = " << ol_quantity
+       << ",ol_amount = " << ol_amount
+       << ",ol_delivery_d = " << ol_delivery_d
+       << ",ol_dist_info = " << ol_dist_info;
+    return ss.str();
+}
+
+/// ------------------------- [begin customer table] ---------------------- ///
+
+Customer::Customer(int32_t id, int32_t d_id, int32_t w_id, const std::string& datetime,
+         bool bad_credit, RandomGenerator* rand_gen)
+    : c_id(id), 
+      c_d_id(d_id),
+      c_w_id(w_id),
+      c_credit_lim(kInitCreditLimit),
+      c_balance(kInitBalance),
+      c_ytd_payment(kInitYTDPayment),
+      c_payment_cnt(kInitPaymentCnt),
+      c_delivery_cnt(kInitDeliveryCnt),
+      c_middle("OE"),
+      c_since(datetime) {
+    c_discount = rand_gen->MakeFloat(kMinDisCount, kMaxDisCount, kDisCountDigits);
+    c_first = rand_gen->MakeAString(kFirstLowerLen, kFirstUpperLen);
+    c_last = GenLastName(rand_gen, (id <= 1000 ? id : kCustomerCountPerDistrict));
+    c_street_1 = rand_gen->MakeAString(kStreetLowerLen, kStreetUpperLen);
+    c_street_2 = rand_gen->MakeAString(kStreetLowerLen, kStreetUpperLen); 
+    c_city = rand_gen->MakeAString(kCityLowerLen, kCityUpperLen);
+    c_state = rand_gen->MakeAString(kStateLen,kStateLen);
+    c_zip = GenZip(rand_gen);
+    c_phone = rand_gen->MakeNString(kPhoneLen,kPhoneLen);
+    c_credit = bad_credit ? "BC" : "GC";
+    c_data = GenData(rand_gen, kCustomerDataLowerLen, kCustomerDataUpperLen, false);
+}
+
+std::string Customer::PrimaryKey() const { 
+    return std::to_string(c_w_id) + "_" + std::to_string(c_d_id)
+        + "_" + std::to_string(c_id);
+}
+
+std::string Customer::ForeignKey() const { 
+    return std::to_string(c_w_id) + "_" + std::to_string(c_d_id);
+}
+
+std::string Customer::ToString() const {
+    std::stringstream ss;
+    ss << "c_id = " << c_id
+       << ",c_d_id = " << c_d_id
+       << ",c_w_id = " << c_w_id
+       << ",c_credit_lim = " << c_credit_lim
+       << ",c_discount = " << c_discount
+       << ",c_balance = " << c_balance
+       << ",c_ytd_payment = " << c_ytd_payment
+       << ",c_payment_cnt = " << c_payment_cnt
+       << ",c_delivery_cnt = " << c_delivery_cnt
+       << ",c_name = [" << c_first << "," << c_middle << "," << c_last << "]"
+       << ",c_street_1 = " << c_street_1
+       << ",c_street_2 = " << c_street_2
+       << ",c_city = " << c_city
+       << ",c_state = " << c_state
+       << ",c_zip = " << c_zip
+       << ",c_phone = " << c_phone
+       << ",c_since = " << c_since
+       << ",c_credit = " << c_credit
+       << ",c_data = " << c_data;
+    return ss.str(); 
+}
+
+/// ------------------------- [begin history table] ----------------------- ///
+std::string History::ToString() const {
+    std::stringstream ss;
+    ss << "h_c_id = " << h_c_id
+       << ",h_c_d_id = " << h_c_d_id
+       << ",h_c_w_id = " << h_c_w_id
+       << ",h_d_id = " << h_d_id
+       << ",h_w_id = " << h_w_id
+       << ",h_amount = " << h_amount
+       << ",h_date = " << h_date
+       << ",h_data = " << h_data;
+    return ss.str(); 
+}
+
+/// ------------------------- [end tables] -------------------------------- ///
+
+bool TxnResult::State() const {
+    return status_;
+}
+
+void TxnResult::SetState(bool status) {
+    status_ = status;
+}
+
+void TxnResult::SetReason(const std::string& reason) {
+    reason_ = reason;
+}
+
+void StockLevelResult::SetLowStock(int low_stock) {
+    low_stock_ = low_stock;
+}
+
+int StockLevelResult::LowStock() const {
+    return low_stock_;
+}
+
+void PaymentResult::SetSingleLine(const RetTuples& single_line) {
+    single_line_ = single_line;
+}
+
+void NewOrderResult::AddLine(const RetTuples& line) {
+    lines_.push_back(line);
+} 
+
+void NewOrderResult::SetSingleLine(const RetTuples& single_line) {
+    single_line_ = single_line;
+}
+
+TpccDb* TpccDb::NewTpccDb(const std::string& db_type) {
+    if (db_type == "tera") {
+        return new TeraTpccDb();
+    } else {
+        LOG(ERROR) << "not support db:" << db_type;
+    }
+    return NULL;
+}
+
+} // namespace tpcc
+} // namespace tera
+
diff --git a/src/benchmark/tpcc/tpccdb.h b/src/benchmark/tpcc/tpccdb.h
new file mode 100644
index 000000000..93b3c32f3
--- /dev/null
+++ b/src/benchmark/tpcc/tpccdb.h
@@ -0,0 +1,471 @@
+// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+//
+// Author: baorenyi@baidu.com
+
+#ifndef TERA_BENCHMARK_TPCC_TPCCDB_H
+#define TERA_BENCHMARK_TPCC_TPCCDB_H
+
+#include <iostream>
+#include <stdint.h>
+#include <string>
+#include <unordered_set>
+#include <unordered_map>
+#include <vector>
+
+#include <gflags/gflags.h>
+#include <glog/logging.h>
+
+#include "benchmark/tpcc/random_generator.h"
+#include "benchmark/tpcc/tpcc_types.h"
+
+namespace tera {
+namespace tpcc {
+
+typedef std::unordered_set<int> IdSet;
+typedef std::unordered_map<std::string, std::string> ForeignKeyMap;
+typedef std::unordered_map<std::string, std::string> RetTuples;
+
+
+inline float GenTax(RandomGenerator* rand_gen) {
+	return rand_gen->MakeFloat(kTaxMax, kTaxMin, kTaxDigits);
+}
+
+inline std::string GenZip(RandomGenerator* rand_gen) {
+    return rand_gen->MakeNString(kZipLen, kZipLen);
+}
+
+inline std::string GenData(RandomGenerator* rand_gen, 
+                           int lower_len, 
+                           int upper_len, 
+                           bool is_original) {
+    std::string ret = rand_gen->MakeAString(lower_len, upper_len);
+    if (is_original) {
+        int pos = rand_gen->GetRandom(0, ret.size() - 8);
+        ret = ret.replace(pos, 8, "ORIGINAL");
+    }
+    return ret;
+}
+
+inline std::string GenLastName(RandomGenerator* rand_gen, int id) {
+    if (id > 999) {
+        id = rand_gen->NURand(255, 0, std::min(999, id - 1));
+    }
+    std::vector<std::string> labels = {"BAR", "OUGHT", "ABLE", "PRI", "PRES", 
+                                       "ESE", "ANTI", "CALLY", "ATION", "EING"};
+    return labels[id / 100] + labels[(id / 10) % 10] + labels[id % 10];
+}
+
+inline IdSet PickUniqueIdSet(RandomGenerator* rand_gen, size_t cnt, int lower_id, int upper_id) {
+    IdSet ids;
+    while(ids.size() < cnt) {
+        int tmp_id = rand_gen->GetRandom(lower_id, upper_id);
+        if (ids.find(tmp_id) == ids.end()) {
+            ids.insert(tmp_id);
+        }
+    }
+    return ids;
+}
+
+struct Item {
+    int32_t i_id;
+    int32_t i_im_id;
+    float i_price;
+    std::string i_name;
+    std::string i_data;
+
+    Item(int32_t id, bool is_original, RandomGenerator* rand_gen) : i_id(id) {
+        i_im_id = rand_gen->GetRandom(kItemMinIm, kItemMaxIm);
+        i_price = rand_gen->MakeFloat(kItemMinPrice, kItemMaxPrice, kItemPriceDigits);
+        i_name = rand_gen->MakeAString(kItemMinNameLen, kItemMaxNameLen);
+        i_data = GenData(rand_gen, kItemMinDataLen, kItemMaxDataLen, is_original);
+    } 
+
+    std::string PrimaryKey() const { return std::to_string(i_id); }
+    std::string ToString() const;
+};
+
+struct Warehouse {
+    int32_t w_id;
+    float w_tax;
+    float w_ytd;
+    std::string w_name;
+    std::string w_street_1;
+    std::string w_street_2;
+    std::string w_city;
+    std::string w_state;
+    std::string w_zip;
+	Warehouse(int32_t id, RandomGenerator* rand_gen) : w_id(id) {
+		w_tax = GenTax(rand_gen);
+        w_ytd = kInitYTD;
+        w_name = rand_gen->MakeAString(kWareHouseNameLowerLen, kWareHouseNameUpperLen);
+        w_street_1 = rand_gen->MakeAString(kStreetLowerLen, kStreetUpperLen);
+        w_street_2 = rand_gen->MakeAString(kStreetLowerLen, kStreetUpperLen); 
+        w_city = rand_gen->MakeAString(kCityLowerLen, kCityUpperLen);
+        w_state = rand_gen->MakeAString(kStateLen,kStateLen);
+        w_zip = GenZip(rand_gen);
+	}
+    std::string PrimaryKey() const { return std::to_string(w_id); }
+    std::string ToString() const;
+};
+
+struct District {
+    int32_t d_id;
+    int32_t d_w_id;
+    float d_tax;
+    float d_ytd;
+    int32_t d_next_o_id;
+    std::string d_name;
+    std::string d_street_1;
+    std::string d_street_2;
+    std::string d_city;
+    std::string d_state;
+    std::string d_zip; 
+	
+    District(int32_t id, int32_t w_id, RandomGenerator* rand_gen);
+    std::string PrimaryKey() const;
+    std::string ForeignKey() const;
+    std::string ToString() const;
+};
+
+struct Stock {
+int32_t s_i_id;
+    int32_t s_w_id;
+    int32_t s_quantity;
+    int32_t s_ytd;
+    int32_t s_order_cnt;
+    int32_t s_remote_cnt;
+    std::vector<std::string> s_dist; 
+    std::string s_data;
+
+    Stock(int32_t id, int32_t w_id, bool is_original, RandomGenerator* rand_gen);
+    std::string PrimaryKey() const;
+    std::string ForeignKey() const;
+    std::string ToString() const;
+};
+
+struct Customer {
+    int32_t c_id;
+    int32_t c_d_id;
+    int32_t c_w_id;
+    float c_credit_lim;
+    float c_discount;
+    float c_balance;
+    float c_ytd_payment;
+    int32_t c_payment_cnt;
+    int32_t c_delivery_cnt;
+    std::string c_first;
+    std::string c_middle;
+    std::string c_last;
+    std::string c_street_1;
+    std::string c_street_2;
+    std::string c_city;
+    std::string c_state;
+    std::string c_zip;
+    std::string c_phone;
+    std::string c_since;
+    std::string c_credit;
+    std::string c_data;	
+    Customer(int32_t id, int32_t d_id, int32_t w_id, const std::string& datetime,
+             bool bad_credit, RandomGenerator* rand_gen);
+    std::string PrimaryKey() const;
+    std::string ForeignKey() const;
+    std::string ToString() const;
+};
+
+struct Order {
+    int32_t o_id;
+    int32_t o_c_id;
+    int32_t o_d_id;
+    int32_t o_w_id;
+    int32_t o_carrier_id;
+    int32_t o_ol_cnt;
+
+    // If the order includes only home order-lines, 
+    // then O_ALL_LOCAL is set to 1, otherwise O_ALL_LOCAL is set to 0.
+    int32_t o_all_local; 
+    std::string o_entry_d;
+
+    Order(int32_t id, int32_t c_id, int32_t d_id, int32_t w_id, bool new_order, 
+          const std::string& datetime, RandomGenerator* rand_gen);
+    std::string PrimaryKey() const;
+    std::string ForeignKey() const;
+    std::string ToString() const;
+};
+
+// An order-line is said to be 'home' if it is supplied by the home warehouse 
+// (i.e., when OL_SUPPLY_W_ID equals O_W_ID).
+// 
+// An order-line is said to be remote when it is supplied by a remote warehouse 
+// (i.e., when OL_SUPPLY_W_ID does not equal O_W_ID).
+//
+struct OrderLine {
+    int32_t ol_o_id;
+    int32_t ol_d_id;
+    int32_t ol_w_id;
+    int32_t ol_number;
+    int32_t ol_i_id;
+    int32_t ol_supply_w_id;
+    int32_t ol_quantity;
+    float ol_amount;
+    std::string ol_delivery_d;
+    std::string ol_dist_info;
+
+    OrderLine(int32_t o_id, int32_t d_id, int32_t w_id, int32_t number, 
+              bool new_order, const std::string& datetime, 
+              RandomGenerator* rand_gen);
+    std::string PrimaryKey() const;
+    ForeignKeyMap ForeignKeys() const;
+    std::string ToString() const;
+};
+
+struct NewOrder {
+    int32_t no_o_id;
+    int32_t no_d_id;
+    int32_t no_w_id;
+
+    NewOrder(int32_t o_id, int32_t d_id, int32_t w_id);
+    std::string PrimaryKey() const;
+    std::string ForeignKey() const;
+    std::string ToString() const;
+};
+
+struct History {
+    int32_t h_c_id;
+    int32_t h_c_d_id;
+    int32_t h_c_w_id;
+    int32_t h_d_id;
+    int32_t h_w_id;
+    float h_amount;
+    std::string h_date;
+    std::string h_data;
+
+    History(int32_t c_id, int32_t d_id, int32_t w_id, const std::string& datetime,
+            RandomGenerator* rand_gen)
+        : h_c_id(c_id), h_c_d_id(d_id), h_c_w_id(w_id), h_d_id(d_id), h_w_id(w_id),
+          h_amount(kInitHistoryAmount), h_date(datetime) {
+        h_data = rand_gen->MakeAString(kHistoryDataLowerLen, kHistoryDataUpperLen);
+    }
+    std::string PrimaryKey() const { return std::to_string(h_c_id); }
+    std::string ToString() const;
+};
+
+struct NewOrderInfo {
+    bool need_failed;
+    int32_t o_all_local;
+    int32_t o_ol_cnt;
+    std::vector<int32_t> ol_supply_w_ids;
+    std::vector<int32_t> ol_i_ids;
+    std::vector<int32_t> ol_quantities;
+};
+
+enum TpccTables
+{  
+	kItemTable      = 0,
+    kWarehouseTable = 1,  
+    kDistrictTable  = 2,  
+    kCustomerTable  = 3,
+	kHistoryTable   = 4,
+	kStockTable     = 5,
+	kOrderTable     = 6,
+	kOrderLineTable = 7,
+	kNewOrderTable  = 8,
+
+    // the index of table
+    kCustomerLastIndex =  9,
+    kOrderIndex        = 10,
+    kHistoryIndex      = 11
+};
+
+/// ------------------------- transaction result ---------------------------///
+
+class TxnResult {
+public:
+    void SetState(bool status);
+    bool State() const;
+    void SetReason(const std::string& reason);
+    const std::string& Reason() const;
+private:
+    bool status_;
+    std::string reason_;
+};
+
+class StockLevelResult : public TxnResult {
+public:
+    void SetLowStock(int low_stock);
+    int LowStock() const;
+private:
+    int low_stock_; 
+};
+
+class PaymentResult : public TxnResult {
+public:
+    void SetSingleLine(const RetTuples& single_line);
+private:
+    RetTuples single_line_;
+};
+
+class NewOrderResult : public TxnResult {
+public:
+   void AddLine(const RetTuples& line);
+   void SetSingleLine(const RetTuples& single_line);
+private:
+   std::vector<RetTuples> lines_;
+   RetTuples single_line_;
+};
+
+class OrderStatusResult : public TxnResult {
+
+};
+
+class DeliveryResult : public TxnResult {
+    
+};
+
+class TpccDb {
+public:
+    TpccDb(){}
+    virtual ~TpccDb(){}
+
+    // init db 
+    virtual bool CreateTables() = 0;
+    virtual bool CleanTables() = 0;
+
+    // for insert table
+    virtual bool InsertItem(const Item& i) = 0;
+
+    virtual bool InsertWarehouse(const Warehouse& w) = 0;
+
+    virtual bool InsertDistrict(const District& d) = 0;
+
+    virtual bool InsertCustomer(const Customer& c) = 0;
+
+    virtual bool InsertHistory(const History& h) = 0;
+
+    virtual bool InsertStock(const Stock& s) = 0;
+    
+    virtual bool InsertOrder(const Order& o) = 0;
+
+    virtual bool InsertOrderLine(const OrderLine& ol) = 0;
+
+    virtual bool InsertNewOrder(const NewOrder& no) = 0;
+
+    //  for transaction
+    
+    //  The Stock-Level Transaction [Revision 5.11 - Page 44]
+    //
+    //  (warehouse_id, district_id) 
+    //      is the primarykey of t_district 
+    //      Each terminal must use a unique value of (W_ID, D_ID) that is constant 
+    //      over the whole measurement, i.e., D_IDs cannot be re-used within a warehouse
+    //
+    //  threshold 
+    //      The threshold of minimum quantity in stock (threshold) is selected 
+    //      at random within [10 .. 20].
+    //
+    virtual void StockLevelTxn(int32_t warehouse_id, int32_t district_id, 
+                               int32_t threshold, 
+                               StockLevelResult* ret) = 0;
+
+    //  The Delivery Transaction [Revision 5.11 - Page 40]
+    //
+    //  warehouse_id
+    //      For any given terminal, the home warehouse number (W_ID) is constant 
+    //      over the whole measurement interval
+    //
+    //  carrier_id
+    //      The carrier number (O_CARRIER_ID) is randomly selected within [1 .. 10].
+    //
+    //  delivery_datetime
+    //      The delivery date (OL_DELIVERY_D) is generated within the 
+    //      SUT by using the current system date and time.
+    //
+    virtual void DeliveryTxn(int32_t warehouse_id, 
+                             int32_t carrier_id, 
+                             const std::string& delivery_datetime,
+                             DeliveryResult* ret) = 0;
+
+    //  The Order-Status Transaction [Revision 5.11 - Page 37]
+    //  
+    //  warehouse_id
+    //      For any given terminal, the home warehouse number (W_ID) is constant 
+    //      over the whole measurement interval
+    //
+    //  district_id
+    //      The district number (D_ID) is randomly selected within [1 .. 10] 
+    //      from the home warehouse (D_W_ID = W_ID). 
+    //
+    //  c_warehouse_id, c_district_id, last_name
+    //      customer is randomly selected 
+    //      60% of the time by last name (C_W_ID, C_D_ID, C_LAST) 
+    //      from the selected district (C_D_ID = D_ID) 
+    //      and the home warehouse number (C_W_ID = W_ID). 
+    //
+    //  c_warehouse_id, c_district_id, customer_id
+    //      40% of the time by number (C_W_ID, C_D_ID, C_ID) 
+    //      from the selected district (C_D_ID = D_ID) 
+    //      and the home warehouse number (C_W_ID = W_ID). 
+    //
+    virtual void OrderStatusTxn(bool by_last_name,
+                                int32_t warehouse_id, int32_t district_id, 
+                                int32_t c_customer_id, 
+                                const std::string& last_name,
+                                OrderStatusResult* ret) = 0;
+
+    //  The Payment Transaction [Revision 5.11 - Page 33]
+    //
+    //  warehouse_id
+    //      For any given terminal, the home warehouse number (W_ID) is constant 
+    //      over the whole measurement interval
+    //
+    //  district_id
+    //      The district number (D_ID) is randomly selected within [1 .. 10] 
+    //      from the home warehouse (D_W_ID = W_ID). 
+    //
+    //  c_warehouse_id, c_district_id, last_name
+    //      The customer is randomly selected 
+    //      1) 60% of the time by last name (C_W_ID , C_D_ID, C_LAST) 
+    //  c_warehouse_id, c_district_id, customer_id
+    //      The customer is randomly selected 
+    //      2) 40% of the time by number (C_W_ID , C_D_ID , C_ID).
+    //
+    //  h_amount
+    //      The payment amount (H_AMOUNT) is randomly selected within 
+    //      [1.00 .. 5,000.00].
+    //
+    virtual void PaymentTxn(bool by_last_name,
+                            int32_t warehouse_id, int32_t district_id, 
+                            int32_t c_warehouse_id, int32_t c_district_id, 
+                            int32_t c_customer_id, 
+                            const std::string& last_name,
+                            int32_t h_amount,
+                            PaymentResult* ret) = 0;
+
+
+    //  The New-Order Transaction [Revision 5.11 - Page 28]
+    //  warehouse_id
+    //      For any given terminal, the home warehouse number (W_ID) is constant 
+    //      over the whole measurement interval
+    //  
+    //  district_id
+    //      The district number (D_ID) is randomly selected within [1 .. 10] 
+    //      from the home warehouse (D_W_ID = W_ID). 
+    //
+    //  customer_id
+    //      The non-uniform random customer number (C_ID) is selected using 
+    //      the NURand(1023,1,3000) function from the selected district 
+    //      number (C_D_ID = D_ID) and the home warehouse number (C_W_ID = W_ID).
+    //
+    virtual void NewOrderTxn(int32_t warehouse_id, 
+                             int32_t district_id, 
+                             int32_t customer_id, const NewOrderInfo& info, 
+                             NewOrderResult* ret) = 0;
+    
+    static TpccDb* NewTpccDb(const std::string& db_type);
+};
+
+} // namespace tpcc
+} // namespace tera
+
+#endif /* TERA_BENCHMARK_TPCC_TPCCDB_H */
diff --git a/src/common/atomic.h b/src/common/atomic.h
index 6837cb302..195a7b0da 100644
--- a/src/common/atomic.h
+++ b/src/common/atomic.h
@@ -1,11 +1,10 @@
+#pragma once
 // Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.
+#include<cstdint>
 
-#ifndef  TERA_COUNTER_ATOMIC_H_
-#define  TERA_COUNTER_ATOMIC_H_
-
-namespace common {
+namespace tera {
 
 static inline int atomic_add(volatile int *mem, int add)
 {
@@ -106,5 +105,4 @@ static inline int64_t atomic_comp_swap64(volatile void *mem, int64_t xchg, int64
     return cmp;
 }
 
-} // namespace common
-#endif  // TERA_COMMON_ATOMIC_H_
+} 
diff --git a/src/common/counter.h b/src/common/counter.h
index c9869f633..d4687bfd8 100644
--- a/src/common/counter.h
+++ b/src/common/counter.h
@@ -7,10 +7,10 @@
 
 #include <stdio.h>
 
-#include "atomic.h"
-#include "timer.h"
+#include "common/atomic.h"
+#include "common/timer.h"
 
-namespace common {
+namespace tera {
 
 class Counter {
 public:
@@ -47,19 +47,19 @@ class AutoCounter {
         : counter_(counter),
           msg1_(msg1),
           msg2_(msg2) {
-        start_ = timer::get_micros();
+        start_ = get_micros();
         counter_->Inc();
     }
     ~AutoCounter() {
-        int64_t end = timer::get_micros();
+        int64_t end = get_micros();
         if (end - start_ > 5000000) {
             int64_t t = (end - start_) / 1000000;
             if (!msg2_) {
                 fprintf(stderr, "%s [AutoCounter] %s hang for %ld s\n",
-                    timer::get_curtime_str().data(), msg1_, t);
+                    get_curtime_str().data(), msg1_, t);
             } else {
                 fprintf(stderr, "%s [AutoCounter] %s %s hang for %ld s\n",
-                    timer::get_curtime_str().data(), msg1_, msg2_, t);
+                    get_curtime_str().data(), msg1_, msg2_, t);
             }
         }
         counter_->Dec();
diff --git a/src/common/cpu_profiler.cc b/src/common/cpu_profiler.cc
new file mode 100644
index 000000000..758ed674d
--- /dev/null
+++ b/src/common/cpu_profiler.cc
@@ -0,0 +1,50 @@
+// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include <chrono>
+#include <gperftools/profiler.h>
+
+#include "common/cpu_profiler.h"
+
+namespace tera {
+
+CpuProfiler::CpuProfiler() 
+    : exit_(false), 
+    thread_(&CpuProfiler::run, this) {}
+
+CpuProfiler::~CpuProfiler() {
+    exit_ = true;
+    cv_.notify_one();
+    thread_.join();
+    ProfilerState ps;
+    ProfilerGetCurrentState(&ps);
+    if (ps.enabled) {
+        ProfilerStop();
+    }
+}
+
+void CpuProfiler::run() {
+    while (!exit_.load()) {
+        if (enable_) {
+            ProfilerState ps;
+            ProfilerGetCurrentState(&ps);
+            if (ps.enabled == 0) {
+                ProfilerStart(profiler_file_.c_str());
+            }
+
+            ProfilerFlush();
+            LOG(INFO) << "[Cpu Profiler] Cpu Profiler Dumped";
+        } else {
+            ProfilerState ps;
+            ProfilerGetCurrentState(&ps);
+            if (ps.enabled) {
+                ProfilerStop();
+            }
+        }
+        std::unique_lock<std::mutex> lock(lock_);
+        cv_.wait_for(lock, interval_);
+    }
+}
+
+} // namespace tera
\ No newline at end of file
diff --git a/src/common/cpu_profiler.h b/src/common/cpu_profiler.h
new file mode 100644
index 000000000..ccf0686ab
--- /dev/null
+++ b/src/common/cpu_profiler.h
@@ -0,0 +1,68 @@
+// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef TERA_CPU_PROFILER_H
+#define TERA_CPU_PROFILER_H
+
+#include <atomic>
+#include <thread>
+#include <string>
+#include <mutex>
+#include <condition_variable>
+
+#include <gflags/gflags.h>
+#include <glog/logging.h>
+
+namespace tera {
+
+class CpuProfiler {
+public:
+    /**
+     * @brief Init CpuProfiler and the detect thread will start
+    **/
+    CpuProfiler();
+
+    ~CpuProfiler();
+
+    CpuProfiler& SetEnable(bool enable) {
+        enable_ = enable;
+        if (enable_) {
+            LOG(INFO) << "[Cpu Profiler] Cpu Profiler Enabled";
+        } else {
+            LOG(INFO) << "[Cpu Profiler] Cpu Profiler Disabled";
+        }
+        cv_.notify_one();
+        return *this;
+    }
+
+    CpuProfiler& SetInterval(int second) {
+        interval_ = std::chrono::seconds(second);
+        cv_.notify_one();
+        return *this;
+    }
+
+    CpuProfiler& SetProfilerFile(const std::string& file) {
+        profiler_file_ = file;
+        cv_.notify_one();
+        return *this;
+    }
+
+private:
+    void run();
+
+private:
+    std::atomic<bool> exit_;
+    bool enable_{false};
+    std::chrono::seconds interval_{10};
+    std::string profiler_file_;
+    std::thread thread_;
+    std::mutex lock_;
+    std::condition_variable cv_;
+};
+
+} // namespace tera
+
+#endif  //TERA_CPU_PROFILER_H
+
+/* vim: set ts=4 sw=4 sts=4 tw=100 */
diff --git a/src/common/event.h b/src/common/event.h
index a289d0278..9a6770ece 100644
--- a/src/common/event.h
+++ b/src/common/event.h
@@ -46,8 +46,75 @@ class AutoResetEvent {
     bool signaled_;
 };
 
+class CompletedEvent {
+public:
+    CompletedEvent() 
+        : cv_(&mutex_), cnt_(0), triggered_(false) {}
+
+    CompletedEvent(int64_t task_cnt) 
+        : cv_(&mutex_), cnt_(task_cnt), triggered_(false) {}
+
+    // add event source, 
+    // tasks maybe add while others finished or doing, like a task queue
+    void AddEventSources(int64_t task_cnt) {
+        MutexLock lock(&mutex_);
+        if (!triggered_) {
+            cnt_ += task_cnt;
+        }
+    }
+
+    // call after all tasks added to EventSource,
+    // trigger other thread's Wait() function take effect.
+    void Trigger() {
+        MutexLock lock(&mutex_);
+        triggered_ = true;
+        if (cnt_ <= 0) {
+            cv_.Signal();
+        }
+    }
+
+    // wait until cnt_ == 0 and triggered_ == true
+    void Wait() {
+        MutexLock lock(&mutex_);
+        // cnt_ > 0
+        while (cnt_ > 0 || !triggered_) {
+            cv_.Wait();
+        }
+    }
+
+    // wait for 'timeout' ms, don't careful cnt_ and triggered_
+    // if last event source completed, this will returned early 'timeout'
+    bool TimeWait(int64_t timeout) {
+        MutexLock lock(&mutex_);
+        if (cnt_ > 0 || !triggered_) {
+            cv_.TimeWait(timeout);
+        }
+        return cnt_ > 0 ? false : true;
+    }
+
+    // last event source complated and triggered_ == true, will be notify
+    // Wait or TimeWait
+    void Complete(int64_t task_cnt = 1) {
+        MutexLock lock(&mutex_);
+        cnt_ -= task_cnt;
+        // use 'triggered_' to make sure all tasks call 'AddEventSources'
+        if (cnt_ <= 0 && triggered_) {
+            cv_.Signal();
+        }
+    }
+
+private:
+    CompletedEvent(const CompletedEvent&) = delete;
+    CompletedEvent &operator=(const CompletedEvent&) = delete;
+    Mutex mutex_;
+    CondVar cv_;
+    int64_t cnt_;
+    bool triggered_;
+};
+
 } // namespace common
 
 using common::AutoResetEvent;
+using common::CompletedEvent;
 
 #endif  // TERA_COMMON_EVENT_H_
diff --git a/src/common/file/file_path.cc b/src/common/file/file_path.cc
index 44738117f..ea3a8ef08 100644
--- a/src/common/file/file_path.cc
+++ b/src/common/file/file_path.cc
@@ -146,6 +146,33 @@ bool ListCurrentDir(const std::string& dir_path,
     return true;
 }
 
+bool ListCurrentDirWithStat(const std::string& dir_path,
+		                    std::vector<FileStateInfo>* file_list) {
+    DIR *dir = NULL;
+    struct dirent *ptr = NULL;
+    dir = opendir(dir_path.c_str());
+    if (dir == NULL) {
+        return false;
+    }
+    bool stat_all_succ = true;
+    while ((ptr = readdir(dir)) != NULL) {
+        if (strcmp(ptr->d_name, ".") != 0 && strcmp(ptr->d_name, "..") != 0) {
+            struct stat st;
+            std::string file_name(ptr->d_name);
+            file_name = dir_path + "/" + file_name;
+            if (lstat(file_name.c_str(), &st) == 0) {
+                file_list->push_back(std::make_pair(file_name, st));
+            } else {
+                // break if stat fail and return false later
+            	stat_all_succ = false;
+                break;
+            }
+        }
+    }
+    closedir(dir);
+    return stat_all_succ;
+}
+
 bool IsExist(const std::string& path) {
     return access(path.c_str(), R_OK) == 0;
 }
diff --git a/src/common/file/file_path.h b/src/common/file/file_path.h
index e0ab5d002..d5e04ea99 100644
--- a/src/common/file/file_path.h
+++ b/src/common/file/file_path.h
@@ -8,6 +8,8 @@
 #include<unistd.h>
 #include<string>
 #include<vector>
+#include <sys/types.h> 
+#include <sys/stat.h>
 
 void SplitStringPath(const std::string& full_path,
                      std::string* dir_part,
@@ -28,6 +30,11 @@ std::string UidToName(uid_t uid);
 bool ListCurrentDir(const std::string& dir_path,
                     std::vector<std::string>* file_list);
 
+typedef std::pair<std::string, struct stat> FileStateInfo;
+
+bool ListCurrentDirWithStat(const std::string& dir_path,
+                            std::vector<FileStateInfo>* file_list);
+
 bool IsExist(const std::string& path);
 
 bool IsDir(const std::string& path);
diff --git a/src/common/heap_profiler.cc b/src/common/heap_profiler.cc
new file mode 100644
index 000000000..386e314e9
--- /dev/null
+++ b/src/common/heap_profiler.cc
@@ -0,0 +1,51 @@
+// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include <chrono>
+#include <ctime>
+#include <string.h>
+#include <gperftools/heap-profiler.h>
+
+#include "common/heap_profiler.h"
+
+namespace tera {
+
+HeapProfiler::HeapProfiler() 
+    : exit_(false),
+    thread_(&HeapProfiler::run, this) {}
+
+HeapProfiler::~HeapProfiler() {
+    exit_ = true;
+    cv_.notify_one();
+    thread_.join();
+    if (IsHeapProfilerRunning()) {
+        HeapProfilerStop();
+    }
+}
+
+void HeapProfiler::run() {
+    while (!exit_.load()) {
+        if (enable_) {
+            // "reason" is time
+            std::time_t t = std::chrono::system_clock::to_time_t(std::chrono::system_clock::now());
+            char ts[128];
+            ctime_r(&t, ts);
+            ts[strlen(ts) - 1] = '\0'; // erase \n
+            
+            if (IsHeapProfilerRunning() == 0) {
+                HeapProfilerStart(profiler_file_.c_str());
+            }
+            HeapProfilerDump(ts);
+            LOG(INFO) << "[Heap Profiler] Heap Profiler Dumped";
+        } else {
+            if (IsHeapProfilerRunning()) {
+                HeapProfilerStop();
+            }
+        }
+        std::unique_lock<std::mutex> lock(lock_);
+        cv_.wait_for(lock, interval_);
+    }
+}
+
+} // namespace tera
\ No newline at end of file
diff --git a/src/common/heap_profiler.h b/src/common/heap_profiler.h
new file mode 100644
index 000000000..f5ffa9c6b
--- /dev/null
+++ b/src/common/heap_profiler.h
@@ -0,0 +1,90 @@
+// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef TERA_HEAP_PROFILER_H
+#define TERA_HEAP_PROFILER_H
+
+#include <atomic>
+#include <thread>
+#include <mutex>
+#include <string>
+#include <condition_variable>
+#include <cstdlib>
+
+#include <gflags/gflags.h>
+#include <glog/logging.h>
+
+DECLARE_int64(heap_profile_allocation_interval);
+DECLARE_int64(heap_profile_inuse_interval);
+
+namespace tera {
+
+class HeapProfiler {
+public:
+
+    /**
+     * @brief Init HeapProfiler and the detect thread will start
+    **/
+    HeapProfiler();
+    /**
+     * @brief: the heap profiler will stop after descontrucor called 
+     *
+    **/
+    ~HeapProfiler();
+
+    HeapProfiler& SetEnable(bool enable) {
+        enable_ = enable;
+
+        if (enable_) {
+            setenv("HEAP_PROFILE_ALLOCATION_INTERVAL",
+                   std::to_string(FLAGS_heap_profile_allocation_interval).c_str(),
+                   1);
+
+            setenv("HEAP_PROFILE_INUSE_INTERVAL",
+                   std::to_string(FLAGS_heap_profile_inuse_interval).c_str(),
+                   1);
+
+            LOG(INFO) << "[Heap Profiler] HEAP_PROFILE_ALLOCATION_INTERVAL: "
+                      << getenv("HEAP_PROFILE_ALLOCATION_INTERVAL");
+            LOG(INFO) << "[Heap Profiler] HEAP_PROFILE_INUSE_INTERVAL: "
+                      << getenv("HEAP_PROFILE_INUSE_INTERVAL");
+            LOG(INFO) << "[Heap Profiler] Heap Profiler Enabled";
+        } else {
+            unsetenv("HEAP_PROFILE_ALLOCATION_INTERVAL");
+            unsetenv("HEAP_PROFILE_INUSE_INTERVAL");
+            LOG(INFO) << "[Heap Profiler] Heap Profiler Disabled";
+        }
+        cv_.notify_one();
+        return *this;
+    }
+    
+    HeapProfiler& SetInterval(int second) {
+        interval_ = std::chrono::seconds(second);
+        cv_.notify_one();
+        return *this;
+    }
+
+    HeapProfiler& SetProfilerFile(const std::string& file) {
+        profiler_file_ = file;
+        cv_.notify_one();
+        return *this;
+    }
+
+private:
+    void run();
+private:
+    std::atomic<bool> exit_;
+    bool enable_{false};
+    std::chrono::seconds interval_{10};
+    std::string profiler_file_;
+    std::thread thread_;
+    std::mutex lock_;
+    std::condition_variable cv_;
+};
+
+} // namespace tera
+
+#endif  //TERA_HEAP_PROFILER
+
+/* vim: set ts=4 sw=4 sts=4 tw=100 */
\ No newline at end of file
diff --git a/src/common/log/log_cleaner.cc b/src/common/log/log_cleaner.cc
new file mode 100644
index 000000000..6b5474a1d
--- /dev/null
+++ b/src/common/log/log_cleaner.cc
@@ -0,0 +1,322 @@
+// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+ 
+#include "common/log/log_cleaner.h"
+
+#include <algorithm>
+#include <map>
+#include <vector>
+#include <unistd.h>
+
+#include <glog/logging.h>
+
+#include "common/file/file_path.h"
+#include "common/timer.h"
+
+DECLARE_string(log_dir);
+DECLARE_string(tera_log_prefix);
+DECLARE_string(tera_leveldb_log_path);
+DECLARE_int64(tera_info_log_clean_period_second);
+DECLARE_int64(tera_info_log_expire_second);
+DECLARE_string(ins_log_file);
+ 
+namespace common {
+
+static const int64_t kMinCleanPeriodMs = 1000; // 1s
+static const int64_t kMinInfoLogExpireSec = 1; // 1s
+static const size_t kPathMaxLen = 64;
+
+Mutex LogCleaner::inst_init_mutex_;
+LogCleaner* LogCleaner::singleton_instance_ = NULL;
+
+static std::string GetProcFdPath() {
+    char path_buf[kPathMaxLen];
+    snprintf(path_buf, kPathMaxLen, "/proc/%d/fd", getpid());
+    return std::string(path_buf);
+}
+
+static std::string GetFileNameFromPath(const std::string& path) {
+    std::string::size_type pos = path.rfind("/");
+    if (pos == std::string::npos) {
+        return path;
+    } else {
+        return path.substr(pos + 1);
+    }
+}
+
+
+LogCleaner* LogCleaner::GetInstance(ThreadPool *thread_pool) {
+    if (singleton_instance_ == NULL) {
+        singleton_instance_ = new LogCleaner(FLAGS_log_dir,
+            FLAGS_tera_info_log_clean_period_second, 
+            FLAGS_tera_info_log_expire_second, 
+            thread_pool);
+        singleton_instance_->AddPrefix(FLAGS_tera_log_prefix);
+        singleton_instance_->AddPrefix(GetFileNameFromPath(FLAGS_tera_leveldb_log_path));
+        singleton_instance_->AddPrefix(GetFileNameFromPath(FLAGS_ins_log_file));
+    }
+    return singleton_instance_;
+}
+
+bool LogCleaner::StartCleaner(ThreadPool *thread_pool) {
+    return GetInstance()->Start();
+}
+
+void LogCleaner::StopCleaner() {
+    MutexLock l(&inst_init_mutex_, "Destroy log cleaner");
+    if (singleton_instance_ != NULL) {
+        singleton_instance_->Stop();
+        delete singleton_instance_;
+        singleton_instance_ = NULL;
+    }
+}
+ 
+LogCleaner::LogCleaner(const std::string& log_dir, 
+                       int64_t period_second,
+                       int64_t expire_second, 
+                       ThreadPool *thread_pool)
+    : thread_pool_(thread_pool),
+      thread_pool_own_(false),
+      mutex_(),
+      info_log_dir_(log_dir),
+      log_prefix_list_(),
+      info_log_clean_period_ms_(std::max(period_second * 1000, kMinCleanPeriodMs)),
+      info_log_expire_sec_(std::max(expire_second, kMinInfoLogExpireSec)),
+      stop_(false),
+      bg_exit_(false),
+      bg_cond_(&mutex_),
+      bg_func_(std::bind(&LogCleaner::CleanTaskWrap, this)),
+      bg_task_id_(-1), 
+      proc_fd_path_(GetProcFdPath()) {}
+ 
+LogCleaner::~LogCleaner() {
+    DestroyOwnThreadPool();
+}
+
+static bool CheckDirPath(const std::string &dir_path) {
+    return !dir_path.empty() && IsDir(dir_path);
+}
+
+bool LogCleaner::CheckOptions() const {
+    return CheckDirPath(info_log_dir_) && 
+           info_log_clean_period_ms_ > 0 && 
+           info_log_expire_sec_ > 0;
+}
+
+bool LogCleaner::Start() {
+    if (!CheckOptions()) {
+        return false;
+    }
+    
+    MutexLock l(&mutex_, "Start info log cleaner");
+    
+    // double check
+    if (IsRunning()) {
+        return true;
+    }
+    
+    stop_ = false;
+    bg_exit_ = false;
+    if (nullptr == thread_pool_) {
+        NewThreadPool();
+    }
+
+    if (bg_task_id_ <= 0) {
+        // start immediately
+        bg_task_id_ = thread_pool_->DelayTask(0, bg_func_);
+    }
+    return true;
+}
+
+void LogCleaner::Stop() {
+    MutexLock l(&mutex_, "Stop info log cleaner");
+    stop_ = true;
+    bool is_running = false;
+    if (bg_task_id_ > 0) {
+        bg_exit_ = thread_pool_->CancelTask(bg_task_id_, true, &is_running);
+    } else {
+        bg_exit_ = true;
+    }
+    
+    CHECK(is_running || bg_exit_);
+    while(!bg_exit_) {
+        bg_cond_.Wait();
+    }
+    bg_task_id_ = -1;
+}
+
+void LogCleaner::CleanTaskWrap() {
+    MutexLock l(&mutex_);
+    DoCleanLocalLogs();
+    if (stop_) {
+        bg_task_id_ = -1;
+        bg_exit_ = true;
+    } else {
+        bg_task_id_ = thread_pool_->DelayTask(info_log_clean_period_ms_, bg_func_);
+    }
+    bg_cond_.Signal();
+}
+
+bool LogCleaner::CheckLogPrefix(const std::string& filename) const {
+    std::set<std::string>::const_iterator prefix_iter = log_prefix_list_.begin();
+    for (; prefix_iter != log_prefix_list_.end(); ++prefix_iter) {
+        const std::string& prefix = *prefix_iter;
+        if (filename.size() < prefix.size()) {
+            // do not need to compare
+            continue;
+        }
+
+        if (strncmp(prefix.c_str(), filename.c_str(), prefix.size()) == 0) {
+            // return true if match any prefix
+            return true;
+        }
+    }
+    return false;
+}
+
+bool LogCleaner::DoCleanLocalLogs() {
+    if (log_prefix_list_.empty()) {
+        LOG(WARNING) << "[LogCleaner] Log prefix is not set yet.";
+        return false;
+    }
+    if (!CheckDirPath(info_log_dir_) || IsEmpty(info_log_dir_)) {
+        LOG(WARNING) << "[LogCleaner] Log dir " << info_log_dir_ << " not exsit logs.";
+        return false;
+    }
+    int64_t now_time = tera::get_millis() / 1000;
+    int64_t clean_time = now_time - info_log_expire_sec_;
+    LOG(INFO) << "[LogCleaner] Start clean log dir: " << info_log_dir_ 
+              << ", now_time = " << now_time 
+              << ", clean_time = " << clean_time;
+
+    long path_maxlen = pathconf(info_log_dir_.c_str(), _PC_PATH_MAX);
+    std::vector<std::string> log_file_list;
+    if (!ListCurrentDir(info_log_dir_, &log_file_list)) {
+        // list failed
+        LOG(WARNING) << "[LogCleaner] List log dir " << info_log_dir_
+                     << " failed. Cancel clean.";
+        return false;
+    }
+
+    // reserved_set: filenames that should not to be clean
+    std::set<std::string> reserved_set;
+    if (!GetCurrentOpendLogs(&reserved_set)) {
+        LOG(WARNING) << "[LogCleaner] GetCurrentOpendLogs failed. Cancel clean.";
+        return false;
+    }
+
+    std::vector<std::string>::const_iterator it = log_file_list.begin();
+    for (; it != log_file_list.end(); ++it) {
+        if (reserved_set.find(*it) != reserved_set.end()) {
+            // already reserved
+            continue;
+        }
+            
+        const std::string& file_name = *it;
+            
+        // check if filename start with log_prefix_
+        // if leveldb_log_prefix_ is not empty, check also
+        if (!CheckLogPrefix(file_name)) {
+            VLOG(16) << "[LogCleaner] Reserve log file: " << file_name
+                     << ", which not match prefix.";
+            reserved_set.insert(file_name);
+            continue;
+        }
+            
+        // get file stat
+        std::string file_path = info_log_dir_ + "/" + file_name;
+        struct stat file_st;
+        if (lstat(file_path.c_str(), &file_st) != 0) {
+            // cancel clean if any file stat failed
+            LOG(WARNING) << "[LogCleaner] Stat log file: " << file_path << " fail. Cancel log clean.";
+            return false;
+        }
+            
+        if (S_ISLNK(file_st.st_mode)) {
+            // handle symbolic link
+            VLOG(16) << "[LogCleaner] Reserve symbolic link log: " << file_name;
+            reserved_set.insert(file_name);
+            char path_buf[path_maxlen];
+            int ret = readlink(file_path.c_str(), path_buf, path_maxlen);
+            if (ret < 0 || ret >= path_maxlen) {
+                continue;
+            } else {
+                // reserve link target
+                path_buf[ret] = '\0';
+                std::string target_filename = GetFileNameFromPath(path_buf);
+                VLOG(16) << "[LogCleaner] Reserve link target: " << target_filename
+                         << " for link: " << file_path;
+                reserved_set.insert(target_filename);
+            }
+        } else if (!S_ISREG(file_st.st_mode)) {
+            VLOG(16)  << "[LogCleaner] Reserve not regular file: " << file_name;
+            reserved_set.insert(file_name);
+        } else if (file_st.st_mtime >= clean_time) {
+            VLOG(16)  << "[LogCleaner] Reserve not expire log: " << file_name
+                      << ", mtime: " << file_st.st_mtime << ", clean_time: " << clean_time;
+            reserved_set.insert(file_name);
+        }
+        VLOG(16) << "stat filename: " << file_name
+                 << ", is_symbolic_link: " << S_ISLNK(file_st.st_mode)
+                 << ", is_dir: " << S_ISDIR(file_st.st_mode)
+                 << ", is_regular_file: " << S_ISREG(file_st.st_mode)
+                 << ", last mod time: " << file_st.st_mtime
+                 << ", link number: " << file_st.st_nlink
+                 << ", reserve: " << (reserved_set.find(file_name) != reserved_set.end());
+    }
+
+    // clean log
+    size_t clean_cnt = 0;
+    it = log_file_list.begin();
+    for (; it != log_file_list.end(); ++it) {
+        const std::string &file_name = *it;
+        std::string file_path = info_log_dir_ + "/" + file_name;
+        if (reserved_set.find(file_name) == reserved_set.end()) {
+            LOG(INFO) << "[LogCleaner] log: " << file_path << " will be clean";
+            if (!RemoveLocalFile(file_path)){
+                LOG(WARNING) << "[LogCleaner] log clean fail: " << file_path;
+            } else {
+                ++clean_cnt;
+            }
+        }
+    }
+    LOG(INFO) << "[LogCleaner] Found log: " << log_file_list.size()
+              << ", clean: " << clean_cnt;
+    return true;
+}
+
+bool LogCleaner::GetCurrentOpendLogs(std::set<std::string>* opend_logs) {
+    long path_maxlen = pathconf(proc_fd_path_.c_str(), _PC_PATH_MAX);
+    if (path_maxlen < 0) {
+        LOG(ERROR) << "[LogCleaner] Get Path Max Len Failed";
+        return false;
+    }
+    std::vector<FileStateInfo> opend_logs_list;
+    VLOG(16) << "[LogCleaner] Search fd_path: " << proc_fd_path_;
+    if (!ListCurrentDirWithStat(proc_fd_path_, &opend_logs_list)) {
+        VLOG(16) << "[LogCleaner] list fd_path: " << proc_fd_path_ << " failed.";
+        return false;
+    }
+
+    std::vector<FileStateInfo>::const_iterator it = opend_logs_list.begin();
+    for (; it != opend_logs_list.end(); ++it) {
+        const std::string& filename = it->first;
+        const struct stat& st = it->second;
+        if (S_ISLNK(st.st_mode)) {
+            char path_buf[path_maxlen];
+            int ret = readlink(filename.c_str(), path_buf, path_maxlen);
+            if (ret > 0 && ret < path_maxlen && path_buf[0] == '/') {
+                path_buf[ret] = '\0';
+                std::string target_filename = GetFileNameFromPath(path_buf);
+                VLOG(16) << "[LogCleaner] Reserve log in use: " << target_filename;
+                opend_logs->insert(target_filename);
+            }
+        }
+    }
+    return true;
+}
+ 
+} // end namespace common
+ 
+/* vim: set expandtab ts=4 sw=4 sts=4 tw=100: */
diff --git a/src/common/log/log_cleaner.h b/src/common/log/log_cleaner.h
new file mode 100644
index 000000000..53830a733
--- /dev/null
+++ b/src/common/log/log_cleaner.h
@@ -0,0 +1,114 @@
+// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+ 
+#ifndef TERA_COMMON_LOG_CLEANER_H_
+#define TERA_COMMON_LOG_CLEANER_H_
+
+#include <pthread.h>
+#include <set>
+#include <string>
+
+#include "common/mutex.h"
+#include "common/thread_pool.h"
+
+namespace common {
+ 
+class LogCleaner {
+private:
+    // set private since singleton
+    LogCleaner(const std::string& log_dir, 
+			   int64_t period_second,
+    		   int64_t expire_second, 
+			   ThreadPool* thread_pool);
+    ~LogCleaner();
+	// disallow copy
+	LogCleaner(const LogCleaner& other) = delete;
+	LogCleaner & operator = (const LogCleaner& other) = delete;
+	
+public:
+    bool CheckOptions() const;
+    bool Start();
+    void Stop();
+	bool IsRunning() const { return bg_task_id_ > 0; }
+
+	bool AddPrefix(const std::string& prefix) {
+	    if (prefix.empty()) {
+	        // empty prefix is not allowed
+	    	return false;
+	    } else {
+	        MutexLock l(&mutex_);
+	    	log_prefix_list_.insert(prefix);
+	    	return true;
+	    }
+	}
+
+	void RemovePrefix(const std::string& prefix) {
+        MutexLock l(&mutex_);
+		log_prefix_list_.erase(prefix);
+	}
+
+private:
+    // singleton
+    static Mutex inst_init_mutex_;
+	static LogCleaner* singleton_instance_;
+
+	// get singleton instance but not start
+	// for unittest
+	static LogCleaner* GetInstance(ThreadPool *thread_pool = NULL);
+
+public:
+	static bool StartCleaner(ThreadPool *thread_pool = NULL);
+	static void StopCleaner();
+
+private:
+    // do under lock
+    void NewThreadPool() {
+    	if (NULL == thread_pool_) {
+    		thread_pool_ = new ThreadPool(1);
+    		thread_pool_own_ = true;
+    	}
+    }
+    void DestroyOwnThreadPool() {
+    	if (thread_pool_own_ && NULL != thread_pool_) {
+    		thread_pool_->Stop(true);
+    		delete thread_pool_;
+    		thread_pool_ = NULL;
+    		thread_pool_own_ = false;
+    	}
+    }
+
+    void CleanTaskWrap();
+
+    bool CheckLogPrefix(const std::string& filename) const;
+
+    bool DoCleanLocalLogs();
+
+    bool GetCurrentOpendLogs(std::set<std::string>* opend_logs);
+
+private:
+	ThreadPool* thread_pool_;
+	bool thread_pool_own_;
+	mutable Mutex mutex_;
+
+	// options
+	std::string info_log_dir_;
+    std::set<std::string> log_prefix_list_;
+	int64_t info_log_clean_period_ms_;   // milli second
+	int64_t info_log_expire_sec_;        // second
+
+	bool stop_;
+	bool bg_exit_;
+	CondVar bg_cond_;
+	const ThreadPool::Task bg_func_;
+	int64_t bg_task_id_;
+	
+	std::string proc_fd_path_; 
+};
+ 
+} // end namespace common
+ 
+#endif // TERA_COMMON_LOG_CLEANER_H_
+ 
+/* vim: set expandtab ts=4 sw=4 sts=4 tw=100: */
+
diff --git a/src/common/metric/cache_collector.h b/src/common/metric/cache_collector.h
new file mode 100644
index 000000000..ae415b0d8
--- /dev/null
+++ b/src/common/metric/cache_collector.h
@@ -0,0 +1,108 @@
+// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef TERA_COMMOM_METRIC_CACHE_COLLECTOR_H_
+#define TERA_COMMOM_METRIC_CACHE_COLLECTOR_H_
+ 
+#include <cmath> 
+#include <string>
+ 
+#include "common/metric/collector_report_publisher.h" 
+#include "common/metric/collector.h"
+#include "db/table_cache.h"
+#include "leveldb/cache.h" 
+ 
+namespace tera { 
+ 
+enum class CacheCollectType {
+    kHitRate,
+    kEntries,
+    kCharge,
+};
+
+class BaseCacheCollector : public Collector {
+public:
+    explicit BaseCacheCollector(CacheCollectType cache_type) : cache_type_(cache_type) {}
+    virtual ~BaseCacheCollector() {}
+    
+    virtual int64_t Collect() {
+        switch (cache_type_) {
+            case CacheCollectType::kHitRate:
+                return HitRate();
+            case CacheCollectType::kEntries:
+                return Entries();
+            case CacheCollectType::kCharge:
+                return TotalCharge();
+            default:
+                return 0;
+        }
+    }
+    
+protected:
+    virtual int64_t HitRate() = 0;
+    virtual int64_t Entries() = 0;
+    virtual int64_t TotalCharge() = 0;
+    
+protected:
+    CacheCollectType cache_type_;
+};
+
+class LRUCacheCollector : public BaseCacheCollector {
+public:
+    LRUCacheCollector(leveldb::Cache* cache, 
+                      CacheCollectType cache_type):
+        BaseCacheCollector(cache_type), 
+        cache_(cache) {}
+
+    virtual ~LRUCacheCollector() {}
+
+protected:
+    int64_t HitRate() override {
+        if (cache_ == NULL) {
+            return 0;
+        }
+        
+        double hit_rate = cache_->HitRate(true);
+        return isnan(hit_rate) ? -1 : static_cast<int64_t>(hit_rate * 100.0d);
+    }
+    
+    int64_t Entries() override { return cache_ == NULL ? 0 : static_cast<int64_t>(cache_->Entries()); }
+    
+    int64_t TotalCharge() override { return cache_ == NULL ? 0 : static_cast<int64_t>(cache_->TotalCharge()); }
+private:
+    leveldb::Cache* cache_;
+};
+
+class TableCacheCollector : public BaseCacheCollector {
+public:
+    TableCacheCollector(leveldb::TableCache* cache, 
+                        CacheCollectType cache_type):
+        BaseCacheCollector(cache_type), 
+        cache_(cache) {}
+
+    virtual ~TableCacheCollector() {}
+
+protected:    
+    int64_t HitRate() override {
+        if (cache_ == NULL) {
+            return 0;
+        }
+        
+        double hit_rate = cache_->HitRate(true);
+        return isnan(hit_rate) ? -1 : static_cast<int64_t>(hit_rate * 100.0d);
+    }
+    
+    int64_t Entries() override { return cache_ == NULL ? 0 : static_cast<int64_t>(cache_->TableEntries()); }
+    
+    int64_t TotalCharge() override { return cache_ == NULL ? 0 : static_cast<int64_t>(cache_->ByteSize()); }
+private:
+    leveldb::TableCache* cache_;
+}; 
+ 
+} // end namespace tera 
+ 
+#endif // TERA_COMMOM_METRIC_CACHE_COLLECTOR_H_
+ 
+/* vim: set expandtab ts=4 sw=4 sts=4 tw=100: */
+
diff --git a/src/common/metric/collector.h b/src/common/metric/collector.h
new file mode 100644
index 000000000..0b31bb446
--- /dev/null
+++ b/src/common/metric/collector.h
@@ -0,0 +1,15 @@
+#pragma once
+// Copyright (c) 2017, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+#include <cstdint>
+#include <memory>
+
+namespace tera{
+class Collector {
+public:
+    virtual ~Collector() {}
+    // return a instant value of the metric for tera to dump log and other usage
+    virtual int64_t Collect() = 0;
+};
+}
diff --git a/src/common/metric/collector_report.h b/src/common/metric/collector_report.h
new file mode 100644
index 000000000..8c453dcaa
--- /dev/null
+++ b/src/common/metric/collector_report.h
@@ -0,0 +1,49 @@
+#pragma once
+// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+#include <memory>
+#include <string> 
+#include <unordered_map> 
+ 
+#include "common/metric/metric_id.h"
+#include "common/mutex.h" 
+#include "common/metric/collector.h"
+#include "common/metric/subscriber.h"
+ 
+namespace tera { 
+ 
+using CollectorReportMap = std::unordered_map<MetricId, int64_t>;
+
+struct CollectorReport {
+    int64_t timestamp_ms;    // timestamp of the report
+    int64_t interval_ms;     // time interval since last report
+
+    // metric_id to metric snapshot
+    CollectorReport() : timestamp_ms(get_millis()) {} 
+    
+    // find methods, return 0 if not found
+    int64_t FindMetricValue(const MetricId& metric_id) const {
+        auto iter = report.find(metric_id);
+        return iter == report.end() ? 0 : iter->second;
+    };
+    
+    int64_t FindMetricValue(const std::string& metric_name) const {
+        return FindMetricValue(MetricId(metric_name));
+    }
+    
+    int64_t FindMetricValue(const std::string& metric_name, const std::string& label_str) const {
+        MetricId metric_id;
+        if (!MetricId::ParseFromString(metric_name, label_str, &metric_id)) {
+            return 0;
+        } else {
+            return FindMetricValue(metric_id);
+        }
+    }
+
+    CollectorReportMap report;
+}; 
+} // end namespace tera
+
+/* vim: set expandtab ts=4 sw=4 sts=4 tw=100: */
+
diff --git a/src/common/metric/collector_report_publisher.cc b/src/common/metric/collector_report_publisher.cc
new file mode 100644
index 000000000..620cc4107
--- /dev/null
+++ b/src/common/metric/collector_report_publisher.cc
@@ -0,0 +1,150 @@
+// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. 
+
+#include "common/metric/collector_report_publisher.h" 
+
+#include "glog/logging.h"
+
+#include "common/metric/hardware_collectors.h"
+#include "common/timer.h"
+#include "common/metric/collector.h"
+#include "common/metric/prometheus_subscriber.h"
+ 
+namespace tera {
+
+CollectorReportPublisher& CollectorReportPublisher::GetInstance() {
+    static CollectorReportPublisher instance;
+    return instance;
+}
+
+CollectorReportPublisher::CollectorReportPublisher():
+    last_report_timestamp_(get_millis()),
+    last_collector_report_(new CollectorReport) {
+    AddHardwareCollectors();
+}
+
+CollectorReportPublisher::~CollectorReportPublisher() {}
+
+std::shared_ptr<SubscriberReport> CollectorReportPublisher::GetSubscriberReport() {
+    std::lock_guard<std::recursive_mutex> lock(mutex_);
+
+    std::shared_ptr<SubscriberReport> new_report(new SubscriberReport());
+    int64_t start_ts = get_millis();
+    // do collect
+    for (auto& subscriber_pair : subscribers_) {
+        const MetricId& metric_id = subscriber_pair.first;
+        new_report->insert(std::make_pair(metric_id, subscriber_pair.second->Collect()));
+    }
+
+    int64_t end_ts = get_millis();
+    VLOG(12) << "[Metric] Get Subscriber Summary Cost: " << (end_ts - start_ts) << " ms.";
+    return new_report;
+}
+
+std::shared_ptr<CollectorReport> CollectorReportPublisher::GetCollectorReport() {
+    std::lock_guard<std::recursive_mutex> lock(mutex_);
+    return last_collector_report_;
+}
+
+void CollectorReportPublisher::Refresh() {
+    std::lock_guard<std::recursive_mutex> lock(mutex_);
+
+    std::shared_ptr<CollectorReport> new_report(new CollectorReport());
+    int64_t start_ts = new_report->timestamp_ms;
+    new_report->interval_ms = new_report->timestamp_ms - last_report_timestamp_;
+
+    // do collect
+    for (auto& metric_pair : collectors_) {
+        const MetricId& metric_id = metric_pair.first;
+        int64_t value = metric_pair.second->Collect();
+        new_report->report[metric_id] = value;
+    }
+
+    last_report_timestamp_ = start_ts;
+    int64_t end_ts = get_millis();
+    VLOG(12) << "[Metric] Refresh Collectors Cost: " << (end_ts - start_ts) << " ms.";
+    last_collector_report_ = new_report;
+    NotifySubscribers();
+}
+
+bool CollectorReportPublisher::AddCollector(const MetricId& metric_id, 
+                                          std::unique_ptr<Collector>&& metric_collector,
+                                          SubscriberTypeList type_list) {
+    if (!metric_id.IsValid() || !metric_collector) {
+        return false;
+    }
+    
+    std::lock_guard<std::recursive_mutex> lock(mutex_);
+    auto insert_ret = collectors_.insert(std::make_pair(metric_id, std::move(metric_collector)));
+    if (!insert_ret.second) {
+        return false;
+    } 
+
+    for (auto type : type_list) {
+        if (!AddSubscriber(std::unique_ptr<Subscriber>(new PrometheusSubscriber(metric_id, type)))) {
+            LOG(ERROR) << "[METRIC] Add Subscriber For " << metric_id.ToString() << " Failed!";
+        }
+    }
+
+    return true;
+}
+
+bool CollectorReportPublisher::AddSubscriber(std::unique_ptr<Subscriber>&& prometheus_subscriber_ptr) {
+    if (!prometheus_subscriber_ptr || 
+        !prometheus_subscriber_ptr->GetMetricId().IsValid()) {
+        // invalid arguments
+        return false;
+    }
+
+    std::lock_guard<std::recursive_mutex> lock(mutex_);
+    subscribers_.insert(std::make_pair(prometheus_subscriber_ptr->GetMetricId(), 
+                                       std::move(prometheus_subscriber_ptr)));
+
+    return true;
+}
+
+void CollectorReportPublisher::NotifySubscribers() {
+    std::lock_guard<std::recursive_mutex> lock(mutex_);
+    for (auto& subscriber_pair : subscribers_) {
+        subscriber_pair.second->OnUpdate(last_collector_report_);
+    }
+}
+
+bool CollectorReportPublisher::HasCollector(const MetricId& metric_id) const {
+    std::lock_guard<std::recursive_mutex> lock(mutex_);
+    return collectors_.find(metric_id) != collectors_.end();
+}
+
+bool CollectorReportPublisher::DeleteCollector(const MetricId& metric_id) {
+    std::lock_guard<std::recursive_mutex> lock(mutex_);
+    DeleteSubscriber(metric_id);
+    return collectors_.erase(metric_id) > 0;
+}
+
+bool CollectorReportPublisher::DeleteSubscriber(const MetricId& metric_id) {
+    std::lock_guard<std::recursive_mutex> lock(mutex_);
+    return subscribers_.erase(metric_id) > 0;
+}
+
+void CollectorReportPublisher::DeleteSubscribers() {
+    subscribers_.clear();
+}
+
+void CollectorReportPublisher::AddHardwareCollectors() {
+    // register hardware metrics
+    AddCollector(MetricId(kInstCpuMetricName), std::unique_ptr<Collector>(new CpuUsageCollector()));
+    AddCollector(MetricId(kInstMemMetricName), std::unique_ptr<Collector>(new MemUsageCollector()));
+    
+    AddCollector(MetricId(kInstNetRXMetricName), 
+                      std::unique_ptr<Collector>(new NetUsageCollector(RECEIVE)), 
+                      {SubscriberType::MAX});
+
+    AddCollector(MetricId(kInstNetTXMetricName), 
+                      std::unique_ptr<Collector>(new NetUsageCollector(TRANSMIT)), 
+                      {SubscriberType::MAX});
+}
+} // end namespace tera
+ 
+/* vim: set expandtab ts=4 sw=4 sts=4 tw=100: */
+
diff --git a/src/common/metric/collector_report_publisher.h b/src/common/metric/collector_report_publisher.h
new file mode 100644
index 000000000..1290f2000
--- /dev/null
+++ b/src/common/metric/collector_report_publisher.h
@@ -0,0 +1,162 @@
+// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef TERA_COMMON_METRIC_METRICS_H_
+#define TERA_COMMON_METRIC_METRICS_H_
+
+#include <map>
+#include <memory>
+#include <stdexcept>
+#include <string>
+#include <vector>
+#include <unordered_map>
+#include <mutex>
+#include <initializer_list>
+ 
+#include "common/metric/metric_id.h" 
+#include "common/metric/collector_report.h"
+#include "common/metric/collector.h"
+#include "common/metric/subscriber.h"
+
+namespace tera {
+// Base class for metric value collector 
+
+using SubscriberTypeList = std::initializer_list<SubscriberType>;
+
+class CollectorReportPublisher {
+private:
+    // set private for singleton
+    CollectorReportPublisher();
+    ~CollectorReportPublisher();
+    
+    // disallow copy
+    CollectorReportPublisher(const CollectorReportPublisher&) = delete;
+    CollectorReportPublisher& operator = (const CollectorReportPublisher&) = delete;
+
+public:
+    static CollectorReportPublisher& GetInstance();
+    
+    void Refresh();
+    /// report the instant values of collectors
+    std::shared_ptr<CollectorReport> GetCollectorReport();
+    std::shared_ptr<SubscriberReport> GetSubscriberReport();
+    
+    /// Add a collector with a given metric_id
+    /// collector should be a right value reference of std::unique_ptr<Collector>
+    /// return true if register success, 
+    /// retrun false if argument is invalid or metric_id name has been registered already.
+    bool AddCollector(const MetricId& metric_id, 
+                            std::unique_ptr<Collector>&& metric_collector,
+                            SubscriberTypeList type_list = {SubscriberType::LATEST});
+
+
+    /// weather a collector has been Added
+    bool HasCollector(const MetricId& metric_id) const;
+    /// Delete a collector
+    bool DeleteCollector(const MetricId& metric_id);
+
+
+    /// Add a subscriber to a given metricId.
+    /// Different type of subscribers can be registered to a same metricId.
+    bool AddSubscriber(std::unique_ptr<Subscriber>&& subscriber);
+    /// Delete a subscriber
+    bool DeleteSubscriber(const MetricId& metric_id);
+    void DeleteSubscribers();
+
+private:                    
+    void NotifySubscribers();
+    void AddHardwareCollectors();
+    
+private:
+    mutable std::recursive_mutex mutex_;
+    
+    using CollectorMap = std::unordered_map<MetricId, std::unique_ptr<Collector>>;
+    
+    using SubscriberMap = std::unordered_multimap<MetricId, std::unique_ptr<Subscriber>>;
+    CollectorMap collectors_;
+    SubscriberMap subscribers_;
+
+    int64_t last_report_timestamp_;
+
+    std::shared_ptr<CollectorReport> last_collector_report_;
+};
+
+class AutoCollectorRegister {
+public:
+    AutoCollectorRegister(const MetricId& id, 
+                          std::unique_ptr<Collector>&& collector, 
+                          SubscriberTypeList type_list = {SubscriberType::LATEST}): 
+                    registered_(false), 
+                    id_(id) {
+        registered_ = CollectorReportPublisher::GetInstance().AddCollector(id_, std::move(collector), type_list);
+    }
+    
+    // create a metric with empty label
+    AutoCollectorRegister(const std::string& name, 
+                          std::unique_ptr<Collector>&& collector,
+                          SubscriberTypeList type_list = {SubscriberType::LATEST}):
+                    registered_(false), 
+                    id_(name) {
+        if (name.empty()) {
+            throw std::invalid_argument("name");
+        }
+        registered_ = CollectorReportPublisher::GetInstance().AddCollector(id_, std::move(collector), type_list);
+    }
+    
+    // create a metric with name and label
+    // label_str format: k1:v1,k2:v2,...
+    // can build by LabelStringBuilder().Append("k1", "v1").Append("k2","v2").ToString();
+    AutoCollectorRegister(const std::string& name, 
+                          const std::string& label_str, 
+                          std::unique_ptr<Collector>&& collector,
+                          SubscriberTypeList type_list = {SubscriberType::LATEST}): 
+                    registered_(false) {
+        // parse metric id
+        MetricId::ParseFromStringWithThrow(name, label_str, &id_);
+        registered_ = CollectorReportPublisher::GetInstance().AddCollector(id_, std::move(collector), type_list);
+    }
+    
+    ~AutoCollectorRegister() {
+        if (registered_) {
+            CollectorReportPublisher::GetInstance().DeleteCollector(id_);
+        }
+    }
+    
+    const MetricId& GetId() const {
+        return id_;
+    }
+    
+    bool IsRegistered() const {
+        return registered_;
+    }
+    
+private:
+    bool registered_;
+    MetricId id_;
+};
+
+
+class AutoSubscriberRegister {
+public:
+    AutoSubscriberRegister(std::unique_ptr<Subscriber>&& subscriber_ptr):registered_(false) {
+        if (subscriber_ptr) {
+            metric_id_ = subscriber_ptr->GetMetricId();
+            registered_ = CollectorReportPublisher::GetInstance().AddSubscriber(std::move(subscriber_ptr));
+        }
+    }
+    ~AutoSubscriberRegister(){
+        if (registered_) {
+            CollectorReportPublisher::GetInstance().DeleteSubscriber(metric_id_);
+        }
+    }
+private:
+    bool registered_;
+    MetricId metric_id_;
+};
+} // end namespace tera
+ 
+#endif // TERA_COMMON_METRIC_METRICS_H_
+ 
+/* vim: set expandtab ts=4 sw=4 sts=4 tw=100: */
+
diff --git a/src/common/metric/counter_collector.h b/src/common/metric/counter_collector.h
new file mode 100644
index 000000000..1a5ea981b
--- /dev/null
+++ b/src/common/metric/counter_collector.h
@@ -0,0 +1,41 @@
+// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef TERA_COMMON_METRIC_COUNTER_COLLECTOR_H_
+#define TERA_COMMON_METRIC_COUNTER_COLLECTOR_H_
+
+#include "common/metric/collector.h"
+#include "common/counter.h"
+ 
+namespace tera { 
+
+class CounterCollector : public Collector {
+public:
+    /// if is_periodic is true, the counter will be cleared when collect
+    /// this parameter is usually true, but it's false with some instantaneous value
+    /// Eg: read_pending_count, scan_pending_count, which can't be clear during collect.
+    explicit CounterCollector(Counter* counter, 
+                              bool is_periodic = true):
+        counter_(counter), 
+        is_periodic_(is_periodic) {}
+
+    ~CounterCollector() override {}
+
+    int64_t Collect() override {
+        if (counter_ == NULL) {
+            return -1;
+        } else {
+            return is_periodic_ ? counter_->Clear() : counter_->Get();
+        }
+    }
+private:
+    Counter* const counter_;
+    const bool is_periodic_;
+};
+} // end namespace tera
+ 
+#endif // TERA_COMMON_METRIC_COUNTER_COLLECTOR_H_
+ 
+/* vim: set expandtab ts=4 sw=4 sts=4 tw=100: */
+
diff --git a/src/common/metric/hardware_collectors.cc b/src/common/metric/hardware_collectors.cc
new file mode 100644
index 000000000..cddfd6ee6
--- /dev/null
+++ b/src/common/metric/hardware_collectors.cc
@@ -0,0 +1,250 @@
+// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. 
+#include <algorithm>
+#include <iostream>
+#include <sstream>
+#include <stdio.h>  
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <unistd.h>
+
+#include "gflags/gflags.h"
+#include "glog/logging.h"
+ 
+#include "common/timer.h"
+
+#include "common/metric/hardware_collectors.h"
+
+DECLARE_int64(tera_hardware_collect_period_second);
+ 
+namespace tera { 
+
+// return number of cpu(cores)
+static uint32_t GetCpuCount() {
+#if defined(_SC_NPROCESSORS_ONLN)
+    return sysconf(_SC_NPROCESSORS_ONLN);
+#else
+    FILE *fp = fopen("/proc/stat", "r");
+    if (fp == NULL) {
+        LOG(ERROR) << "[HardWare Metric] open /proc/stat failed.";
+        return 1;
+    }
+    static const size_t kLineMaxLen = 256; // enough in here
+    std::unique_ptr<char[]> aline(new char[kLineMaxLen]);
+    if (!aline) {
+        LOG(ERROR) << "[HardWare Metric] malloc failed.";
+        return 1;
+    }
+    static const size_t kHeaderMaxLen = 10;
+    char header[kHeaderMaxLen];
+    uint32_t i = 0;
+    size_t len = 0;
+    char* line_ptr = aline.get();
+    getline(&line_ptr, &len, fp); // drop the first line
+    while (getline(&line_ptr, &len, fp)) {
+        i++;
+        sscanf(line_ptr, "%s", header);
+        if (!strncmp(header, "intr", kHeaderMaxLen)) {
+            break;
+        }
+    }
+    fclose(fp);
+    return std::max(i - 1, 1);
+#endif
+}
+
+// return the number of ticks(jiffies) that this process
+// has been scheduled in user and kernel mode.
+static bool ProcessCpuTick(const std::string& stat_path, int64_t* tick) {
+    if (tick == NULL) {
+        return false;
+    }
+    FILE *fp = fopen(stat_path.c_str(), "r");
+    if (fp == NULL) {
+        LOG(ERROR) << "[HardWare Metric] open " << stat_path << " failed.";
+        return false;
+    }
+    long long utime = 0;
+    long long stime = 0;
+    if (fscanf(fp, "%*s %*s %*s %*s %*s %*s %*s %*s %*s %*s %*s %*s %*s %lld %lld",
+               &utime, &stime) < 2) {
+        fclose(fp);
+        LOG(ERROR) << "[HardWare Metric] get cpu tick from " << stat_path << " failed.";
+        return false;
+    }
+    fclose(fp);
+    *tick = utime + stime;
+    return true;
+}
+
+CpuUsageCollector::CpuUsageCollector():
+    pid_(getpid()), 
+    cpu_core_num_(GetCpuCount()), 
+    cpu_hertz_(sysconf(_SC_CLK_TCK)), 
+    stat_path_(std::string("/proc/") + std::to_string(pid_) + "/stat"), 
+    last_check_time_ms_(get_millis()), 
+    last_tick_total_(0), 
+    cpu_usage_(0) {}
+
+CpuUsageCollector::~CpuUsageCollector() {}
+
+int64_t CpuUsageCollector::Collect() {
+    int64_t cur_ts = get_millis();
+    int64_t collect_period_ms = FLAGS_tera_hardware_collect_period_second * 1000;
+    if (collect_period_ms > 0 && cur_ts < last_check_time_ms_ + collect_period_ms) {
+        return cpu_usage_;
+    } else {
+        return CheckCpuUsage(cur_ts, false);
+    }
+}
+
+int64_t CpuUsageCollector::CheckCpuUsage(int64_t cur_ts, bool is_irix_on) {
+    int64_t new_tick_total = 0;
+    if (!ProcessCpuTick(stat_path_, &new_tick_total)) {
+        // read proc file failed. 
+        return 0;
+    }
+    
+    float interval_sec = static_cast<float>(cur_ts - last_check_time_ms_) / 1000.0f;
+    // percentage per tick during time interval
+    float interval_total_ticks = static_cast<float>(cpu_hertz_) * interval_sec;
+    if (!is_irix_on) {
+        interval_total_ticks *= cpu_core_num_;
+    }
+
+    float usage_percentage = static_cast<float>(new_tick_total - last_tick_total_) * 100.0f / interval_total_ticks;
+    usage_percentage = std::min(usage_percentage, 99.9f);
+    
+    // update 
+    last_tick_total_ = new_tick_total;
+    cpu_usage_ = static_cast<int64_t>(usage_percentage);
+    last_check_time_ms_ = cur_ts;
+    VLOG(15) << "[Hardware Metric] %CPU: " << usage_percentage;
+    return cpu_usage_;
+}
+
+MemUsageCollector::MemUsageCollector(): 
+    pid_(getpid()),
+    stat_path_(std::string("/proc/") + std::to_string(pid_) + "/statm"),
+    last_check_time_ms_(get_millis()),
+    mem_usage_(0) {}
+    
+      
+MemUsageCollector::~MemUsageCollector() {}
+
+int64_t MemUsageCollector::Collect() {
+    int64_t cur_ts = get_millis();
+    int64_t collect_period_ms = FLAGS_tera_hardware_collect_period_second * 1000;
+    if (collect_period_ms > 0 && cur_ts < last_check_time_ms_ + collect_period_ms) {
+        return mem_usage_;
+    } else {
+        return CheckMemUsage(cur_ts);
+    }
+}
+
+int64_t MemUsageCollector::CheckMemUsage(int64_t cur_ts) {
+    FILE* stat_file = fopen(stat_path_.c_str(), "r");
+    if (stat_file == NULL) {
+        LOG(ERROR) << "[Hardware Metric] open " << stat_path_ << " failed.";
+        return false;
+    }
+    
+    int64_t mem_pages = 0;
+    fscanf(stat_file, "%*d %ld", &mem_pages);
+    fclose(stat_file);
+    
+    mem_usage_ = mem_pages * 4 * 1024;
+    last_check_time_ms_ = cur_ts;
+    VLOG(15) << "[Hardware Metric] Memory: " << mem_usage_;
+    return mem_usage_;
+}
+
+NetUsageCollector::NetInfoChecker NetUsageCollector::net_info_checker_;
+
+NetUsageCollector::NetUsageCollector(NetUsageType n_type): 
+    net_usage_type_(n_type) {}
+    
+NetUsageCollector::~NetUsageCollector() {}
+
+int64_t NetUsageCollector::Collect() {
+    int64_t cur_ts = get_millis();
+    int64_t collect_period_ms = FLAGS_tera_hardware_collect_period_second * 1000;
+    if (collect_period_ms > 0 && 
+            cur_ts < net_info_checker_.last_check_time_ms_ + collect_period_ms) {
+        return net_usage_type_ == RECEIVE ? net_info_checker_.net_rx_usage_ : net_info_checker_.net_tx_usage_;
+    } else {
+        int64_t value = 0;
+        if (net_usage_type_ == RECEIVE) {
+            // check net info and get receive usage
+            net_info_checker_.CheckNetUsage(cur_ts, &value, NULL);
+        } else {
+            // check net info and get transmit usage
+            net_info_checker_.CheckNetUsage(cur_ts, NULL, &value);
+        }
+        return value;
+    }
+}
+
+NetUsageCollector::NetInfoChecker::NetInfoChecker()
+    : pid_(getpid()), 
+      stat_path_(std::string("/proc/") + std::to_string(pid_) + "/net/dev"),
+      last_check_time_ms_(get_millis()),
+      last_rx_total_(0),
+      last_tx_total_(0),
+      net_rx_usage_(0),
+      net_tx_usage_(0) {
+    GetCurrentTotal(&last_rx_total_, &last_tx_total_);
+}
+
+bool NetUsageCollector::NetInfoChecker::GetCurrentTotal(int64_t *rx_total, int64_t *tx_total) {
+    FILE* stat_file = fopen(stat_path_.c_str(), "r");
+    if (stat_file == NULL) {
+        LOG(ERROR) << "[Hardware Metric] open " << stat_path_ << "failed.";
+        return false;
+    }
+    int ret = fseek(stat_file, 327, SEEK_SET);
+    CHECK_EQ(ret, 0);
+    for (int i = 0; i < 10; i++) {
+        while (':' != fgetc(stat_file));
+        ret = fscanf(stat_file, "%ld%*d%*d%*d%*d%*d%*d%*d%ld", rx_total, tx_total);
+        if (ret >= 2 && rx_total > 0 && tx_total > 0) {
+            break;
+        }
+    }
+    fclose(stat_file);
+
+    return true;
+}
+
+bool NetUsageCollector::NetInfoChecker::CheckNetUsage(int64_t cur_ts, int64_t* rx_usage, int64_t *tx_usage) {
+    int64_t new_rx_total = 0;
+    int64_t new_tx_total = 0;
+
+    if (!GetCurrentTotal(&new_rx_total, &new_tx_total)) {
+        return false;
+    }
+    int64_t interval_ms = cur_ts - last_check_time_ms_;
+    // update
+    net_rx_usage_ = (new_rx_total - last_rx_total_) * 1000 / interval_ms;
+    net_tx_usage_ = (new_tx_total - last_tx_total_) * 1000 / interval_ms;
+    last_rx_total_ = new_rx_total;
+    last_tx_total_ = new_tx_total;
+    last_check_time_ms_ = cur_ts;
+    
+    if (rx_usage) {
+        *rx_usage = net_rx_usage_;
+    }
+    
+    if (tx_usage) {
+        *tx_usage = net_tx_usage_;
+    }
+
+    VLOG(15) << "[Hardware Metric] Network RX/TX: " << last_rx_total_ << " / " << last_tx_total_;
+    return true;
+}
+ 
+} // end namespace tera 
+ 
+/* vim: set expandtab ts=4 sw=4 sts=4 tw=100: */
+
diff --git a/src/common/metric/hardware_collectors.h b/src/common/metric/hardware_collectors.h
new file mode 100644
index 000000000..be04e4165
--- /dev/null
+++ b/src/common/metric/hardware_collectors.h
@@ -0,0 +1,104 @@
+// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef TERA_COMMON_METRIC_HARDWARE_METRICS_H_
+#define TERA_COMMON_METRIC_HARDWARE_METRICS_H_
+ 
+#include <string>
+
+#include "common/metric/collector_report_publisher.h"
+#include "common/metric/collector.h"
+ 
+namespace tera { 
+
+const char* const kInstCpuMetricName = "tera_instance_cpu_usage_percent";
+const char* const kInstMemMetricName = "tera_instance_mem_usage_bytes";
+const char* const kInstNetRXMetricName = "tera_instance_net_receive_bytes";
+const char* const kInstNetTXMetricName = "tera_instance_net_transmit_bytes";
+
+class CpuUsageCollector : public Collector {
+public:
+    CpuUsageCollector();
+    virtual ~CpuUsageCollector();
+    
+    virtual int64_t Collect();
+private:
+    int64_t CheckCpuUsage(int64_t cur_ts, bool is_irix_on);
+
+private:
+    // proc info
+    int    pid_;
+    uint32_t cpu_core_num_;
+    int64_t cpu_hertz_;
+    std::string stat_path_;
+    
+    // last check info
+    int64_t last_check_time_ms_;
+    int64_t last_tick_total_; // cpu total ticks at last check
+    int64_t cpu_usage_;       // (new_tick_total - last_tick_total_) / (total ticks in interval)
+};
+
+class MemUsageCollector : public Collector {
+public:
+    MemUsageCollector();
+    virtual ~MemUsageCollector();
+    
+    virtual int64_t Collect();
+private:
+    int64_t CheckMemUsage(int64_t cur_ts);
+    
+private:
+    // proc info
+    int pid_;
+    std::string stat_path_;
+    
+    // last check info
+    int64_t last_check_time_ms_;
+    int64_t mem_usage_;
+};
+
+enum NetUsageType {
+    RECEIVE,    // net_rx
+    TRANSMIT,   // net_tx
+};
+
+class NetUsageCollector : public Collector {
+public:
+    explicit NetUsageCollector(NetUsageType n_type);
+    virtual ~NetUsageCollector();
+    
+    virtual int64_t Collect();
+private:
+    struct NetInfoChecker {
+        // proc info
+        int pid_;
+        std::string stat_path_;
+        
+        // last check info
+        int64_t last_check_time_ms_;
+        int64_t last_rx_total_;  // total rx bytes at last check
+        int64_t last_tx_total_;  // total tx bytes at last check
+        
+        // metric value cache
+        int64_t net_rx_usage_;  // (new_rx_total - last_rx_total_) / check_interval
+        int64_t net_tx_usage_;  // (new_tx_total - last_tx_total_) / check_interval
+        
+        NetInfoChecker();
+        
+        bool GetCurrentTotal(int64_t*, int64_t*);
+        bool CheckNetUsage(int64_t cur_ts, int64_t* rx_usage, int64_t *tx_usage);
+    };
+    
+    static NetInfoChecker net_info_checker_;
+    
+private:
+    NetUsageType net_usage_type_;
+};
+ 
+} // end namespace tera 
+ 
+#endif // TERA_COMMON_METRIC_HARDWARE_METRICS_H_
+ 
+/* vim: set expandtab ts=4 sw=4 sts=4 tw=100: */
+
diff --git a/src/common/metric/metric_counter.h b/src/common/metric/metric_counter.h
new file mode 100644
index 000000000..55b4c59fe
--- /dev/null
+++ b/src/common/metric/metric_counter.h
@@ -0,0 +1,93 @@
+#pragma once
+// Copyright (c) 2017, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include <stdexcept>
+#include <string>
+
+#include "common/metric/collector_report_publisher.h"
+#include "common/metric/counter_collector.h"
+#include "common/counter.h"
+
+namespace tera{
+class MetricCounter : public Counter {
+public:
+    // create a metric with empty label
+    explicit MetricCounter(const std::string& name,
+                           SubscriberTypeList type_list = {SubscriberType::LATEST},
+                           bool is_periodic = true):
+            Counter(),
+            registered_(false),
+            metric_id_(name),
+            type_list_(type_list),
+            is_periodic_(is_periodic) {
+        if (name.empty()) {
+            // throw a exception and make process exit with coredump
+            throw std::invalid_argument("metric name is empty");
+        }
+        registered_ = CollectorReportPublisher::GetInstance().AddCollector(
+            metric_id_,
+            std::unique_ptr<Collector>(new CounterCollector(this, is_periodic_)),
+            type_list_);
+    }
+
+    // create a metric with name and label
+    // label_str format: k1:v1,k2:v2,...
+    // can build by LabelStringBuilder().Append("k1", "v1").Append("k2","v2").ToString();
+    MetricCounter(const std::string& name,
+                  const std::string& label_str,
+                  SubscriberTypeList type_list = {SubscriberType::LATEST},
+                  bool is_periodic = true):
+            Counter(),
+            registered_(false),
+            type_list_(type_list),
+            is_periodic_(is_periodic) {
+        // parse metric id
+        MetricId::ParseFromStringWithThrow(name, label_str, &metric_id_);
+        // legal label str format, do register
+        registered_ = CollectorReportPublisher::GetInstance().AddCollector(
+            metric_id_,
+            std::unique_ptr<Collector>(new CounterCollector(this, is_periodic_)),
+            type_list);
+    }
+
+    MetricCounter(MetricCounter&& counter) {
+        // parse metric id
+        if (counter.registered_) {
+            CollectorReportPublisher::GetInstance().DeleteCollector(counter.metric_id_);
+        }
+        registered_ = counter.registered_;
+        metric_id_ = counter.metric_id_;
+        is_periodic_ = counter.is_periodic_;
+        type_list_ = counter.type_list_;
+        Set(counter.Get());
+        counter.registered_ = false;
+        registered_ = CollectorReportPublisher::GetInstance().AddCollector(
+            metric_id_,
+            std::unique_ptr<Collector>(new CounterCollector(this, is_periodic_)),
+            type_list_);
+    }
+
+    virtual ~MetricCounter() {
+        if (registered_) {
+            // do unregister
+            CollectorReportPublisher::GetInstance().DeleteCollector(metric_id_);
+        }
+    }
+
+    bool IsRegistered() const {
+        return registered_;
+    }
+
+    //Never copyied
+    MetricCounter(const MetricCounter&) = delete;
+    MetricCounter& operator=(const MetricCounter&) = delete;
+
+private:
+    bool registered_;
+    MetricId metric_id_;
+    SubscriberTypeList type_list_;
+    bool is_periodic_;
+};
+}
diff --git a/src/common/metric/metric_http_server.cc b/src/common/metric/metric_http_server.cc
new file mode 100644
index 000000000..fdb01910c
--- /dev/null
+++ b/src/common/metric/metric_http_server.cc
@@ -0,0 +1,232 @@
+// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. 
+
+#include "common/metric/metric_http_server.h" 
+
+#include <unordered_map>
+#include <algorithm>
+#include <string>
+
+#include "glog/logging.h"
+
+#include "common/timer.h"
+#include "common/metric/collector_report.h"
+
+using std::string;
+ 
+namespace tera { 
+
+void ResponseBodyBuilder::BuildType(string* body, const string& metric_name, const string& type) {
+    body->append("# TYPE " + metric_name + " " + type + "\n");
+}
+
+void ResponseBodyBuilder::BuildHelp(string* body, const string& metric_name, const string& help_info) {
+    body->append("# HELP " + metric_name + " " + help_info + "\n");
+}
+
+void ResponseBodyBuilder::BuildMetricItem(string* body, const MetricId& metric_id, const ReportItem& report_item) {
+
+    VLOG(12) << "[Building Metric] name: " << metric_id.GetName()
+             << "\tValue: " << static_cast<double>(report_item.Value())
+             << "\tTimeStamp: " << report_item.Time()
+             << "\tType: " << report_item.Type();
+
+    if (report_item.Time() == -1) {
+        return;
+    }
+
+    body->append(metric_id.GetName() + "{");
+    const auto& label_map = metric_id.GetLabelMap();
+    auto iter = label_map.begin();
+    bool has_label = false;
+    if (iter != label_map.end()) {
+        body->append(iter->first + "=" + "\"" + iter->second + "\"");
+        has_label = true;
+        ++iter;
+    }
+    while (iter != label_map.end()) {
+        body->append("," + iter->first + "=" + "\"" + iter->second + "\"");
+        ++iter;
+    }
+
+    if (has_label) {
+        body->append(",value_type=\"" + report_item.Type() + "\"");
+    } else {
+        body->append("value_type=\"" + report_item.Type() + "\"");
+    }
+
+    body->append("} " + std::to_string(report_item.Value()) + " " + std::to_string(report_item.Time()));
+    body->append("\n");
+}
+
+static const int kMongoosePollTimeoutMs = 1000; 
+
+static void LogRequest(struct http_message *request) {
+      VLOG(16) << "[MetricHttpServer] Recv http request."
+               << " method [" << std::string(request->method.p, request->method.len) << "]"
+               << " uri [" << std::string(request->uri.p, request->uri.len) << "]"
+               << " proto [" << std::string(request->proto.p, request->proto.len) << "]"
+               << " query [" << std::string(request->query_string.p, request->query_string.len) << "]"
+               << " body [" << std::string(request->body.p, request->body.len) << "]";
+}
+
+void MetricHttpServer::EventHandler(struct mg_connection *conn, int event, void *p_data) {
+    if (event == MG_EV_HTTP_REQUEST) {
+        if (conn == NULL || conn->mgr == NULL || p_data == NULL) {
+            LOG(WARNING) << "[MetricHttpServer] handle invalid request.";
+            return;
+        }
+
+        // get user data
+        void* user_data = conn->mgr->user_data;
+        if (user_data == NULL) {
+            LOG(WARNING) << "[MetricHttpServer] Connection missing user data.";
+            return;
+        } 
+        
+        MetricHttpServer *server = reinterpret_cast<MetricHttpServer *>(user_data);
+        struct http_message *request = reinterpret_cast<struct http_message *>(p_data);
+        server->HandleHttpRequest(conn, request);
+    }
+    // ignore other events
+}
+ 
+MetricHttpServer::MetricHttpServer(): 
+    is_running_(false), 
+    stop_(false),
+    listen_port_(-1) {}
+
+MetricHttpServer::~MetricHttpServer() {} 
+
+bool MetricHttpServer::Start(int32_t listen_port) {
+    if (listen_port <= 0) {
+        LOG(WARNING) << "[MetricHttpServer] Start got invalid listen port: " << listen_port;
+        return false;
+    }
+    
+    MutexLock lock(&mutex_);
+    if (IsRunning()) {
+        LOG(WARNING) << "[MetricHttpServer] Server is already running, listening: " << listen_port_;
+        return false;
+    }
+    
+    // init mongoose use this as user_data
+    mg_mgr_init(&mongoose_mgr_, this);
+    
+    // bind listen port
+    std::string bind_addr = std::to_string(listen_port);
+    struct mg_connection *conn = mg_bind(&mongoose_mgr_, bind_addr.c_str(), &MetricHttpServer::EventHandler);
+    
+    if (conn == NULL) {
+        LOG(WARNING) << "[MetricHttpServer] Bind port [" << listen_port << "] failed.";
+        mg_mgr_free(&mongoose_mgr_);
+        return false;
+    }
+
+    mg_set_protocol_http_websocket(conn);
+    LOG(INFO) << "[MetricHttpServer] Bind port [" << listen_port << "] success.";
+    
+    stop_.store(false);
+    if (!bg_thread_.Start(std::bind(&MetricHttpServer::BackgroundWorkWrapper, this))) {
+        mg_mgr_free(&mongoose_mgr_);
+        LOG(WARNING) << "[MetricHttpServer] Start background thread failed.";
+        return false;
+    }
+    return true;
+}
+
+void MetricHttpServer::Stop() {
+    MutexLock lock(&mutex_);
+    if (!IsRunning()) {
+        return;
+    }
+    
+    stop_.store(true);
+    bg_thread_.Join();
+    listen_port_ = -1;
+}
+
+void MetricHttpServer::BackgroundWorkWrapper() {
+    LOG(INFO) << "[MetricHttpServer] Start background work";
+    is_running_.store(true);
+    while (!stop_.load()) {
+        mg_mgr_poll(&mongoose_mgr_, kMongoosePollTimeoutMs);
+    }
+    is_running_.store(false);
+    mg_mgr_free(&mongoose_mgr_);
+    LOG(INFO) << "[MetricHttpServer] Exit background work";
+}
+
+void MetricHttpServer::HandleHttpRequest(struct mg_connection *conn, struct http_message *request) {
+    int64_t start_ts = get_micros();
+    LogRequest(request);
+    
+    // select real handler based on uri
+    std::string uri(request->uri.p, request->uri.len);
+    if (uri == "/metrics") {
+        HandleMetrics(conn, request);
+    } else {
+        HandleUnknowUri(conn, request);
+    }
+    int64_t end_ts = get_micros();
+    VLOG(16) << "[MetricHttpServer] Handle uri [" << uri << "] cost [" << (end_ts - start_ts) << "] us."; 
+}
+
+void MetricHttpServer::HandleUnknowUri(struct mg_connection *conn, struct http_message *request) {
+    VLOG(16) << "[MetricHttpServer] Handle unknow uri [" 
+             << std::string(request->uri.p, request->uri.len) << "] ...";
+    mg_send_head(conn, 404, 0, "Content-Type: text/plain");
+}
+
+void MetricHttpServer::HandleMetrics(struct mg_connection *conn, struct http_message *request) {
+    std::string body(GetResponseBody());
+    mg_printf(conn, "HTTP/1.1 200 OK\r\nContent-Type: %s\r\n", "text/plain");
+    mg_printf(conn, "Content-Length: %lu\r\n\r\n", static_cast<unsigned long>(body.size()));
+    mg_send(conn, body.data(), body.size());
+}
+
+string MetricHttpServer::GetResponseBody() {
+    int64_t start_ts = get_millis();
+    std::shared_ptr<SubscriberReport> cur_report = 
+        CollectorReportPublisher::GetInstance().GetSubscriberReport();
+
+    if (!cur_report) {
+        LOG(WARNING) << "[MetricHttpServer] Subscriber Report Is Empty";
+        return "";
+    }
+
+    //pair<MetricId, TimeValueQueue>
+    using MetricIdValuePair = SubscriberReport::value_type;
+    //Vector of pair<MetricId, TimeValueQueue>
+    using MetricIdValueVec = std::vector<const MetricIdValuePair*>;
+    // MetricNameMap: map< metric_name, vector< pair<metric_id, value> > >
+    using MetricNameMap = std::unordered_map<std::string, MetricIdValueVec>;
+
+    MetricNameMap metric_name_map;
+    
+    for (const auto& report_item : *cur_report) {
+        const std::string& metric_name = report_item.first.GetName();
+        metric_name_map[metric_name].push_back(&report_item);
+    }
+    
+    std::string body;
+    // fill MetricFamilyVec
+    for (const auto& metric_item : metric_name_map) {
+        ResponseBodyBuilder::BuildHelp(&body, metric_item.first, metric_item.first);
+        ResponseBodyBuilder::BuildType(&body, metric_item.first, "gauge");
+
+        const MetricIdValueVec& metric_vec = metric_item.second;
+
+        std::for_each(metric_vec.begin(), metric_vec.end(), [&body, this](const MetricIdValuePair* x) {
+            ResponseBodyBuilder::BuildMetricItem(&body, x->first, x->second);
+        });
+    }
+    VLOG(12) << "[MetricHttpServer] Get Response Body cost: " <<
+             get_millis() - start_ts << " ms";
+    return std::move(body);
+}
+} // end namespace tera
+ 
+/* vim: set expandtab ts=4 sw=4 sts=4 tw=100: */
+
diff --git a/src/common/metric/metric_http_server.h b/src/common/metric/metric_http_server.h
new file mode 100644
index 000000000..a0b735450
--- /dev/null
+++ b/src/common/metric/metric_http_server.h
@@ -0,0 +1,84 @@
+// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef TERA_COMMON_METRIC_METRIC_HTTP_SERVER_H_
+#define TERA_COMMON_METRIC_METRIC_HTTP_SERVER_H_
+ 
+#include <atomic>
+#include <string> 
+#include <vector>
+ 
+#include "mongoose.h" 
+
+#include "common/metric/collector_report_publisher.h"
+#include "common/mutex.h" 
+#include "common/thread.h"
+ 
+namespace tera {
+
+struct ResponseBodyBuilder {
+    static void BuildType(std::string* body, 
+                                 const std::string& metric_name, 
+                                 const std::string& type);
+
+    static void BuildHelp(std::string* body, 
+                                 const std::string& metric_name, 
+                                 const std::string& help_info);
+
+    static void BuildMetricItem(std::string* body, 
+                                       const MetricId& metric_id, 
+                                       const ReportItem& report_item);
+};
+
+// a simple http server based on mongoose
+class MetricHttpServer {
+public:
+    MetricHttpServer();
+    ~MetricHttpServer();
+    
+private:
+    // disallow copy
+    MetricHttpServer(const MetricHttpServer&) = delete;
+    MetricHttpServer& operator = (const MetricHttpServer&) = delete;
+
+private:    
+    static void EventHandler(struct mg_connection *conn, int event, void *p_data);
+    
+public:
+    bool Start(int32_t listen_port);
+    void Stop();
+    
+    bool IsRunning() const {
+        return is_running_.load();
+    }
+    
+private:
+    void BackgroundWorkWrapper();
+    
+    // http request handlers
+    void HandleHttpRequest(struct mg_connection *conn, struct http_message *request);
+    void HandleMetrics(struct mg_connection *conn, struct http_message *request);
+    void HandleUnknowUri(struct mg_connection *conn, struct http_message *request);
+    
+    // prometheus handle functions
+    std::string GetResponseBody();
+
+private:
+    mutable Mutex mutex_;
+    std::atomic<bool> is_running_;
+    std::atomic<bool> stop_;
+    int32_t listen_port_;
+    
+    // background thread
+    common::Thread bg_thread_;
+    
+    // mongoose info
+    struct mg_mgr mongoose_mgr_;
+}; 
+ 
+} // end namespace tera 
+ 
+#endif // TERA_COMMON_METRIC_METRIC_HTTP_SERVER_H_
+ 
+/* vim: set expandtab ts=4 sw=4 sts=4 tw=100: */
diff --git a/src/common/metric/metric_id.cc b/src/common/metric/metric_id.cc
new file mode 100644
index 000000000..b77ee095c
--- /dev/null
+++ b/src/common/metric/metric_id.cc
@@ -0,0 +1,156 @@
+// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. 
+
+#include "common/metric/metric_id.h" 
+
+#include <boost/algorithm/string.hpp>
+#include <sstream>
+ 
+namespace tera { 
+
+static const std::string kInvalidLabel = "";
+
+MetricId::MetricId(const std::string& name, const std::string& label_str) {
+    ParseFromStringWithThrow(name, label_str, this);
+}
+
+static std::string MetricLabelsToString(const MetricLabels& label_map) {
+    if (label_map.empty()) {
+        return "";
+    }
+    std::ostringstream label_oss;
+    auto iter = label_map.begin();
+    // do not append kLabelPairDelimiter for the first pair
+    label_oss << iter->first << kLabelKVDelimiter << iter->second;
+    ++iter;
+        
+    for (; iter != label_map.end(); ++iter) {
+        label_oss << kLabelPairDelimiter << iter->first << kLabelKVDelimiter << iter->second;
+    }
+    return label_oss.str();
+}
+
+std::string MetricId::GenMetricIdStr(const std::string& name, const MetricLabels& label_map) {
+    if (label_map.empty()) {
+        return name;
+    } 
+    
+    std::ostringstream id_oss;
+    id_oss << name << kNameLabelsDelimiter << MetricLabelsToString(label_map);
+    return id_oss.str();
+}
+
+void MetricId::ParseFromStringWithThrow(const std::string& name, 
+                                        const std::string& label_str, 
+                                        MetricId* metric_id) throw(std::invalid_argument) {
+    if (metric_id == NULL) {
+        throw std::invalid_argument("metric_id is invalid");
+    }
+    if (name.empty()) {
+        throw std::invalid_argument("metric name is invalid");
+    }
+    
+    metric_id->name_ = name;
+    metric_id->labels_.clear();
+    
+    if (label_str.empty()) {
+        metric_id->id_str_ = metric_id->name_;
+        return;
+    }
+    
+    // label_str format: k1:v1,k2:v2,...
+    std::vector<std::string> label_str_splits;
+    boost::algorithm::split(label_str_splits, label_str,
+            boost::algorithm::is_any_of(kLabelPairDelimiter));
+    for (const std::string& label_kv_str : label_str_splits) {
+        std::vector<std::string> label_kv_splits;
+        boost::algorithm::split(label_kv_splits, label_kv_str,
+                boost::algorithm::is_any_of(kLabelKVDelimiter));
+        if (label_kv_splits.size() != 2) {
+            // invalid label str format
+            throw std::invalid_argument("label_str");
+        }
+        
+        metric_id->labels_.insert(std::make_pair(label_kv_splits[0], label_kv_splits[1]));
+    }
+    
+    // gen identifier string 
+    metric_id->id_str_ = metric_id->name_ + kNameLabelsDelimiter + label_str;
+    return;
+}
+
+bool MetricId::ParseFromString(const std::string& name,
+                               const std::string& label_str,
+                               MetricId* metric_id) throw() {
+    try {
+        ParseFromStringWithThrow(name, label_str, metric_id);
+        return true;
+    } catch (std::invalid_argument&) {
+        return false;
+    }
+}
+
+MetricId::MetricId() : name_(), labels_(), id_str_() {}
+
+MetricId::MetricId(const std::string& name) 
+    : name_(name), 
+      labels_(), 
+      id_str_(GenMetricIdStr(name_, labels_)) {}
+
+MetricId::MetricId(const std::string& name, const MetricLabels& label_map)
+    : name_(name), 
+      labels_(label_map), 
+      id_str_(GenMetricIdStr(name_, labels_)) {}
+
+MetricId::MetricId(const MetricId& other)
+    : name_(other.name_), 
+      labels_(other.labels_), 
+      id_str_(other.id_str_) {}
+      
+MetricId::~MetricId() {}
+
+MetricId& MetricId::operator = (const MetricId& other) {
+    name_ = other.name_;
+    labels_ = other.labels_;
+    id_str_ = other.id_str_;
+    return *this;
+}
+ 
+const std::string& MetricId::GetLabel(const std::string& name) const {
+    auto iter = labels_.find(name);
+    if (iter == labels_.end()) {
+        return kInvalidLabel;
+    } else {
+        return iter->second;
+    }
+} 
+ 
+bool MetricId::ExistLabel(const std::string& name) const {
+    return labels_.find(name) != labels_.end();
+}
+
+bool MetricId::CheckLabel(const std::string& name, const std::string& expected_value) const {
+    auto iter = labels_.find(name);
+    if (iter == labels_.end()) {
+        return false;
+    } else {
+        return (iter->second == expected_value);
+    }
+} 
+
+LabelStringBuilder& LabelStringBuilder::Append(const std::string& name, const std::string& value) {
+    if (!name.empty() && !value.empty()) {
+        labels_[name] = value;
+    }
+    return *this;
+}
+
+std::string LabelStringBuilder::ToString() const {
+    return MetricLabelsToString(labels_);
+}
+ 
+} // end namespace tera 
+ 
+/* vim: set expandtab ts=4 sw=4 sts=4 tw=100: */
+
diff --git a/src/common/metric/metric_id.h b/src/common/metric/metric_id.h
new file mode 100644
index 000000000..cff30448e
--- /dev/null
+++ b/src/common/metric/metric_id.h
@@ -0,0 +1,143 @@
+// Copyright (c) 2017, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef TERA_COMMON_METRIC_METRIC_ID_H_
+#define TERA_COMMON_METRIC_METRIC_ID_H_ 
+ 
+#include <functional> 
+#include <sstream>
+#include <stdexcept>
+#include <string>
+#include <map> 
+ 
+namespace tera { 
+
+// use ordered map to ensure the order of labels in id_str
+typedef std::map<std::string, std::string> MetricLabels;
+
+const char* const kNameLabelsDelimiter = "#";
+const char* const kLabelPairDelimiter = ",";
+const char* const kLabelKVDelimiter = ":";
+
+// A metric identifiered by name and all labels
+//     name:   necessary, and should not be empty
+//     labels: optional
+//
+// Can get name and labels from MetricId
+class MetricId {
+public:
+    MetricId();
+    explicit MetricId(const std::string& name);
+    MetricId(const std::string& name, const MetricLabels& label_map);
+    MetricId(const std::string& name, const std::string& label_str);
+    MetricId(const MetricId& other);
+    ~MetricId();
+    
+    MetricId& operator = (const MetricId& other);
+    
+    bool IsValid() const {
+        return !name_.empty();
+    }
+    
+    const std::string& GetName() const {
+        return name_;
+    }
+    
+    const MetricLabels& GetLabelMap() const {
+        return labels_;
+    }
+    
+    const std::string& ToString() const {
+        return id_str_;
+    }
+    
+    // access labels
+    const std::string& GetLabel(const std::string& name) const;
+    bool ExistLabel(const std::string& name) const;
+    bool CheckLabel(const std::string& name, const std::string& expected_value) const;
+    
+public:
+    // Parse MetricId from name and formated label string
+    // nothrow std::invalid_argument if got illegal format arguments
+    static void ParseFromStringWithThrow(const std::string& name, 
+                                         const std::string& label_str, 
+                                         MetricId* metric_id) throw(std::invalid_argument);
+    // Parse MetricId from name and formated label string
+    // nothrow version
+    static bool ParseFromString(const std::string& name, 
+                                const std::string& label_str,
+                                MetricId* metric_id) throw();
+
+private:
+    static std::string GenMetricIdStr(const std::string& name, const MetricLabels& label_map);
+private:
+    std::string name_;
+    MetricLabels labels_;
+    std::string id_str_;
+}; 
+    
+// relational operators
+// make MetricId can be the key of std::map and std::unordered_map
+inline bool operator == (const MetricId& id1, const MetricId& id2) {
+    return id1.ToString() == id2.ToString();
+}
+
+inline bool operator != (const MetricId& id1, const MetricId& id2) {
+    return id1.ToString() != id2.ToString();
+}
+
+inline bool operator < (const MetricId& id1, const MetricId& id2) {
+    return id1.ToString() < id2.ToString();
+}
+
+inline bool operator <= (const MetricId& id1, const MetricId& id2) {
+    return id1.ToString() <= id2.ToString();
+}
+
+inline bool operator > (const MetricId& id1, const MetricId& id2) {
+    return id1.ToString() > id2.ToString();
+}
+
+inline bool operator >= (const MetricId& id1, const MetricId& id2) {
+    return id1.ToString() >= id2.ToString();
+}
+
+// A helper class to build formated label string
+// Usage: label_str = LabelStringBuilder().Append("k1","v1").Append("k2","v2").ToString();
+class LabelStringBuilder {
+public:
+    LabelStringBuilder() {}
+    ~LabelStringBuilder() {}
+    
+    // append a k-v pair
+    LabelStringBuilder& Append(const std::string& name, const std::string& value);
+    
+    // build formated string
+    std::string ToString() const;
+    
+private:
+    MetricLabels labels_;
+};
+ 
+} // end namespace tera 
+
+namespace std {
+// specialization std::hash for tera::MetricId
+// make MetricId can be the key of unordered_map
+template<>
+struct hash<::tera::MetricId> {
+public:
+    size_t operator () (const ::tera::MetricId& id) const {
+        return str_hash_(id.ToString());
+    }
+private:
+    hash<string> str_hash_;
+};
+
+} // end namespace std
+ 
+#endif // TERA_COMMON_METRIC_METRIC_ID_H_
+ 
+/* vim: set expandtab ts=4 sw=4 sts=4 tw=100: */
+
diff --git a/src/common/metric/prometheus_subscriber.cc b/src/common/metric/prometheus_subscriber.cc
new file mode 100644
index 000000000..9aca684df
--- /dev/null
+++ b/src/common/metric/prometheus_subscriber.cc
@@ -0,0 +1,142 @@
+// Copyright (c) 2017, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+#include "gflags/gflags.h"
+#include "glog/logging.h"
+
+#include "common/metric/prometheus_subscriber.h"
+#include "common/metric/collector_report.h"
+
+DECLARE_int64(tera_metric_hold_max_time);
+
+namespace tera{
+
+void PrometheusSubscriber::OnUpdate(std::shared_ptr<CollectorReport> report) {
+    int64_t value = report->FindMetricValue(metric_id_);
+    Append(report->timestamp_ms, value);
+}
+
+ReportItem PrometheusSubscriber::Collect() {
+    ReportItem ret;
+    std::shared_ptr<TimeValueQueue> tera_queue_ptr;
+    int64_t last_collect_ts;
+
+    {
+        std::lock_guard<std::mutex> lock_mtx(mtx_);
+        if (tera_queue_ptr_->empty()) {
+            VLOG(12) << "[PROMETHEUS SUBSCRIBER] Empty Tera Queue";
+            return ret;
+        }
+
+        last_collect_ts = last_collect_ts_;
+        last_collect_ts_ = tera_queue_ptr_->back().first;
+        tera_queue_ptr = tera_queue_ptr_;
+        tera_queue_ptr_.reset(new TimeValueQueue);
+    }
+
+    int64_t value = GetSpecificValue(tera_queue_ptr);
+
+    if (type_ == SubscriberType::QPS ||
+        type_ == SubscriberType::THROUGHPUT) {
+        int64_t time_interval = tera_queue_ptr->back().first - last_collect_ts;
+        value = (time_interval != 0 ? value * 1000 / time_interval : 0);
+    }
+
+    ret.SetTimeValue({tera_queue_ptr->back().first, value});
+    ret.SetType(GetTypeName());
+
+    return ret;
+}
+
+void PrometheusSubscriber::Append(int64_t time_stamp, int64_t current_value) {
+    std::lock_guard<std::mutex> mtx_lock(mtx_);
+    tera_queue_ptr_->emplace_back(time_stamp, current_value);
+    VLOG(12) << "[PROMETHEUS APPEND] " << metric_id_.GetName()
+             << "\tValue: "  << current_value
+             << "\tQueue Size:" << tera_queue_ptr_->size();
+    if (has_inited_) {
+        DropExpiredValue();
+    } else {
+        last_collect_ts_ = time_stamp;
+        has_inited_ = true;
+    }
+}
+
+std::string PrometheusSubscriber::GetTypeName() {
+    switch (type_)
+    {
+
+    case SubscriberType::LATEST:
+        return "Latest";
+
+    case SubscriberType::MAX:
+        return "Max";
+
+    case SubscriberType::MIN:
+        return "Min";
+
+    case SubscriberType::SUM:
+        return "Sum";
+
+    case SubscriberType::QPS:
+        return "Qps";
+
+    case SubscriberType::THROUGHPUT:
+        return "ThroughPut";
+
+    default:
+        LOG(ERROR) << "Unknown collector type: ";
+        abort();
+
+    }
+    //Never reach here
+    return "";
+}
+
+void PrometheusSubscriber::DropExpiredValue() {
+    if (tera_queue_ptr_->empty()) {
+        return;
+    }
+
+    auto last_enqueue_ts = tera_queue_ptr_->back().first;
+    int64_t drop_cnt = 0;
+    while (last_enqueue_ts - tera_queue_ptr_->front().first >= FLAGS_tera_metric_hold_max_time) {
+        VLOG(12) << "[PROMETHEUS SUBSCRIBER] drop last_enqueue_ts: " << last_enqueue_ts
+                  << "first_ts: " << tera_queue_ptr_->front().first;
+        ++drop_cnt;
+        last_collect_ts_ = tera_queue_ptr_->front().first;
+        tera_queue_ptr_->pop_front();
+    }
+    if (drop_cnt != 0) {
+        VLOG(12) << "[PROMETHEUS SUBSCRIBER] drop " << drop_cnt << "values";
+    }
+}
+
+int64_t PrometheusSubscriber::GetSpecificValue(std::shared_ptr<TimeValueQueue> tera_queue_ptr) {
+    switch (type_)
+    {
+
+    case SubscriberType::LATEST:
+        return GetLatest(tera_queue_ptr);
+
+    case SubscriberType::MAX:
+        return GetMax(tera_queue_ptr);
+
+    case SubscriberType::MIN:
+        return GetMin(tera_queue_ptr);
+
+    //Both of SUM, Qps, and THROUGHPUT use GetSum here
+    case SubscriberType::SUM:
+    case SubscriberType::QPS:
+    case SubscriberType::THROUGHPUT:
+        return GetSum(tera_queue_ptr);
+
+    default:
+        LOG(ERROR) << "Unknown collector type";
+        abort();
+
+    }
+    //Never reach here
+    return -1;
+}
+}
diff --git a/src/common/metric/prometheus_subscriber.h b/src/common/metric/prometheus_subscriber.h
new file mode 100644
index 000000000..67affa7bb
--- /dev/null
+++ b/src/common/metric/prometheus_subscriber.h
@@ -0,0 +1,81 @@
+#pragma once
+// Copyright (c) 2017, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+#include <memory>
+#include <deque>
+#include <utility>
+#include <algorithm>
+#include <numeric>
+#include <mutex>
+#include <string>
+
+#include "common/metric/subscriber.h"
+
+namespace tera {
+
+using TimeValueQueue = std::deque<TimeValuePair>;
+
+class PrometheusSubscriber : public Subscriber {
+public:
+    PrometheusSubscriber(const MetricId& metric_id, SubscriberType type = SubscriberType::LATEST):
+        tera_queue_ptr_(new TimeValueQueue),
+        last_collect_ts_(0), 
+        has_inited_(false),
+        type_(type),
+        metric_id_(metric_id) { }
+
+    ~PrometheusSubscriber() override {}
+    ReportItem Collect() override;
+    void OnUpdate(const std::shared_ptr<CollectorReport>) override;
+
+    std::string GetTypeName() override;
+
+    const MetricId& GetMetricId() override {
+        return metric_id_;
+    }
+
+private:
+    void Append(int64_t time_stamp, int64_t current_value);
+    void DropExpiredValue();
+    int64_t GetSpecificValue(std::shared_ptr<TimeValueQueue>);
+
+    int64_t GetMax(std::shared_ptr<TimeValueQueue> tera_queue_ptr) {
+        return std::max_element(tera_queue_ptr->begin(), tera_queue_ptr->end(), 
+               [](const TimeValuePair& x, const TimeValuePair& y) {
+                   return x.second < y.second;
+               })->second;
+    }
+
+    int64_t GetMin(std::shared_ptr<TimeValueQueue> tera_queue_ptr) {
+        return std::min_element(tera_queue_ptr->begin(), tera_queue_ptr->end(),
+               [](const TimeValuePair& x, const TimeValuePair& y) {
+                   return x.second < y.second;
+               })->second;
+    }
+
+    int64_t GetLatest(std::shared_ptr<TimeValueQueue> tera_queue_ptr) {
+        return tera_queue_ptr->back().second;
+    }
+
+    int64_t GetSum(std::shared_ptr<TimeValueQueue> tera_queue_ptr) {
+        return std::accumulate(tera_queue_ptr->begin(), tera_queue_ptr->end(), (int64_t)0, 
+               [](const int64_t val, const TimeValuePair& x) {
+                   return val + x.second;
+               });
+    }
+
+
+    std::mutex mtx_;
+    //queue of tera timestamp-value
+    std::shared_ptr<TimeValueQueue> tera_queue_ptr_;
+    //timestamp of prometheus_queue_ptr_'s last enqueue operation
+    int64_t last_collect_ts_;
+    //Is this class inited?
+    bool has_inited_;
+    //subscriber type
+    const SubscriberType type_;
+    MetricId metric_id_;
+};
+
+}
\ No newline at end of file
diff --git a/src/common/metric/ratio_collector.h b/src/common/metric/ratio_collector.h
new file mode 100644
index 000000000..3a933adef
--- /dev/null
+++ b/src/common/metric/ratio_collector.h
@@ -0,0 +1,45 @@
+// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef TERA_COMMOM_METRIC_RATIO_COLLECTOR_H_
+#define TERA_COMMOM_METRIC_RATIO_COLLECTOR_H_
+ 
+#include <cmath> 
+#include "common/metric/collector_report_publisher.h"
+ 
+namespace tera { 
+
+class RatioCollector : public Collector {
+public:
+    explicit RatioCollector(Counter* first_counter, 
+                            Counter* second_counter, 
+                            bool is_periodic = true):
+        first_counter_(first_counter),
+        second_counter_(second_counter),
+        is_periodic_(is_periodic) {}
+
+    int64_t Collect() override {
+        if (NULL == first_counter_ || NULL == second_counter_) {
+            return 0;
+        } else {
+            double ratio = (double)first_counter_->Get() / second_counter_->Get();
+            if (is_periodic_) {
+                first_counter_->Clear();
+                second_counter_->Clear();
+            }
+            return isnan(ratio) ? -1 : static_cast<int64_t>(ratio * 100);
+        }
+    }
+private:
+    Counter* const first_counter_;
+    Counter* const second_counter_;
+    const bool is_periodic_;
+};
+
+} // end namespace tera 
+ 
+#endif // TERA_COMMOM_METRIC_RATIO_COLLECTOR_H_
+ 
+/* vim: set expandtab ts=4 sw=4 sts=4 tw=100: */
+
diff --git a/src/common/metric/ratio_subscriber.h b/src/common/metric/ratio_subscriber.h
new file mode 100644
index 000000000..32656b46b
--- /dev/null
+++ b/src/common/metric/ratio_subscriber.h
@@ -0,0 +1,58 @@
+#pragma once
+// Copyright (c) 2017, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+#include "common/metric/subscriber.h"
+#include <cassert>
+#include <cmath>
+
+namespace tera {
+class RatioSubscriber : public Subscriber {
+public:
+    RatioSubscriber(const MetricId& metric_id, 
+                  std::unique_ptr<Subscriber>&& subscriber1, 
+                  std::unique_ptr<Subscriber>&& subscriber2):
+        metric_id_(metric_id),
+        subscriber1_(std::move(subscriber1)),
+        subscriber2_(std::move(subscriber2)) {
+        type_name_ = "Ratio: (" 
+                     + subscriber1_->GetMetricId().GetName() + ":" + subscriber1_->GetTypeName() + " / " 
+                     + subscriber2_->GetMetricId().GetName() + ":" + subscriber2_->GetTypeName() + ")";
+    }
+
+    virtual std::string GetTypeName() override {
+        return type_name_;
+    }
+
+    virtual void OnUpdate(const std::shared_ptr<CollectorReport> report_ptr) override {
+        subscriber1_->OnUpdate(report_ptr);
+        subscriber2_->OnUpdate(report_ptr);
+    }
+
+    virtual ReportItem Collect() override {
+        ReportItem ret;
+        auto subscriber1_ret = subscriber1_->Collect();
+        auto subscriber2_ret = subscriber2_->Collect();
+        //timestamp should be equal;
+        assert(subscriber1_ret.Time() == subscriber2_ret.Time());
+        double ratio = (double)subscriber1_ret.Value() / subscriber2_ret.Value();
+        ret.SetTimeValue({subscriber1_ret.Time(), 
+                          (isnan(ratio) ? -1 : static_cast<int64_t>(ratio))});
+        ret.SetType(GetTypeName());
+        return ret;
+    }
+
+    const MetricId& GetMetricId() override {
+        return metric_id_;
+    }
+
+    virtual ~RatioSubscriber() override {}
+
+private:
+    MetricId metric_id_;
+    std::unique_ptr<Subscriber> subscriber1_;
+    std::unique_ptr<Subscriber> subscriber2_;
+    std::string type_name_;
+};
+}
+
diff --git a/src/common/metric/subscriber.h b/src/common/metric/subscriber.h
new file mode 100644
index 000000000..6b0eb394b
--- /dev/null
+++ b/src/common/metric/subscriber.h
@@ -0,0 +1,66 @@
+#pragma once
+// Copyright (c) 2017, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+#include <memory>
+#include <string>
+#include <unordered_map> 
+#include "common/metric/metric_id.h"
+
+namespace tera {
+
+using TimeValuePair = std::pair<int64_t, int64_t>;
+
+class CollectorReport;
+
+struct ReportItem {
+    TimeValuePair time_value_pair;
+    std::string type;
+    ReportItem(TimeValuePair tvp = {-1, -1}, const std::string& t = ""):
+        time_value_pair(tvp),
+        type(t) { }
+
+    int64_t Value() const {
+        return time_value_pair.second;
+    }
+
+    int64_t Time() const {
+        return time_value_pair.first;
+    }
+
+    void SetTimeValue(const TimeValuePair& tvp) {
+        time_value_pair = tvp;
+    }
+
+    void SetType(const std::string& tp) {
+        type = tp;
+    }
+
+    std::string Type() const {
+        return type;
+    }
+};
+
+class Subscriber {
+public:
+    enum class SubscriberType {
+        LATEST,
+        MAX,
+        MIN,
+        QPS,
+        SUM,
+        THROUGHPUT
+    };
+    virtual ~Subscriber() {}
+    // return a pair of <timestamp, aggregate value> to Prometheus 
+    virtual ReportItem Collect() = 0;
+    // Update subscriber, depends to subscriber type
+    // Called in CollectorReportPublisher::Report()
+    virtual void OnUpdate(const std::shared_ptr<CollectorReport>) = 0;
+    virtual std::string GetTypeName() = 0;
+    virtual const MetricId& GetMetricId() = 0;
+};
+
+using SubscriberType = Subscriber::SubscriberType;
+using SubscriberReport = std::unordered_multimap<MetricId, ReportItem>;
+}
\ No newline at end of file
diff --git a/src/common/mutex.h b/src/common/mutex.h
old mode 100644
new mode 100755
index 46e89044f..381a69218
--- a/src/common/mutex.h
+++ b/src/common/mutex.h
@@ -12,7 +12,8 @@
 #include <stdlib.h>
 #include <string.h>
 #include <sys/time.h>
-#include "timer.h"
+#include <cassert>
+#include "common/timer.h"
 
 namespace common {
 
@@ -45,7 +46,7 @@ class Mutex {
         #ifdef MUTEX_DEBUG
         int64_t s = 0;
         if (msg) {
-            s = timer::get_micros();
+            s = get_micros();
         }
         #endif
         PthreadCall("mutex lock", pthread_mutex_lock(&mu_));
@@ -74,16 +75,16 @@ class Mutex {
         msg_ = msg;
         msg_threshold_ = msg_threshold;
         if (msg_) {
-            lock_time_ = timer::get_micros();
+            lock_time_ = get_micros();
         }
         #endif
         owner_ = pthread_self();
     }
     void BeforeUnlock() {
         #ifdef MUTEX_DEBUG
-        if (msg_ && timer::get_micros() - lock_time_ > msg_threshold_) {
+        if (msg_ && get_micros() - lock_time_ > msg_threshold_) {
             printf("%s locked %.3f ms\n",
-                    msg_, (timer::get_micros() - lock_time_) / 1000.0);
+                    msg_, (get_micros() - lock_time_) / 1000.0);
         }
         msg_ = NULL;
         #endif
@@ -137,11 +138,14 @@ class CondVar {
     }
     // Time wait in us
     // timeout < 0 would cause ETIMEOUT and return false immediately
-    bool TimeWaitInUs(int timeout, const char* msg = NULL) {
+    bool TimeWaitInUs(int64_t timeout, const char* msg = NULL) {
         // ref: http://www.qnx.com/developers/docs/6.5.0SP1.update/com.qnx.doc.neutrino_lib_ref/p/pthread_cond_timedwait.html
         struct timespec ts;
         clock_gettime(CLOCK_MONOTONIC, &ts);
-        int64_t nsec = ((int64_t)timeout) * 1000 + ts.tv_nsec;
+        int64_t nsec = timeout * 1000 + ts.tv_nsec;
+
+        assert(nsec > 0);
+
         ts.tv_sec += nsec / 1000000000;
         ts.tv_nsec = nsec % 1000000000;
 
diff --git a/src/common/request_done_wrapper.h b/src/common/request_done_wrapper.h
new file mode 100644
index 000000000..cd6b7b3b7
--- /dev/null
+++ b/src/common/request_done_wrapper.h
@@ -0,0 +1,29 @@
+#pragma once
+#include <google/protobuf/stubs/common.h>
+
+namespace tera {
+class RequestDoneWrapper : public google::protobuf::Closure {
+public:
+    static google::protobuf::Closure* NewInstance(google::protobuf::Closure* done) {
+        return new RequestDoneWrapper(done);
+    }
+
+    //Self-Deleted, never access it after Run();
+    //Default do nothing;
+    virtual void Run() override {
+        delete this;
+    }
+
+    virtual ~RequestDoneWrapper() {
+        done_->Run();
+    }
+
+protected:
+    //Can Only Create on Heap;
+    RequestDoneWrapper(google::protobuf::Closure* done):
+        done_(done) { }
+
+private:
+    google::protobuf::Closure* done_;
+};
+}
\ No newline at end of file
diff --git a/src/common/test/collector_report_test.cc b/src/common/test/collector_report_test.cc
new file mode 100644
index 000000000..e01972cc9
--- /dev/null
+++ b/src/common/test/collector_report_test.cc
@@ -0,0 +1,179 @@
+// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. 
+
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include "gflags/gflags.h"
+#include "glog/logging.h"
+#include "gtest/gtest.h"
+ 
+#include "common/metric/metric_counter.h" 
+#include "common/metric/collector_report.h"
+#include "common/this_thread.h" 
+ 
+namespace tera { 
+ 
+class CollectorReportTest : public ::testing::Test {
+public:
+    CollectorReportTest() 
+        : nonperiod_counter1_label(LabelStringBuilder().Append("key1", "value1").ToString()),
+          nonperiod_counter1("counter1", nonperiod_counter1_label, {}, false),
+          nonperiod_counter2("counter2", {}, false), 
+          period_counter1_label(LabelStringBuilder().Append("key2", "value2").ToString()),
+          period_counter1("counter1", period_counter1_label, {}, true),
+          period_counter3("counter3", {}, true) {
+        other_whatever_ids.push_back(MetricId());
+        other_whatever_ids.push_back(MetricId("whatevername"));
+        
+        MetricLabels whatever_labels;
+        whatever_labels["haha"] = "hehe";
+        whatever_labels["heihei"] = "hoho";
+        other_whatever_ids.push_back(MetricId("", whatever_labels));
+        other_whatever_ids.push_back(MetricId("whatevername", whatever_labels));
+    }
+
+    virtual void SetUp() {
+        nonperiod_counter1.Set(1);
+        nonperiod_counter2.Set(2);
+        period_counter1.Set(3);
+        period_counter3.Set(4);
+    }
+    
+    virtual void TearDown() {
+        // reset cache to initial status
+        CollectorReportPublisher::GetInstance().last_collector_report_.reset(new CollectorReport());
+    }
+private:
+    std::string nonperiod_counter1_label;
+    MetricCounter nonperiod_counter1;
+    MetricCounter nonperiod_counter2;
+    std::string period_counter1_label;
+    MetricCounter period_counter1;
+    MetricCounter period_counter3;
+    
+    std::vector<MetricId> other_whatever_ids;
+};  
+
+TEST_F(CollectorReportTest, FindTest) {
+    int64_t value = 0;
+    CollectorReportPublisher::GetInstance().Refresh();
+    std::shared_ptr<CollectorReport> report = CollectorReportPublisher::GetInstance().GetCollectorReport();
+    
+    // check report
+    EXPECT_EQ(report->report.size(), CollectorReportPublisher::GetInstance().collectors_.size());
+    
+    // nonperiod_counter1
+    value = report->FindMetricValue("counter1", nonperiod_counter1_label);
+    EXPECT_EQ(value, 1);
+    value = report->FindMetricValue(nonperiod_counter1.metric_id_);
+    EXPECT_EQ(value, 1);
+    value = report->FindMetricValue("counter1");
+    EXPECT_EQ(value, 0);
+    value = report->FindMetricValue("counter1", "other not exist label");
+    EXPECT_EQ(value, 0);
+    value = report->FindMetricValue("not exist name", nonperiod_counter1_label);
+    EXPECT_EQ(value, 0);
+    value = report->FindMetricValue(MetricId("counter1"));
+    EXPECT_EQ(value, 0);
+    
+    // nonperiod_counter2
+    value = report->FindMetricValue("counter2");
+    EXPECT_EQ(value, 2);
+    value = report->FindMetricValue("counter2", "");
+    EXPECT_EQ(value, 2);
+    value = report->FindMetricValue(MetricId("counter2"));
+    EXPECT_EQ(value, 2);
+    value = report->FindMetricValue("counter2", "whatever_label");
+    EXPECT_EQ(value, 0);
+    
+    // period_counter1
+    value = report->FindMetricValue("counter1", period_counter1_label);
+    EXPECT_EQ(value, 3);
+    value = report->FindMetricValue(period_counter1.metric_id_);
+    EXPECT_EQ(value, 3);
+    
+    // period_counter3
+    value = report->FindMetricValue("counter3");
+    EXPECT_EQ(value, 4);
+    value = report->FindMetricValue(period_counter3.metric_id_);
+    EXPECT_EQ(value, 4);
+    
+    // invalid 
+    for (const MetricId& not_exist_id : other_whatever_ids) {
+        value = report->FindMetricValue(not_exist_id.GetName());
+        EXPECT_EQ(value, 0);
+        value = report->FindMetricValue(not_exist_id.ToString());
+        EXPECT_EQ(value, 0);
+        value = report->FindMetricValue(not_exist_id);
+        EXPECT_EQ(value, 0);
+    }
+    
+    // report again
+    nonperiod_counter1.Inc();
+    nonperiod_counter2.Inc();
+    period_counter1.Inc();
+    period_counter3.Inc();
+    MetricCounter another_counter1("another1");
+    MetricCounter another_counter2("another2");
+    another_counter1.Inc();
+    CollectorReportPublisher::GetInstance().Refresh();
+    report = CollectorReportPublisher::GetInstance().GetCollectorReport();
+    EXPECT_EQ(report->report.size(), CollectorReportPublisher::GetInstance().collectors_.size());
+    
+    value = report->FindMetricValue(nonperiod_counter1.metric_id_);
+    EXPECT_EQ(value, 2);
+    value = report->FindMetricValue(nonperiod_counter2.metric_id_);
+    EXPECT_EQ(value, 3);
+    value = report->FindMetricValue(period_counter1.metric_id_);
+    EXPECT_EQ(value, 1);
+    value = report->FindMetricValue(period_counter3.metric_id_);
+    EXPECT_EQ(value, 1);
+    value = report->FindMetricValue(another_counter1.metric_id_);
+    EXPECT_EQ(value, 1);
+    value = report->FindMetricValue(another_counter2.metric_id_);
+    EXPECT_EQ(value, 0);
+}
+
+TEST_F(CollectorReportTest, CacheTest) {
+    // do not update yet
+    std::shared_ptr<CollectorReport> initial_report = CollectorReportPublisher::GetInstance().GetCollectorReport();
+    EXPECT_TRUE(initial_report.get() != NULL);
+    EXPECT_TRUE(initial_report->report.empty());
+    
+    // update
+    CollectorReportPublisher::GetInstance().Refresh();
+    std::shared_ptr<CollectorReport> report1 = CollectorReportPublisher::GetInstance().GetCollectorReport();
+    EXPECT_EQ(report1->report.size(), CollectorReportPublisher::GetInstance().collectors_.size());
+    EXPECT_TRUE(report1.get() == CollectorReportPublisher::GetInstance().last_collector_report_.get());
+    
+    // modify counters and report again
+    nonperiod_counter1.Inc();
+    nonperiod_counter2.Inc();
+    period_counter1.Inc();
+    period_counter3.Inc();
+    MetricCounter another_counter1("another1");
+    MetricCounter another_counter2("another2");
+    another_counter1.Inc();
+    
+    // get report before update, return same ptr
+    std::shared_ptr<CollectorReport> report2 = CollectorReportPublisher::GetInstance().GetCollectorReport();
+    EXPECT_TRUE(report2.get() == CollectorReportPublisher::GetInstance().last_collector_report_.get());
+    EXPECT_TRUE(report2.get() == report1.get());
+    EXPECT_EQ(report2->FindMetricValue(period_counter3.metric_id_), 4);
+    
+    // update and get
+    CollectorReportPublisher::GetInstance().Refresh();
+    std::shared_ptr<CollectorReport> report3 = CollectorReportPublisher::GetInstance().GetCollectorReport();
+    EXPECT_TRUE(report3.get() == CollectorReportPublisher::GetInstance().last_collector_report_.get());
+    EXPECT_FALSE(report3.get() == report1.get());
+    EXPECT_EQ(report3->report.size(), report2->report.size() + 2);
+    EXPECT_EQ(report3->FindMetricValue(period_counter3.metric_id_), 1);
+}
+ 
+} // end namespace tera
+ 
+/* vim: set expandtab ts=4 sw=4 sts=4 tw=100: */
+
diff --git a/src/common/test/common_test_main.cc b/src/common/test/common_test_main.cc
new file mode 100644
index 000000000..90c3b06dd
--- /dev/null
+++ b/src/common/test/common_test_main.cc
@@ -0,0 +1,30 @@
+// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include <string>
+
+#include "gflags/gflags.h"
+#include "glog/logging.h"
+#include "gtest/gtest.h"
+
+#include "utils/utils_cmd.h"
+
+int main(int argc, char** argv) {
+    ::google::InitGoogleLogging(argv[0]);
+    FLAGS_v = 16;
+    FLAGS_minloglevel=0;
+    FLAGS_log_dir = "./log";
+    if (access(FLAGS_log_dir.c_str(), F_OK)) {
+        mkdir(FLAGS_log_dir.c_str(), 0777);
+    }
+    std::string pragram_name("tera");
+    tera::utils::SetupLog(pragram_name);
+    ::google::ParseCommandLineFlags(&argc, &argv, true);
+    ::testing::InitGoogleTest(&argc, argv);
+    
+    return RUN_ALL_TESTS();
+} 
+ 
+/* vim: set expandtab ts=4 sw=4 sts=4 tw=100: */
+
diff --git a/src/utils/test/counter_test.cc b/src/common/test/counter_test.cc
similarity index 95%
rename from src/utils/test/counter_test.cc
rename to src/common/test/counter_test.cc
index 526f9cae6..598c98f04 100644
--- a/src/utils/test/counter_test.cc
+++ b/src/common/test/counter_test.cc
@@ -11,7 +11,7 @@
 
 #include "common/mutex.h"
 #include "common/thread_pool.h"
-#include "counter.h"
+#include "common/counter.h"
 
 namespace tera {
 
@@ -69,7 +69,7 @@ TEST(CounterTest, Basic) {
     Counter counter;
     ThreadPool* pool = new ThreadPool(thread_num);
     for (int i = 0; i < thread_num / 4; ++i) {
-        std::function<void ()> callback =
+        std::function<void (int64_t)> callback =
             std::bind(&callback_add, &counter);
         pool->AddTask(callback);
 
@@ -99,7 +99,7 @@ TEST(CounterTest, Clear) {
     Counter counter;
     ThreadPool* pool = new ThreadPool(thread_num);
     for (int i = 0; i < thread_num / 3; ++i) {
-        std::function<void ()> callback =
+        std::function<void (int64_t)> callback =
             std::bind(&callback_add, &counter);
         pool->AddTask(callback);
 
diff --git a/src/common/test/log_cleaner_test.cc b/src/common/test/log_cleaner_test.cc
new file mode 100644
index 000000000..8fbf3ef9f
--- /dev/null
+++ b/src/common/test/log_cleaner_test.cc
@@ -0,0 +1,246 @@
+// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+
+#include <fcntl.h>
+#include <functional>
+#include <iostream>
+#include <string>
+#include <sys/stat.h>
+
+#include "gflags/gflags.h"
+#include "glog/logging.h"
+#include "gtest/gtest.h"
+
+#include "common/file/file_path.h"
+#include "common/log/log_cleaner.h"
+#include "common/this_thread.h"
+#include "utils/utils_cmd.h"
+
+DECLARE_string(log_dir);
+DECLARE_string(tera_log_prefix);
+DECLARE_string(tera_leveldb_log_path);
+DECLARE_int64(tera_info_log_clean_period_second);
+DECLARE_int64(tera_info_log_expire_second);
+
+using namespace std::placeholders;
+
+namespace common {
+
+static size_t g_touch_file_count = 0;
+static size_t g_expect_clean_count = 0;
+const static int64_t kTestLogExpireSecond = 5;
+
+std::string TouchFile(const std::string& dir_path, const std::string& filename, bool need_close = true) {
+    std::string full_path = dir_path + "/" + filename;
+    int fd = open(full_path.c_str(), O_RDWR | O_CREAT | O_TRUNC, 0777);
+    if (need_close && fd > 0) {
+        close(fd);
+    }
+    ++g_touch_file_count;
+    return full_path;
+}
+
+void SetupTestEnv() {
+    std::string leveldb_log_prefix = "leveldb.log";
+    FLAGS_tera_leveldb_log_path = "./log/" + leveldb_log_prefix;
+    // fake options, change log dir for cleaner
+    FLAGS_log_dir = "./test_log";
+    FLAGS_tera_log_prefix = "tera_test";
+    FLAGS_tera_info_log_clean_period_second = 1;
+    FLAGS_tera_info_log_expire_second = kTestLogExpireSecond;
+    std::string other_prefix = "tera_other_prefix";
+    
+    // make test log dir, ignore failture
+    mkdir(FLAGS_log_dir.c_str(), 0777);
+    g_touch_file_count = 0;
+    g_expect_clean_count = 0;
+    
+    // touch file unlinked
+    std::string unlinked_info = FLAGS_tera_log_prefix + ".INFO.unlink";
+    TouchFile(FLAGS_log_dir, unlinked_info);
+    std::string unlinked_warn = FLAGS_tera_log_prefix + ".WARNING.unlink";
+    TouchFile(FLAGS_log_dir, unlinked_warn);
+    std::string unlinked_err = FLAGS_tera_log_prefix + ".stderr.unlink";
+    TouchFile(FLAGS_log_dir, unlinked_err);
+    g_expect_clean_count += 3; // expect clean unlinked file
+    
+    // touch file linked
+    std::string linked_info = FLAGS_tera_log_prefix + ".INFO.linked";
+    std::string info_link_path = FLAGS_log_dir + "/" + FLAGS_tera_log_prefix + ".INFO";
+    std::string linked_info_path = TouchFile(FLAGS_log_dir, linked_info);
+    // link full path
+    remove(info_link_path.c_str());
+    symlink(linked_info_path.c_str(), info_link_path.c_str());
+    ++g_touch_file_count;
+    
+    std::string linked_warn = FLAGS_tera_log_prefix + ".WARNING.linked";
+    std::string warn_link_path = FLAGS_log_dir + "/" + FLAGS_tera_log_prefix + ".WARNING";
+    TouchFile(FLAGS_log_dir, linked_warn);
+    // link filename only
+    remove(warn_link_path.c_str());
+    symlink(linked_warn.c_str(), warn_link_path.c_str());
+    ++g_touch_file_count;
+    
+    // touch file opened
+    std::string opened_info = FLAGS_tera_log_prefix + ".INFO.opened";
+    TouchFile(FLAGS_log_dir, opened_info, false);
+    std::string opened_warn = FLAGS_tera_log_prefix + ".WARNING.opened";
+    TouchFile(FLAGS_log_dir, opened_warn, false);
+    std::string opened_err = FLAGS_tera_log_prefix + ".stderr.opened";
+    TouchFile(FLAGS_log_dir, opened_err, false);
+    
+    // touch file not start with prefix
+    std::string other_pre_info = other_prefix + ".INFO.otherpre";
+    TouchFile(FLAGS_log_dir, other_pre_info);
+    std::string other_pre_warn = other_prefix + ".WARNING.otherpre";
+    TouchFile(FLAGS_log_dir, other_pre_warn);
+    std::string other_pre_err = other_prefix + ".stderr.otherpre";
+    TouchFile(FLAGS_log_dir, other_pre_err);
+    
+    // touch file start with leveldb_log_prefix and open one of them
+    std::string ldb_pre_info = leveldb_log_prefix;
+    TouchFile(FLAGS_log_dir, ldb_pre_info, false);
+    std::string ldb_pre_info_lod = leveldb_log_prefix + ".old";
+    TouchFile(FLAGS_log_dir, ldb_pre_info_lod);
+    g_expect_clean_count++; // expect clean leveldb_log_prefix.old
+}
+
+TEST(LogCleanerTest, InitialStatus) {
+    // ensure stop firstly
+    LogCleaner::StopCleaner();
+    ASSERT_TRUE(LogCleaner::singleton_instance_ == NULL);
+    SetupTestEnv();
+    LogCleaner *cleaner = LogCleaner::GetInstance();
+    
+    ASSERT_FALSE(cleaner == NULL);
+    ASSERT_FALSE(cleaner->IsRunning());
+    ASSERT_TRUE(cleaner->CheckOptions());
+    ASSERT_FALSE(cleaner->stop_);
+}
+
+TEST(LogCleanerTest, Basic) {
+    SetupTestEnv();
+    // get instance
+    LogCleaner *cleaner = LogCleaner::GetInstance();
+    ASSERT_FALSE(cleaner == NULL);
+    
+    // check log dir before clean
+    std::vector<std::string> reserved_file_list;
+    bool list_ret = ListCurrentDir(cleaner->info_log_dir_, &reserved_file_list);
+    ASSERT_TRUE(list_ret);
+    
+    // print filelist before clean
+    std::cout << "before clean. file count: " << reserved_file_list.size() << std::endl;
+    for (size_t i = 0; i < reserved_file_list.size(); ++i) {
+        std::cout << reserved_file_list[i] << std::endl;
+    }
+    ASSERT_EQ(reserved_file_list.size(), g_touch_file_count);
+    
+    // start and stop
+    cleaner->Start();
+    ASSERT_TRUE(cleaner->IsRunning());
+    ASSERT_FALSE(cleaner->stop_);
+    
+    {
+        // wait schedule clean first times
+        MutexLock l(&(cleaner->mutex_), "log cleaner unittest");
+        cleaner->bg_cond_.Wait();
+    }
+    
+    // check clean result
+    reserved_file_list.clear();
+    list_ret = ListCurrentDir(cleaner->info_log_dir_, &reserved_file_list);
+    ASSERT_TRUE(list_ret);
+    // print filelist after clean
+    std::cout << "first clean. expect clean nothing since not expire yet" << std::endl;
+    EXPECT_EQ(reserved_file_list.size(), g_touch_file_count);
+    
+    {
+        // wait schedule clean second times
+        MutexLock l(&(cleaner->mutex_), "log cleaner unittest");
+        cleaner->bg_cond_.Wait();
+    }
+    // check clean result
+    reserved_file_list.clear();
+    list_ret = ListCurrentDir(cleaner->info_log_dir_, &reserved_file_list);
+    ASSERT_TRUE(list_ret);
+    std::cout << "second clean. expect clean nothing since not expire yet" << std::endl;
+    EXPECT_EQ(reserved_file_list.size(), g_touch_file_count);
+    
+    for (size_t i = 3; i < kTestLogExpireSecond + 5; ++i) {
+        // wait schedule clean several times
+        std::cout << "wait " << i << " times clean." << std::endl;
+        MutexLock l(&(cleaner->mutex_), "log cleaner unittest");
+        cleaner->bg_cond_.Wait();
+    }
+    // check clean result
+    reserved_file_list.clear();
+    list_ret = ListCurrentDir(cleaner->info_log_dir_, &reserved_file_list);
+    ASSERT_TRUE(list_ret);
+    std::cout << "after " << kTestLogExpireSecond 
+              << " times clean. expect clean " << g_expect_clean_count
+              << " logs: " << std::endl;
+    // print filelist after clean
+    for (size_t i = 0; i < reserved_file_list.size(); ++i) {
+        std::cout << reserved_file_list[i] << std::endl;
+    }
+    EXPECT_EQ(reserved_file_list.size(), g_touch_file_count - g_expect_clean_count);
+    
+    // stop cleaner
+    cleaner->Stop();
+    ASSERT_FALSE(cleaner->IsRunning());
+    ASSERT_TRUE(cleaner->stop_);
+    ASSERT_FALSE(cleaner == NULL);
+    
+    // destroy
+    LogCleaner::StopCleaner();
+    ASSERT_TRUE(LogCleaner::singleton_instance_ == NULL);
+}
+
+TEST(LogCleanerTest, MultiStartAndStop) {
+    // ensure stop firstly
+    LogCleaner::StopCleaner();
+    ASSERT_TRUE(LogCleaner::singleton_instance_ == NULL);
+    
+    SetupTestEnv();
+    // get instance
+    LogCleaner *cleaner = LogCleaner::GetInstance();
+    
+    // stop while not start
+    cleaner->Stop();
+    ASSERT_FALSE(cleaner->IsRunning());
+    ASSERT_TRUE(cleaner->stop_);
+    
+    // start three times
+    cleaner->Start();
+    ASSERT_TRUE(cleaner->IsRunning());
+    cleaner->Start();
+    ASSERT_TRUE(cleaner->IsRunning());
+    cleaner->Start();
+    ASSERT_TRUE(cleaner->IsRunning());
+    
+    {
+        // wait schedule clean
+        MutexLock l(&(cleaner->mutex_), "log cleaner unittest");
+        cleaner->bg_cond_.Wait();
+    }
+    
+    // stop twice
+    cleaner->Stop();
+    ASSERT_FALSE(cleaner->IsRunning());
+    cleaner->Stop();
+    ASSERT_FALSE(cleaner->IsRunning());
+    
+    // start again
+    cleaner->Start();
+    ASSERT_TRUE(cleaner->IsRunning());
+    
+    // stop and destroy
+    LogCleaner::StopCleaner();
+    ASSERT_TRUE(LogCleaner::singleton_instance_ == NULL);
+}
+
+} // end namespace common
+
diff --git a/src/common/test/metric_counter_test.cc b/src/common/test/metric_counter_test.cc
new file mode 100644
index 000000000..00062b8ff
--- /dev/null
+++ b/src/common/test/metric_counter_test.cc
@@ -0,0 +1,97 @@
+// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. 
+
+#include <iostream>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+#include "gflags/gflags.h"
+#include "glog/logging.h"
+#include "gtest/gtest.h"
+ 
+#include "common/metric/metric_counter.h" 
+ 
+namespace tera { 
+ 
+class MetricCounterTest : public ::testing::Test {
+public:
+    virtual void SetUp() {
+        label_str_ = LabelStringBuilder()
+                .Append("test_label1", "test_value1")
+                .Append("test_label2", "test_value2")
+                .ToString();
+    }
+    
+    virtual void TearDown() {}
+
+private:
+    std::string label_str_;
+};
+
+TEST_F(MetricCounterTest, RegisterTest) {
+    MetricId test_id;
+    {
+        // with name and labels
+        MetricCounter counter1("counter1", label_str_);
+        test_id = counter1.metric_id_;
+        
+        EXPECT_TRUE(CollectorReportPublisher::GetInstance().HasCollector(counter1.metric_id_))
+                << "metric_id " << counter1.metric_id_.ToString() << std::endl;
+        EXPECT_TRUE(CollectorReportPublisher::GetInstance().HasCollector(test_id))
+                << "metric_id " << test_id.ToString() << std::endl;
+        EXPECT_TRUE(counter1.IsRegistered());
+    }
+    EXPECT_FALSE(CollectorReportPublisher::GetInstance().HasCollector(test_id))
+            << "metric_id " << test_id.ToString() << std::endl;
+    
+    {
+        // with name only
+        MetricCounter counter2("counter2", {}, true);
+        test_id = counter2.metric_id_;
+        
+        EXPECT_TRUE(CollectorReportPublisher::GetInstance().HasCollector(counter2.metric_id_))
+                << "metric_id " << counter2.metric_id_.ToString() << std::endl;
+        EXPECT_TRUE(CollectorReportPublisher::GetInstance().HasCollector(test_id)) 
+                << "metric_id " << test_id.ToString() << std::endl;
+        EXPECT_TRUE(counter2.IsRegistered());
+    }
+    EXPECT_FALSE(CollectorReportPublisher::GetInstance().HasCollector(test_id))
+            << "metric_id " << test_id.ToString() << std::endl;
+    
+    // with illegal label string
+    ASSERT_THROW(MetricCounter("counter3", "illegal_label_string", {}, true), std::invalid_argument);
+    
+    // with empty name
+    ASSERT_THROW(MetricCounter("", label_str_, {}, true), std::invalid_argument);
+    ASSERT_THROW(MetricCounter("", {}, true), std::invalid_argument);
+}
+ 
+TEST_F(MetricCounterTest, CollectTest) {
+    MetricCounter periodic_counter("periodic", label_str_, {}, true);
+    MetricCounter nonperiodic_counter("nonperiodic", label_str_, {}, false);
+    
+    for (size_t i = 0; i < 3; ++i) {
+        periodic_counter.Inc();
+        nonperiodic_counter.Inc();
+    }
+    EXPECT_EQ(periodic_counter.Get(), 3);
+    EXPECT_EQ(nonperiodic_counter.Get(), 3);
+    
+    // do collect
+    CollectorReportPublisher::GetInstance().Refresh();
+    
+    EXPECT_EQ(periodic_counter.Get(), 0);
+    EXPECT_EQ(nonperiodic_counter.Get(), 3);
+    
+    periodic_counter.Inc();
+    nonperiodic_counter.Inc();
+    EXPECT_EQ(periodic_counter.Get(), 1);
+    EXPECT_EQ(nonperiodic_counter.Get(), 4);
+}
+ 
+} // end namespace tera 
+ 
+/* vim: set expandtab ts=4 sw=4 sts=4 tw=100: */
+
diff --git a/src/common/test/metric_http_server_test.cc b/src/common/test/metric_http_server_test.cc
new file mode 100644
index 000000000..c911b438e
--- /dev/null
+++ b/src/common/test/metric_http_server_test.cc
@@ -0,0 +1,138 @@
+// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. 
+
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include "gflags/gflags.h"
+#include "glog/logging.h"
+#include "gtest/gtest.h"
+ 
+#include "common/metric/metric_counter.h" 
+#include "common/metric/metric_http_server.h" 
+#include "common/metric/collector_report.h" 
+#include "common/base/string_ext.h" 
+
+namespace tera { 
+ 
+class MetricHttpServerTest : public ::testing::Test { 
+public:
+    virtual void SetUp() {
+        // register metrics
+        test_counter = new MetricCounter("counter", {SubscriberType::LATEST});
+        server = new MetricHttpServer;
+        test_counter->Set(1);
+    }
+    virtual void TearDown() {
+        delete test_counter;
+        delete server;
+    }
+
+private:
+    MetricCounter* test_counter;
+    MetricHttpServer* server;
+}; 
+ 
+TEST_F(MetricHttpServerTest, BuildType) { 
+    std::string body;
+    ResponseBodyBuilder::BuildType(&body, "good", "gauge");
+    EXPECT_STREQ(body.c_str(), "# TYPE good gauge\n");
+    ResponseBodyBuilder::BuildType(&body, "bad", "summary");
+    EXPECT_STREQ(body.c_str(), "# TYPE good gauge\n"
+                               "# TYPE bad summary\n");
+}
+
+TEST_F(MetricHttpServerTest, BuildHelp) { 
+    std::string body;
+    ResponseBodyBuilder::BuildHelp(&body, "good", "good");
+    EXPECT_STREQ(body.c_str(), "# HELP good good\n");
+    ResponseBodyBuilder::BuildHelp(&body, "bad", "bad");
+    EXPECT_STREQ(body.c_str(), "# HELP good good\n"
+                               "# HELP bad bad\n");
+}
+
+TEST_F(MetricHttpServerTest, BuildMetricItem) { 
+    CollectorReportPublisher::GetInstance().Refresh();
+    auto report = CollectorReportPublisher::GetInstance().GetSubscriberReport();
+
+    std::string body;
+    int64_t time_stamp;
+
+    for (const auto& item : *report) {
+        if (item.first.GetName() == "counter") {
+            ResponseBodyBuilder::BuildMetricItem(&body, item.first, item.second);
+            time_stamp = item.second.Time();
+        }
+    }
+    std::string expect_body = "counter{value_type=\"Latest\"} 1 " + 
+                              std::to_string(time_stamp) + "\n";
+
+    EXPECT_EQ(body, expect_body);
+    EXPECT_EQ(test_counter->Get(), 0);
+    test_counter->Set(2);
+
+    CollectorReportPublisher::GetInstance().Refresh();
+    report = CollectorReportPublisher::GetInstance().GetSubscriberReport();
+
+    for (const auto& item : *report) {
+        if (item.first.GetName() == "counter") {
+            ResponseBodyBuilder::BuildMetricItem(&body, item.first, item.second);
+            time_stamp = item.second.Time();
+        }
+    }
+
+    expect_body += "counter{value_type=\"Latest\"} 2 " + 
+                   std::to_string(time_stamp) + "\n";
+
+    EXPECT_EQ(body, expect_body);
+}
+
+TEST_F(MetricHttpServerTest, GetResponseBody) { 
+    CollectorReportPublisher::GetInstance().Refresh();
+    int64_t timestamp = CollectorReportPublisher::GetInstance().GetCollectorReport()->timestamp_ms;
+    std::string body = server->GetResponseBody();
+    std::vector<std::string> splited_string;
+    SplitString(body, "\n", &splited_string);
+    bool find_counter = false;
+    for (int idx = 0; idx != splited_string.size(); ++ idx) {
+        if (splited_string[idx].substr(0, 8) == "counter{") {
+            find_counter = true;
+            EXPECT_STREQ(splited_string[idx - 2].c_str(), 
+                           "# HELP counter counter");
+            EXPECT_STREQ(splited_string[idx - 1].c_str(), 
+                           "# TYPE counter gauge");
+            std::string expected_line = "counter{value_type=\"Latest\"} 1 " + std::to_string(timestamp);
+            EXPECT_EQ(expected_line, splited_string[idx]);
+        }
+    }
+    EXPECT_TRUE(find_counter);
+    EXPECT_EQ(test_counter->Get(), 0);
+    test_counter->Set(19);
+    find_counter = false;
+
+    CollectorReportPublisher::GetInstance().Refresh();
+    timestamp = CollectorReportPublisher::GetInstance().GetCollectorReport()->timestamp_ms;
+    body = server->GetResponseBody();
+    splited_string.clear();
+    SplitString(body, "\n", &splited_string);
+    for (int idx = 0; idx != splited_string.size(); ++ idx) {
+        if (splited_string[idx].substr(0, 8) == "counter{") {
+            find_counter = true;
+            EXPECT_STREQ(splited_string[idx - 2].c_str(), 
+                           "# HELP counter counter");
+            EXPECT_STREQ(splited_string[idx - 1].c_str(), 
+                           "# TYPE counter gauge");
+            std::string expected_line = "counter{value_type=\"Latest\"} 19 " + std::to_string(timestamp);
+            EXPECT_EQ(expected_line, splited_string[idx]);
+        }
+    }
+
+    EXPECT_TRUE(find_counter);
+    EXPECT_EQ(test_counter->Get(), 0);
+}
+} // end namespace tera
+ 
+/* vim: set expandtab ts=4 sw=4 sts=4 tw=100: */
+
diff --git a/src/common/test/metric_id_test.cc b/src/common/test/metric_id_test.cc
new file mode 100644
index 000000000..ad2795073
--- /dev/null
+++ b/src/common/test/metric_id_test.cc
@@ -0,0 +1,178 @@
+// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. 
+
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include "gflags/gflags.h"
+#include "glog/logging.h"
+#include "gtest/gtest.h"
+ 
+#include "common/metric/metric_id.h" 
+ 
+namespace tera { 
+ 
+static const std::string kTestMetricName = "test_name";
+ 
+class MetricIdTest : public ::testing::Test {
+public:
+    virtual void SetUp() {
+	    empty_id_ = new MetricId();
+		id_with_name_ = new MetricId(kTestMetricName);
+		
+	    MetricLabels label_map;
+	    label_map.insert(std::make_pair("test_label1", "test_value1"));
+	    label_map.insert(std::make_pair("test_label2", "test_value2"));
+		label_str_ = "test_label1:test_value1,test_label2:test_value2";
+		
+		id_with_label_ = new MetricId("", label_map);
+		id_with_name_and_label_ = new MetricId(kTestMetricName, label_map);
+	}
+	
+	virtual void TearDown() {
+	    delete empty_id_;
+	    delete id_with_name_;
+		delete id_with_label_;
+		delete id_with_name_and_label_;
+	}
+
+private:
+    MetricId *empty_id_;
+    MetricId *id_with_name_;
+    MetricId *id_with_label_;
+    MetricId *id_with_name_and_label_;
+	std::string label_str_;
+};
+
+TEST_F(MetricIdTest, BasicTest) {
+    // empty id
+	ASSERT_FALSE(empty_id_->IsValid());
+	ASSERT_TRUE(empty_id_->GetName().empty());
+	ASSERT_TRUE(empty_id_->GetLabelMap().empty());
+	ASSERT_TRUE(empty_id_->ToString().empty());
+	ASSERT_TRUE(empty_id_->GetLabel("whatever_label").empty());
+	ASSERT_FALSE(empty_id_->ExistLabel("whatever_label"));
+	ASSERT_FALSE(empty_id_->CheckLabel("whatever_label", "whatever_value"));
+	
+	// id with name, empty label
+	ASSERT_TRUE(id_with_name_->IsValid());
+	ASSERT_STREQ(id_with_name_->GetName().c_str(), kTestMetricName.c_str());
+	ASSERT_TRUE(id_with_name_->GetLabelMap().empty());
+	ASSERT_STREQ(id_with_name_->ToString().c_str(), kTestMetricName.c_str());
+	ASSERT_TRUE(id_with_name_->GetLabel("whatever_label").empty());
+	ASSERT_FALSE(id_with_name_->ExistLabel("whatever_label"));
+	ASSERT_FALSE(id_with_name_->CheckLabel("whatever_label", "whatever_value"));
+	
+	// id with name and label
+	ASSERT_TRUE(id_with_name_and_label_->IsValid());
+	ASSERT_STREQ(id_with_name_and_label_->GetName().c_str(), kTestMetricName.c_str());
+	ASSERT_EQ(id_with_name_and_label_->GetLabelMap().size(), 2);
+	
+	std::string expected_id_str = kTestMetricName + kNameLabelsDelimiter + label_str_;
+	ASSERT_STREQ(id_with_name_and_label_->ToString().c_str(), expected_id_str.c_str());
+	ASSERT_STREQ(id_with_name_and_label_->GetLabel("test_label1").c_str(), "test_value1");
+	ASSERT_TRUE(id_with_name_and_label_->ExistLabel("test_label1"));
+	ASSERT_TRUE(id_with_name_and_label_->CheckLabel("test_label1", "test_value1"));
+	
+	ASSERT_TRUE(id_with_name_and_label_->GetLabel("not_exist_label").empty());
+	ASSERT_FALSE(id_with_name_and_label_->ExistLabel("not_exist_label"));
+	ASSERT_FALSE(id_with_name_and_label_->CheckLabel("not_exist_label", "test_value1"));
+	ASSERT_FALSE(id_with_name_and_label_->CheckLabel("test_label1", "test_value2"));
+	
+	// id with label, empty name
+	ASSERT_FALSE(id_with_label_->IsValid());
+}
+ 
+TEST_F(MetricIdTest, CopyTest) {
+	// copy id
+	MetricId copy_id(*id_with_name_and_label_);
+	ASSERT_TRUE(copy_id.IsValid());
+	ASSERT_STREQ(copy_id.GetName().c_str(), id_with_name_and_label_->GetName().c_str());
+	ASSERT_EQ(copy_id.GetLabelMap().size(), id_with_name_and_label_->GetLabelMap().size());
+	ASSERT_STREQ(copy_id.ToString().c_str(), id_with_name_and_label_->ToString().c_str());
+	ASSERT_STREQ(copy_id.GetLabel("test_label1").c_str(), "test_value1");
+	ASSERT_TRUE(copy_id.ExistLabel("test_label1"));
+	ASSERT_TRUE(copy_id.CheckLabel("test_label1", "test_value1"));
+	
+	ASSERT_TRUE(copy_id.GetLabel("not_exist_label").empty());
+	ASSERT_FALSE(copy_id.ExistLabel("not_exist_label"));
+	ASSERT_FALSE(copy_id.CheckLabel("not_exist_label", "test_value1"));
+	ASSERT_FALSE(copy_id.CheckLabel("test_label1", "test_value2"));
+    ASSERT_TRUE(copy_id == *id_with_name_and_label_);
+	
+	// assign id
+	MetricId assign_id;
+	assign_id = *id_with_name_and_label_;
+	ASSERT_TRUE(assign_id.IsValid());
+	ASSERT_STREQ(assign_id.GetName().c_str(), id_with_name_and_label_->GetName().c_str());
+	ASSERT_EQ(assign_id.GetLabelMap().size(), id_with_name_and_label_->GetLabelMap().size());
+	ASSERT_STREQ(assign_id.ToString().c_str(), id_with_name_and_label_->ToString().c_str());
+	ASSERT_STREQ(assign_id.GetLabel("test_label1").c_str(), "test_value1");
+	ASSERT_TRUE(assign_id.ExistLabel("test_label1"));
+	ASSERT_TRUE(assign_id.CheckLabel("test_label1", "test_value1"));
+	
+	ASSERT_TRUE(assign_id.GetLabel("not_exist_label").empty());
+	ASSERT_FALSE(assign_id.ExistLabel("not_exist_label"));
+	ASSERT_FALSE(assign_id.CheckLabel("not_exist_label", "test_value1"));
+	ASSERT_FALSE(assign_id.CheckLabel("test_label1", "test_value2"));
+    ASSERT_TRUE(assign_id == *id_with_name_and_label_);
+}
+
+TEST_F(MetricIdTest, BuildTest) {
+	MetricId test_id;
+	bool ret = false;
+	
+	std::string legal_label_str = LabelStringBuilder()
+	    .Append("test_label1", "test_value1")
+		.Append("test_label2", "test_value2")
+		.ToString();
+	ASSERT_STREQ(legal_label_str.c_str(), label_str_.c_str());
+	
+	ret = MetricId::ParseFromString(kTestMetricName, legal_label_str, &test_id);
+	ASSERT_TRUE(ret) << "Parse label string: " << legal_label_str << ", failed" << std::endl;
+	ASSERT_TRUE(test_id.IsValid());
+	ASSERT_STREQ(test_id.GetName().c_str(), kTestMetricName.c_str());
+	ASSERT_EQ(test_id.GetLabelMap().size(), id_with_name_and_label_->GetLabelMap().size());
+	std::string expected_id_str = kTestMetricName + kNameLabelsDelimiter + legal_label_str;
+	ASSERT_STREQ(test_id.ToString().c_str(), expected_id_str.c_str());
+    
+    std::string single_label_str = LabelStringBuilder()
+	    .Append("test_label1", "test_value1")
+		.ToString();
+	ASSERT_STREQ(single_label_str.c_str(), "test_label1:test_value1");
+	ret = MetricId::ParseFromString(kTestMetricName, single_label_str, &test_id);
+	ASSERT_TRUE(ret) << "Parse label string: " << single_label_str << ", failed" << std::endl;
+	ASSERT_TRUE(test_id.IsValid());
+	ASSERT_STREQ(test_id.GetName().c_str(), kTestMetricName.c_str());
+	ASSERT_EQ(test_id.GetLabelMap().size(), 1);
+	expected_id_str = kTestMetricName + kNameLabelsDelimiter + single_label_str;
+	ASSERT_STREQ(test_id.ToString().c_str(), expected_id_str.c_str());
+	
+	std::string empty_label_str = LabelStringBuilder().ToString();
+	ASSERT_STREQ(empty_label_str.c_str(), "");
+	ret = MetricId::ParseFromString(kTestMetricName, empty_label_str, &test_id);
+	ASSERT_TRUE(ret);
+	ASSERT_TRUE(test_id.IsValid());
+	ASSERT_STREQ(test_id.GetName().c_str(), kTestMetricName.c_str());
+	ASSERT_TRUE(test_id.GetLabelMap().empty());
+	ASSERT_STREQ(test_id.ToString().c_str(), kTestMetricName.c_str());
+	
+	std::vector<std::string> illegal_label_str_vec;
+	illegal_label_str_vec.push_back("haha:hehe,,,,");
+	illegal_label_str_vec.push_back("haha:hehe,hoho");
+	illegal_label_str_vec.push_back("haha:hehe,hoho:heihei,");
+	illegal_label_str_vec.push_back("haha");
+	illegal_label_str_vec.push_back(",lalala");
+	
+	for (const std::string& illegal_label : illegal_label_str_vec) {
+	    ret = MetricId::ParseFromString(kTestMetricName, illegal_label, &test_id);
+	    ASSERT_FALSE(ret);
+	}
+}
+ 
+} // end namespace tera 
+ 
+/* vim: set expandtab ts=4 sw=4 sts=4 tw=100: */
+
diff --git a/src/common/test/metrics_test.cc b/src/common/test/metrics_test.cc
new file mode 100644
index 000000000..7bc5e9abb
--- /dev/null
+++ b/src/common/test/metrics_test.cc
@@ -0,0 +1,187 @@
+// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. 
+
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include "gflags/gflags.h"
+#include "glog/logging.h"
+#include "gtest/gtest.h"
+ 
+#include "common/metric/metric_counter.h" 
+#include "common/metric/hardware_collectors.h"
+#include "common/metric/collector_report_publisher.h"
+#include "common/this_thread.h" 
+
+DECLARE_int64(tera_hardware_collect_period_second);
+ 
+namespace tera { 
+ 
+class MetricsTest : public ::testing::Test {
+public:
+    virtual void SetUp() {
+        // shorter period for test
+        FLAGS_tera_hardware_collect_period_second = 1;
+        CollectorReportPublisher::GetInstance().AddHardwareCollectors();
+        
+        label_map_["test_label1"] = "test_value1";
+        label_map_["test_label2"] = "test_value2";
+    }
+    
+    virtual void TearDown() {
+        CollectorReportPublisher::GetInstance().collectors_.clear();
+        label_map_.clear();
+    }
+
+private:
+    MetricLabels label_map_;
+};
+
+static void PrintCollectorReportPublisher() {
+    std::cout << "Print Metric Registry: " << std::endl;
+    auto& metric_map = CollectorReportPublisher::GetInstance().collectors_;
+    auto metric_iter = metric_map.begin();
+    for (; metric_iter != metric_map.end(); ++metric_iter) {
+        std::cout << metric_iter->first.ToString() << std::endl;
+    }
+}
+
+TEST_F(MetricsTest, RegisterTest) {
+    // hardware metrics
+    ASSERT_TRUE(CollectorReportPublisher::GetInstance().HasCollector(MetricId(kInstCpuMetricName)));
+    ASSERT_TRUE(CollectorReportPublisher::GetInstance().HasCollector(MetricId(kInstMemMetricName)));
+    ASSERT_TRUE(CollectorReportPublisher::GetInstance().HasCollector(MetricId(kInstNetRXMetricName)));
+    ASSERT_TRUE(CollectorReportPublisher::GetInstance().HasCollector(MetricId(kInstNetTXMetricName)));
+    
+    bool ret = false;
+    Counter* test_counters = new Counter[5];
+    // register a counter
+    MetricId test_id_1("test_counter", label_map_);
+    ret = CollectorReportPublisher::GetInstance().AddCollector(
+            test_id_1, std::unique_ptr<Collector>(new CounterCollector(&test_counters[0])));
+    EXPECT_TRUE(ret);
+    EXPECT_TRUE(CollectorReportPublisher::GetInstance().HasCollector(test_id_1));
+    PrintCollectorReportPublisher();
+    
+    // register a counter with different name
+    MetricId test_id_2("test_counter_2", label_map_);
+    ret = CollectorReportPublisher::GetInstance().AddCollector(
+            test_id_2, std::unique_ptr<Collector>(new CounterCollector(&test_counters[0])));
+    EXPECT_TRUE(ret);
+    EXPECT_TRUE(CollectorReportPublisher::GetInstance().HasCollector(test_id_2));
+    PrintCollectorReportPublisher();
+    
+    // register a counter with name only
+    ret = CollectorReportPublisher::GetInstance().AddCollector(
+            MetricId("test_counter3"), std::unique_ptr<Collector>(new CounterCollector(&test_counters[2])));
+    EXPECT_TRUE(ret);
+    EXPECT_TRUE(CollectorReportPublisher::GetInstance().HasCollector(MetricId("test_counter3")));
+    PrintCollectorReportPublisher();
+    
+    // register a counter with same name and different labels
+    label_map_["test_label2"] = "other_label_value";
+    MetricId test_id_4("test_counter", label_map_);
+    ret = CollectorReportPublisher::GetInstance().AddCollector(
+            test_id_4, std::unique_ptr<Collector>(new CounterCollector(&test_counters[3])));
+    EXPECT_TRUE(ret);
+    EXPECT_TRUE(CollectorReportPublisher::GetInstance().HasCollector(test_id_4));
+    PrintCollectorReportPublisher();
+    
+    // register a counter with same id
+    ret = CollectorReportPublisher::GetInstance().AddCollector(
+            test_id_1, std::unique_ptr<Collector>(new CounterCollector(&test_counters[4])));
+    EXPECT_FALSE(ret);
+    EXPECT_TRUE(CollectorReportPublisher::GetInstance().HasCollector(test_id_1));
+    PrintCollectorReportPublisher();
+    
+    ret = CollectorReportPublisher::GetInstance().AddCollector(
+            MetricId("test_counter3"), std::unique_ptr<Collector>(new CounterCollector(&test_counters[4])));
+    EXPECT_FALSE(ret);
+    EXPECT_TRUE(CollectorReportPublisher::GetInstance().HasCollector(MetricId("test_counter3")));
+    PrintCollectorReportPublisher();
+    
+    // unregister
+    ret = CollectorReportPublisher::GetInstance().DeleteCollector(test_id_1);
+    EXPECT_TRUE(ret);
+    EXPECT_FALSE(CollectorReportPublisher::GetInstance().HasCollector(test_id_1));
+    EXPECT_TRUE(CollectorReportPublisher::GetInstance().HasCollector(test_id_2));
+    
+    ret = CollectorReportPublisher::GetInstance().DeleteCollector(MetricId("test_counter3"));
+    EXPECT_TRUE(ret);
+    EXPECT_FALSE(CollectorReportPublisher::GetInstance().HasCollector(MetricId("test_counter3")));
+    EXPECT_TRUE(CollectorReportPublisher::GetInstance().HasCollector(test_id_2));
+    
+    MetricId not_registered_id("not_registered_name", label_map_);
+    ret = CollectorReportPublisher::GetInstance().DeleteCollector(not_registered_id);
+    EXPECT_FALSE(ret);
+    
+    label_map_["test_label2"] = "not_registered_value";
+    MetricId not_registered_id_2("test_counter", label_map_);
+    ret = CollectorReportPublisher::GetInstance().DeleteCollector(not_registered_id_2);
+    EXPECT_FALSE(ret);
+    
+    ret = CollectorReportPublisher::GetInstance().DeleteCollector(MetricId("not_registered_name"));
+    EXPECT_FALSE(ret);
+    
+    delete[] test_counters;
+}
+
+TEST_F(MetricsTest, ReportTest) {
+    // check report cache
+    int64_t value = 0;
+    
+    // register 2 counter
+    std::string label_str = LabelStringBuilder()
+            .Append("test_label1", "test_value1")
+            .Append("test_label2", "test_value2")
+            .ToString();
+    MetricCounter periodic_counter("periodic", label_str, {}, true);
+    MetricCounter nonperiodic_counter("nonperiodic", label_str, {}, false);
+    
+    for (size_t i = 0; i < 3; ++i) {
+        periodic_counter.Inc();
+        nonperiodic_counter.Inc();
+    }
+    EXPECT_EQ(periodic_counter.Get(), 3);
+    EXPECT_EQ(nonperiodic_counter.Get(), 3);
+    
+    // do collect
+    ThisThread::Sleep(10);
+    
+    CollectorReportPublisher::GetInstance().Refresh();
+    std::shared_ptr<CollectorReport> report = CollectorReportPublisher::GetInstance().GetCollectorReport();
+    
+    EXPECT_EQ(periodic_counter.Get(), 0);
+    EXPECT_EQ(nonperiodic_counter.Get(), 3);
+    
+    // check report
+    EXPECT_EQ(report->report.size(), CollectorReportPublisher::GetInstance().collectors_.size());
+    value = report->FindMetricValue("periodic", label_str);
+    EXPECT_EQ(value, 3);
+    value = report->FindMetricValue("nonperiodic", label_str);
+    EXPECT_EQ(value, 3);
+    
+    // change counter value
+    periodic_counter.Inc();
+    nonperiodic_counter.Dec();
+    EXPECT_EQ(periodic_counter.Get(), 1);
+    EXPECT_EQ(nonperiodic_counter.Get(), 2);
+    
+    // report again
+    CollectorReportPublisher::GetInstance().Refresh();
+    report = CollectorReportPublisher::GetInstance().GetCollectorReport();
+    EXPECT_EQ(periodic_counter.Get(), 0);
+    EXPECT_EQ(nonperiodic_counter.Get(), 2);
+    
+    value = report->FindMetricValue("periodic", label_str);
+    EXPECT_EQ(value, 1);
+    value = report->FindMetricValue("nonperiodic", label_str);
+    EXPECT_EQ(value, 2);
+}
+ 
+} // end namespace tera
+ 
+/* vim: set expandtab ts=4 sw=4 sts=4 tw=100: */
+
diff --git a/src/common/test/profiler_test.cc b/src/common/test/profiler_test.cc
new file mode 100644
index 000000000..623d1c0f4
--- /dev/null
+++ b/src/common/test/profiler_test.cc
@@ -0,0 +1,83 @@
+// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. 
+
+#include <string>
+
+#include <gperftools/profiler.h>
+#include <gperftools/heap-profiler.h>
+
+#include "gtest/gtest.h"
+ 
+#include "common/cpu_profiler.h" 
+#include "common/heap_profiler.h" 
+#include "common/this_thread.h" 
+ 
+namespace tera { 
+ 
+class ProfilerTest : public ::testing::Test {
+public:
+    virtual void SetUp() {}
+    
+    virtual void TearDown() {}
+
+private:
+    CpuProfiler cpu_profiler_;
+    HeapProfiler heap_profiler_;
+};
+
+TEST_F(ProfilerTest, SetEnableTest) {
+    ProfilerState ps;
+    EXPECT_FALSE(cpu_profiler_.enable_);
+    EXPECT_FALSE(heap_profiler_.enable_);
+    ProfilerGetCurrentState(&ps);
+    EXPECT_FALSE(ps.enabled);
+    EXPECT_FALSE(IsHeapProfilerRunning());
+
+    cpu_profiler_.SetProfilerFile("Cpu")
+                 .SetEnable(true);
+
+    heap_profiler_.SetProfilerFile("Heap")
+                  .SetEnable(true);
+
+    EXPECT_TRUE(cpu_profiler_.enable_);
+    EXPECT_TRUE(heap_profiler_.enable_);
+
+    ThisThread::Sleep(2000);
+    ProfilerGetCurrentState(&ps);
+    EXPECT_TRUE(ps.enabled);
+    EXPECT_TRUE(IsHeapProfilerRunning());
+
+    cpu_profiler_.SetEnable(false);
+    heap_profiler_.SetEnable(false);
+    
+    EXPECT_FALSE(cpu_profiler_.enable_);
+    EXPECT_FALSE(heap_profiler_.enable_);
+
+    ThisThread::Sleep(2000);
+    ProfilerGetCurrentState(&ps);
+    EXPECT_FALSE(ps.enabled);
+    EXPECT_FALSE(IsHeapProfilerRunning());
+}
+
+TEST_F(ProfilerTest, SetInvervalTest) {
+    EXPECT_EQ(cpu_profiler_.interval_, std::chrono::seconds(10));
+    EXPECT_EQ(heap_profiler_.interval_, std::chrono::seconds(10));
+    cpu_profiler_.SetInterval(1000);
+    heap_profiler_.SetInterval(2000);
+    EXPECT_EQ(cpu_profiler_.interval_, std::chrono::seconds(1000));
+    EXPECT_EQ(heap_profiler_.interval_, std::chrono::seconds(2000));
+}
+
+TEST_F(ProfilerTest, SetProfilerFileTest) {
+    EXPECT_EQ(cpu_profiler_.profiler_file_, std::string(""));
+    EXPECT_EQ(heap_profiler_.profiler_file_, std::string(""));
+    cpu_profiler_.SetProfilerFile("Good");
+    heap_profiler_.SetProfilerFile("Bad");
+    EXPECT_EQ(cpu_profiler_.profiler_file_, std::string("Good"));
+    EXPECT_EQ(heap_profiler_.profiler_file_, std::string("Bad"));
+}
+} // end namespace tera 
+ 
+/* vim: set expandtab ts=4 sw=4 sts=4 tw=100: */
+
diff --git a/src/common/test/thread_pool_test.cc b/src/common/test/thread_pool_test.cc
index 6c1e421cb..8462b78a5 100644
--- a/src/common/test/thread_pool_test.cc
+++ b/src/common/test/thread_pool_test.cc
@@ -56,7 +56,7 @@ TEST(TimerTest, test1) {
 
     clock_gettime(CLOCK_REALTIME, &ts1);
     gettimeofday(&tv, NULL);
-    int64_t ts = common::timer::get_micros();
+    int64_t ts = get_micros();
 
     int delta = 0;
     delta = ts1.tv_sec - tv.tv_sec;
diff --git a/src/common/timer.h b/src/common/timer.h
index 1b335bb6b..b035e18c9 100644
--- a/src/common/timer.h
+++ b/src/common/timer.h
@@ -1,18 +1,31 @@
-// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved
+#pragma once
+// Copyright (c) 2017, Baidu.com, Inc. All Rights Reserved
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.
 //
 // Author: yanshiguang02@baidu.com
 
-#ifndef  TERA_COMMON_TIMER_H_
-#define  TERA_COMMON_TIMER_H_
 
 #include <stdio.h>
 #include <sys/time.h>
 #include <string>
+#include <cstring>
 
-namespace common {
-namespace timer {
+namespace tera{
+
+static inline int64_t get_timestamp_from_str(const std::string& time) {
+    struct tm tm;
+    memset(&tm, 0, sizeof(tm));
+
+    sscanf(time.c_str(), "%4d%2d%2d-%d:%d:%d",
+           &tm.tm_year, &tm.tm_mon, &tm.tm_mday,
+           &tm.tm_hour, &tm.tm_min, &tm.tm_sec);
+
+    tm.tm_year -= 1900;
+    tm.tm_mon--;
+
+    return mktime(&tm);
+}
 
 static inline std::string get_time_str(int64_t timestamp) {
     struct tm tt;
@@ -26,12 +39,24 @@ static inline std::string get_curtime_str() {
     return get_time_str(time(NULL));
 }
 
+static inline std::string get_curtime_str_plain() {
+    struct tm tt;
+    char buf[20];
+    time_t t = time(NULL);
+    strftime(buf, 20, "%Y%m%d%H%M%S", localtime_r(&t, &tt));
+    return std::string(buf);
+}
+
 static inline int64_t get_micros() {
     struct timespec ts;
     clock_gettime(CLOCK_REALTIME, &ts);
     return static_cast<int64_t>(ts.tv_sec) * 1000000 + static_cast<int64_t>(ts.tv_nsec) / 1000;
 }
 
+static inline int64_t get_millis() {
+    return get_micros() / 1000;
+}
+
 static inline int64_t get_unique_micros(int64_t ref) {
     int64_t now;
     do {
@@ -40,7 +65,12 @@ static inline int64_t get_unique_micros(int64_t ref) {
     return now;
 }
 
-}  // namespace timer
-}  // namespace common
+static inline int64_t GetTimeStampInUs() {
+    return get_micros();
+}
+
+static inline int64_t GetTimeStampInMs() {
+    return get_millis();
+}
+}
 
-#endif  // TERA_COMMON_TIMER_H_
diff --git a/src/io/default_compact_strategy.cc b/src/io/default_compact_strategy.cc
index b667b8e6e..4e34a6060 100644
--- a/src/io/default_compact_strategy.cc
+++ b/src/io/default_compact_strategy.cc
@@ -256,6 +256,7 @@ bool DefaultCompactStrategy::InternalMergeProcess(leveldb::Iterator* it,
 }
 
 bool DefaultCompactStrategy::ScanDrop(const Slice& tera_key, uint64_t n) {
+    bool key_col_qual_same = false;
     Slice key, col, qual;
     int64_t ts = -1;
     leveldb::TeraKeyType type;
@@ -345,6 +346,7 @@ bool DefaultCompactStrategy::ScanDrop(const Slice& tera_key, uint64_t n) {
         }
         return true;
     } else {
+        key_col_qual_same = true;
         last_type_ = type;
     }
 
@@ -362,8 +364,7 @@ bool DefaultCompactStrategy::ScanDrop(const Slice& tera_key, uint64_t n) {
 
     CHECK(cf_id >= 0) << "illegel column family";
     if (type == leveldb::TKT_VALUE) {
-        if (cur_ts_ == last_ts_ && last_qual_ == qual.ToString() &&
-            last_col_ == col.ToString() && last_key_ == key.ToString()) {
+        if (cur_ts_ == last_ts_ && key_col_qual_same) {
             // this is the same key, do not chang version num
         } else {
             version_num_++;
diff --git a/src/io/tablet_io.cc b/src/io/tablet_io.cc
index 81222e447..de97994c7 100644
--- a/src/io/tablet_io.cc
+++ b/src/io/tablet_io.cc
@@ -27,11 +27,14 @@
 #include "leveldb/filter_policy.h"
 #include "leveldb/raw_key_operator.h"
 #include "types.h"
-#include "utils/counter.h"
+#include "common/counter.h"
 #include "utils/scan_filter.h"
 #include "utils/string_util.h"
-#include "utils/timer.h"
+#include "common/timer.h"
 #include "utils/utils_cmd.h"
+#include "common/metric/prometheus_subscriber.h"
+#include "common/metric/ratio_subscriber.h"
+#include "tabletnode/tabletnode_metric_name.h"
 
 DECLARE_string(tera_leveldb_env_type);
 DECLARE_int64(tera_tablet_log_file_size);
@@ -69,11 +72,47 @@ DECLARE_bool(tera_tablet_use_memtable_on_leveldb);
 DECLARE_int64(tera_tablet_memtable_ldb_write_buffer_size);
 DECLARE_int64(tera_tablet_memtable_ldb_block_size);
 
-tera::Counter row_read_delay;
+DECLARE_bool(tera_leveldb_ignore_corruption_in_open);
+DECLARE_int32(tera_leveldb_slow_down_level0_score_limit);
+DECLARE_int32(tera_leveldb_max_background_compactions);
+DECLARE_int32(tera_tablet_max_sub_parallel_compaction);
 
 namespace tera {
 namespace io {
 
+using tera::tabletnode::kRowDelayMetric;
+using tera::tabletnode::kRowCountMetric;
+using tera::tabletnode::kRowThroughPutMetric;
+
+using tera::tabletnode::kApiLabelRead;
+using tera::tabletnode::kApiLabelScan;
+using tera::tabletnode::kApiLabelWrite;
+
+using tera::tabletnode::kLowLevelReadMetric;
+
+tera::MetricCounter low_level_read_count(kLowLevelReadMetric, {SubscriberType::QPS});
+
+tera::MetricCounter row_read_delay(kRowDelayMetric, kApiLabelRead, {});
+tera::MetricCounter row_read_count(kRowCountMetric, kApiLabelRead, {SubscriberType::QPS});
+tera::MetricCounter row_read_bytes(kRowThroughPutMetric, kApiLabelRead, {SubscriberType::THROUGHPUT});
+
+tera::MetricCounter row_scan_delay(kRowDelayMetric, kApiLabelScan, {});
+tera::MetricCounter row_scan_count(kRowCountMetric, kApiLabelScan, {SubscriberType::QPS});
+tera::MetricCounter row_scan_bytes(kRowThroughPutMetric, kApiLabelScan, {SubscriberType::THROUGHPUT});
+
+tera::MetricCounter row_write_bytes(kRowThroughPutMetric, kApiLabelWrite, {SubscriberType::THROUGHPUT});
+
+tera::AutoSubscriberRegister row_read_delay_per_row(std::unique_ptr<Subscriber>(new tera::RatioSubscriber(
+    MetricId("tera_ts_row_read_delay_us_per_row"),
+    std::unique_ptr<Subscriber>(new tera::PrometheusSubscriber(MetricId(kRowDelayMetric, kApiLabelRead), SubscriberType::SUM)),
+    std::unique_ptr<Subscriber>(new tera::PrometheusSubscriber(MetricId(kRowCountMetric, kApiLabelRead), SubscriberType::SUM)))));
+
+tera::AutoSubscriberRegister row_scan_delay_per_row(std::unique_ptr<Subscriber>(new tera::RatioSubscriber(
+    MetricId("tera_ts_row_scan_delay_us_per_row"),
+    std::unique_ptr<Subscriber>(new tera::PrometheusSubscriber(MetricId(kRowDelayMetric, kApiLabelScan), SubscriberType::SUM)),
+    std::unique_ptr<Subscriber>(new tera::PrometheusSubscriber(MetricId(kRowCountMetric, kApiLabelScan), SubscriberType::SUM)))));
+
+
 std::ostream& operator << (std::ostream& o, const TabletIO& tablet_io) {
     o << tablet_io.short_path_
       << " [" << DebugString(tablet_io.start_key_)
@@ -81,6 +120,17 @@ std::ostream& operator << (std::ostream& o, const TabletIO& tablet_io) {
     return o;
 }
 
+std::string MetricLabelToString(const std::string& tablet_path) {
+    size_t sep_pos = tablet_path.find_last_of("/");
+    if (sep_pos == std::string::npos) {
+        // meta tablet
+        return LabelStringBuilder().Append("table", tablet_path).Append("tablet", tablet_path).ToString();
+    } else {
+        std::string table_name = tablet_path.substr(0, sep_pos);
+        return LabelStringBuilder().Append("table", table_name).Append("tablet", tablet_path).ToString();
+    }
+}
+
 TabletIO::TabletIO(const std::string& key_start, const std::string& key_end,
                    const std::string& path)
     : async_writer_(NULL),
@@ -90,10 +140,12 @@ TabletIO::TabletIO(const std::string& key_start, const std::string& key_end,
       short_path_(path),
       compact_status_(kTableNotCompact),
       status_(kNotInit),
+      tablet_status_(static_cast<tera::TabletStatus>(kTabletReady)),
       ref_count_(1), db_ref_count_(0), db_(NULL),
       m_memory_cache(NULL),
       kv_only_(false),
       key_operator_(NULL),
+      counter_(short_path_),
       mock_env_(NULL) {
 }
 
@@ -138,6 +190,10 @@ std::string TabletIO::GetEndKey() const {
     return end_key_;
 }
 
+const std::string& TabletIO::GetMetricLabel() const {
+    return counter_.label;
+}
+
 CompactStatus TabletIO::GetCompactStatus() const {
     return compact_status_;
 }
@@ -167,6 +223,7 @@ void TabletIO::SetMemoryCache(leveldb::Cache* cache) {
 bool TabletIO::Load(const TableSchema& schema,
                     const std::string& path,
                     const std::vector<uint64_t>& parent_tablets,
+                    const std::set<std::string>& ignore_err_lgs,
                     std::map<uint64_t, uint64_t> snapshots,
                     std::map<uint64_t, uint64_t> rollbacks,
                     leveldb::Logger* logger,
@@ -226,6 +283,7 @@ bool TabletIO::Load(const TableSchema& schema,
     ldb_options_.key_start = raw_start_key_;
     ldb_options_.key_end = raw_end_key_;
     ldb_options_.l0_slowdown_writes_trigger = FLAGS_tera_tablet_level0_file_limit;
+    ldb_options_.max_sub_parallel_compaction = FLAGS_tera_tablet_max_sub_parallel_compaction;
     ldb_options_.ttl_percentage = FLAGS_tera_tablet_ttl_percentage;
     ldb_options_.del_percentage = FLAGS_tera_tablet_del_percentage;
     ldb_options_.block_size = FLAGS_tera_tablet_write_block_size * 1024;
@@ -234,6 +292,9 @@ bool TabletIO::Load(const TableSchema& schema,
     ldb_options_.log_async_mode = FLAGS_tera_log_async_mode;
     ldb_options_.info_log = logger;
     ldb_options_.max_open_files = FLAGS_tera_memenv_table_cache_size;
+    ldb_options_.max_background_compactions = FLAGS_tera_leveldb_max_background_compactions;
+    ldb_options_.slow_down_level0_score_limit = FLAGS_tera_leveldb_slow_down_level0_score_limit;
+    ldb_options_.ignore_corruption_in_open = FLAGS_tera_leveldb_ignore_corruption_in_open;
 
     ldb_options_.use_memtable_on_leveldb = FLAGS_tera_tablet_use_memtable_on_leveldb;
     ldb_options_.memtable_ldb_write_buffer_size =
@@ -277,7 +338,7 @@ bool TabletIO::Load(const TableSchema& schema,
     ldb_options_.ignore_corruption_in_compaction = FLAGS_tera_leveldb_ignore_corruption_in_compaction;
     ldb_options_.use_file_lock = FLAGS_tera_leveldb_use_file_lock;
     ldb_options_.disable_wal = table_schema_.disable_wal();
-    SetupOptionsForLG();
+    SetupOptionsForLG(ignore_err_lgs);
 
     std::string path_prefix = FLAGS_tera_tabletnode_path_prefix;
     if (*path_prefix.rbegin() != '/') {
@@ -328,6 +389,23 @@ bool TabletIO::Load(const TableSchema& schema,
     return true;
 }
 
+bool TabletIO::ShouldForceUnloadOnError() {
+    {
+        MutexLock lock(&mutex_);
+        if (status_ != kReady) {
+            return false;
+        }
+        db_ref_count_++;
+    }
+    // If TabletIO is Ready but has encountered some fatal errors
+    bool ret = db_->ShouldForceUnloadOnError();
+    {
+        MutexLock lock(&mutex_);
+        db_ref_count_--;
+    }
+    return ret;
+}
+
 bool TabletIO::Unload(StatusCode* status) {
     {
         MutexLock lock(&mutex_);
@@ -341,7 +419,6 @@ bool TabletIO::Unload(StatusCode* status) {
 
     LOG(INFO) << "[Unload] start shutdown1 " << tablet_path_;
     leveldb::Status s = db_->Shutdown1();
-
     {
         MutexLock lock(&mutex_);
         status_ = kUnLoading2;
@@ -566,13 +643,13 @@ bool TabletIO::IsBusy() {
         db_ref_count_++;
     }
     bool is_busy = db_->BusyWrite();
+    is_busy = is_busy ? true : async_writer_->IsBusy();
     {
         MutexLock lock(&mutex_);
         db_ref_count_--;
     }
     return is_busy;
 }
-
 bool TabletIO::Workload(double* write_workload) {
     {
         MutexLock lock(&mutex_);
@@ -581,7 +658,14 @@ bool TabletIO::Workload(double* write_workload) {
         }
         db_ref_count_++;
     }
+
+    // if busy cause by write log, set workload score more than 10, because level 0
+    // limits to 20 sst files by default, which score is 10.
     db_->Workload(write_workload);
+    if (*write_workload < 10.618 && async_writer_->IsBusy()) {
+        *write_workload = 10.618;
+    }
+
     {
         MutexLock lock(&mutex_);
         db_ref_count_--;
@@ -700,6 +784,7 @@ bool TabletIO::LowLevelScan(const std::string& start_tera_key,
     ScanContext* context = new ScanContext;
     context->compact_strategy = ldb_options_.compact_strategy_factory->NewInstance();
     context->version_num = 1;
+    context->qu_num = 1;
     bool ret = LowLevelScan(start_tera_key, end_row_key, scan_options, it, context,
                             value_list, next_start_point, read_row_count, read_bytes,
                             is_complete, status);
@@ -849,6 +934,7 @@ inline bool TabletIO::LowLevelScan(const std::string& start_tera_key,
     std::string& last_col = scan_context->last_col;
     std::string& last_qual = scan_context->last_qual;
     uint32_t& version_num = scan_context->version_num;
+    uint64_t& qu_num = scan_context->qu_num;
 
     std::list<KeyValuePair> row_buf;
     uint32_t buffer_size = 0;
@@ -861,13 +947,18 @@ inline bool TabletIO::LowLevelScan(const std::string& start_tera_key,
     KeyValuePair next_start_kv_pair;
     VLOG(9) << "ll-scan timeout set to be " << scan_options.timeout
         << ", start_tera_key " << DebugString(start_tera_key)
-        << ", end_row_key " << DebugString(end_row_key);
+        << ", end_row_key " << DebugString(end_row_key)
+        << ", max_size " << scan_options.max_size
+        << ", number_limit " << scan_options.number_limit
+        << ", max_versions " << scan_options.max_versions
+        << ", max_qualifiers " << scan_options.max_qualifiers;
 
     *is_complete = false;
     for (; it->Valid();) {
         bool has_merged = false;
         std::string merged_value;
         counter_.low_read_cell.Inc();
+        low_level_read_count.Inc();
         *read_bytes += it->key().size() + it->value().size();
         now_time = GetTimeStampInMs();
 
@@ -886,7 +977,21 @@ inline bool TabletIO::LowLevelScan(const std::string& start_tera_key,
             << "] key=[" << DebugString(key.ToString())
             << "] column=[" << DebugString(col.ToString())
             << ":" << DebugString(qual.ToString())
-            << "] ts=[" << ts << "] type=[" << type << "]";
+            << "] ts=[" << ts << "] type=[" << type << "]"
+            << " buffer_size=[" << buffer_size << "]"
+            << " number_limit=[" << number_limit << "]"
+            << " read_bytes=[" << *read_bytes << "]"
+            << " qu_num=[" << qu_num << "]";
+
+        if (now_time > time_out) {
+            VLOG(9) << "ll-scan timeout, now_time: " << now_time << ", time_out: " << time_out;
+            if (next_start_point != NULL) {
+                VLOG(9) << "Mark next start key: " << DebugString(tera_key.ToString());
+                MakeKvPair(key, col, qual, ts, "", next_start_point);
+            }
+            SetStatusCode(kRPCTimeout, status);
+            break;
+        }
 
         if (end_row_key.size() && key.compare(end_row_key) >= 0) {
             // scan finished
@@ -932,15 +1037,8 @@ inline bool TabletIO::LowLevelScan(const std::string& start_tera_key,
             *read_row_count += 1;
             ProcessRowBuffer(row_buf, scan_options, value_list, &buffer_size, &number_limit);
             row_buf.clear();
-
-            if (now_time > time_out && (next_start_point != NULL)) {
-                VLOG(9) << "ll-scan timeout. Mark next start key: " << DebugString(tera_key.ToString());
-                MakeKvPair(key, col, qual, ts, "", next_start_point);
-                break;
-            }
         }
 
-        // max version filter
         if (key.compare(last_key) == 0 &&
             col.compare(last_col) == 0 &&
             qual.compare(last_qual) == 0) {
@@ -949,6 +1047,16 @@ inline bool TabletIO::LowLevelScan(const std::string& start_tera_key,
                 continue;
             }
         } else {
+            if (key.compare(last_key) == 0 && col.compare(last_col) == 0 ) {
+                if (++qu_num > scan_options.max_qualifiers) {
+                    VLOG(10) << "max_qualifiers triggered, max_qualifiers: " << scan_options.max_qualifiers;
+                    it->Next();
+                    continue;
+                }
+            } else {
+                qu_num = 1;
+            }
+
             last_key.assign(key.data(), key.size());
             last_col.assign(col.data(), col.size());
             last_qual.assign(qual.data(), qual.size());
@@ -957,6 +1065,7 @@ inline bool TabletIO::LowLevelScan(const std::string& start_tera_key,
             has_merged = compact_strategy->ScanMergedValue(it, &merged_value, &merged_num);
             if (has_merged) {
                 counter_.low_read_cell.Add(merged_num - 1);
+                low_level_read_count.Add(merged_num - 1);
                 value = merged_value;
                 key = last_key;
                 col = last_col;
@@ -977,7 +1086,9 @@ inline bool TabletIO::LowLevelScan(const std::string& start_tera_key,
 
         // check scan buffer
         if (buffer_size >= scan_options.max_size || number_limit >= scan_options.number_limit) {
-            VLOG(10) << "stream scan, break scan context, version_num " << version_num
+            VLOG(10) << "stream scan, break scan context"
+                <<", buffer_size " << buffer_size
+                <<", number_limit " << number_limit
                 << ", key " << DebugString(key.ToString()) << ", col " << DebugString(col.ToString())
                 << ", qual " << DebugString(qual.ToString());
             it->Next();
@@ -1000,6 +1111,9 @@ inline bool TabletIO::LowLevelScan(const std::string& start_tera_key,
         ProcessRowBuffer(row_buf, scan_options, value_list, &buffer_size, &number_limit);
     }
 
+    if (*status == kRPCTimeout) {
+        return false;
+    }
     if (!it->Valid() && !(it->status().ok())) {
         SetStatusCode(it->status(), status);
         VLOG(10) << "ll-scan fail: " << "tablet=[" << tablet_path_ << "], "
@@ -1054,6 +1168,7 @@ bool TabletIO::LowLevelSeek(const std::string& row_key,
                                   leveldb::TKT_FORSEEK, &row_seek_key);
     it_data->Seek(row_seek_key);
     counter_.low_read_cell.Inc();
+    low_level_read_count.Inc();
     if (it_data->Valid()) {
         VLOG(10) << "ll-seek: " << "tablet=[" << tablet_path_
             << "] row_key=[" << row_key << "]";
@@ -1087,6 +1202,7 @@ bool TabletIO::LowLevelSeek(const std::string& row_key,
                                       leveldb::TKT_FORSEEK, &cf_seek_key);
         it_data->Seek(cf_seek_key);
         counter_.low_read_cell.Inc();
+        low_level_read_count.Inc();
         if (it_data->Valid()) {
             VLOG(10) << "ll-seek: " << "tablet=[" << tablet_path_
                 << "] row_key=[" << row_key
@@ -1122,6 +1238,7 @@ bool TabletIO::LowLevelSeek(const std::string& row_key,
             uint32_t version_num = 0;
             for (; it_data->Valid();) {
                 counter_.low_read_cell.Inc();
+                low_level_read_count.Inc();
                 VLOG(10) << "ll-seek: " << "tablet=[" << tablet_path_
                     << "] row_key=[" << row_key << "] cf=[" << cf_name
                     << "] qu=[" << qu_name << "]";
@@ -1134,7 +1251,7 @@ bool TabletIO::LowLevelSeek(const std::string& row_key,
                     break;
                 }
 
-                // skip qu delete mark
+                // skip qu delete mark and out-of-range version
                 if (compact_strategy->ScanDrop(it_data->key(), 0)) {
                     VLOG(10) << "ll-seek: scan drop " << "tablet=[" << tablet_path_
                         << "] row_key=[" << row_key << "] cf=[" << cf_name
@@ -1143,6 +1260,14 @@ bool TabletIO::LowLevelSeek(const std::string& row_key,
                     continue;
                 }
 
+                if (scan_options.ts_start > timestamp) {
+                    break;
+                }
+                if (scan_options.ts_end < timestamp) {
+                    it_data->Next();
+                    continue;
+                }
+
                 // version filter
                 if (++version_num > scan_options.max_versions) {
                     break;
@@ -1160,6 +1285,7 @@ bool TabletIO::LowLevelSeek(const std::string& row_key,
                     compact_strategy->ScanMergedValue(it_data, &merged_value, &merged_num);
                 if (has_merged) {
                     counter_.low_read_cell.Add(merged_num - 1);
+                    low_level_read_count.Add(merged_num - 1);
                     kv->set_value(merged_value);
                     VLOG(10) << "ll-seek merge: " << "key=[" << DebugString(row_key)
                         << "] column=[" << DebugString(cf_name)
@@ -1188,7 +1314,7 @@ bool TabletIO::LowLevelSeek(const std::string& row_key,
 }
 
 bool TabletIO::ReadCells(const RowReaderInfo& row_reader, RowResult* value_list,
-                         uint64_t snapshot_id, StatusCode* status) {
+                         uint64_t snapshot_id, StatusCode* status, int64_t timeout_ms) {
     {
         MutexLock lock(&mutex_);
         if (status_ != kReady && status_ != kOnSplit
@@ -1205,7 +1331,7 @@ bool TabletIO::ReadCells(const RowReaderInfo& row_reader, RowResult* value_list,
         db_ref_count_++;
     }
 
-    int64_t read_ms = get_micros();
+    int64_t start_read_us = get_micros();
 
     if (kv_only_) {
         std::string key(row_reader.key());
@@ -1215,7 +1341,8 @@ bool TabletIO::ReadCells(const RowReaderInfo& row_reader, RowResult* value_list,
         }
         if (!Read(key, &value, snapshot_id, status)) {
             counter_.read_rows.Inc();
-            row_read_delay.Add(get_micros() - read_ms);
+            row_read_count.Inc();
+            row_read_delay.Add(get_micros() - start_read_us);
             {
                 MutexLock lock(&mutex_);
                 db_ref_count_--;
@@ -1226,8 +1353,10 @@ bool TabletIO::ReadCells(const RowReaderInfo& row_reader, RowResult* value_list,
         result->set_key(row_reader.key());
         result->set_value(value);
         counter_.read_rows.Inc();
+        row_read_count.Inc();
         counter_.read_size.Add(result->ByteSize());
-        row_read_delay.Add(get_micros() - read_ms);
+        row_read_bytes.Add(result->ByteSize());
+        row_read_delay.Add(get_micros() - start_read_us);
         {
             MutexLock lock(&mutex_);
             db_ref_count_--;
@@ -1258,12 +1387,23 @@ bool TabletIO::ReadCells(const RowReaderInfo& row_reader, RowResult* value_list,
     if (row_reader.has_max_version()) {
         scan_options.max_versions = row_reader.max_version();
     }
+
+    if (row_reader.has_max_qualifiers()) {
+        scan_options.max_qualifiers = row_reader.max_qualifiers();
+    } else {
+        scan_options.max_qualifiers = std::numeric_limits<uint64_t>::max();
+    }
+
     if (row_reader.has_time_range()) {
         scan_options.ts_start = row_reader.time_range().ts_start();
         scan_options.ts_end = row_reader.time_range().ts_end();
+        VLOG(10) << "ReadCells: " << "timerange=[" << scan_options.ts_start
+            << "," << scan_options.ts_end << "]";
     }
 
     scan_options.snapshot_id = snapshot_id;
+    scan_options.timeout = timeout_ms;
+
 
     VLOG(10) << "ReadCells: " << "key=[" << DebugString(row_reader.key()) << "]";
 
@@ -1284,7 +1424,8 @@ bool TabletIO::ReadCells(const RowReaderInfo& row_reader, RowResult* value_list,
                            &is_complete, status);
     }
     counter_.read_rows.Inc();
-    row_read_delay.Add(get_micros() - read_ms);
+    row_read_count.Inc();
+    row_read_delay.Add(get_micros() - start_read_us);
     {
         MutexLock lock(&mutex_);
         db_ref_count_--;
@@ -1293,6 +1434,7 @@ bool TabletIO::ReadCells(const RowReaderInfo& row_reader, RowResult* value_list,
         return false;
     } else {
         counter_.read_size.Add(value_list->ByteSize());
+        row_read_bytes.Add(value_list->ByteSize());
     }
 
     if (value_list->key_values_size() == 0) {
@@ -1310,7 +1452,6 @@ bool TabletIO::WriteBatch(leveldb::WriteBatch* batch, bool disable_wal, bool syn
 
     CHECK_NOTNULL(db_);
 
-    counter_.write_size.Add(batch->DataSize());
     leveldb::Status db_status = db_->Write(options, batch);
     if (!db_status.ok()) {
         LOG(ERROR) << "fail to batch write to tablet: " << tablet_path_
@@ -1318,6 +1459,8 @@ bool TabletIO::WriteBatch(leveldb::WriteBatch* batch, bool disable_wal, bool syn
         SetStatusCode(kIOError, status);
         return false;
     }
+    counter_.write_size.Add(batch->DataSize());
+    row_write_bytes.Add(batch->DataSize());
     SetStatusCode(kTabletNodeOk, status);
     return true;
 }
@@ -1349,6 +1492,10 @@ bool TabletIO::Write(std::vector<const RowMutationSequence*>* row_mutation_vec,
     }
     bool ret = async_writer_->Write(row_mutation_vec, status_vec, is_instant,
                                      callback, status);
+    if (!ret) {
+        counter_.write_reject_rows.Add(row_mutation_vec->size());
+    }
+
     {
         MutexLock lock(&mutex_);
         db_ref_count_--;
@@ -1426,12 +1573,18 @@ bool TabletIO::ScanRowsRestricted(const ScanTabletRequest* request,
 
     StatusCode status = kTabletNodeOk;
     bool ret = false;
+
+    int64_t start_scan_us = get_micros();
+
     if (LowLevelScan(start_tera_key, end_row_key, scan_options,
                      response->mutable_results(), response->mutable_next_start_point(),
                      &read_row_count, &read_bytes, &is_complete, &status)) {
         response->set_complete(is_complete);
         counter_.scan_rows.Add(read_row_count);
         counter_.scan_size.Add(read_bytes);
+        row_scan_count.Add(read_row_count);
+        row_scan_bytes.Add(read_bytes);
+        row_scan_delay.Add(get_micros() - start_scan_us);
         ret = true;
     }
 
@@ -1464,17 +1617,26 @@ bool TabletIO::HandleScan(const ScanTabletRequest* request,
 void TabletIO::ProcessScan(ScanContext* context) {
     uint32_t rows_scan_num = 0;
     uint32_t size_scan_bytes = 0;
+
+    int64_t start_scan_us = get_micros();
+
     if (LowLevelScan(context->start_tera_key, context->end_row_key,
                      context->scan_options, context->it, context,
                      context->result, NULL, &rows_scan_num, &size_scan_bytes,
                      &context->complete, &context->ret_code)) {
         counter_.scan_rows.Add(rows_scan_num);
         counter_.scan_size.Add(size_scan_bytes);
+        row_scan_count.Add(rows_scan_num);
+        row_scan_bytes.Add(size_scan_bytes);
+        row_scan_delay.Add(get_micros() - start_scan_us);
     }
 }
 
 bool TabletIO::Scan(const ScanOption& option, KeyValueList* kv_list,
                     bool* complete, StatusCode* status) {
+
+    int64_t start_scan_us = get_micros();
+
     std::string start = option.key_range().key_start();
     std::string end = option.key_range().key_end();
     if (start < start_key_) {
@@ -1558,8 +1720,13 @@ bool TabletIO::Scan(const ScanOption& option, KeyValueList* kv_list,
     if (!it->Valid()) {
         *complete = true;
     }
+
     counter_.scan_rows.Add(kv_list->size());
     counter_.scan_size.Add(pack_size);
+    row_scan_count.Add(kv_list->size());
+    row_scan_bytes.Add(pack_size);
+    row_scan_delay.Add(get_micros() - start_scan_us);
+
     delete it;
     delete strategy;
 
@@ -1618,6 +1785,11 @@ void TabletIO::SetupScanRowOptions(const ScanTabletRequest* request,
     if (request->has_max_version()) {
         scan_options->max_versions = request->max_version();
     }
+    if (request->has_max_qualifiers()) {
+        scan_options->max_qualifiers = request->max_qualifiers();
+    } else {
+        scan_options->max_qualifiers = std::numeric_limits<uint64_t>::max();
+    }
     if (request->has_timerange()) {
         scan_options->ts_start = request->timerange().ts_start();
         scan_options->ts_end = request->timerange().ts_end();
@@ -1635,7 +1807,7 @@ void TabletIO::SetupScanRowOptions(const ScanTabletRequest* request,
 }
 
 // no concurrent, so no lock on schema_mutex_
-void TabletIO::SetupOptionsForLG() {
+void TabletIO::SetupOptionsForLG(const std::set<std::string>& ignore_err_lgs) {
     if (kv_only_) {
         if (RawKeyType() == TTLKv) {
             ldb_options_.compact_strategy_factory =
@@ -1656,6 +1828,7 @@ void TabletIO::SetupOptionsForLG() {
     std::set<uint32_t>* exist_lg_list = new std::set<uint32_t>;
     std::map<uint32_t, leveldb::LG_info*>* lg_info_list =
         new std::map<uint32_t, leveldb::LG_info*>;
+    std::set<uint32_t> ignore_corruption_in_open_lg_list;
 
     int64_t triggered_log_size = 0;
     for (int32_t lg_i = 0; lg_i < table_schema_.locality_groups_size();
@@ -1721,6 +1894,9 @@ void TabletIO::SetupOptionsForLG() {
         triggered_log_size += lg_info->write_buffer_size;
         exist_lg_list->insert(lg_i);
         (*lg_info_list)[lg_i] = lg_info;
+        if (ignore_err_lgs.find(lg_schema.name()) != ignore_err_lgs.end()) {
+            ignore_corruption_in_open_lg_list.insert(lg_i);
+        }
     }
     if (mock_env_ != NULL) {
         ldb_options_.env = LeveldbMockEnv();
@@ -1738,6 +1914,8 @@ void TabletIO::SetupOptionsForLG() {
         delete lg_info_list;
     } else {
         ldb_options_.lg_info_list = lg_info_list;
+        ldb_options_.ignore_corruption_in_open_lg_list
+            = ignore_corruption_in_open_lg_list;
     }
 
     IndexingCfToLG();
@@ -1994,23 +2172,6 @@ const leveldb::RawKeyOperator* TabletIO::GetRawKeyOperator() {
     return key_operator_;
 }
 
-void TabletIO::GetAndClearCounter(TabletCounter* counter) {
-    counter->set_low_read_cell(counter_.low_read_cell.Clear());
-    counter->set_scan_rows(counter_.scan_rows.Clear());
-    counter->set_scan_kvs(counter_.scan_kvs.Clear());
-    counter->set_scan_size(counter_.scan_size.Clear());
-    counter->set_read_rows(counter_.read_rows.Clear());
-    counter->set_read_kvs(counter_.read_kvs.Clear());
-    counter->set_read_size(counter_.read_size.Clear());
-    counter->set_write_rows(counter_.write_rows.Clear());
-    counter->set_write_kvs(counter_.write_kvs.Clear());
-    counter->set_write_size(counter_.write_size.Clear());
-    counter->set_is_on_busy(IsBusy());
-    double write_workload = 0;
-    Workload(&write_workload);
-    counter->set_write_workload(write_workload);
-}
-
 int32_t TabletIO::AddRef() {
     MutexLock lock(&mutex_);
     ++ref_count_;
@@ -2040,6 +2201,36 @@ void TabletIO::ApplySchema(const TableSchema& schema) {
     ldb_options_.compact_strategy_factory->SetArg(&schema);
 }
 
+bool TabletIO::PutIfAbsentCheck(const std::string& row_key,
+                                const Mutation& mutation) {
+    RowResult value_list;
+    ScanOptions scan_options;
+    std::set<std::string>& qualifier_list = scan_options.column_family_list[mutation.family()];
+    qualifier_list.insert(mutation.qualifier());
+    scan_options.iter_cf_set.insert(mutation.family());
+    scan_options.max_versions = 1;
+    StatusCode status;
+    if (!LowLevelSeek(row_key, scan_options, &value_list, &status)) {
+        if (status == kKeyNotExist) {
+            return true;
+        }
+        VLOG(9) << "txn of row (PutIfAbsent) " << DebugString(row_key)
+                << ":" << DebugString(mutation.family())
+                << ":" << DebugString(mutation.qualifier())
+                << " is interrupted: lowlevelseek fail";
+        return false;
+    }
+
+    if (value_list.key_values_size() > 0) {
+        VLOG(9) << "txn of row (PutIfAbsent) " << DebugString(row_key)
+                << ":" << DebugString(mutation.family())
+                << ":" << DebugString(mutation.qualifier())
+                << " is interrupted: already exist";
+        return false;
+    }
+    return true;
+}
+
 bool TabletIO::SingleRowTxnCheck(const std::string& row_key,
                                  const SingleRowTxnReadInfo& txn_read_info,
                                  StatusCode* status) {
@@ -2098,5 +2289,33 @@ bool TabletIO::SingleRowTxnCheck(const std::string& row_key,
     return true;
 }
 
+bool TabletIO::GetDBStatus(tera::TabletStatus* tablet_status, bool slow_check) {
+    *tablet_status = static_cast<tera::TabletStatus>(kTabletReady);
+    {
+        MutexLock lock(&mutex_);
+        if (status_ != kReady) {
+            return false;
+        }
+        db_ref_count_++;
+    }
+
+    std::string db_property_key = "leveldb.verify-db-integrity";
+    std::string db_property_val;
+    if (slow_check && db_->GetProperty(db_property_key, &db_property_val)) {
+        if (db_property_val.find("verify_fail") != std::string::npos) {
+            tablet_status_ = kTabletCorruption;
+        } else {
+            tablet_status_ = static_cast<tera::TabletStatus>(kTabletReady);
+        }
+    }
+    *tablet_status = tablet_status_;
+
+    {
+        MutexLock lock(&mutex_);
+        db_ref_count_--;
+    }
+    return true;
+}
+
 } // namespace io
 } // namespace tera
diff --git a/src/io/tablet_io.h b/src/io/tablet_io.h
index ba5cd99cf..9ce73d96a 100644
--- a/src/io/tablet_io.h
+++ b/src/io/tablet_io.h
@@ -13,6 +13,7 @@
 #include <vector>
 
 #include "common/base/scoped_ptr.h"
+#include "common/metric/metric_counter.h"
 #include "common/mutex.h"
 #include "io/tablet_scanner.h"
 #include "leveldb/db.h"
@@ -26,9 +27,23 @@
 #include "proto/table_schema.pb.h"
 #include "proto/tabletnode_rpc.pb.h"
 #include "types.h"
-#include "utils/counter.h"
+#include "common/counter.h"
 
 namespace tera {
+
+// metric name constants
+const char* const kLowReadCellMetricName = "tera_ts_tablet_low_read_cell_count";
+const char* const kScanRowsMetricName = "tera_ts_tablet_scan_row_count";
+const char* const kScanKvsMetricName = "tera_ts_tablet_scan_kv_count";
+const char* const kScanThroughPutMetricName = "tera_ts_tablet_scan_through_put";
+const char* const kReadRowsMetricName = "tera_ts_tablet_read_row_count";
+const char* const kReadKvsMetricName = "tera_ts_tablet_read_kv_count";
+const char* const kReadThroughPutMetricName = "tera_ts_tablet_read_through_put";
+const char* const kWriteRowsMetricName = "tera_ts_tablet_write_row_count";
+const char* const kWriteKvsMetricName = "tera_ts_tablet_write_kv_count";
+const char* const kWriteThroughPutMetricName = "tera_ts_tablet_write_through_put";
+const char* const kWriteRejectRowsMetricName = "tera_ts_tablet_write_reject_row_count";
+
 namespace io {
 
 class TabletWriter;
@@ -36,6 +51,8 @@ struct ScanOptions;
 struct ScanContext;
 class ScanContextManager;
 
+std::string MetricLabelToString(const std::string& tablet_path);
+
 class TabletIO {
 public:
     enum CompactionType {
@@ -54,16 +71,32 @@ class TabletIO {
     };
 
     struct StatCounter {
-        tera::Counter low_read_cell;
-        tera::Counter scan_rows;
-        tera::Counter scan_kvs;
-        tera::Counter scan_size;
-        tera::Counter read_rows;
-        tera::Counter read_kvs;
-        tera::Counter read_size;
-        tera::Counter write_rows;
-        tera::Counter write_kvs;
-        tera::Counter write_size;
+        const std::string label;
+        tera::MetricCounter low_read_cell;
+        tera::MetricCounter scan_rows;
+        tera::MetricCounter scan_kvs;
+        tera::MetricCounter scan_size;
+        tera::MetricCounter read_rows;
+        tera::MetricCounter read_kvs;
+        tera::MetricCounter read_size;
+        tera::MetricCounter write_rows;
+        tera::MetricCounter write_kvs;
+        tera::MetricCounter write_size;
+        tera::MetricCounter write_reject_rows;
+
+        StatCounter(const std::string& tablet_path)
+            : label(MetricLabelToString(tablet_path)),
+              low_read_cell(tera::kLowReadCellMetricName, label, {SubscriberType::QPS}),
+              scan_rows(tera::kScanRowsMetricName, label, {SubscriberType::QPS}),
+              scan_kvs(tera::kScanKvsMetricName, label, {SubscriberType::QPS}),
+              scan_size(tera::kScanThroughPutMetricName, label, {SubscriberType::THROUGHPUT}),
+              read_rows(tera::kReadRowsMetricName, label, {SubscriberType::QPS}),
+              read_kvs(tera::kReadKvsMetricName, label, {SubscriberType::QPS}),
+              read_size(tera::kReadThroughPutMetricName, label, {SubscriberType::THROUGHPUT}),
+              write_rows(tera::kWriteRowsMetricName, label, {SubscriberType::QPS}),
+              write_kvs(tera::kWriteKvsMetricName, label, {SubscriberType::QPS}),
+              write_size(tera::kWriteThroughPutMetricName, label, {SubscriberType::THROUGHPUT}),
+              write_reject_rows(tera::kWriteRejectRowsMetricName, label, {SubscriberType::QPS}) {}
     };
 
     typedef std::function<void (std::vector<const RowMutationSequence*>*,
@@ -83,6 +116,7 @@ class TabletIO {
     std::string GetTablePath() const;
     std::string GetStartKey() const;
     std::string GetEndKey() const;
+    const std::string& GetMetricLabel() const;
     virtual CompactStatus GetCompactStatus() const;
     virtual TableSchema GetSchema() const;
     RawKey RawKeyType() const;
@@ -94,6 +128,7 @@ class TabletIO {
     virtual bool Load(const TableSchema& schema,
                       const std::string& path,
                       const std::vector<uint64_t>& parent_tablets,
+                      const std::set<std::string>& ignore_err_lgs,
                       std::map<uint64_t, uint64_t> snapshots,
                       std::map<uint64_t, uint64_t> rollbacks,
                       leveldb::Logger* logger = NULL,
@@ -118,7 +153,8 @@ class TabletIO {
 
     // read a row
     virtual bool ReadCells(const RowReaderInfo& row_reader, RowResult* value_list,
-                           uint64_t snapshot_id = 0, StatusCode* status = NULL);
+                           uint64_t snapshot_id = 0, StatusCode* status = NULL,
+                           int64_t timeout_ms = std::numeric_limits<int64_t>::max());
     /// scan from leveldb return ture means complete flase means not complete
     bool LowLevelScan(const std::string& start_tera_key,
                       const std::string& end_row_key,
@@ -162,8 +198,6 @@ class TabletIO {
     void SetStatus(TabletStatus status);
     TabletStatus GetStatus();
 
-    void GetAndClearCounter(TabletCounter* counter);
-
     int32_t AddRef();
     int32_t DecRef();
     int32_t GetRef() const;
@@ -173,6 +207,10 @@ class TabletIO {
     void ProcessScan(ScanContext* context);
     void ApplySchema(const TableSchema& schema);
 
+    bool ShouldForceUnloadOnError();
+
+    bool GetDBStatus(tera::TabletStatus* tablet_status, bool slow_check);
+
 private:
     friend class TabletWriter;
     friend class ScanConextManager;
@@ -180,7 +218,7 @@ class TabletIO {
                           bool sync = false, StatusCode* status = NULL);
 //     int64_t GetDataSizeWithoutLock(StatusCode* status = NULL);
 
-    void SetupOptionsForLG();
+    void SetupOptionsForLG(const std::set<std::string>& ignore_err_lgs);
     void TearDownOptionsForLG();
     void IndexingCfToLG();
 
@@ -245,6 +283,8 @@ class TabletIO {
                      KeyValuePair* next);
     void SetSchema(const TableSchema& schema);
 
+    bool PutIfAbsentCheck(const std::string& row_key, const Mutation& mutation);
+
     bool SingleRowTxnCheck(const std::string& row_key,
                            const SingleRowTxnReadInfo& txn_read_info,
                            StatusCode* status);
@@ -263,6 +303,7 @@ class TabletIO {
     CompactStatus compact_status_;
 
     TabletStatus status_;
+    tera::TabletStatus tablet_status_; // check wether db corruption
     volatile int32_t ref_count_;
     volatile int32_t db_ref_count_;
     leveldb::Options ldb_options_;
diff --git a/src/io/tablet_scanner.cc b/src/io/tablet_scanner.cc
index d799f3fe9..47f082126 100644
--- a/src/io/tablet_scanner.cc
+++ b/src/io/tablet_scanner.cc
@@ -134,6 +134,9 @@ bool ScanContextManager::ScheduleScanContext(ScanContext* context) {
             // complete or io error, return all the rest request to client
             if (context->complete || (context->ret_code != kTabletNodeOk)) {
                 DeleteScanContext(context); // never use context
+                if (context->ret_code != kTabletNodeOk) {
+                    return false;
+                }
                 return true;
             }
             if (context->jobs.size() == 0) {
@@ -148,6 +151,7 @@ bool ScanContextManager::ScheduleScanContext(ScanContext* context) {
         MutexLock l(&lock_);
         if (context->ret_code != kTabletNodeOk) {
             DeleteScanContext(context); // never use context
+            return false;
         }
     }
     return true;
diff --git a/src/io/tablet_scanner.h b/src/io/tablet_scanner.h
index e816e1b11..d468bdb6f 100644
--- a/src/io/tablet_scanner.h
+++ b/src/io/tablet_scanner.h
@@ -33,12 +33,15 @@ struct ScanOptions {
     ColumnFamilyMap column_family_list;
     std::set<std::string> iter_cf_set;
     int64_t timeout;
+    uint64_t max_qualifiers;
 
     ScanOptions()
             : max_versions(std::numeric_limits<uint32_t>::max()),
               max_size(std::numeric_limits<uint32_t>::max()),
               number_limit(std::numeric_limits<int64_t>::max()),
-              ts_start(kOldestTs), ts_end(kLatestTs), snapshot_id(0), timeout(std::numeric_limits<int64_t>::max() / 2)
+              ts_start(kOldestTs), ts_end(kLatestTs), snapshot_id(0),
+              timeout(std::numeric_limits<int64_t>::max() / 2),
+              max_qualifiers(std::numeric_limits<uint64_t>::max())
     {}
 };
 
@@ -55,6 +58,7 @@ struct ScanContext {
     leveldb::Iterator* it; // init to NULL
     leveldb::CompactStrategy* compact_strategy;
     uint32_t version_num;
+    uint64_t qu_num;
     std::string last_key;
     std::string last_col;
     std::string last_qual;
diff --git a/src/io/tablet_writer.cc b/src/io/tablet_writer.cc
index 5e8791cda..81954d5bd 100644
--- a/src/io/tablet_writer.cc
+++ b/src/io/tablet_writer.cc
@@ -5,6 +5,8 @@
 #include "io/tablet_writer.h"
 
 #include <set>
+#include <unordered_set>
+#include <memory>
 
 #include <gflags/gflags.h>
 #include <glog/logging.h>
@@ -16,9 +18,13 @@
 #include "leveldb/lg_coding.h"
 #include "proto/proto_helper.h"
 #include "tera/table_descriptor.h"
-#include "utils/counter.h"
+#include "common/counter.h"
 #include "utils/string_util.h"
-#include "utils/timer.h"
+#include "common/timer.h"
+
+#include "tabletnode/tabletnode_metric_name.h"
+#include "common/metric/ratio_subscriber.h"
+#include "common/metric/prometheus_subscriber.h"
 
 DECLARE_int32(tera_asyncwriter_pending_limit);
 DECLARE_bool(tera_enable_level0_limit);
@@ -30,6 +36,20 @@ DECLARE_bool(tera_sync_log);
 namespace tera {
 namespace io {
 
+using tera::tabletnode::kRowDelayMetric;
+using tera::tabletnode::kRowCountMetric;
+
+using tera::tabletnode::kApiLabelWrite;
+using tera::Subscriber;
+
+tera::MetricCounter row_write_count(kRowCountMetric, kApiLabelWrite, {SubscriberType::QPS});
+tera::MetricCounter row_write_delay(kRowDelayMetric, kApiLabelWrite, {});
+
+tera::AutoSubscriberRegister row_write_delay_per_row(std::unique_ptr<Subscriber>(new tera::RatioSubscriber(
+    MetricId("tera_ts_row_write_delay_us_per_row"),
+    std::unique_ptr<Subscriber>(new tera::PrometheusSubscriber(MetricId(kRowDelayMetric, kApiLabelWrite), SubscriberType::SUM)),
+    std::unique_ptr<Subscriber>(new tera::PrometheusSubscriber(MetricId(kRowCountMetric, kApiLabelWrite), SubscriberType::SUM)))));
+
 TabletWriter::TabletWriter(TabletIO* tablet_io)
     : tablet_(tablet_io), stopped_(true),
       sync_timestamp_(0),
@@ -157,15 +177,21 @@ void TabletWriter::DoWork() {
         }
         // 否则 flush
         VLOG(7) << "write data, sleep_duration: " << sleep_duration;
-
+        sync_timestamp_ = GetTimeStampInMs();
         FlushToDiskBatch(sealed_buffer_);
         sealed_buffer_->clear();
-        sync_timestamp_ = GetTimeStampInMs();
     }
     LOG(INFO) << "AsyncWriter::DoWork done";
     worker_done_event_.Set();
 }
 
+bool TabletWriter::IsBusy() {
+    const uint64_t MAX_PENDING_SIZE = FLAGS_tera_asyncwriter_pending_limit * 1024UL;
+
+    MutexLock lock(&task_mutex_);
+    return active_buffer_size_ >= MAX_PENDING_SIZE;
+}
+
 bool TabletWriter::SwapActiveBuffer(bool force) {
     const uint64_t SYNC_SIZE = FLAGS_tera_asyncwriter_sync_size_threshold * 1024UL;
     if (FLAGS_tera_enable_level0_limit == true) {
@@ -200,12 +226,14 @@ void TabletWriter::BatchRequest(WriteTaskBuffer* task_buffer,
         WriteTask& task = (*task_buffer)[task_idx];
         const std::vector<const RowMutationSequence*>& row_mutation_vec = *(task.row_mutation_vec);
         std::vector<StatusCode>* status_vec = task.status_vec;
+        const std::vector<IgnoreCellFlags>& ignore_row_vec = task.ignore_row_vec;
 
         for (uint32_t i = 0; i < row_mutation_vec.size(); ++i) {
             StatusCode* status = &((*status_vec)[i]);
+            const IgnoreCellFlags& ignore_cell_flags = ignore_row_vec[i];
             const RowMutationSequence& row_mu = *row_mutation_vec[i];
             const std::string& row_key = row_mu.row_key();
-            int32_t mu_num = row_mu.mutation_sequence().size();
+            uint32_t mu_num = row_mu.mutation_sequence().size();
             if (*status != kTabletNodeOk) {
                 VLOG(11) << "batch write fail, row " << DebugString(row_key)
                     << ", status " << StatusCodeToString(*status);
@@ -235,7 +263,12 @@ void TabletWriter::BatchRequest(WriteTaskBuffer* task_buffer,
                     batch->Delete(tera_key);
                 }
             } else {
-                for (int32_t t = 0; t < mu_num; ++t) {
+                for (uint32_t t = 0; t < mu_num; ++t) {
+                    if (t < ignore_cell_flags.size() && ignore_cell_flags[t]) {
+                        VLOG(11) << "batch write ignore cell @ " << DebugString(row_key) 
+                                 << "[" << task_idx << "," << i << "," << t << "]";
+                        continue;
+                    }
                     const Mutation& mu = row_mu.mutation_sequence().Get(t);
                     std::string tera_key;
                     leveldb::TeraKeyType type = leveldb::TKT_VALUE;
@@ -258,9 +291,11 @@ void TabletWriter::BatchRequest(WriteTaskBuffer* task_buffer,
                         case kAddInt64:
                             type = leveldb::TKT_ADDINT64;
                             break;
+                        /*
                         case kPutIfAbsent:
                             type = leveldb::TKT_PUT_IFABSENT;
                             break;
+                        */
                         case kAppend:
                             type = leveldb::TKT_APPEND;
                             break;
@@ -317,6 +352,8 @@ void TabletWriter::FinishTask(WriteTaskBuffer* task_buffer, StatusCode status) {
     for (uint32_t task_idx = 0; task_idx < task_buffer->size(); ++task_idx) {
         WriteTask& task = (*task_buffer)[task_idx];
         tablet_->GetCounter().write_rows.Add(task.row_mutation_vec->size());
+        row_write_count.Add(task.row_mutation_vec->size());
+        row_write_delay.Add(get_micros() - task.start_time);
         for (uint32_t i = 0; i < task.row_mutation_vec->size(); i++) {
             tablet_->GetCounter().write_kvs.Add((*task.row_mutation_vec)[i]->mutation_sequence_size());
             // set batch_write status for row_mu
@@ -329,7 +366,7 @@ void TabletWriter::FinishTask(WriteTaskBuffer* task_buffer, StatusCode status) {
     return;
 }
 
-// set status to kTxnFail, if transaction conflicts.
+// set status to kTxnFail, if single row transaction or putifabsent conflicts
 bool TabletWriter::CheckSingleRowTxnConflict(const RowMutationSequence& row_mu,
                                              std::set<std::string>* commit_row_key_set,
                                              StatusCode* status) {
@@ -359,6 +396,36 @@ bool TabletWriter::CheckSingleRowTxnConflict(const RowMutationSequence& row_mu,
     return false;
 }
 
+void TabletWriter::MarkPutIfAbsentConflict(const RowMutationSequence& row_mu,
+                                           IgnoreCellFlags* ignore_cell_flags, 
+                                           std::unordered_set<std::string>* not_exist_cell_set) {
+    const std::string& row_key = row_mu.row_key();
+    // check every mutate item if mutation type is PutIfAbsent
+    for (int32_t i = 0; i < row_mu.mutation_sequence_size(); ++i) {
+        const Mutation& mutation = row_mu.mutation_sequence(i);
+        if (mutation.type() != kPutIfAbsent) {
+            continue;
+        }
+        std::string cell_key;
+        tablet_->GetRawKeyOperator()->EncodeTeraKey(row_key, 
+                mutation.family(), mutation.qualifier(), kLatestTs, 
+                leveldb::TKT_FORSEEK, &cell_key);  
+        if (not_exist_cell_set->find(cell_key) != not_exist_cell_set->end()) {
+            VLOG(9) << "txn of row (PutIfAbsent) " << DebugString(row_key)
+                    << ":" << DebugString(mutation.family())
+                    << ":" << DebugString(mutation.qualifier());
+            (*ignore_cell_flags)[i] = true;
+        }
+        if (!tablet_->PutIfAbsentCheck(row_key, mutation)) {
+            VLOG(9) << "txn of row (PutIfAbsent) " << DebugString(row_key)
+                    << ":" << DebugString(mutation.family())
+                    << ":" << DebugString(mutation.qualifier());
+            (*ignore_cell_flags)[i] = true;
+        }
+        not_exist_cell_set->insert(cell_key);
+    }
+}
+
 bool TabletWriter::CheckIllegalRowArg(const RowMutationSequence& row_mu,
                                       const std::set<std::string>& cf_set,
                                       StatusCode* status) {
@@ -401,6 +468,8 @@ void TabletWriter::CheckRows(WriteTaskBuffer* task_buffer) {
     }
 
     std::set<std::string> commit_row_key_set;
+    // for PutIfAbsent, make sure only one PutIfAbsent operation in a cell
+    std::unordered_set<std::string> not_exist_cell_set;
     for (uint32_t task_idx = 0; task_idx < task_buffer->size(); ++task_idx) {
         WriteTask& task = (*task_buffer)[task_idx];
         std::vector<const RowMutationSequence*>& row_mutation_vec = *task.row_mutation_vec;
@@ -408,9 +477,15 @@ void TabletWriter::CheckRows(WriteTaskBuffer* task_buffer) {
 
         for (uint32_t row_idx = 0; row_idx < row_mutation_vec.size(); ++row_idx) {
             const RowMutationSequence* row_mu = row_mutation_vec[row_idx];
+            IgnoreCellFlags ignore_cell_flags;
+            // init all cell not ignored
+            ignore_cell_flags.assign(row_mu->mutation_sequence_size(), false);
+            task.ignore_row_vec.push_back(ignore_cell_flags);
+
             if(CheckSingleRowTxnConflict(*row_mu, &commit_row_key_set, &status_vec[row_idx])) {
                 continue;
             }
+            MarkPutIfAbsentConflict(*row_mu, &(task.ignore_row_vec.back()), &not_exist_cell_set);
             if (CheckIllegalRowArg(*row_mu, cf_set, &status_vec[row_idx])) {
                 continue;
             }
@@ -421,18 +496,28 @@ void TabletWriter::CheckRows(WriteTaskBuffer* task_buffer) {
 }
 
 StatusCode TabletWriter::FlushToDiskBatch(WriteTaskBuffer* task_buffer) {
-    int64_t ts = get_micros();
+    int64_t start_ts, check_cost, batch_cost, write_cost, finish_cost;
+
+    start_ts = get_micros();
     CheckRows(task_buffer);
+    check_cost = get_micros();
 
     leveldb::WriteBatch batch;
     BatchRequest(task_buffer, &batch);
+    batch_cost = get_micros();
     StatusCode status = kTabletNodeOk;
     const bool disable_wal = false;
     tablet_->WriteBatch(&batch, disable_wal, FLAGS_tera_sync_log, &status);
     batch.Clear();
+    write_cost = get_micros();
 
     FinishTask(task_buffer, status);
-    VLOG(7) << "finish a batch: " << task_buffer->size() << ", use " << get_micros() - ts;
+    finish_cost = get_micros();
+    VLOG(7) << "finish a batch: " << task_buffer->size() << ", cost(check/batch/write/finish): "
+        << check_cost - start_ts << "/"
+        << batch_cost - check_cost << "/"
+        << write_cost - batch_cost << "/"
+        << finish_cost - write_cost;
     return status;
 }
 
diff --git a/src/io/tablet_writer.h b/src/io/tablet_writer.h
index 561db7b1d..b0019ec8b 100644
--- a/src/io/tablet_writer.h
+++ b/src/io/tablet_writer.h
@@ -6,6 +6,8 @@
 #define TERA_TABLETNODE_TABLET_WRITER_H_
 
 #include <functional>
+#include <unordered_set>
+#include <set>
 
 #include "common/event.h"
 #include "common/mutex.h"
@@ -27,11 +29,16 @@ class TabletWriter {
 public:
     typedef std::function<void (std::vector<const RowMutationSequence*>*, \
                                 std::vector<StatusCode>*)> WriteCallback;
+    
+    typedef std::vector<bool> IgnoreCellFlags;
 
     struct WriteTask {
+        WriteTask():start_time(get_micros()) {}
         std::vector<const RowMutationSequence*>* row_mutation_vec;
         std::vector<StatusCode>* status_vec;
+        std::vector<IgnoreCellFlags> ignore_row_vec;
         WriteCallback callback;
+        int64_t start_time;
     };
 
     typedef std::vector<WriteTask> WriteTaskBuffer;
@@ -47,6 +54,7 @@ class TabletWriter {
                                      bool kv_only);
     void Start();
     void Stop();
+    bool IsBusy();
 
 private:
     void DoWork();
@@ -57,6 +65,11 @@ class TabletWriter {
     bool CheckSingleRowTxnConflict(const RowMutationSequence& row_mu,
                                    std::set<std::string>* commit_row_key_set,
                                    StatusCode* status);
+    // mark conflict of PutIfAbsent
+    void MarkPutIfAbsentConflict(const RowMutationSequence& row_mu,
+                                 IgnoreCellFlags* ignore_cell_flags,
+                                 std::unordered_set<std::string>* not_exist_cell_set);
+
     bool CheckIllegalRowArg(const RowMutationSequence& row_mu,
                             const std::set<std::string>& cf_set,
                             StatusCode* status);
diff --git a/src/io/test/load_test.cc b/src/io/test/load_test.cc
index 714758a5f..7351488ea 100644
--- a/src/io/test/load_test.cc
+++ b/src/io/test/load_test.cc
@@ -24,7 +24,7 @@
 #include "leveldb/table_utils.h"
 #include "proto/proto_helper.h"
 #include "proto/status_code.pb.h"
-#include "utils/timer.h"
+#include "common/timer.h"
 #include "utils/utils_cmd.h"
 
 DECLARE_int32(tera_io_retry_max_times);
@@ -104,7 +104,8 @@ TEST_F(TabletIOTest, General) {
     leveldb::Status s = leveldb::Env::Default()->NewLogger("./log/leveldblog", &ldb_logger);
     assert(s.ok());
     EXPECT_TRUE(tablet.Load(TableSchema(), tablet_path, std::vector<uint64_t>(),
-                            empty_snaphsots_, empty_rollback_, ldb_logger, NULL, NULL, &status));
+                            std::set<std::string>(), empty_snaphsots_,
+                            empty_rollback_, ldb_logger, NULL, NULL, &status));
 
     std::string key = "555";
     std::string value = "value of 555";
@@ -147,7 +148,8 @@ TEST_F(TabletIOTest, CurrentLost) {
     assert(s.ok());
 
     ASSERT_FALSE(tablet.Load(TableSchema(), tablet_path, std::vector<uint64_t>(),
-                            empty_snaphsots_, empty_rollback_, ldb_logger, NULL, NULL, &status));
+                             std::set<std::string>(), empty_snaphsots_,
+                             empty_rollback_, ldb_logger, NULL, NULL, &status));
 
     env->ResetMock();
 }
@@ -178,7 +180,8 @@ TEST_F(TabletIOTest, CurrentReadFailed) {
     assert(s.ok());
 
     ASSERT_FALSE(tablet.Load(TableSchema(), tablet_path, std::vector<uint64_t>(),
-                            empty_snaphsots_, empty_rollback_, ldb_logger, NULL, NULL, &status));
+                             std::set<std::string>(), empty_snaphsots_,
+                             empty_rollback_, ldb_logger, NULL, NULL, &status));
 
     env->ResetMock();
 }
@@ -216,7 +219,8 @@ TEST_F(TabletIOTest, CurrentCorrupted) {
     assert(s.ok());
 
     ASSERT_FALSE(tablet.Load(TableSchema(), tablet_path, std::vector<uint64_t>(),
-                            empty_snaphsots_, empty_rollback_, ldb_logger, NULL, NULL, &status));
+                             std::set<std::string>(), empty_snaphsots_,
+                             empty_rollback_, ldb_logger, NULL, NULL, &status));
 
     env->ResetMock();
 }
@@ -254,7 +258,8 @@ TEST_F(TabletIOTest, ManifestLost) {
     assert(s.ok());
 
     ASSERT_FALSE(tablet.Load(TableSchema(), tablet_path, std::vector<uint64_t>(),
-                            empty_snaphsots_, empty_rollback_, ldb_logger, NULL, NULL, &status));
+                             std::set<std::string>(), empty_snaphsots_,
+                             empty_rollback_, ldb_logger, NULL, NULL, &status));
 
     env->ResetMock();
 }
@@ -284,7 +289,8 @@ TEST_F(TabletIOTest, ManifestReadFailed) {
     assert(s.ok());
 
     ASSERT_FALSE(tablet.Load(TableSchema(), tablet_path, std::vector<uint64_t>(),
-                            empty_snaphsots_, empty_rollback_, ldb_logger, NULL, NULL, &status));
+                             std::set<std::string>(), empty_snaphsots_,
+                             empty_rollback_, ldb_logger, NULL, NULL, &status));
 
     env->ResetMock();
 }
@@ -322,7 +328,8 @@ TEST_F(TabletIOTest, ManifestCorrupted) {
     assert(s.ok());
 
     ASSERT_FALSE(tablet.Load(TableSchema(), tablet_path, std::vector<uint64_t>(),
-                            empty_snaphsots_, empty_rollback_, ldb_logger, NULL, NULL, &status));
+                             std::set<std::string>(), empty_snaphsots_,
+                             empty_rollback_, ldb_logger, NULL, NULL, &status));
 
     env->ResetMock();
 }
@@ -353,7 +360,8 @@ TEST_F(TabletIOTest, SstLost) {
     assert(s.ok());
 
     ASSERT_FALSE(tablet.Load(TableSchema(), tablet_path, std::vector<uint64_t>(),
-                            empty_snaphsots_, empty_rollback_, ldb_logger, NULL, NULL, &status));
+                             std::set<std::string>(), empty_snaphsots_,
+                             empty_rollback_, ldb_logger, NULL, NULL, &status));
 
     env->ResetMock();
 }
@@ -367,26 +375,30 @@ TEST_F(TabletIOTest, SstLostButIgnore) {
     TabletIO tablet(key_start, key_end, tablet_path);
     leveldb::MockEnv* env = (leveldb::MockEnv*)LeveldbMockEnv();
 
-    std::string fname = mock_env_prefix + tablet_path + "/0/__oops";
-    int fd = open(fname.c_str(), O_RDWR | O_CREAT);
-    if (fd == -1) {
-        std::cout << strerror(errno) << fname << std::endl;
-        abort();
-    }
     env->SetPrefix(mock_env_prefix);
-
     env->SetGetChildrenCallback(DropSst);
     tablet.SetMockEnv(env);
 
     leveldb::Logger* ldb_logger;
     leveldb::Status s = leveldb::Env::Default()->NewLogger("./log/leveldblog", &ldb_logger);
     assert(s.ok());
+    std::set<std::string> ignore_err_lgs;
+    ignore_err_lgs.insert("lg0");
+    TableSchema schema = TableSchema();
+
+    LocalityGroupSchema* lg = schema.add_locality_groups();
+    lg->set_name("lg0");
+
+    ColumnFamilySchema* cf = schema.add_column_families();
+    cf->set_name("column");
+    cf->set_locality_group("lg0");
+    cf->set_max_versions(3);
 
-    ASSERT_TRUE(tablet.Load(TableSchema(), tablet_path, std::vector<uint64_t>(),
-                            empty_snaphsots_, empty_rollback_, ldb_logger, NULL, NULL, &status));
+    ASSERT_TRUE(tablet.Load(schema, tablet_path, std::vector<uint64_t>(),
+                            ignore_err_lgs, empty_snaphsots_,
+                            empty_rollback_, ldb_logger, NULL, NULL, &status));
 
     env->ResetMock();
-    close(fd);
 }
 //#endif
 
diff --git a/src/io/test/tablet_io_test.cc b/src/io/test/tablet_io_test.cc
index 90da431f9..5aa7f12fa 100644
--- a/src/io/test/tablet_io_test.cc
+++ b/src/io/test/tablet_io_test.cc
@@ -18,7 +18,7 @@
 #include "leveldb/table_utils.h"
 #include "proto/proto_helper.h"
 #include "proto/status_code.pb.h"
-#include "utils/timer.h"
+#include "common/timer.h"
 #include "utils/utils_cmd.h"
 #include "utils/string_util.h"
 #include "io/tablet_scanner.h"
@@ -93,7 +93,8 @@ TEST_F(TabletIOTest, General) {
 
     TabletIO tablet(key_start, key_end, tablet_path);
     EXPECT_TRUE(tablet.Load(TableSchema(), tablet_path, std::vector<uint64_t>(),
-                            empty_snaphsots_, empty_rollback_, NULL, NULL, NULL, &status));
+                            std::set<std::string>(), empty_snaphsots_,
+                            empty_rollback_, NULL, NULL, NULL, &status));
 
     std::string key = "555";
     std::string value = "value of 555";
@@ -118,7 +119,8 @@ TEST_F(TabletIOTest, Split) {
 
     TabletIO tablet(key_start, key_end, tablet_path);
     EXPECT_TRUE(tablet.Load(TableSchema(), tablet_path, std::vector<uint64_t>(),
-                            empty_snaphsots_, empty_rollback_, NULL, NULL, NULL, &status));
+                            std::set<std::string>(), empty_snaphsots_,
+                            empty_rollback_, NULL, NULL, NULL, &status));
 
     // prepare test data
     EXPECT_TRUE(PrepareTestData(&tablet, N));
@@ -139,7 +141,8 @@ TEST_F(TabletIOTest, Split) {
     key_end = "8000";
     TabletIO other_tablet(key_start, key_end, tablet_path);
     EXPECT_TRUE(other_tablet.Load(TableSchema(), tablet_path, std::vector<uint64_t>(),
-                            empty_snaphsots_, empty_rollback_, NULL, NULL, NULL, &status));
+                                  std::set<std::string>(), empty_snaphsots_,
+                                  empty_rollback_, NULL, NULL, NULL, &status));
     other_tablet.GetDataSize(&size, NULL, &status);
     LOG(INFO) << "table[" << key_start << ", " << key_end
         << "]: size = " << size;
@@ -155,7 +158,8 @@ TEST_F(TabletIOTest, Split) {
     key_end = "5000";
     TabletIO l_tablet(key_start, key_end, tablet_path);
     EXPECT_TRUE(l_tablet.Load(TableSchema(), tablet_path, std::vector<uint64_t>(),
-                            empty_snaphsots_, empty_rollback_, NULL, NULL, NULL, &status));
+                              std::set<std::string>(), empty_snaphsots_,
+                              empty_rollback_, NULL, NULL, NULL, &status));
     l_tablet.GetDataSize(&size, NULL, &status);
     LOG(INFO) << "table[" << key_start << ", " << key_end
         << "]: size = " << size;
@@ -165,7 +169,8 @@ TEST_F(TabletIOTest, Split) {
     key_end = "";
     TabletIO r_tablet(key_start, key_end, tablet_path);
     EXPECT_TRUE(r_tablet.Load(TableSchema(), tablet_path, std::vector<uint64_t>(),
-                            empty_snaphsots_, empty_rollback_, NULL, NULL, NULL, &status));
+                              std::set<std::string>(), empty_snaphsots_,
+                              empty_rollback_, NULL, NULL, NULL, &status));
     r_tablet.GetDataSize(&size, NULL, &status);
     LOG(INFO) << "table[" << key_start << ", " << key_end
         << "]: size = " << size;
@@ -182,7 +187,8 @@ TEST_F(TabletIOTest, SplitAndCheckSize) {
 
     TabletIO tablet(key_start, key_end, tablet_path);
     EXPECT_TRUE(tablet.Load(TableSchema(), tablet_path, std::vector<uint64_t>(),
-                            empty_snaphsots_, empty_rollback_, NULL, NULL, NULL, &status));
+                            std::set<std::string>(), empty_snaphsots_,
+                            empty_rollback_, NULL, NULL, NULL, &status));
 
     // prepare test data
     EXPECT_TRUE(PrepareTestData(&tablet, N));
@@ -202,7 +208,8 @@ TEST_F(TabletIOTest, SplitAndCheckSize) {
     // open from split key to check scope size
     TabletIO l_tablet(key_start, split_key, tablet_path);
     EXPECT_TRUE(l_tablet.Load(TableSchema(), tablet_path, std::vector<uint64_t>(),
-                            empty_snaphsots_, empty_rollback_, NULL, NULL, NULL, &status));
+                              std::set<std::string>(), empty_snaphsots_,
+                              empty_rollback_, NULL, NULL, NULL, &status));
     l_tablet.GetDataSize(&size, NULL, &status);
     LOG(INFO) << "table[" << key_start << ", " << split_key
         << "]: size = " << size;
@@ -210,7 +217,8 @@ TEST_F(TabletIOTest, SplitAndCheckSize) {
 
     TabletIO r_tablet(split_key, key_end, tablet_path);
     EXPECT_TRUE(r_tablet.Load(TableSchema(), tablet_path, std::vector<uint64_t>(),
-                            empty_snaphsots_, empty_rollback_, NULL, NULL, NULL, &status));
+                              std::set<std::string>(), empty_snaphsots_,
+                              empty_rollback_, NULL, NULL, NULL, &status));
     r_tablet.GetDataSize(&size, NULL, &status);
     LOG(INFO) << "table[" << split_key << ", " << key_end
         << "]: size = " << size;
@@ -227,7 +235,8 @@ TEST_F(TabletIOTest, OverWrite) {
 
     TabletIO tablet(key_start, key_end, tablet_path);
     EXPECT_TRUE(tablet.Load(TableSchema(), tablet_path, std::vector<uint64_t>(),
-                            empty_snaphsots_, empty_rollback_, NULL, NULL, NULL, &status));
+                            std::set<std::string>(), empty_snaphsots_,
+                            empty_rollback_, NULL, NULL, NULL, &status));
 
     std::string key = "555";
     std::string value = "value of 555";
@@ -253,7 +262,8 @@ TEST_F(TabletIOTest, Compact) {
 
     TabletIO tablet(key_start, key_end, tablet_path);
     EXPECT_TRUE(tablet.Load(TableSchema(), tablet_path, std::vector<uint64_t>(),
-                            empty_snaphsots_, empty_rollback_, NULL, NULL, NULL, &status));
+                            std::set<std::string>(), empty_snaphsots_,
+                            empty_rollback_, NULL, NULL, NULL, &status));
 
     // prepare test data
     EXPECT_TRUE(PrepareTestData(&tablet, 100));
@@ -269,7 +279,8 @@ TEST_F(TabletIOTest, Compact) {
     std::string new_key_end = StringFormat("%011llu", 50); // NumberToString(800);
     TabletIO new_tablet(new_key_start, new_key_end, tablet_path);
     EXPECT_TRUE(new_tablet.Load(TableSchema(), tablet_path, std::vector<uint64_t>(),
-                            empty_snaphsots_, empty_rollback_, NULL, NULL, NULL, &status));
+                                std::set<std::string>(), empty_snaphsots_,
+                                empty_rollback_, NULL, NULL, NULL, &status));
     EXPECT_TRUE(new_tablet.Compact(0, &status));
 
     uint64_t new_table_size = 0;
@@ -291,6 +302,110 @@ TEST_F(TabletIOTest, Compact) {
     EXPECT_TRUE(new_tablet.Unload());
 }
 
+TEST_F(TabletIOTest, LowLevelSeek) {
+    std::string tablet_path = working_dir + "llseek_tablet";
+    std::string key_start = "";
+    std::string key_end = "";
+    StatusCode status;
+
+    TabletIO tablet(key_start, key_end, tablet_path);
+    EXPECT_TRUE(tablet.Load(GetTableSchema(), tablet_path, std::vector<uint64_t>(),
+                            std::set<std::string>(), empty_snaphsots_,
+                            empty_rollback_, NULL, NULL, NULL, &status));
+
+    // init scan
+    ScanOptions scan_options;
+    ColumnFamilyMap cf_map;
+    std::set<std::string> qu_set;
+    qu_set.insert("qualifer");
+    qu_set.insert("2a");
+    qu_set.insert("1a");
+    cf_map["column"] = qu_set;
+    scan_options.column_family_list = cf_map;
+    scan_options.iter_cf_set.insert("column");
+
+    std::string tkey1;
+    // delete this key
+    tablet.GetRawKeyOperator()->EncodeTeraKey("row", "", "", get_micros(), leveldb::TKT_DEL, &tkey1);
+    tablet.WriteOne(tkey1, "" , false, NULL);
+    tablet.GetRawKeyOperator()->EncodeTeraKey("row1", "", "", get_micros(), leveldb::TKT_DEL, &tkey1);
+    tablet.WriteOne(tkey1, "" , false, NULL);
+
+    // write cell
+    tablet.GetRawKeyOperator()->EncodeTeraKey("row", "column", "qualifer", get_micros(), leveldb::TKT_VALUE, &tkey1);
+    tablet.WriteOne(tkey1, "lala" , false, NULL);
+    RowResult value_list;
+
+    EXPECT_TRUE(tablet.LowLevelSeek("row", scan_options, &value_list, &status));
+    EXPECT_EQ(value_list.key_values_size(), 1);
+
+    // delete cell
+    tablet.GetRawKeyOperator()->EncodeTeraKey("row", "", "", get_micros(), leveldb::TKT_DEL, &tkey1);
+    tablet.WriteOne(tkey1, "" , false, NULL);
+    EXPECT_TRUE(tablet.LowLevelSeek("row", scan_options, &value_list, &status));
+    EXPECT_EQ(value_list.key_values_size(), 0);
+
+    // write cell again
+    tablet.GetRawKeyOperator()->EncodeTeraKey("row", "column", "2a", get_micros(), leveldb::TKT_VALUE, &tkey1);
+    tablet.WriteOne(tkey1, "lala" , false, NULL);
+    EXPECT_TRUE(tablet.LowLevelSeek("row", scan_options, &value_list, &status));
+    EXPECT_EQ(value_list.key_values_size(), 1);
+
+    // clean
+    tablet.GetRawKeyOperator()->EncodeTeraKey("row", "", "", get_micros(), leveldb::TKT_DEL, &tkey1);
+    tablet.WriteOne(tkey1, "", false, NULL);
+    tablet.GetRawKeyOperator()->EncodeTeraKey("row1", "", "", get_micros(), leveldb::TKT_DEL, &tkey1);
+    tablet.WriteOne(tkey1, "", false, NULL);
+
+    // write 5 versions
+    tablet.GetRawKeyOperator()->EncodeTeraKey("row", "column", "1a", get_micros(), leveldb::TKT_VALUE, &tkey1);
+    tablet.WriteOne(tkey1, "lala1", false, NULL);
+    int64_t start_ts = get_micros();
+    tablet.GetRawKeyOperator()->EncodeTeraKey("row", "column", "1a", start_ts, leveldb::TKT_VALUE, &tkey1);
+    tablet.WriteOne(tkey1, "lala2", false, NULL);
+    tablet.GetRawKeyOperator()->EncodeTeraKey("row", "column", "1a", get_micros(), leveldb::TKT_VALUE, &tkey1);
+    tablet.WriteOne(tkey1, "lala3", false, NULL);
+    int64_t end_ts = get_micros();
+    tablet.GetRawKeyOperator()->EncodeTeraKey("row", "column", "1a", end_ts, leveldb::TKT_VALUE, &tkey1);
+    tablet.WriteOne(tkey1, "lala4", false, NULL);
+    tablet.GetRawKeyOperator()->EncodeTeraKey("row", "column", "1a", get_micros(), leveldb::TKT_VALUE, &tkey1);
+    tablet.WriteOne(tkey1, "lala5", false, NULL);
+    tablet.GetRawKeyOperator()->EncodeTeraKey("row1", "column", "1a", get_micros(), leveldb::TKT_VALUE, &tkey1);
+    tablet.WriteOne(tkey1, "lala5", false, NULL);
+
+    // read all versions ( write 5 versions, but schema set max_versions = 3 )
+    EXPECT_TRUE(tablet.LowLevelSeek("row", scan_options, &value_list, &status));
+    EXPECT_EQ(value_list.key_values_size(), 3);
+
+    // for max_versions
+    // read 2 versions
+    scan_options.max_versions = 2;
+    EXPECT_TRUE(tablet.LowLevelSeek("row", scan_options, &value_list, &status));
+    EXPECT_EQ(value_list.key_values_size(), 2);
+
+    // for timerange and max_versions
+    // read 2 versions ( write 5 versions, but schema set max_versions = 3)
+    scan_options.max_versions = 4;
+    scan_options.ts_start = start_ts;
+    scan_options.ts_end = end_ts;
+    EXPECT_TRUE(tablet.LowLevelSeek("row", scan_options, &value_list, &status));
+    EXPECT_EQ(value_list.key_values_size(), 2);
+
+    // start_ts not in top 3 versions
+    scan_options.ts_start = start_ts;
+    scan_options.ts_end = start_ts;
+    EXPECT_TRUE(tablet.LowLevelSeek("row", scan_options, &value_list, &status));
+    EXPECT_EQ(value_list.key_values_size(), 0);
+
+    // end_ts in top 3 versions
+    scan_options.ts_start = end_ts;
+    scan_options.ts_end = end_ts;
+    EXPECT_TRUE(tablet.LowLevelSeek("row", scan_options, &value_list, &status));
+    EXPECT_EQ(value_list.key_values_size(), 1);
+
+    EXPECT_TRUE(tablet.Unload());
+}
+
 TEST_F(TabletIOTest, LowLevelScan) {
     std::string tablet_path = working_dir + "llscan_tablet";
     std::string key_start = "";
@@ -299,7 +414,8 @@ TEST_F(TabletIOTest, LowLevelScan) {
 
     TabletIO tablet(key_start, key_end, tablet_path);
     EXPECT_TRUE(tablet.Load(GetTableSchema(), tablet_path, std::vector<uint64_t>(),
-                            empty_snaphsots_, empty_rollback_, NULL, NULL, NULL, &status));
+                            std::set<std::string>(), empty_snaphsots_,
+                            empty_rollback_, NULL, NULL, NULL, &status));
 
     std::string tkey1;
 
@@ -322,19 +438,19 @@ TEST_F(TabletIOTest, LowLevelScan) {
     uint32_t read_bytes = 0;
     bool is_complete = false;
     EXPECT_TRUE(tablet.LowLevelScan(start_tera_key, "", ScanOptions(),
-                                    &value_list, &next_start_point, &read_row_count, &read_bytes, &is_complete, NULL));
+                                    &value_list, &next_start_point, &read_row_count, &read_bytes, &is_complete, &status));
     EXPECT_EQ(value_list.key_values_size(), 1);
 
     tablet.GetRawKeyOperator()->EncodeTeraKey("row", "", "", get_micros(), leveldb::TKT_DEL, &tkey1);
     tablet.WriteOne(tkey1, "lala" , false, NULL);
     EXPECT_TRUE(tablet.LowLevelScan(start_tera_key, "", ScanOptions(),
-                                    &value_list, &next_start_point, &read_row_count, &read_bytes, &is_complete, NULL));
+                                    &value_list, &next_start_point, &read_row_count, &read_bytes, &is_complete, &status));
     EXPECT_EQ(value_list.key_values_size(), 0);
 
     tablet.GetRawKeyOperator()->EncodeTeraKey("row", "column", "2a", get_micros(), leveldb::TKT_VALUE, &tkey1);
     tablet.WriteOne(tkey1, "lala" , false, NULL);
     EXPECT_TRUE(tablet.LowLevelScan(start_tera_key, "", ScanOptions(),
-                                    &value_list, &next_start_point, &read_row_count, &read_bytes, &is_complete, NULL));
+                                    &value_list, &next_start_point, &read_row_count, &read_bytes, &is_complete, &status));
     EXPECT_EQ(value_list.key_values_size(), 1);
 
     tablet.GetRawKeyOperator()->EncodeTeraKey("row", "", "", get_micros(), leveldb::TKT_DEL, &tkey1);
@@ -357,17 +473,17 @@ TEST_F(TabletIOTest, LowLevelScan) {
     end_row_key = std::string("row1\0", 5);
     ScanOptions scan_options;
     EXPECT_TRUE(tablet.LowLevelScan(start_tera_key, end_row_key, scan_options,
-                                    &value_list, &next_start_point, &read_row_count, &read_bytes, &is_complete, NULL));
+                                    &value_list, &next_start_point, &read_row_count, &read_bytes, &is_complete, &status));
     EXPECT_EQ(value_list.key_values_size(), 5);
     tablet.GetRawKeyOperator()->EncodeTeraKey("row", "", "", 0, leveldb::TKT_FORSEEK, &start_tera_key);
     end_row_key = std::string("row\0", 5);
     scan_options.column_family_list["column"].insert("1a");
     EXPECT_TRUE(tablet.LowLevelScan(start_tera_key, end_row_key, scan_options,
-                                    &value_list, &next_start_point, &read_row_count, &read_bytes, &is_complete, NULL));
+                                    &value_list, &next_start_point, &read_row_count, &read_bytes, &is_complete, &status));
     EXPECT_EQ(value_list.key_values_size(), 3);
     scan_options.max_versions = 2;
     EXPECT_TRUE(tablet.LowLevelScan(start_tera_key, end_row_key, scan_options,
-                                    &value_list, &next_start_point, &read_row_count, &read_bytes, &is_complete, NULL));
+                                    &value_list, &next_start_point, &read_row_count, &read_bytes, &is_complete, &status));
     EXPECT_EQ(value_list.key_values_size(), 2);
     EXPECT_TRUE(tablet.Unload());
 }
@@ -382,7 +498,8 @@ TEST_F(TabletIOTest, SplitToSubTable) {
 
     TabletIO tablet(key_start, key_end, tablet_path);
     EXPECT_TRUE(tablet.Load(TableSchema(), tablet_path, std::vector<uint64_t>(),
-                            empty_snaphsots_, empty_rollback_, NULL, NULL, NULL, &status));
+                            std::set<std::string>(), empty_snaphsots_,
+                            empty_rollback_, NULL, NULL, NULL, &status));
 
     // prepare test data
     EXPECT_TRUE(PrepareTestData(&tablet, N / 2, 0));
@@ -391,7 +508,8 @@ TEST_F(TabletIOTest, SplitToSubTable) {
     // make sure all data are dumped into sst
     EXPECT_TRUE(tablet.Unload());
     EXPECT_TRUE(tablet.Load(TableSchema(), tablet_path, std::vector<uint64_t>(),
-                            empty_snaphsots_, empty_rollback_, NULL, NULL, NULL, &status));
+                            std::set<std::string>(), empty_snaphsots_,
+                            empty_rollback_, NULL, NULL, NULL, &status));
 
     // for first tablet
     tablet.GetDataSize(&size, NULL, &status);
@@ -418,7 +536,8 @@ TEST_F(TabletIOTest, SplitToSubTable) {
     // 1. load sub-table 1
     TabletIO l_tablet(key_start, split_key, split_path_1);
     EXPECT_TRUE(l_tablet.Load(TableSchema(), split_path_1, parent_tablet,
-                            empty_snaphsots_, empty_rollback_, NULL, NULL, NULL, &status));
+                              std::set<std::string>(), empty_snaphsots_,
+                              empty_rollback_, NULL, NULL, NULL, &status));
     l_tablet.GetDataSize(&size, NULL, &status);
     LOG(INFO) << "table[" << key_start << ", " << split_key
         << "]: size = " << size;
@@ -436,7 +555,8 @@ TEST_F(TabletIOTest, SplitToSubTable) {
     // 2. load sub-table 2
     TabletIO r_tablet(split_key, key_end, split_path_2);
     EXPECT_TRUE(r_tablet.Load(TableSchema(), split_path_2, parent_tablet,
-                            empty_snaphsots_, empty_rollback_, NULL, NULL, NULL, &status));
+                              std::set<std::string>(), empty_snaphsots_,
+                              empty_rollback_, NULL, NULL, NULL, &status));
     r_tablet.GetDataSize(&size, NULL, &status);
     LOG(INFO) << "table[" << split_key << ", " << key_end
         << "]: size = " << size;
@@ -554,7 +674,8 @@ TEST_F(TabletIOTest, RowBloomFilter) {
 
     TabletIO tablet(key_start, key_end, tablet_path);
     EXPECT_TRUE(tablet.Load(GetTableSchema(), tablet_path, std::vector<uint64_t>(),
-                            empty_snaphsots_, empty_rollback_, NULL, NULL, NULL, &status));
+                            std::set<std::string>(), empty_snaphsots_,
+                            empty_rollback_, NULL, NULL, NULL, &status));
 
     // prepare data
     leveldb::WriteBatch batch;
@@ -594,7 +715,7 @@ TEST_F(TabletIOTest, RowBloomFilter) {
         bool is_complete = false;
         ASSERT_TRUE(tablet.LowLevelScan(start_tera_key, end_row_key, ScanOptions(), &value_list,
                                         &next_start_point, &read_row_count, &read_bytes,
-                                        &is_complete, NULL));
+                                        &is_complete, &status));
         ASSERT_EQ(value_list.key_values_size(), CR);
         for (int32_t j = 0; j < CR; j++) {
             char buf[16];
diff --git a/src/io/test/tablet_scanner_test.cc b/src/io/test/tablet_scanner_test.cc
index a53f2d52a..915ad6a92 100644
--- a/src/io/test/tablet_scanner_test.cc
+++ b/src/io/test/tablet_scanner_test.cc
@@ -20,7 +20,7 @@
 #include "leveldb/table_utils.h"
 #include "proto/proto_helper.h"
 #include "proto/status_code.pb.h"
-#include "utils/timer.h"
+#include "common/timer.h"
 #include "utils/utils_cmd.h"
 
 DECLARE_string(tera_tabletnode_path_prefix);
@@ -225,7 +225,8 @@ TEST_F(TabletScannerTest, General) {
 
     TabletIO tablet(key_start, key_end, tablet_path);
     EXPECT_TRUE(tablet.Load(GetTableSchema(), tablet_path, std::vector<uint64_t>(),
-                            empty_snaphsots_, empty_rollback_, NULL, NULL, NULL, &status));
+                            std::set<std::string>(), empty_snaphsots_,
+                            empty_rollback_, NULL, NULL, NULL, &status));
 
     PrepareData(&tablet, 1000000);
     uint64_t nr = 400;
@@ -246,7 +247,8 @@ TEST_F(TabletScannerTest, CacheEvict) {
 
     TabletIO tablet(key_start, key_end, tablet_path);
     EXPECT_TRUE(tablet.Load(GetTableSchema(), tablet_path, std::vector<uint64_t>(),
-                            empty_snaphsots_, empty_rollback_, NULL, NULL, NULL, &status));
+                            std::set<std::string>(), empty_snaphsots_,
+                            empty_rollback_, NULL, NULL, NULL, &status));
 
     PrepareData(&tablet, 1000000);
 
diff --git a/src/io/utils_leveldb.cc b/src/io/utils_leveldb.cc
index 033ad4d37..48a6fa2c8 100644
--- a/src/io/utils_leveldb.cc
+++ b/src/io/utils_leveldb.cc
@@ -14,6 +14,8 @@
 #include "common/base/string_number.h"
 #include "common/file/file_path.h"
 #include "common/mutex.h"
+#include "common/timer.h"
+#include "db/filename.h"
 #include "io/timekey_comparator.h"
 #include "leveldb/comparator.h"
 #include "leveldb/env_dfs.h"
@@ -21,7 +23,7 @@
 #include "leveldb/env_inmem.h"
 #include "leveldb/env_mock.h"
 #include "leveldb/table_utils.h"
-#include "utils/timer.h"
+#include "common/timer.h"
 
 DECLARE_string(tera_leveldb_env_type);
 DECLARE_string(tera_leveldb_env_dfs_type);
@@ -31,6 +33,7 @@ DECLARE_string(tera_leveldb_env_hdfs2_nameservice_list);
 DECLARE_string(tera_tabletnode_path_prefix);
 DECLARE_string(tera_dfs_so_path);
 DECLARE_string(tera_dfs_conf);
+DECLARE_int64(tera_master_gc_trash_expire_time_s);
 
 namespace tera {
 namespace io {
@@ -99,6 +102,11 @@ std::string GetTrashDir() {
     return FLAGS_tera_tabletnode_path_prefix + "/" + trash;
 }
 
+std::string GetTrackableGcTrashDir() {
+    const std::string trash("#trackable_gc_trash");
+    return FLAGS_tera_tabletnode_path_prefix + "/" + trash;
+}
+
 bool MoveEnvDirToTrash(const std::string& tablename) {
     leveldb::Env* env = LeveldbBaseEnv();
     std::string src_dir = FLAGS_tera_tabletnode_path_prefix + "/" + tablename;
@@ -140,6 +148,75 @@ bool MoveEnvDirToTrash(const std::string& tablename) {
     return true;
 }
 
+leveldb::Status MoveSstToTrackableGcTrash(const std::string& table_name,
+                                          uint64_t tablet_id,
+                                          uint32_t lg_id,
+                                          uint64_t file_id) {
+    leveldb::Status s;
+    leveldb::Env* env = LeveldbBaseEnv();
+    std::string table_path = FLAGS_tera_tabletnode_path_prefix + table_name;
+    std::string src_path = leveldb::BuildTableFilePath(table_path, tablet_id, lg_id, file_id);
+
+    s = env->FileExists(src_path);
+    if(s.IsNotFound()) {
+        // not found, so no need to move
+        return leveldb::Status::OK();
+    } else if (!s.ok()) {
+        // unknown status
+        return s;
+    }
+
+    std::string trash_dir = GetTrackableGcTrashDir();
+    s = env->FileExists(trash_dir);
+    if (s.IsNotFound()) {
+        if (!env->CreateDir(trash_dir).ok()) {
+            LOG(ERROR) << "[gc] fail to create trackable gc trash dir: " << trash_dir;
+            return leveldb::Status::IOError("fail to create trackable gc trash dir");
+        } else {
+            LOG(INFO) << "[gc] succeed in creating trackable gc trash dir: " << trash_dir;
+        }
+    } else if (!s.ok()) {
+        // unknown status
+        return s;
+    }
+
+    std::string time = get_curtime_str();
+    std::replace(time.begin(), time.end(), ':', '-');
+    std::string dest_path = leveldb::BuildTrashTableFilePath(
+            trash_dir + "/" + table_name, tablet_id, lg_id, file_id, time);
+
+    size_t dir_pos = dest_path.rfind("/");
+    if (dir_pos == std::string::npos) {
+        LOG(ERROR) << "[gc] invalid dest path: " << dest_path;
+        return leveldb::Status::IOError("invalid dest path");
+    }
+    std::string lg_path = dest_path.substr(0, dir_pos);
+    s = env->FileExists(lg_path);
+    if(s.IsNotFound()) {
+        // not found, so no need to mkdir
+        s = env->CreateDir(lg_path);
+        if (!s.ok()) {
+            LOG(ERROR) << "[gc] create lg dir in trash: " << lg_path
+                       << " failed: " << s.ToString();
+            return s;
+        }
+    } else if (!s.ok()) {
+        // unknown status
+        return s;
+    }
+
+    s = env->RenameFile(src_path, dest_path);
+    if (!s.ok()) {
+        LOG(ERROR) << "[gc] fail to move file to trackable gc trash, src_path: " << src_path
+            << ", dest_path: " << dest_path << ", status: " << s.ToString();
+        return s;
+    }
+    LOG(INFO) << "[gc] move file to trackable gc trash, src_path: " << src_path
+            << ", dest_path: " << dest_path;
+
+    return leveldb::Status::OK();
+}
+
 void CleanTrashDir() {
     leveldb::Env* env = LeveldbBaseEnv();
     std::string trash_dir = GetTrashDir();
@@ -156,6 +233,136 @@ void CleanTrashDir() {
     return;
 }
 
+bool TryDeleteEmptyDir(const std::string& dir_path,
+                       size_t total_children_size,
+                       size_t deleted_children_size) {
+    bool deleted = false;
+
+    if (deleted_children_size == total_children_size) {
+        leveldb::Status s;
+        leveldb::Env* env = LeveldbBaseEnv();
+        s = env->DeleteDir(dir_path);
+        if (s.ok()) {
+            LOG(INFO) << "[gc] delete empty dir: " << dir_path;
+            deleted = true;
+        } else {
+            LOG(WARNING) << "[gc] fail to delete empty dir: "
+                << dir_path <<" status: " << s.ToString();
+            deleted = false;
+        }
+    }
+
+    return deleted;
+}
+
+leveldb::Status DeleteTrashFileIfExpired(const std::string& file_path) {
+    leveldb::Status s;
+    leveldb::Env* env = LeveldbBaseEnv();
+
+    std::string file_time_str = leveldb::GetTimeStrFromTrashFile(file_path);
+    if (file_time_str.empty()) {
+        LOG(ERROR) << "[gc] skip invalid trash file path: " << file_path;
+        return leveldb::Status::Corruption("invalid trash file path");
+    }
+
+    // change time format
+    // eg.: change "20170801-15-54-23" to "20170801-15:54:23"
+    file_time_str = file_time_str.replace(file_time_str.rfind("-"), 1, ":");
+    file_time_str = file_time_str.replace(file_time_str.rfind("-"), 1, ":");
+
+    int64_t file_time = get_timestamp_from_str(file_time_str);
+    int64_t current_time = time(nullptr);
+    if (current_time - file_time > FLAGS_tera_master_gc_trash_expire_time_s) {
+        s = env->DeleteFile(file_path);
+        if (s.ok()) {
+            LOG(INFO) << "[gc] delete expired trash file: " << file_path
+                << ", file added to trash time: " << get_time_str(file_time)
+                << ", current time: " << get_time_str(current_time);
+        } else {
+            LOG(ERROR) << "[gc] fail to delete expired trash file: " << file_path
+                <<" status: " << s.ToString();
+            return s;
+        }
+    } else {
+        return leveldb::Status::Corruption("file not expired");
+    }
+
+    return s;
+}
+
+void CleanTrackableGcTrash() {
+    leveldb::Status s;
+    leveldb::Env* env = LeveldbBaseEnv();
+    std::string trash_dir = GetTrackableGcTrashDir();
+
+    s = env->FileExists(trash_dir);
+    if (s.IsNotFound()) {
+        LOG(INFO) << "[gc] skip empty trash dir: " << trash_dir
+            <<" status: " << s.ToString();
+        return;
+    }
+
+    std::vector<std::string> tables;
+    s = env->GetChildren(trash_dir, &tables);
+    if (!s.ok()) {
+        LOG(ERROR) << "[gc] fail to list trash dir: " << trash_dir
+            <<" status: " << s.ToString();
+        return;
+    }
+
+    for (const auto& table : tables) {
+        std::string table_path = trash_dir + "/" + table;
+        std::vector<std::string> tablets;
+        s = env->GetChildren(table_path, &tablets);
+        if (!s.ok()) {
+            LOG(ERROR) << "[gc] skip due to fail to list table dir: " << table_path
+                <<" status: " << s.ToString();
+            continue;
+        }
+
+        size_t deleted_empty_tablet_num = 0;
+        for (const auto& tablet : tablets) {
+            std::string tablet_path = table_path + "/" + tablet;
+            std::vector<std::string> lgs;
+            s = env->GetChildren(tablet_path, &lgs);
+            if (!s.ok()) {
+                LOG(ERROR) << "[gc] skip due to fail to list tablet dir: " << tablet_path
+                    <<" status: " << s.ToString();
+                continue;
+            }
+
+            size_t deleted_empty_lg_num = 0;
+            for (const auto& lg : lgs) {
+                std::string lg_path = tablet_path + "/" + lg;
+                std::vector<std::string> files;
+                s = env->GetChildren(lg_path, &files);
+                if (!s.ok()) {
+                    LOG(ERROR) << "[gc] skip due to fail to list lg dir: " << lg_path
+                        <<" status: " << s.ToString();
+                    continue;
+                }
+
+                size_t deleted_file_num = 0;
+                for (const auto& file : files) {
+                    std::string file_path = lg_path + "/" + file;
+                    if (DeleteTrashFileIfExpired(file_path).ok()) {
+                        ++deleted_file_num;
+                    }
+                }
+                if (TryDeleteEmptyDir(lg_path, files.size(), deleted_file_num)) {
+                    ++ deleted_empty_lg_num;
+                }
+            }
+            if (TryDeleteEmptyDir(tablet_path, lgs.size(), deleted_empty_lg_num)) {
+                ++ deleted_empty_tablet_num;
+            }
+        }
+        TryDeleteEmptyDir(table_path, tablets.size(), deleted_empty_tablet_num);
+    }
+
+    return;
+}
+
 leveldb::Status DeleteEnvDir(const std::string& dir) {
     leveldb::Status s;
     static bool is_support_rmdir = true;
diff --git a/src/io/utils_leveldb.h b/src/io/utils_leveldb.h
index 307c270aa..9654ce5b9 100644
--- a/src/io/utils_leveldb.h
+++ b/src/io/utils_leveldb.h
@@ -30,10 +30,25 @@ leveldb::Env* LeveldbMockEnv();
 
 std::string GetTrashDir();
 
+std::string GetTrackableGcTrashDir();
+
 bool MoveEnvDirToTrash(const std::string& subdir);
 
+leveldb::Status MoveSstToTrackableGcTrash(const std::string& table_name,
+                                          uint64_t tablet_id,
+                                          uint32_t lg_id,
+                                          uint64_t file_id);
+
 void CleanTrashDir();
 
+bool TryDeleteEmptyDir(const std::string& dir_path,
+                       size_t total_children_size,
+                       size_t deleted_children_size);
+
+leveldb::Status DeleteTrashFileIfExpired(const std::string& file_path);
+
+void CleanTrackableGcTrash();
+
 leveldb::Status DeleteEnvDir(const std::string& subdir);
 
 } // namespace io
diff --git a/src/lbcli_main.cc b/src/lbcli_main.cc
new file mode 100644
index 000000000..c1149812c
--- /dev/null
+++ b/src/lbcli_main.cc
@@ -0,0 +1,314 @@
+// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+//
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <readline/history.h>
+#include <readline/readline.h>
+
+#include <fstream>
+#include <iomanip>
+#include <iostream>
+#include <limits>
+#include <map>
+#include <memory>
+#include <sstream>
+
+#include <gflags/gflags.h>
+#include <glog/logging.h>
+
+#include "common/thread_pool.h"
+#include "common/base/string_ext.h"
+#include "common/base/string_number.h"
+#include "common/console/progress_bar.h"
+#include "common/file/file_path.h"
+#include "io/coding.h"
+#include "proto/kv_helper.h"
+#include "proto/lb_client.h"
+#include "proto/load_balancer_rpc.pb.h"
+#include "proto/proto_helper.h"
+#include "proto/tabletnode.pb.h"
+#include "proto/tabletnode_client.h"
+#include "sdk/client_impl.h"
+#include "sdk/cookie.h"
+#include "sdk/sdk_utils.h"
+#include "sdk/sdk_zk.h"
+#include "sdk/table_impl.h"
+#include "tera.h"
+#include "types.h"
+#include "utils/crypt.h"
+#include "utils/string_util.h"
+#include "utils/tprinter.h"
+#include "utils/utils_cmd.h"
+#include "version.h"
+
+DECLARE_string(flagfile);
+
+// using FLAGS instead of isatty() for compatibility
+DEFINE_bool(stdout_is_tty, true, "is stdout connected to a tty");
+DEFINE_bool(reorder_tablets, false, "reorder tablets by ts list");
+DEFINE_bool(readable, true, "readable input");
+
+DECLARE_string(tera_lb_server_addr);
+DECLARE_string(tera_lb_server_port);
+
+tera::TPrinter::PrintOpt g_printer_opt;
+
+using namespace tera;
+
+typedef std::shared_ptr<Table> TablePtr;
+typedef std::shared_ptr<TableImpl> TableImplPtr;
+typedef std::map<std::string, int32_t(*)(Client*, int32_t, std::string*, ErrorCode*)> CommandTable;
+
+static CommandTable& GetCommandTable() {
+    static CommandTable command_table;
+    return command_table;
+}
+
+static std::string GetServerAddr() {
+    return FLAGS_tera_lb_server_addr + ":" + FLAGS_tera_lb_server_port;
+}
+
+const char* builtin_cmd_list[] = {
+    "safemode",
+    "safemode [enter | leave | get]",
+
+    "help",
+    "help [cmd]                                                           \n\
+          show manual for a or all cmd(s)",
+
+    "version",
+    "version                                                              \n\
+             show version info",
+};
+
+static void PrintCmdHelpInfo(const char* msg) {
+    if (msg == NULL) {
+        return;
+    }
+    int count = sizeof(builtin_cmd_list)/sizeof(char*);
+    for (int i = 0; i < count; i+=2) {
+        if(strncmp(msg, builtin_cmd_list[i], 32) == 0) {
+            std::cout << builtin_cmd_list[i + 1] << std::endl;
+            return;
+        }
+    }
+}
+
+static void PrintCmdHelpInfo(const std::string& msg) {
+    PrintCmdHelpInfo(msg.c_str());
+}
+
+static void PrintAllCmd() {
+    std::cout << "there is cmd list:" << std::endl;
+    int count = sizeof(builtin_cmd_list)/sizeof(char*);
+    bool newline = false;
+    for (int i = 0; i < count; i+=2) {
+        std::cout << std::setiosflags(std::ios::left) << std::setw(20) << builtin_cmd_list[i];
+        if (newline) {
+            std::cout << std::endl;
+            newline = false;
+        } else {
+            newline = true;
+        }
+    }
+
+    std::cout << std::endl << "help [cmd] for details." << std::endl;
+}
+
+// return false if similar command(s) not found
+static bool PromptSimilarCmd(const char* msg) {
+    if (msg == NULL) {
+        return false;
+    }
+    bool found = false;
+    int64_t len = strlen(msg);
+    int64_t threshold = int64_t((len * 0.3 < 3) ? 3 : len * 0.3);
+    int count = sizeof(builtin_cmd_list)/sizeof(char*);
+    for (int i = 0; i < count; i+=2) {
+        if (EditDistance(msg, builtin_cmd_list[i]) <= threshold) {
+            if (!found) {
+                std::cout << "Did you mean:" << std::endl;
+                found = true;
+            }
+            std::cout << "    " << builtin_cmd_list[i] << std::endl;
+        }
+    }
+    return found;
+}
+
+static void PrintUnknownCmdHelpInfo(const char* msg) {
+    if (msg != NULL) {
+        std::cout << "'" << msg << "' is not a valid command." << std::endl << std::endl;
+    }
+    if ((msg != NULL)
+        && PromptSimilarCmd(msg)) {
+        return;
+    }
+    PrintAllCmd();
+}
+
+int32_t SafemodeOp(Client* client, int32_t argc, std::string* argv, ErrorCode* err) {
+    if (argc < 3) {
+        PrintCmdHelpInfo(argv[1]);
+        return -1;
+    }
+
+    std::string op = argv[2];
+    if (op != "get" && op != "leave" && op != "enter") {
+        PrintCmdHelpInfo(argv[1]);
+        return -1;
+    }
+
+    load_balancer::LBClient lb_client(GetServerAddr());
+    CmdCtrlRequest request;
+    CmdCtrlResponse response;
+
+    request.set_sequence_id(0);
+    request.set_command("safemode");
+    request.add_arg_list(op);
+
+    string reason;
+    if (lb_client.CmdCtrl(&request, &response)) {
+        if (response.status() != tera::kLoadBalancerOk) {
+            reason = StatusCodeToString(response.status());
+            LOG(ERROR) << reason;
+            std::cout << reason << std::endl;
+            err->SetFailed(ErrorCode::kSystem, reason);
+            return -1;
+        }
+        if (op == "get") {
+            if (response.bool_result()) {
+                std::cout << "true" << std::endl;
+            } else {
+                std::cout << "false" << std::endl;
+            }
+        }
+        return 0;
+    } else {
+        reason = "fail to CmdCtrl";
+        LOG(ERROR) << reason;
+        std::cout << reason << std::endl;
+        err->SetFailed(ErrorCode::kSystem, reason);
+        return -1;
+    }
+}
+
+int32_t HelpOp(Client*, int32_t argc, std::string* argv, ErrorCode*) {
+    if (argc == 2) {
+        PrintAllCmd();
+    } else if (argc == 3) {
+        PrintCmdHelpInfo(argv[2]);
+    } else {
+        PrintCmdHelpInfo("help");
+    }
+    return 0;
+}
+
+int32_t HelpOp(int32_t argc, char** argv) {
+    std::vector<std::string> argv_svec(argv, argv + argc);
+    return HelpOp(NULL, argc, &argv_svec[0], NULL);
+}
+
+bool ParseCommand(int argc, char** arg_list, std::vector<std::string>* parsed_arg_list) {
+    for (int i = 0; i < argc; i++) {
+        std::string parsed_arg = arg_list[i];
+        if (FLAGS_readable && !ParseDebugString(arg_list[i], &parsed_arg)) {
+            std::cout << "invalid debug format of argument: " << arg_list[i] << std::endl;
+            return false;
+        }
+        parsed_arg_list->push_back(parsed_arg);
+    }
+    return true;
+}
+
+static void InitializeCommandTable(){
+    CommandTable& command_table = GetCommandTable();
+    command_table["safemode"] = SafemodeOp;
+    command_table["help"] = HelpOp;
+}
+
+int ExecuteCommand(Client* client, int argc, char** arg_list) {
+    int ret = 0;
+    ErrorCode error_code;
+
+    std::vector<std::string> parsed_arg_list;
+    if (!ParseCommand(argc, arg_list, &parsed_arg_list)) {
+        return 1;
+    }
+    std::string* argv = &parsed_arg_list[0];
+
+    CommandTable& command_table = GetCommandTable();
+    std::string cmd = argv[1];
+    if (cmd == "version") {
+        PrintSystemVersion();
+    } else if (command_table.find(cmd) != command_table.end()) {
+        ret = command_table[cmd](client, argc, argv, &error_code);
+    } else {
+        PrintUnknownCmdHelpInfo(argv[1].c_str());
+        ret = 1;
+    }
+
+    if (error_code.GetType() != ErrorCode::kOK) {
+        LOG(ERROR) << "fail reason: " << error_code.ToString();
+    }
+    return ret;
+}
+
+int main(int argc, char* argv[]) {
+    FLAGS_minloglevel = 2;
+    ::google::ParseCommandLineFlags(&argc, &argv, true);
+
+    if (argc > 1 && std::string(argv[1]) == "version") {
+        PrintSystemVersion();
+        return 0;
+    } else if (argc > 1 && std::string(argv[1]) == "help") {
+        HelpOp(argc, argv);
+        return 0;
+    }
+
+    Client* client = Client::NewClient(FLAGS_flagfile, NULL);
+    if (client == NULL) {
+        LOG(ERROR) << "client instance not exist";
+        return -1;
+    }
+    g_printer_opt.print_head = FLAGS_stdout_is_tty;
+
+    InitializeCommandTable();
+
+    int ret  = 0;
+    if (argc == 1) {
+        char* line = NULL;
+        while ((line = readline("lb> ")) != NULL) {
+            char* line_copy = strdup(line);
+            std::vector<char*> arg_list;
+            arg_list.push_back(argv[0]);
+            char* tmp = NULL;
+            char* token = strtok_r(line, " \t", &tmp);
+            while (token != NULL) {
+                arg_list.push_back(token);
+                token = strtok_r(NULL, " \t", &tmp);
+            }
+            if (arg_list.size() == 2 &&
+                (strcmp(arg_list[1], "quit") == 0 || strcmp(arg_list[1], "exit") == 0)) {
+                free(line_copy);
+                free(line);
+                break;
+            }
+            if (arg_list.size() > 1) {
+                add_history(line_copy);
+                ret = ExecuteCommand(client, arg_list.size(), &arg_list[0]);
+            }
+            free(line_copy);
+            free(line);
+        }
+    } else {
+        ret = ExecuteCommand(client, argc, argv);
+    }
+
+    delete client;
+    return ret;
+}
diff --git a/src/leveldb/Makefile b/src/leveldb/Makefile
index c9162d2eb..175c916dc 100644
--- a/src/leveldb/Makefile
+++ b/src/leveldb/Makefile
@@ -19,7 +19,7 @@ include ../../depends.mk
 include build_config.mk
 
 CFLAGS += -I. -I./include $(PLATFORM_CCFLAGS) $(OPT)
-CXXFLAGS += -I. -I./include $(PLATFORM_CXXFLAGS) $(OPT)
+CXXFLAGS += -std=c++11 -I. -I./include $(PLATFORM_CXXFLAGS) $(OPT)
 
 LDFLAGS += $(PLATFORM_LDFLAGS) -L$(SNAPPY_LIBDIR) -lrt -ldl -lsnappy
 LIBS += $(PLATFORM_LIBS)
diff --git a/src/leveldb/build_detect_platform b/src/leveldb/build_detect_platform
index 8e230d950..325dfaf01 100755
--- a/src/leveldb/build_detect_platform
+++ b/src/leveldb/build_detect_platform
@@ -22,7 +22,6 @@
 #
 #       -DLEVELDB_CSTDATOMIC_PRESENT if <cstdatomic> is present
 #       -DLEVELDB_PLATFORM_POSIX     for Posix-based platforms
-#       -DSNAPPY                     if the Snappy library is present
 #
 
 OUTPUT=$1
@@ -176,15 +175,6 @@ EOF
         COMMON_FLAGS="$COMMON_FLAGS -DLEVELDB_PLATFORM_POSIX"
     fi
 
-    # Test whether Snappy library is installed
-    # http://code.google.com/p/snappy/
-    $CXX $CXXFLAGS -x c++ - -o $CXXOUTPUT 2>/dev/null  <<EOF
-      #include <snappy.h>
-      int main() {}
-EOF
-    COMMON_FLAGS="$COMMON_FLAGS -DSNAPPY"
-    PLATFORM_LIBS="$PLATFORM_LIBS"
-
     # Test whether tcmalloc is available
     $CXX $CXXFLAGS -x c++ - -o $CXXOUTPUT -ltcmalloc_minimal 2>/dev/null  <<EOF
       int main() {}
diff --git a/src/leveldb/db/builder.cc b/src/leveldb/db/builder.cc
index fdbae74af..0d89df1bc 100644
--- a/src/leveldb/db/builder.cc
+++ b/src/leveldb/db/builder.cc
@@ -110,6 +110,7 @@ Status BuildTable(const std::string& dbname,
       *saved_size = 0;
       if (s.ok() && builder->NumEntries()) {
         meta->file_size = builder->FileSize();
+        meta->data_size = meta->file_size;
         assert(meta->file_size > 0);
         *saved_size = builder->SavedSize();
 
@@ -164,7 +165,7 @@ Status BuildTable(const std::string& dbname,
 
   if (s.ok() && meta->file_size > 0) {
     // Keep it
-  } else {
+  } else if (!s.IsIOPermissionDenied()) {
     env->DeleteFile(fname);
   }
   return s;
diff --git a/src/leveldb/db/db_impl.cc b/src/leveldb/db/db_impl.cc
index c076008de..e8758a8ce 100644
--- a/src/leveldb/db/db_impl.cc
+++ b/src/leveldb/db/db_impl.cc
@@ -9,13 +9,14 @@
 #include "db/db_impl.h"
 
 #include <iostream>
-
 #include <algorithm>
 #include <set>
 #include <string>
 #include <stdint.h>
 #include <stdio.h>
 #include <vector>
+#include <thread>
+
 #include "db/builder.h"
 #include "db/db_iter.h"
 #include "db/dbformat.h"
@@ -44,11 +45,17 @@
 
 namespace leveldb {
 
+extern Status WriteStringToFileSync(Env* env, const Slice& data,
+                                    const std::string& fname);
+
 const int kNumNonTableCacheFiles = 10;
 
 // if this file exists, ignore error in db-opening
 const static std::string mark_file_name = "/__oops";
 
+// if this file exists,  
+const static std::string init_load_filelock = "/__init_load_filelock";
+
 // Information kept for every waiting writer
 struct DBImpl::Writer {
   WriteBatch* batch;
@@ -87,6 +94,7 @@ struct DBImpl::CompactionState {
   TableBuilder* builder;
 
   uint64_t total_bytes;
+  Status status;
 
   Output* current_output() { return &outputs[outputs.size()-1]; }
 
@@ -129,8 +137,10 @@ Options SanitizeOptions(const std::string& dbname,
     result.block_cache = NewLRUCache(8 << 20);
   }
 
+  if (result.ignore_corruption_in_open) {
+    Log(result.info_log, "[%s] caution: open with ignore_corruption_in_open", dbname.c_str());
+  }
   {
-    // Maybe mark error flag in option
     std::string oops = dbname + mark_file_name;
     Status s = src.env->FileExists(oops);
     if (s.ok()) {
@@ -139,7 +149,6 @@ Options SanitizeOptions(const std::string& dbname,
     }
     // Ignore error from FileExists since there is no harm
   }
-
   return result;
 }
 
@@ -166,10 +175,6 @@ DBImpl::DBImpl(const Options& options, const std::string& dbname)
       logfile_number_(0),
       log_(NULL),
       bound_log_size_(0),
-      bg_compaction_scheduled_(false),
-      bg_compaction_score_(0),
-      bg_compaction_timeout_(0),
-      bg_schedule_id_(0),
       manual_compaction_(NULL),
       consecutive_compaction_errors_(0),
       flush_on_destroy_(false) {
@@ -188,6 +193,11 @@ DBImpl::DBImpl(const Options& options, const std::string& dbname)
                              &internal_comparator_);
 }
 
+bool DBImpl::ShouldForceUnloadOnError() {
+    MutexLock l(&mutex_);
+    return bg_error_.IsIOPermissionDenied();
+}
+
 Status DBImpl::Shutdown1() {
   assert(state_ == kOpened);
   state_ = kShutdown1;
@@ -196,12 +206,17 @@ Status DBImpl::Shutdown1() {
   shutting_down_.Release_Store(this);  // Any non-NULL value is ok
 
   Log(options_.info_log, "[%s] wait bg compact finish", dbname_.c_str());
-  if (bg_compaction_scheduled_) {
-    env_->ReSchedule(bg_schedule_id_, kDumpMemTableUrgentScore, 0);
+  std::vector<CompactionTask*>::iterator it = bg_compaction_tasks_.begin();
+  for (; it != bg_compaction_tasks_.end(); ++it) {
+    env_->ReSchedule((*it)->id, kDumpMemTableUrgentScore, 0);
   }
-  while (bg_compaction_scheduled_) {
+  while (bg_compaction_tasks_.size() > 0) {
     bg_cv_.Wait();
   }
+  // has enconutered IOPermission Denied error, return immediately and do not try to compact memory table aynmore
+  if (bg_error_.IsIOPermissionDenied()) {
+      return bg_error_;
+  }
 
   Status s;
   if (!options_.dump_mem_on_shutdown) {
@@ -231,6 +246,9 @@ Status DBImpl::Shutdown2() {
   state_ = kShutdown2;
 
   MutexLock l(&mutex_);
+  if(bg_error_.IsIOPermissionDenied()) {
+      return bg_error_;
+  }
   Status s;
   if (!options_.dump_mem_on_shutdown) {
     return s;
@@ -315,26 +333,30 @@ void DBImpl::MaybeIgnoreError(Status* s) const {
 }
 
 void DBImpl::DeleteObsoleteFiles() {
+  mutex_.AssertHeld();
   if (!bg_error_.ok()) {
     // After a background error, we don't know whether a new version may
     // or may not have been committed, so we cannot safely garbage collect.
     return;
   }
 
+  // check filesystem, and then check pending_outputs_
+  std::vector<std::string> filenames;
+  mutex_.Unlock();
+  env_->GetChildren(dbname_, &filenames); // Ignoring errors on purpose
+  mutex_.Lock();
+
   // Make a set of all of the live files
   std::set<uint64_t> live = pending_outputs_;
   versions_->AddLiveFiles(&live);
 
   // manifest file set, keep latest 3 manifest files for backup
-  std::set<std::string> manifest_set;
+  //std::set<std::string> manifest_set;
 
-  Log(options_.info_log, "[%s] try DeleteObsoleteFiles, total live file num: %llu\n",
-      dbname_.c_str(), static_cast<unsigned long long>(live.size()));
-
-  std::vector<std::string> filenames;
-  mutex_.Unlock();
-  env_->GetChildren(dbname_, &filenames); // Ignoring errors on purpose
-  mutex_.Lock();
+  Log(options_.info_log, "[%s] try DeleteObsoleteFiles, total live file num: %llu,"
+      " pending_outputs %lu, children_nr %lu\n",
+      dbname_.c_str(), static_cast<unsigned long long>(live.size()),
+      pending_outputs_.size(), filenames.size());
   uint64_t number;
   FileType type;
   for (size_t i = 0; i < filenames.size(); i++) {
@@ -345,28 +367,28 @@ void DBImpl::DeleteObsoleteFiles() {
           keep = ((number >= versions_->LogNumber()) ||
                   (number == versions_->PrevLogNumber()));
           break;
-        case kDescriptorFile:
-          manifest_set.insert(filenames[i]);
-          if (manifest_set.size() > 3) {
-              std::set<std::string>::iterator it = manifest_set.begin();
-              ParseFileName(*it, &number, &type);
-              if (number < versions_->ManifestFileNumber()) {
-                // Keep my manifest file, and any newer incarnations'
-                // (in case there is a race that allows other incarnations)
-                filenames[i] = *it;
-                keep = false;
-                manifest_set.erase(it);
-              }
-          }
-          break;
+        //case kDescriptorFile:
+        //  manifest_set.insert(filenames[i]);
+        //  if (manifest_set.size() > 3) {
+        //      std::set<std::string>::iterator it = manifest_set.begin();
+        //      ParseFileName(*it, &number, &type);
+        //      if (number < versions_->ManifestFileNumber()) {
+        //        // Keep my manifest file, and any newer incarnations'
+        //        // (in case there is a race that allows other incarnations)
+        //        filenames[i] = *it;
+        //        keep = false;
+        //        manifest_set.erase(it);
+        //      }
+        //  }
+        //  break;
         case kTableFile:
           keep = (live.find(BuildFullFileNumber(dbname_, number)) != live.end());
           break;
-        case kTempFile:
-          // Any temp files that are currently being written to must
-          // be recorded in pending_outputs_, which is inserted into "live"
-          keep = (live.find(number) != live.end());
-          break;
+        //case kTempFile:
+        //  // Any temp files that are currently being written to must
+        //  // be recorded in pending_outputs_, which is inserted into "live"
+        //  keep = (live.find(number) != live.end());
+        //  break;
         case kCurrentFile:
         case kDBLockFile:
         case kInfoLogFile:
@@ -381,9 +403,9 @@ void DBImpl::DeleteObsoleteFiles() {
         if (type == kTableFile) {
           table_cache_->Evict(dbname_, BuildFullFileNumber(dbname_, number));
         }
-        Log(options_.info_log, "[%s] Delete type=%s #%lld\n",
+        Log(options_.info_log, "[%s] Delete type=%s #%lld, fname %s\n",
             dbname_.c_str(), FileTypeToString(type),
-            static_cast<unsigned long long>(number));
+            static_cast<unsigned long long>(number), filenames[i].c_str());
         mutex_.Unlock();
         env_->DeleteFile(dbname_ + "/" + filenames[i]);
         mutex_.Lock();
@@ -548,15 +570,7 @@ Status DBImpl::DbExists(bool* exists) {
 
 Status DBImpl::Recover(VersionEdit* edit) {
   mutex_.AssertHeld();
-  if (options_.ignore_corruption_in_open) {
-      Status s = env_->DeleteFile(dbname_ + mark_file_name);
-      if (!s.ok()) {
-        // legacy mark-file is dangerous
-        Log(options_.info_log, "[%s] delete mark-file failed for %s",
-            dbname_.c_str(), s.ToString().c_str());
-        return Status::IOError("delete mark-file failed");
-      }
-  }
+  bool need_newdb_txn = false;
 
   {
       Status s = env_->FileExists(dbname_);
@@ -567,14 +581,24 @@ Status DBImpl::Recover(VersionEdit* edit) {
               dbname_.c_str(), s.ToString().c_str());
           return s;
         }
+        need_newdb_txn = true;
       } else if (s.ok()) {
-        // Directory exists, do nothing
+        // lg directory exists and not ignore curruption in open
+        if (!options_.ignore_corruption_in_open) {
+          s = env_->FileExists(dbname_ + init_load_filelock);
+          if (s.ok()) {
+            need_newdb_txn = true;
+          } else if (!s.IsNotFound()) {
+            // Unknown status
+            return s;
+          }
+        }
       } else {
         // Unknown status
         return s;
       }
   }
-
+  
   if (options_.use_file_lock) {
     Status s = env_->LockFile(LockFileName(dbname_), &db_lock_);
     if (!s.ok()) {
@@ -582,6 +606,36 @@ Status DBImpl::Recover(VersionEdit* edit) {
     }
   }
 
+  if (options_.ignore_corruption_in_open) {
+    Status s = env_->FileExists(dbname_ + init_load_filelock);
+    if (s.ok()) {
+      s = env_->DeleteFile(dbname_ + init_load_filelock);
+      if (!s.ok()) {
+        // legacy initlock-file is dangerous
+        Log(options_.info_log, "[%s] delete initlock-file failed for %s",
+            dbname_.c_str(), s.ToString().c_str());
+        return Status::IOError("delete initlock-file failed");
+      }
+    }
+    s = env_->FileExists(dbname_ + mark_file_name);
+    if (s.ok()) {
+      s = env_->DeleteFile(dbname_ + mark_file_name);
+      if (!s.ok()) {
+        // legacy mark-file is dangerous
+        Log(options_.info_log, "[%s] delete mark-file failed for %s",
+            dbname_.c_str(), s.ToString().c_str());
+        return Status::IOError("delete mark-file failed");
+      }
+    }
+  }
+
+  if (need_newdb_txn) {
+    Status s = BeginNewDbTransaction();
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
   bool db_exists;
   Status s = DbExists(&db_exists);
   if (!s.ok()) {
@@ -664,7 +718,12 @@ Status DBImpl::Recover(VersionEdit* edit) {
       }
     }
   }
-
+  if (need_newdb_txn) {
+    Status s = CommitNewDbTransaction();
+    if (!s.ok()) {
+      return s;
+    }
+  }
   if (s.ok()) {
     state_ = kOpened;
   }
@@ -672,11 +731,14 @@ Status DBImpl::Recover(VersionEdit* edit) {
 }
 
 Status DBImpl::WriteLevel0Table(MemTable* mem, VersionEdit* edit,
-                                Version* base) {
+                                Version* base, uint64_t* number) {
   mutex_.AssertHeld();
   const uint64_t start_micros = env_->NowMicros();
   FileMetaData meta;
   meta.number = BuildFullFileNumber(dbname_, versions_->NewFileNumber());
+  if (number) {
+    *number = meta.number;
+  }
   pending_outputs_.insert(meta.number);
   Iterator* iter = mem->NewIterator();
   Log(options_.info_log, "[%s] Level-0 table #%u: started",
@@ -724,15 +786,39 @@ Status DBImpl::WriteLevel0Table(MemTable* mem, VersionEdit* edit,
   return s;
 }
 
-Status DBImpl::CompactMemTable() {
+// multithread safe
+Status DBImpl::CompactMemTable(bool* sched_idle) {
   mutex_.AssertHeld();
   assert(imm_ != NULL);
+  Status s;
+  if (sched_idle) {
+      *sched_idle = true;
+  }
+  if (imm_->BeingFlushed()) {
+    //Log(options_.info_log, "[%s] CompactMemTable conflict, seq %lu",
+    //    dbname_.c_str(), GetLastSequence(false));
+    return s;
+  }
+  imm_->SetBeingFlushed(true);
+
+  if (imm_->ApproximateMemoryUsage() <= 0) { // imm is empty, do nothing
+    Log(options_.info_log, "[%s] CompactMemTable empty memtable %lu",
+        dbname_.c_str(), GetLastSequence(false));
+    imm_->Unref();
+    imm_ = NULL;
+    has_imm_.Release_Store(NULL);
+    return s;
+  }
+  if (sched_idle) {
+    *sched_idle = false;
+  }
 
   // Save the contents of the memtable as a new Table
   VersionEdit edit;
+  uint64_t number;
   Version* base = versions_->current();
   base->Ref();
-  Status s = WriteLevel0Table(imm_, &edit, base);
+  s = WriteLevel0Table(imm_, &edit, base, &number);
   base->Unref();
 
   if (s.ok() && shutting_down_.Acquire_Load()) {
@@ -741,6 +827,7 @@ Status DBImpl::CompactMemTable() {
 
   // Replace immutable memtable with the generated Table
   if (s.ok()) {
+    pending_outputs_.insert(number); // LogAndApply donot holds lock, so use pending_outputs_ to make sure new file will not be deleted
     edit.SetPrevLogNumber(0);
     edit.SetLogNumber(logfile_number_);  // Earlier logs no longer needed
     if (imm_->GetLastSequence()) {
@@ -749,6 +836,7 @@ Status DBImpl::CompactMemTable() {
     Log(options_.info_log, "[%s] CompactMemTable SetLastSequence %lu",
         dbname_.c_str(), edit.GetLastSequence());
     s = versions_->LogAndApply(&edit, &mutex_);
+    pending_outputs_.erase(number);
   }
 
   if (s.ok()) {
@@ -756,6 +844,9 @@ Status DBImpl::CompactMemTable() {
     imm_->Unref();
     imm_ = NULL;
     has_imm_.Release_Store(NULL);
+  } else {
+    // imm dump fail, reset being flush flag
+    imm_->SetBeingFlushed(false);
   }
 
   return s;
@@ -787,6 +878,8 @@ void DBImpl::TEST_CompactRange(int level, const Slice* begin,const Slice* end) {
   ManualCompaction manual;
   manual.level = level;
   manual.done = false;
+  manual.being_sched = false;
+  manual.compaction_conflict = kManualCompactIdle;
   if (begin == NULL) {
     manual.begin = NULL;
   } else {
@@ -805,6 +898,9 @@ void DBImpl::TEST_CompactRange(int level, const Slice* begin,const Slice* end) {
     if (manual_compaction_ == NULL) { // Idle
         manual_compaction_ = &manual;
         MaybeScheduleCompaction();
+    } else if (manual_compaction_->compaction_conflict == kManualCompactWakeup) {
+        manual_compaction_->compaction_conflict = kManualCompactIdle;
+        MaybeScheduleCompaction();
     } else { // Running either my compaction or another compaction.
         bg_cv_.Wait();
     }
@@ -877,117 +973,144 @@ void DBImpl::AddInheritedLiveFiles(std::vector<std::set<uint64_t> >* live) {
 }
 
 Status DBImpl::RecoverInsertMem(WriteBatch* batch, VersionEdit* edit) {
-    MutexLock lock(&mutex_);
+  MutexLock lock(&mutex_);
 
-    if (recover_mem_ == NULL) {
-        recover_mem_ = NewMemTable();
-        recover_mem_->Ref();
-    }
-    uint64_t log_sequence = WriteBatchInternal::Sequence(batch);
-    uint64_t last_sequence = log_sequence + WriteBatchInternal::Count(batch) - 1;
+  if (recover_mem_ == NULL) {
+    recover_mem_ = NewMemTable();
+    recover_mem_->Ref();
+  }
+  uint64_t log_sequence = WriteBatchInternal::Sequence(batch);
+  uint64_t last_sequence = log_sequence + WriteBatchInternal::Count(batch) - 1;
 
-    // if duplicate record, ignore
-    if (log_sequence <= recover_mem_->GetLastSequence()) {
-        assert (last_sequence <= recover_mem_->GetLastSequence());
-        Log(options_.info_log, "[%s] duplicate record, ignore %lu ~ %lu",
-            dbname_.c_str(), log_sequence, last_sequence);
-        return Status::OK();
-    }
+  // if duplicate record, ignore
+  if (log_sequence <= recover_mem_->GetLastSequence()) {
+    assert (last_sequence <= recover_mem_->GetLastSequence());
+    Log(options_.info_log, "[%s] duplicate record, ignore %lu ~ %lu",
+        dbname_.c_str(), log_sequence, last_sequence);
+    return Status::OK();
+  }
 
-    Status status = WriteBatchInternal::InsertInto(batch, recover_mem_);
-    MaybeIgnoreError(&status);
+  Status status = WriteBatchInternal::InsertInto(batch, recover_mem_);
+  MaybeIgnoreError(&status);
+  if (!status.ok()) {
+    return status;
+  }
+  if (recover_mem_->ApproximateMemoryUsage() > options_.write_buffer_size) {
+    edit->SetLastSequence(recover_mem_->GetLastSequence());
+    status = WriteLevel0Table(recover_mem_, edit, NULL);
     if (!status.ok()) {
-        return status;
-    }
-    if (recover_mem_->ApproximateMemoryUsage() > options_.write_buffer_size) {
-        edit->SetLastSequence(recover_mem_->GetLastSequence());
-        status = WriteLevel0Table(recover_mem_, edit, NULL);
-        if (!status.ok()) {
-            // Reflect errors immediately so that conditions like full
-            // file-systems cause the DB::Open() to fail.
-            return status;
-        }
-        recover_mem_->Unref();
-        recover_mem_ = NULL;
+      // Reflect errors immediately so that conditions like full
+      // file-systems cause the DB::Open() to fail.
+      return status;
     }
-    return status;
+    recover_mem_->Unref();
+    recover_mem_ = NULL;
+  }
+  return status;
 }
 
 Status DBImpl::RecoverLastDumpToLevel0(VersionEdit* edit) {
-    MutexLock lock(&mutex_);
-    Status status;
-    if (recover_mem_ == NULL) {
-        return status;
-    }
+  MutexLock lock(&mutex_);
+  Status s;
+  if (recover_mem_ != NULL) {
     if (recover_mem_->GetLastSequence() > 0) {
-        edit->SetLastSequence(recover_mem_->GetLastSequence());
-        status = WriteLevel0Table(recover_mem_, edit, NULL);
+      edit->SetLastSequence(recover_mem_->GetLastSequence());
+      s = WriteLevel0Table(recover_mem_, edit, NULL);
     }
     recover_mem_->Unref();
     recover_mem_ = NULL;
-    return status;
-}
+  }
+  assert(recover_mem_ == NULL);
 
+  // LogAndApply to lg's manifest
+  if (s.ok()) {
+    s = versions_->LogAndApply(edit, &mutex_);
+    if (s.ok()) {
+      DeleteObsoleteFiles();
+      MaybeScheduleCompaction();
+    } else {
+      Log(options_.info_log, "[%s] Fail to modify manifest",
+          dbname_.c_str());
+    }
+  } else {
+    Log(options_.info_log, "[%s] Fail to dump log to level 0", dbname_.c_str());
+  }
+  return s;
+}
 // end of tera-specific
 
+bool ScoreSortGreater(std::pair<double, uint64_t> i, std::pair<double, uint64_t> j) {
+  if (i.second != j.second) {
+    return i.second < j.second;
+  } else {
+    return i.first > j.first;
+  }
+}
 void DBImpl::MaybeScheduleCompaction() {
   mutex_.AssertHeld();
   if (shutting_down_.Acquire_Load()) {
     // DB is being deleted; no more background compactions
+  } else if (bg_error_.IsIOPermissionDenied()) {
+    // We have met an PermissionDenied error, not try to do compaction anymore, the tablet will be unloaded soon
   } else {
-    uint64_t timeout = 0;
-    double score = versions_->CompactionScore(&timeout);
-    if (manual_compaction_ != NULL) {
-      score = kManualCompactScore;
-      timeout = 0;
-    }
-    if (imm_ != NULL) {
-      score = kDumpMemTableScore;
-      timeout = 0;
-    }
-    if (score > 0) {
-      if (!bg_compaction_scheduled_) {
-        bg_schedule_id_ = env_->Schedule(&DBImpl::BGWork, this, score, timeout);
-        Log(options_.info_log, "[%s] Schedule Compact[%ld] score= %.2f, timeout=%lu",
-            dbname_.c_str(), bg_schedule_id_, score, timeout);
-        bg_compaction_score_ = score;
-        bg_compaction_timeout_ = timeout;
-        bg_compaction_scheduled_ = true;
-        assert(score <= 1 || timeout == 0); // if score > 1, then timeout MUST be 0
-      } else {
-        // use the same way to compute priority score, like util/thread_pool.h
-        bool need_resched = false;
-        if (timeout != bg_compaction_timeout_) {
-          need_resched = timeout < bg_compaction_timeout_;
-        } else if (score != bg_compaction_score_) {
-          need_resched = score > bg_compaction_score_;
-        }
-
-        if (need_resched) {
-          env_->ReSchedule(bg_schedule_id_, score, timeout);
-          Log(options_.info_log, "[%s] ReSchedule Compact[%ld] score= %.2f, timeout=%lu",
-              dbname_.c_str(), bg_schedule_id_, score, timeout);
-          bg_compaction_score_ = score;
-          bg_compaction_timeout_ = timeout;
-          assert(score <= 1 || timeout == 0); // if score > 1, then timeout MUST be 0
+    std::vector<std::pair<double, uint64_t> > scores;
+    if (imm_ && !imm_->BeingFlushed()) {
+      scores.push_back(std::pair<double, uint64_t>(kDumpMemTableScore, 0));
+    }
+    if (manual_compaction_ && !manual_compaction_->being_sched &&
+        (manual_compaction_->compaction_conflict != kManualCompactConflict)) {
+      scores.push_back(std::pair<double, uint64_t>(kManualCompactScore, 0));
+    }
+    versions_->CompactionScore(&scores);
+
+    size_t qlen = scores.size() > bg_compaction_tasks_.size() ? scores.size(): bg_compaction_tasks_.size();
+    for (size_t i = 0; i < bg_compaction_tasks_.size(); i++) {
+      CompactionTask* task = bg_compaction_tasks_[i];
+      scores.push_back(std::pair<double, uint64_t>(task->score, task->timeout));
+    }
+    std::sort(scores.begin(), scores.end(), ScoreSortGreater);
+
+    for (size_t i = 0; i < qlen; i++) {
+      if (bg_compaction_tasks_.size() < options_.max_background_compactions) {
+        if (i < bg_compaction_tasks_.size()) { // try reschedule
+          CompactionTask* task = bg_compaction_tasks_[i];
+          if (ScoreSortGreater(scores[i], std::pair<double, uint64_t>(task->score, task->timeout))) { // resched
+            task->score = scores[i].first;
+            task->timeout = scores[i].second;
+            env_->ReSchedule(task->id, task->score, task->timeout);
+            Log(options_.info_log, "[%s] ReSchedule Compact[%ld] score= %.2f, timeout=%lu, currency %d",
+              dbname_.c_str(), task->id, task->score, task->timeout, (int)bg_compaction_tasks_.size());
+            assert(scores[i].first <= 1 || scores[i].second == 0); // if score > 1, then timeout MUST be 0
+          }
+        } else { // new compact task
+          CompactionTask* task = new CompactionTask;
+          task->db = this;
+          task->score = scores[i].first;
+          task->timeout = scores[i].second;
+          bg_compaction_tasks_.push_back(task);
+          task->id = env_->Schedule(&DBImpl::BGWork, task, task->score, task->timeout);
+          Log(options_.info_log, "[%s] Schedule Compact[%ld] score= %.2f, timeout=%lu, currency %d",
+            dbname_.c_str(), task->id, task->score, task->timeout, (int)bg_compaction_tasks_.size());
+          assert(scores[i].first <= 1 || scores[i].second == 0); // if score > 1, then timeout MUST be 0
         }
       }
-    } else {
-      // No work to be done
     }
   }
+  return;
 }
 
-void DBImpl::BGWork(void* db) {
-  reinterpret_cast<DBImpl*>(db)->BackgroundCall();
+void DBImpl::BGWork(void* task) {
+  CompactionTask* ctask = reinterpret_cast<CompactionTask*>(task);
+  reinterpret_cast<DBImpl*>(ctask->db)->BackgroundCall(ctask);
 }
 
-void DBImpl::BackgroundCall() {
-  Log(options_.info_log, "[%s] BackgroundCall", dbname_.c_str());
+void DBImpl::BackgroundCall(CompactionTask* task) {
   MutexLock l(&mutex_);
-  assert(bg_compaction_scheduled_);
+  Log(options_.info_log, "[%s] BackgroundCompact[%ld] score= %.2f currency %d",
+    dbname_.c_str(), task->id, task->score, (int)bg_compaction_tasks_.size());
+  bool sched_idle = false;
   if (!shutting_down_.Acquire_Load()) {
-    Status s = BackgroundCompaction();
+    Status s = BackgroundCompaction(&sched_idle);
     if (s.ok()) {
       // Success
       consecutive_compaction_errors_ = 0;
@@ -1001,12 +1124,12 @@ void DBImpl::BackgroundCall() {
       bg_cv_.SignalAll();  // In case a waiter can proceed despite the error
       Log(options_.info_log, "[%s] Waiting after background compaction error: %s, retry: %d",
           dbname_.c_str(), s.ToString().c_str(), consecutive_compaction_errors_);
-      mutex_.Unlock();
       ++consecutive_compaction_errors_;
-      if (consecutive_compaction_errors_ > 100000) {
+      if (s.IsIOPermissionDenied() || consecutive_compaction_errors_ > 100000) {
           bg_error_ = s;
           consecutive_compaction_errors_ = 0;
       }
+      mutex_.Unlock();
       int seconds_to_sleep = 1;
       for (int i = 0; i < 3 && i < consecutive_compaction_errors_ - 1; ++i) {
         seconds_to_sleep *= 2;
@@ -1014,36 +1137,53 @@ void DBImpl::BackgroundCall() {
       env_->SleepForMicroseconds(seconds_to_sleep * 1000000);
       mutex_.Lock();
     }
+  } else {
+    sched_idle = true;
   }
 
-  bg_compaction_scheduled_ = false;
+  std::vector<CompactionTask*>::iterator task_id = std::find(bg_compaction_tasks_.begin(),
+                                                            bg_compaction_tasks_.end(),
+                                                            task);
+  assert(task_id != bg_compaction_tasks_.end());
+  bg_compaction_tasks_.erase(task_id);
+  delete task;
 
   // Previous compaction may have produced too many files in a level,
   // so reschedule another compaction if needed.
-  MaybeScheduleCompaction();
+  if (!sched_idle) {
+    MaybeScheduleCompaction();
+  }
   bg_cv_.SignalAll();
 }
 
-Status DBImpl::BackgroundCompaction() {
+Status DBImpl::BackgroundCompaction(bool* sched_idle) {
   mutex_.AssertHeld();
 
-  if (imm_ != NULL) {
-    return CompactMemTable();
+  *sched_idle = false;
+  if (imm_ && !imm_->BeingFlushed()) {
+    return CompactMemTable(sched_idle);
   }
 
-  Compaction* c;
+  Status status;
+  Compaction* c = NULL;
   bool is_manual = (manual_compaction_ != NULL);
   InternalKey manual_end;
   if (is_manual) {
     ManualCompaction* m = manual_compaction_;
-    c = versions_->CompactRange(m->level, m->begin, m->end);
-    m->done = (c == NULL);
+    if (m->being_sched) { // other thread doing manual compaction or range being compacted
+      return status;
+    }
+    m->being_sched = true;
+    bool conflict = false;
+    c = versions_->CompactRange(m->level, m->begin, m->end, &conflict);
+    m->compaction_conflict = conflict? kManualCompactConflict : kManualCompactIdle;
+    m->done = (c == NULL && !conflict);
     if (c != NULL) {
       manual_end = c->input(0, c->num_input_files(0) - 1)->largest;
     }
     Log(options_.info_log,
-        "[%s] Manual compaction at level-%d from %s .. %s; will stop at %s\n",
-        dbname_.c_str(), m->level,
+        "[%s] Manual compaction, conflit %u, at level-%d from %s .. %s; will stop at %s\n",
+        dbname_.c_str(), conflict, m->level,
         (m->begin ? m->begin->DebugString().c_str() : "(begin)"),
         (m->end ? m->end->DebugString().c_str() : "(end)"),
         (m->done ? "(end)" : manual_end.DebugString().c_str()));
@@ -1051,9 +1191,9 @@ Status DBImpl::BackgroundCompaction() {
     c = versions_->PickCompaction();
   }
 
-  Status status;
   if (c == NULL) {
     // Nothing to do
+    *sched_idle = true;
   } else if (!is_manual && c->IsTrivialMove()) {
     // Move file to next level
     assert(c->num_input_files(0) == 1);
@@ -1070,12 +1210,9 @@ Status DBImpl::BackgroundCompaction() {
         static_cast<unsigned long long>(f->file_size),
         status.ToString().c_str(),
         versions_->LevelSummary(&tmp));
+    versions_->ReleaseCompaction(c, status);
   } else {
-    CompactionState* compact = new CompactionState(c);
-    status = DoCompactionWork(compact);
-    CleanupCompaction(compact);
-    c->ReleaseInputs();
-    DeleteObsoleteFiles();
+    status = ParallelCompaction(c);
   }
   delete c;
 
@@ -1094,16 +1231,127 @@ Status DBImpl::BackgroundCompaction() {
 
   if (is_manual) {
     ManualCompaction* m = manual_compaction_;
-    if (!status.ok()) {
-      m->done = true;
+    m->being_sched = false;
+    if (m->compaction_conflict != kManualCompactConflict) { // PickRange success
+      if (!status.ok()) {
+        m->done = true;
+      }
+      if (!m->done) {
+        // We only compacted part of the requested range.  Update *m
+        // to the range that is left to be compacted.
+        m->tmp_storage = manual_end;
+        m->begin = &m->tmp_storage;
+      }
+      manual_compaction_ = NULL;
     }
-    if (!m->done) {
-      // We only compacted part of the requested range.  Update *m
-      // to the range that is left to be compacted.
-      m->tmp_storage = manual_end;
-      m->begin = &m->tmp_storage;
+  } else if (manual_compaction_ != NULL) { // non manual compact
+    ManualCompaction* m = manual_compaction_;
+    m->compaction_conflict = kManualCompactWakeup;// Wakeup here, ManualCompact thread check it
+    Log(options_.info_log,
+        "[%s] Wakeup Manual compaction at level-%d from %s .. %s",
+        dbname_.c_str(), m->level,
+        (m->begin ? m->begin->DebugString().c_str() : "(begin)"),
+        (m->end ? m->end->DebugString().c_str() : "(end)"));
+  }
+  return status;
+}
+
+Status DBImpl::ParallelCompaction(Compaction* c) {
+  const uint64_t start_micros = env_->NowMicros();
+  std::vector<Compaction*> compaction_vec;
+  std::vector<CompactionState*> compaction_state_vec;
+  std::vector<CompactStrategy*> compact_stragety_vec;
+  assert(versions_->NumLevelFiles(c->level()) > 0);
+  SequenceNumber smallest_snapshot = snapshots_.empty() ? kMaxSequenceNumber : *(snapshots_.begin());
+  versions_->GenerateSubCompaction(c, &compaction_vec, &mutex_);
+  mutex_.Unlock();
+
+  // handle compaction without Lock
+  std::vector<std::thread> thread_pool;
+  thread_pool.reserve(compaction_vec.size() - 1);
+  Log(options_.info_log,  "[%s] parallel compacting %d@%d + %d@%d files, "
+      "sub_compact %lu, snapshot %lu\n",
+      dbname_.c_str(),
+      c->num_input_files(0),
+      c->level(),
+      c->num_input_files(1),
+      c->output_level(),
+      compaction_vec.size(),
+      smallest_snapshot);
+  for (size_t i = 0; i < compaction_vec.size(); i++) {
+    CompactionState* compaction = new CompactionState(compaction_vec[i]);
+    assert(compaction->builder == NULL);
+    assert(compaction->outfile == NULL);
+    compaction->smallest_snapshot = smallest_snapshot;
+    compaction_state_vec.push_back(compaction);
+
+    CompactStrategy* compact_strategy = NewCompactStrategy(compaction);
+    compact_stragety_vec.push_back(compact_strategy);
+    if (i == 0) {
+      Log(options_.info_log,  "[%s] compact strategy: %s, snapshot %lu\n",
+          dbname_.c_str(),
+          compact_strategy->Name(),
+          compaction->smallest_snapshot);
     }
-    manual_compaction_ = NULL;
+
+    if (i < compaction_vec.size() - 1) {
+      thread_pool.emplace_back(&DBImpl::HandleCompactionWork, this,
+                               compaction, compact_strategy);
+    } else {
+      HandleCompactionWork(compaction, compact_strategy);
+    }
+  }
+  for (auto& t : thread_pool) {
+    t.join();
+  }
+
+  CompactionStats stats;
+  CompactionState* compact = new CompactionState(c);
+  compact->smallest_snapshot = smallest_snapshot;
+  for (size_t i = 0; i < compaction_vec.size(); i++) {
+    CompactionState* compaction = compaction_state_vec[i];
+    for (auto & out : compaction->outputs) {
+      compact->outputs.push_back(out);
+      stats.bytes_written += out.file_size;
+    }
+    compact->total_bytes += compaction->total_bytes;
+    if (compact->status.ok()) {
+      compact->status = compaction->status;
+    }
+
+    CompactStrategy* compact_stragety = compact_stragety_vec[i];
+    delete compact_stragety;
+  }
+  for (int which = 0; which < 2; which++) {
+    for (int i = 0; i < compact->compaction->num_input_files(which); i++) {
+      stats.bytes_read += compact->compaction->input(which, i)->file_size;
+    }
+  }
+
+  mutex_.Lock();
+  Status status = compact->status;
+  if (status.ok()) {
+    status = InstallCompactionResults(compact);
+  }
+  VersionSet::LevelSummaryStorage tmp;
+  Log(options_.info_log, "[%s] compacted to: %s, compacte stat %s",
+      dbname_.c_str(), versions_->LevelSummary(&tmp), status.ToString().c_str());
+  stats.micros = env_->NowMicros() - start_micros;
+  stats_[compact->compaction->output_level()].Add(stats);
+
+  for (size_t i = 0; i < compaction_vec.size(); i++) {
+    CompactionState* compaction = compaction_state_vec[i];
+    CleanupCompaction(compaction); // pop pedning output, which can be deleted in DeleteObSoleteFiles()
+    delete compaction_vec[i];
+  }
+  assert(compact->builder == NULL);
+  assert(compact->outfile == NULL);
+  CleanupCompaction(compact);
+
+  versions_->ReleaseCompaction(c, status); // current_version has reference to c->inputs_[0,1]
+  c->ReleaseInputs();
+  if (!status.IsIOPermissionDenied()) {
+    DeleteObsoleteFiles();
   }
   return status;
 }
@@ -1120,7 +1368,9 @@ void DBImpl::CleanupCompaction(CompactionState* compact) {
   delete compact->outfile;
   for (size_t i = 0; i < compact->outputs.size(); i++) {
     const CompactionState::Output& out = compact->outputs[i];
-    pending_outputs_.erase(BuildFullFileNumber(dbname_, out.number));
+    if (pending_outputs_.erase(BuildFullFileNumber(dbname_, out.number)) > 0) {
+      Log(options_.info_log, "[%s] erase pending_output #%lu", dbname_.c_str(), out.number);
+    }
   }
   delete compact;
 }
@@ -1138,6 +1388,8 @@ Status DBImpl::OpenCompactionOutputFile(CompactionState* compact) {
     out.smallest.Clear();
     out.largest.Clear();
     compact->outputs.push_back(out);
+
+    Log(options_.info_log, "[%s] insert pending_output #%lu", dbname_.c_str(), file_number);
     mutex_.Unlock();
   }
 
@@ -1253,64 +1505,63 @@ Status DBImpl::InstallCompactionResults(CompactionState* compact) {
   return versions_->LogAndApply(compact->compaction->edit(), &mutex_);
 }
 
-Status DBImpl::DoCompactionWork(CompactionState* compact) {
-  const uint64_t start_micros = env_->NowMicros();
-  int64_t imm_micros = 0;  // Micros spent doing imm_ compactions
-
-  Log(options_.info_log,  "[%s] Compacting %d@%d + %d@%d files",
-      dbname_.c_str(),
-      compact->compaction->num_input_files(0),
-      compact->compaction->level(),
-      compact->compaction->num_input_files(1),
-      compact->compaction->output_level());
-
-  assert(versions_->NumLevelFiles(compact->compaction->level()) > 0);
-  assert(compact->builder == NULL);
-  assert(compact->outfile == NULL);
-  if (snapshots_.empty()) {
-    compact->smallest_snapshot = kMaxSequenceNumber;
-  } else {
-    compact->smallest_snapshot = *(snapshots_.begin());
-  }
-
+CompactStrategy* DBImpl::NewCompactStrategy(CompactionState* compact) {
   CompactStrategy* compact_strategy = NULL;
   if (options_.compact_strategy_factory) {
     compact_strategy = options_.compact_strategy_factory->NewInstance();
-    if (snapshots_.empty()) {
-      compact_strategy->SetSnapshot(kMaxSequenceNumber);
-    } else {
-      compact_strategy->SetSnapshot(*(snapshots_.begin()));
-    }
-    Log(options_.info_log,  "[%s] Compact strategy: %s",
-      dbname_.c_str(),
-      compact_strategy->Name());
+    compact_strategy->SetSnapshot(compact->smallest_snapshot);
   }
+  return compact_strategy;
+}
 
-  // Release mutex while we're actually doing the compaction work
-  mutex_.Unlock();
+// ** Handle sub compaction without LOCK **
+void DBImpl::HandleCompactionWork(CompactionState* compact,
+                                  CompactStrategy* compact_strategy) {
+  Compaction* c = compact->compaction;
+  Status& status = compact->status;
+  Iterator* input = versions_->MakeInputIterator(c);
+  if (c->sub_compact_start_ == "") {
+    input->SeekToFirst();
+  } else {
+    input->Seek(c->sub_compact_start_);
+  }
+  Slice end_key(c->sub_compact_end_);
+  Log(options_.info_log,  "[%s] handle %d@%d + %d@%d compact, range [%s, %s)\n",
+      dbname_.c_str(),
+      c->num_input_files(0),
+      c->level(),
+      c->num_input_files(1),
+      c->output_level(),
+      c->sub_compact_start_.c_str(),
+      c->sub_compact_end_.c_str());
 
-  Iterator* input = versions_->MakeInputIterator(compact->compaction);
-  input->SeekToFirst();
-  Status status;
   ParsedInternalKey ikey;
   std::string current_user_key;
   bool has_current_user_key = false;
   SequenceNumber last_sequence_for_key = kMaxSequenceNumber;
-
   for (; input->Valid() && !shutting_down_.Acquire_Load(); ) {
     // Prioritize immutable compaction work
     if (has_imm_.NoBarrier_Load() != NULL) {
-      const uint64_t imm_start = env_->NowMicros();
       mutex_.Lock();
-      if (imm_ != NULL) {
-        CompactMemTable();
+      if (imm_ && !imm_->BeingFlushed()) {
+        CompactMemTable(); // no need check failure, because imm_ not null if dump fail.
         bg_cv_.SignalAll();  // Wakeup MakeRoomForWrite() if necessary
       }
       mutex_.Unlock();
-      imm_micros += (env_->NowMicros() - imm_start);
     }
 
     Slice key = input->key();
+    if (end_key.size() > 0 && internal_comparator_.InternalKeyComparator::Compare(input->key(), end_key) >= 0) {
+      Log(options_.info_log,  "[%s] handle %d@%d + %d@%d compact, stop at %s\n",
+          dbname_.c_str(),
+          c->num_input_files(0),
+          c->level(),
+          c->num_input_files(1),
+          c->output_level(),
+          end_key.data());
+      break; // reach end_key, stop this sub compaction
+    }
+
     if (compact->compaction->ShouldStopBefore(key) &&
         compact->builder != NULL) { // should not overlap level() + 2 too much
       status = FinishCompactionOutputFile(compact, input);
@@ -1431,10 +1682,6 @@ Status DBImpl::DoCompactionWork(CompactionState* compact) {
     }
   }
 
-  if (compact_strategy) {
-    delete compact_strategy;
-  }
-
   if (status.ok() && shutting_down_.Acquire_Load()) {
     status = Status::IOError("Deleting DB during compaction");
   }
@@ -1451,28 +1698,6 @@ Status DBImpl::DoCompactionWork(CompactionState* compact) {
   }
   delete input;
   input = NULL;
-
-  CompactionStats stats;
-  stats.micros = env_->NowMicros() - start_micros - imm_micros;
-  for (int which = 0; which < 2; which++) {
-    for (int i = 0; i < compact->compaction->num_input_files(which); i++) {
-      stats.bytes_read += compact->compaction->input(which, i)->file_size;
-    }
-  }
-  for (size_t i = 0; i < compact->outputs.size(); i++) {
-    stats.bytes_written += compact->outputs[i].file_size;
-  }
-
-  mutex_.Lock();
-  stats_[compact->compaction->output_level()].Add(stats);
-
-  if (status.ok()) {
-    status = InstallCompactionResults(compact);
-  }
-  VersionSet::LevelSummaryStorage tmp;
-  Log(options_.info_log,
-      "[%s] compacted to: %s", dbname_.c_str(), versions_->LevelSummary(&tmp));
-  return status;
 }
 
 struct IterState {
@@ -1652,8 +1877,9 @@ bool DBImpl::BusyWrite() {
 
 void DBImpl::Workload(double* write_workload) {
   MutexLock l(&mutex_);
-  uint64_t timeout = 0;
-  double wwl = versions_->CompactionScore(&timeout);
+  std::vector<std::pair<double, uint64_t> > scores;
+  versions_->CompactionScore(&scores);
+  double wwl = scores.size() > 0? scores[0].first: 0;
   if (wwl >= 0) {
     *write_workload = wwl;
   } else {
@@ -1837,6 +2063,71 @@ bool DBImpl::GetProperty(const Slice& property, std::string* value) {
   } else if (in == "sstables") {
     *value = versions_->current()->DebugString();
     return true;
+  } else if (in == "verify-db-integrity") {
+    std::map<uint64_t, uint64_t> check_file_list;
+    versions_->AddLiveFilesWithSize(&check_file_list);
+    mutex_.Unlock();
+
+    std::set<uint64_t> tablet_num;
+    std::map<uint64_t, uint64_t>::iterator it = check_file_list.begin();
+    for (; it != check_file_list.end(); ++it) {
+      uint64_t tablet;
+      ParseFullFileNumber(it->first, &tablet, NULL);
+      tablet_num.insert(tablet);
+    }
+
+    Status s;
+    std::set<uint64_t>::iterator it_tablet = tablet_num.begin();
+    for (; s.ok() && it_tablet != tablet_num.end(); ++it_tablet) {
+      std::vector<std::string> filenames;
+      std::string tablet_path = RealDbName(dbname_, *it_tablet);
+      s = env_->GetChildren(tablet_path, &filenames);
+      //Log(options_.info_log, "[%s] verify db(slow), GetChildren %s, files_nr %lu, status %s",
+      //    dbname_.c_str(), tablet_path.c_str(), filenames.size(), s.ToString().c_str());
+
+      uint64_t number;
+      FileType type;
+      for (size_t i = 0; i < filenames.size(); i++) {
+        if (ParseFileName(filenames[i], &number, &type) && (type == kTableFile)) {
+          uint64_t tablet_no = BuildFullFileNumber(tablet_path, number);
+          if (check_file_list.find(tablet_no) == check_file_list.end()) {
+            continue;
+          }
+
+          uint64_t fsize = 0;
+          Status s1 = env_->GetFileSize(tablet_path + "/" + filenames[i], &fsize);
+          if (!s1.ok() || check_file_list[tablet_no] == fsize) {
+            check_file_list.erase(tablet_no);
+          } else {
+            Log(options_.info_log, "[%s] verify db, size mismatch, "
+                "path %s, tablet %s, size(in meta) %lu, size(in fs) %lu",
+                dbname_.c_str(), tablet_path.c_str(), filenames[i].c_str(), check_file_list[tablet_no], fsize);
+          }
+        }
+      }
+    }
+
+    mutex_.Lock();
+    std::map<uint64_t, uint64_t> live;
+    versions_->AddLiveFilesWithSize(&live);
+
+    it = check_file_list.begin();
+    while (it != check_file_list.end()) {
+      if (live.find(it->first) == live.end()) {
+        it = check_file_list.erase(it);
+      } else {
+        ++it;
+      }
+    }
+
+    if (s.ok() && check_file_list.empty()) { // verify success
+      value->append("verify_success");
+    } else if (s.ok()) { //sst file lost
+      value->append("verify_fail");
+      Log(options_.info_log, "[%s] db_corruption, lost %lu",
+          dbname_.c_str(), check_file_list.size());
+    }
+    return s.ok();
   }
 
   return false;
@@ -1894,29 +2185,29 @@ uint64_t DBImpl::GetLastSequence(bool is_locked) {
     retval = versions_->LastSequence();
   }
   if (is_locked) {
-    mutex_.Unlock();
+      mutex_.Unlock();
   }
   return retval;
 }
 
 MemTable* DBImpl::NewMemTable() const {
-  if (!options_.use_memtable_on_leveldb) {
-    return new MemTable(internal_comparator_,
-              options_.enable_strategy_when_get ? options_.compact_strategy_factory : NULL);
-  } else {
-    Logger* info_log = NULL;
-    //Logger* info_log = options_.info_log;
-    MemTableOnLevelDB* new_mem = new MemTableOnLevelDB(dbname_, internal_comparator_,
-                                 options_.compact_strategy_factory,
-                                 options_.memtable_ldb_write_buffer_size,
-                                 options_.memtable_ldb_block_size,
-                                 info_log);
-    std::multiset<uint64_t>::iterator i = snapshots_.begin();
-    for (; i != snapshots_.end(); ++i) {
-      new_mem->GetSnapshot(*i);
-    }
-    return new_mem;
-  }
+    if (!options_.use_memtable_on_leveldb) {
+        return new MemTable(internal_comparator_,
+                  options_.enable_strategy_when_get ? options_.compact_strategy_factory : NULL);
+    } else {
+        Logger* info_log = NULL;
+        //Logger* info_log = options_.info_log;
+        MemTableOnLevelDB* new_mem = new MemTableOnLevelDB(dbname_, internal_comparator_,
+                                     options_.compact_strategy_factory,
+                                     options_.memtable_ldb_write_buffer_size,
+                                     options_.memtable_ldb_block_size,
+                                     info_log);
+        std::multiset<uint64_t>::iterator i = snapshots_.begin();
+        for (; i != snapshots_.end(); ++i) {
+          new_mem->GetSnapshot(*i);
+        }
+        return new_mem;
+    }
 }
 
 uint64_t DBImpl::GetLastVerSequence() {
@@ -1929,4 +2220,70 @@ Iterator* DBImpl::NewInternalIterator() {
   return NewInternalIterator(ReadOptions(), &ignored);
 }
 
+Status DBImpl::BeginNewDbTransaction() {
+  std::string lock_file_name = dbname_ + init_load_filelock;
+  Status s = env_->FileExists(lock_file_name);
+  if (s.IsNotFound()) {
+    // first new by split or merge add __lock file for first create lg
+    s = WriteStringToFileSync(env_, "\n", lock_file_name);
+    if (!s.ok()) {
+      Log(options_.info_log, "[%s] fail to start new db transaction: %s", 
+          dbname_.c_str(), s.ToString().c_str());
+      return s;
+    }
+  } else if (s.ok()) {
+    // have failed before this time to open 	
+    // && ignore corruption option not opened
+    // && don't have sst files
+    // need to delete all files in this db except __init_load_filelock file
+    Log(options_.info_log, "[%s] begin to re-new db: %s",
+        dbname_.c_str(), s.ToString().c_str());
+    std::vector<std::string> files;
+    s = env_->GetChildren(dbname_, &files);
+    if (!s.ok()) {
+      Log(options_.info_log, "[%s] fail to re-new db: %s",
+          dbname_.c_str(), s.ToString().c_str());
+      return s;
+    }
+    uint64_t number;
+    FileType type;
+    for (size_t f = 0; f < files.size(); ++f) {
+      if (ParseFileName(files[f], &number, &type) && kTableFile == type) {
+        return s;
+      }
+    }
+    for (size_t f = 0; f < files.size(); ++f) {
+      std::string old_file_name = dbname_ + "/" + files[f];
+      if ("/" + files[f] != init_load_filelock) {
+        s = env_->DeleteFile(old_file_name);
+        if (!s.ok()) {
+          Log(options_.info_log, "[%s] fail to re-new db: %s",
+              dbname_.c_str(), s.ToString().c_str());
+          return s;
+        }
+      }
+    }
+  }
+  return s; 
+}
+
+Status DBImpl::CommitNewDbTransaction() {
+  std::string lock_file_name = dbname_ + init_load_filelock;
+  Status s = env_->FileExists(lock_file_name);
+  if (s.IsNotFound()) {
+    // lost lock file during this new db
+    Log(options_.info_log, "[%s] find transaction lock file fail: %s",
+        dbname_.c_str(), s.ToString().c_str());
+    return Status::Corruption("newdb transaction lock disappeared");
+  } else if (s.ok()) {
+    s = env_->DeleteFile(lock_file_name);
+    if (!s.ok()) {
+      Log(options_.info_log, "[%s] delete transaction lock file fail: %s",
+          dbname_.c_str(), s.ToString().c_str());
+      return Status::Corruption("newdb transaction clean lock faild");
+    }
+  }
+  return s;
+}
+
 }  // namespace leveldb
diff --git a/src/leveldb/db/db_impl.h b/src/leveldb/db/db_impl.h
index 05b1ae623..8f23fb1c2 100644
--- a/src/leveldb/db/db_impl.h
+++ b/src/leveldb/db/db_impl.h
@@ -14,6 +14,8 @@
 #include "db/db_table.h"
 #include "db/dbformat.h"
 #include "db/log_writer.h"
+#include "db/version_set.h"
+#include "leveldb/compact_strategy.h"
 #include "leveldb/db.h"
 #include "leveldb/env.h"
 #include "port/port.h"
@@ -52,6 +54,8 @@ class DBImpl : public DB {
   virtual void GetApproximateSizes(uint64_t* size, std::vector<uint64_t>* lgsize = NULL);
   virtual void CompactRange(const Slice* begin, const Slice* end, int lg_no = -1);
 
+  virtual bool ShouldForceUnloadOnError();
+
   void AddBoundLogSize(uint64_t size);
 
   // tera-specific
@@ -96,6 +100,12 @@ class DBImpl : public DB {
   friend class DBTable;
   struct CompactionState;
   struct Writer;
+  struct CompactionTask {
+    int64_t id; // compaction thread id
+    double score; // compaction score
+    uint64_t timeout; // compaction task delay time
+    DBImpl* db;
+  };
 
   Iterator* NewInternalIterator(const ReadOptions&,
                                 SequenceNumber* latest_snapshot);
@@ -105,15 +115,23 @@ class DBImpl : public DB {
 
   void MaybeIgnoreError(Status* s) const;
 
+  // parallel compaction
+  Status ParallelCompaction(Compaction* c);
+
+  CompactStrategy* NewCompactStrategy(CompactionState* compact);
+
+  void HandleCompactionWork(CompactionState* compact,
+                            CompactStrategy* compact_strategy);
+
   // Delete any unneeded files and stale in-memory entries.
   void DeleteObsoleteFiles();
 
   // Compact the in-memory write buffer to disk.  Switches to a new
   // log-file/memtable and writes a new descriptor iff successful.
-  Status CompactMemTable()
+  Status CompactMemTable(bool* sched_idle = NULL)
       EXCLUSIVE_LOCKS_REQUIRED(mutex_);
 
-  Status WriteLevel0Table(MemTable* mem, VersionEdit* edit, Version* base)
+  Status WriteLevel0Table(MemTable* mem, VersionEdit* edit, Version* base, uint64_t* number = NULL)
       EXCLUSIVE_LOCKS_REQUIRED(mutex_);
 
   Status MakeRoomForWrite(bool force /* compact even if there is room? */)
@@ -121,12 +139,10 @@ class DBImpl : public DB {
 
   void MaybeScheduleCompaction() EXCLUSIVE_LOCKS_REQUIRED(mutex_);
   static void BGWork(void* db);
-  void BackgroundCall();
-  Status BackgroundCompaction() EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+  void BackgroundCall(CompactionTask* task);
+  Status BackgroundCompaction(bool* sched_idle) EXCLUSIVE_LOCKS_REQUIRED(mutex_);
   void CleanupCompaction(CompactionState* compact)
       EXCLUSIVE_LOCKS_REQUIRED(mutex_);
-  Status DoCompactionWork(CompactionState* compact)
-      EXCLUSIVE_LOCKS_REQUIRED(mutex_);
 
   Status OpenCompactionOutputFile(CompactionState* compact);
   Status FinishCompactionOutputFile(CompactionState* compact, Iterator* input);
@@ -154,6 +170,10 @@ class DBImpl : public DB {
   bool CheckMemTableCompaction(uint64_t last_sequence);
   MemTable* NewMemTable() const;
 
+  // new db transaction process
+  Status BeginNewDbTransaction();
+  Status CommitNewDbTransaction();
+
   // Constant after construction
   Env* const env_;
   const InternalKeyComparator internal_comparator_;
@@ -196,18 +216,24 @@ class DBImpl : public DB {
   std::set<uint64_t> pending_outputs_;
 
   // Has a background compaction been scheduled or is running?
-  bool bg_compaction_scheduled_;
-  double bg_compaction_score_;
-  uint64_t bg_compaction_timeout_;
-  int64_t bg_schedule_id_;
+  std::vector<CompactionTask*> bg_compaction_tasks_;
+  std::vector<double> bg_compaction_score_;
+  std::vector<int64_t> bg_schedule_id_;
 
   // Information for a manual compaction
+  enum ManualCompactState {
+    kManualCompactIdle,         // manual compact inited
+    kManualCompactConflict,     // manual compact run simultaneously
+    kManualCompactWakeup,       // restart delay compact task
+  };
   struct ManualCompaction {
     int level;
     bool done;
+    bool being_sched;
     const InternalKey* begin;   // NULL means beginning of key range
     const InternalKey* end;     // NULL means end of key range
     InternalKey tmp_storage;    // Used to keep track of compaction progress
+    ManualCompactState compaction_conflict; // 0 == idle, 1 == conflict, 2 == wake
   };
   ManualCompaction* manual_compaction_;
 
diff --git a/src/leveldb/db/db_table.cc b/src/leveldb/db/db_table.cc
index d9a4a725c..89ea76a1e 100644
--- a/src/leveldb/db/db_table.cc
+++ b/src/leveldb/db/db_table.cc
@@ -98,6 +98,10 @@ Options InitOptionsLG(const Options& options, uint32_t lg_id) {
   opt.sst_size = lg_info->sst_size;
   opt.write_buffer_size = lg_info->write_buffer_size;
   opt.seek_latency = lg_info->seek_latency;
+  if (options.ignore_corruption_in_open_lg_list.find(lg_id) 
+          != options.ignore_corruption_in_open_lg_list.end()) {
+    opt.ignore_corruption_in_open = true;
+  }
   return opt;
 }
 
@@ -311,22 +315,6 @@ Status DBTable::Init() {
     uint32_t i = *it;
     DBImpl* impl = lg_list_[i];
     s = impl->RecoverLastDumpToLevel0(lg_edits[i]);
-
-    // LogAndApply to lg's manifest
-    if (s.ok()) {
-      MutexLock lock(&impl->mutex_);
-      s = impl->versions_->LogAndApply(lg_edits[i], &impl->mutex_);
-      if (s.ok()) {
-        impl->DeleteObsoleteFiles();
-        impl->MaybeScheduleCompaction();
-      } else {
-        Log(options_.info_log, "[%s] Fail to modify manifest of lg %d",
-            dbname_.c_str(),
-            i);
-      }
-    } else {
-      Log(options_.info_log, "[%s] Fail to dump log to level 0", dbname_.c_str());
-    }
     delete lg_edits[i];
   }
 
@@ -497,6 +485,9 @@ Status DBTable::Write(const WriteOptions& options, WriteBatch* my_batch) {
       break;
     }
     mutex_.Lock();
+    if (s.IsIOPermissionDenied()) {
+        fatal_error_ = s;
+    }
   }
   if (s.ok()) {
     std::vector<WriteBatch*> lg_updates;
@@ -525,7 +516,6 @@ Status DBTable::Write(const WriteOptions& options, WriteBatch* my_batch) {
         Log(options_.info_log, "[%s] [Fatal] Write to lg%u fail",
             dbname_.c_str(), i);
         s = lg_s;
-        fatal_error_ = lg_s;
         break;
       }
     }
@@ -534,7 +524,10 @@ Status DBTable::Write(const WriteOptions& options, WriteBatch* my_batch) {
       for (uint32_t i = 0; i < lg_list_.size(); ++i) {
         lg_list_[i]->AddBoundLogSize(updates->DataSize());
       }
+    } else {
+        fatal_error_ = s;
     }
+
     // Commit updates
     if (s.ok() && lg_list_.size() > 1) {
       for (uint32_t i = 0; i < lg_list_.size(); ++i) {
@@ -696,6 +689,19 @@ void DBTable::ReleaseSnapshot(uint64_t sequence_number) {
   }
 }
 
+bool DBTable::ShouldForceUnloadOnError() {
+    MutexLock l(&mutex_);
+    bool permission_error = fatal_error_.IsIOPermissionDenied();
+    if (permission_error) {     //return early
+        return permission_error;
+    }
+    std::set<uint32_t>::iterator it = options_.exist_lg_list->begin();
+    for (; it != options_.exist_lg_list->end(); ++it) {
+        permission_error |= lg_list_[*it]->ShouldForceUnloadOnError();
+    }
+    return permission_error;
+}
+
 const uint64_t DBTable::Rollback(uint64_t snapshot_seq, uint64_t rollback_point) {
   std::set<uint32_t>::iterator it = options_.exist_lg_list->begin();
   uint64_t rollback_seq = rollback_point == kMaxSequenceNumber ? last_sequence_ : rollback_point;;
@@ -708,21 +714,28 @@ const uint64_t DBTable::Rollback(uint64_t snapshot_seq, uint64_t rollback_point)
 bool DBTable::GetProperty(const Slice& property, std::string* value) {
   bool ret = true;
   std::string ret_string;
+
   std::set<uint32_t>::iterator it = options_.exist_lg_list->begin();
   for (; it != options_.exist_lg_list->end(); ++it) {
     std::string lg_value;
     bool lg_ret = lg_list_[*it]->GetProperty(property, &lg_value);
     if (lg_ret) {
       if (options_.exist_lg_list->size() > 1) {
-        ret_string.append(Uint64ToString(*it) + ": {\n");
+        ret_string.append("LG:" + Uint64ToString(*it) + ":");
       }
       ret_string.append(lg_value);
       if (options_.exist_lg_list->size() > 1) {
-        ret_string.append("\n}\n");
+        ret_string.append(" ");
       }
+    } else {
+      ret = false;
+      break;
     }
   }
-  *value = ret_string;
+
+  if (ret) {
+    *value = ret_string;
+  }
   return ret;
 }
 
@@ -936,7 +949,6 @@ Status DBTable::RecoverLogFile(uint64_t log_number, uint64_t recover_limit,
     }
   }
   delete file;
-
   return status;
 }
 
@@ -1131,6 +1143,14 @@ int64_t DBTable::TEST_MaxNextLevelOverlappingBytes() {
 }
 
 int DBTable::SwitchLog(bool blocked_switch) {
+  {
+    MutexLock l(&mutex_);
+    if (fatal_error_.IsIOPermissionDenied()) {
+      Log(options_.info_log, "[%s] can not switch log becasue %s",
+          dbname_.c_str(), fatal_error_.ToString().c_str());
+     return 2;
+    }
+  }
   if (!blocked_switch ||
       log::AsyncWriter::BlockLogNum() < options_.max_block_log_number) {
     if (current_log_size_ == 0) {
@@ -1156,6 +1176,10 @@ int DBTable::SwitchLog(bool blocked_switch) {
         Log(options_.info_log, "[%s] SwitchLog", dbname_.c_str());
       }
       return 0;   // success
+    } else if (s.IsIOPermissionDenied()) {
+        MutexLock l(&mutex_);
+        fatal_error_ = s;
+        return 2;  // posix error EACCES = 13
     } else {
       Log(options_.info_log, "[%s] fail to open logfile %s. SwitchLog failed",
           dbname_.c_str(), log_file_name.c_str());
diff --git a/src/leveldb/db/db_table.h b/src/leveldb/db/db_table.h
index 4fa0a11c4..4ff14f46a 100644
--- a/src/leveldb/db/db_table.h
+++ b/src/leveldb/db/db_table.h
@@ -88,6 +88,8 @@ class DBTable : public DB {
 
     virtual const uint64_t Rollback(uint64_t snapshot_seq, uint64_t rollback_point = kMaxSequenceNumber);
 
+    virtual bool ShouldForceUnloadOnError();
+
     // DB implementations can export properties about their state
     // via this method.  If "property" is a valid property understood by this
     // DB implementation, fills "*value" with its current value and returns
diff --git a/src/leveldb/db/db_test.cc b/src/leveldb/db/db_test.cc
index 7c25f2de6..a12a0536a 100644
--- a/src/leveldb/db/db_test.cc
+++ b/src/leveldb/db/db_test.cc
@@ -98,6 +98,23 @@ class SpecialEnv : public EnvWrapper {
   }
 
   Status NewWritableFile(const std::string& f, WritableFile** r) {
+    class InitLoadLockFile : public WritableFile {
+     private:
+      SpecialEnv* env_;
+      WritableFile* base_;
+
+     public:
+      InitLoadLockFile(SpecialEnv* env, WritableFile* base)
+          : env_(env),
+            base_(base) {
+      }
+      ~InitLoadLockFile() { delete base_; }
+      Status Append(const Slice& data) { return base_->Append(data); }
+      Status Close() { return base_->Close(); }
+      Status Flush() { return base_->Flush(); }
+      Status Sync() { return base_->Sync(); }
+    };
+
     class SSTableFile : public WritableFile {
      private:
       SpecialEnv* env_;
@@ -165,6 +182,8 @@ class SpecialEnv : public EnvWrapper {
         *r = new SSTableFile(this, *r);
       } else if (strstr(f.c_str(), "MANIFEST") != NULL) {
         *r = new ManifestFile(this, *r);
+      } else if (strstr(f.c_str(), "__init_load_filelock") != NULL) {
+        *r = new InitLoadLockFile(this, *r);
       }
     }
     return s;
@@ -872,6 +891,40 @@ TEST(DBTest, Recover) {
   } while (ChangeOptions());
 }
 
+TEST(DBTest, RecoverWithLostCurrent) {
+  // before write anything delete current file 
+  ASSERT_OK(env_->DeleteFile(CurrentFileName(dbname_ + "/0")));
+  leveldb::WritableFile* lock_file;
+  ASSERT_OK(env_->NewWritableFile(dbname_ + "/0/__init_load_filelock", &lock_file));
+  ASSERT_OK(lock_file->Append("\n"));
+  ASSERT_OK(lock_file->Sync());
+  ASSERT_OK(lock_file->Close());
+  delete lock_file;
+  do {
+    Reopen();
+    ASSERT_OK(Put("foo", "v3"));
+    Reopen();
+    ASSERT_EQ("v3", Get("foo"));
+  } while (ChangeOptions());
+}
+
+TEST(DBTest, RecoverWithLostManifest) {
+  // before write anything delete current file 
+  ASSERT_OK(env_->DeleteFile(DescriptorFileName(dbname_ + "/0", 1)));
+  leveldb::WritableFile* lock_file;
+  ASSERT_OK(env_->NewWritableFile(dbname_ + "/0/__init_load_filelock", &lock_file));
+  ASSERT_OK(lock_file->Append("\n"));
+  ASSERT_OK(lock_file->Sync());
+  ASSERT_OK(lock_file->Close());
+  delete lock_file;
+  do {
+    Reopen();
+    ASSERT_OK(Put("foo", "v3"));
+    Reopen();
+    ASSERT_EQ("v3", Get("foo"));
+  } while (ChangeOptions());
+}
+
 TEST(DBTest, RecoveryWithEmptyLog) {
   do {
     ASSERT_OK(Put("foo", "v1"));
diff --git a/src/leveldb/db/filename.cc b/src/leveldb/db/filename.cc
index d56ea2ff7..4ac4a3864 100644
--- a/src/leveldb/db/filename.cc
+++ b/src/leveldb/db/filename.cc
@@ -129,7 +129,7 @@ bool ParseFileName(const std::string& fname,
   if (rest == "CURRENT") {
     *number = 0;
     *type = kCurrentFile;
-  } else if (rest == "LOCK") {
+  } else if (rest == "LOCK" || rest == "__init_load_filelock") {
     *number = 0;
     *type = kDBLockFile;
   } else if (rest == "LOG" || rest == "LOG.old") {
@@ -242,6 +242,15 @@ std::string BuildTabletPath(const std::string& prefix, uint64_t tablet) {
   return dbname;
 }
 
+std::string BuildTabletLgPath(const std::string& prefix, uint64_t tablet, uint64_t lg) {
+  char buf[100];
+  snprintf(buf, sizeof(buf), "/tablet%08llu/%llu",
+           static_cast<unsigned long long>(tablet),
+           static_cast<unsigned long long>(lg));
+  std::string lg_path = prefix + buf;
+  return lg_path;
+}
+
 std::string BuildTableFilePath(const std::string& prefix, uint64_t tablet,
                                uint64_t lg, uint64_t number) {
   char buf[100];
@@ -252,6 +261,35 @@ std::string BuildTableFilePath(const std::string& prefix, uint64_t tablet,
   return MakeFileName(dbname, number & 0xffffffff, "sst");
 }
 
+std::string BuildTrashTableFilePath(const std::string& prefix, uint64_t tablet,
+                                    uint32_t lg_id, uint64_t number,
+                                    const std::string& time) {
+    char buf[100];
+    snprintf(buf, sizeof(buf), "/tablet%08llu/%lu/%08llu.sst.%s",
+            static_cast<unsigned long long>(tablet),
+            static_cast<unsigned long>(lg_id),
+            static_cast<unsigned long long>(number),
+            time.c_str());
+
+    return prefix + buf;
+}
+
+std::string GetTimeStrFromTrashFile(const std::string& path) {
+    size_t dir_pos = path.rfind("/");
+    if (dir_pos == std::string::npos || dir_pos == path.length() - 1) {
+        return "";
+    }
+    std::string file = path.substr(dir_pos + 1, path.length() - dir_pos - 1);
+
+    size_t time_pos = file.rfind(".");
+    if (time_pos == std::string::npos) {
+        return "";
+    }
+    std::string time_str = file.substr(time_pos + 1, file.length() - time_pos - 1);
+
+    return time_str;
+}
+
 std::string BuildTableFilePath(const std::string& prefix, uint64_t lg, uint64_t full_number) {
   uint64_t tablet, number;
   ParseFullFileNumber(full_number, &tablet, &number);
diff --git a/src/leveldb/db/filename.h b/src/leveldb/db/filename.h
index ede91c51a..b151c165b 100644
--- a/src/leveldb/db/filename.h
+++ b/src/leveldb/db/filename.h
@@ -97,12 +97,27 @@ extern uint64_t BuildFullFileNumber(const std::string& dbname,
 //      from (/table1, 3)
 std::string BuildTabletPath(const std::string& prefix, uint64_t tablet);
 
+std::string BuildTabletLgPath(const std::string& prfix, uint64_t tablet, uint64_t lg);
+
 // Build file path from tablet_num & lg_num & file number
 // E.g. construct "/table1/tablet000003/0/00000001.sst"
 //      from (/table1, 3, 0, 1)
 std::string BuildTableFilePath(const std::string& prefix, uint64_t tablet,
                                uint64_t lg, uint64_t number);
 
+// Build trash file path from tablet_num & lg & file number & time
+// E.g. construct "/table1/tablet000003/0/00000001.sst.20170718-17-08-30"
+//      from (/table1, 3, 0, 1, 20170718-17-08-30)
+std::string BuildTrashTableFilePath(const std::string& prefix, uint64_t tablet,
+                                    uint32_t lg_id, uint64_t number,
+                                    const std::string& time);
+
+// get time string from trash file path
+// E.g. get "20170718-17-08-30"
+//      from  "/table1/tablet000003/0/00000001.sst.20170718-17-08-30"
+// if path is invalid, return ""
+std::string GetTimeStrFromTrashFile(const std::string& path);
+
 // Build file path from lg_num & full file number
 // E.g. construct "/table1/tablet000003/0/00000001.sst"
 //      from (/table1, 0, 0x8000000300000001)
diff --git a/src/leveldb/db/memtable.cc b/src/leveldb/db/memtable.cc
index c9f284110..ddee41b1d 100644
--- a/src/leveldb/db/memtable.cc
+++ b/src/leveldb/db/memtable.cc
@@ -26,6 +26,7 @@ MemTable::MemTable(const InternalKeyComparator& cmp, CompactStrategyFactory* com
     : last_seq_(0),
       comparator_(cmp),
       refs_(0),
+      being_flushed_(false),
       table_(comparator_, &arena_),
       empty_(true),
       compact_strategy_factory_(compact_strategy_factory) {
diff --git a/src/leveldb/db/memtable.h b/src/leveldb/db/memtable.h
index ba608550e..a2a1a073a 100644
--- a/src/leveldb/db/memtable.h
+++ b/src/leveldb/db/memtable.h
@@ -79,6 +79,13 @@ class MemTable {
     empty_ = false;
   }
 
+  bool BeingFlushed() { return being_flushed_;}
+  void SetBeingFlushed(bool flag) {
+    assert(flag ? !being_flushed_
+                : being_flushed_);
+    being_flushed_ = flag;
+  }
+
   virtual ~MemTable();
 
  protected:
@@ -97,6 +104,7 @@ class MemTable {
 
   KeyComparator comparator_;
   int refs_;
+  bool being_flushed_;
 
   Arena arena_;
   Table table_;
diff --git a/src/leveldb/db/version_edit.cc b/src/leveldb/db/version_edit.cc
index fc95284e6..244733915 100644
--- a/src/leveldb/db/version_edit.cc
+++ b/src/leveldb/db/version_edit.cc
@@ -11,7 +11,6 @@
 #include "db/filename.h"
 #include "db/version_set.h"
 #include "util/coding.h"
-
 namespace leveldb {
 
 // Tag numbers for serialized VersionEdit.  These numbers are written to
@@ -29,6 +28,7 @@ enum Tag {
   kNewFile              = 10,
   kDeletedFile          = 11,
   kNewFileInfo          = 12,
+  kSstFileDataSize      = 13,
 
   // no more than 1<<20
   kMaxTag               = 1 << 20,
@@ -147,6 +147,13 @@ void VersionEdit::EncodeTo(std::string* dst) const {
     PutVarint32(dst, str.size() + kMaxTag);
     PutVarint32(dst, kNewFileInfo);
     dst->append(str.data(), str.size());
+
+    // record sst FileData
+    str.clear();
+    PutVarint64(&str, f.data_size);
+    PutVarint32(dst, str.size() + kMaxTag);
+    PutVarint32(dst, kSstFileDataSize);
+    dst->append(str.data(), str.size());
   }
 }
 
@@ -171,6 +178,43 @@ static bool GetLevel(Slice* input, int* level) {
   }
 }
 
+Status VersionEdit::DecodeNewFileInfo(Slice* input, FileMetaData* f) {
+  bool decode_continue = true;
+
+  while (decode_continue && input->size() > 0) {
+    uint32_t len = 0;
+    uint32_t tag = 0;
+    Slice file_input = *input;
+    GetVarint32(&file_input, &len);
+    if (len <= kMaxTag) {
+      break;
+    }
+
+    GetVarint32(&file_input, &tag);
+    switch (tag) {
+      case kNewFileInfo:
+        GetVarint32(input, &len);// ignore len
+        GetVarint32(input, &tag);// ignore tag
+        GetVarint64(input, &f->del_percentage);
+        GetVarint64(input, &f->ttl_percentage);
+        GetVarint64(input, &f->check_ttl_ts);
+        break;
+      case kSstFileDataSize:
+        GetVarint32(input, &len);
+        GetVarint32(input, &tag);
+        GetVarint64(input, &f->data_size);
+        break;
+      default:
+        fprintf(stderr, "NewFile %lu without info, skip tag %d, len %d\n",
+                f->number & 0xffffffff,
+                tag, len);
+        decode_continue = false;
+        break;
+    }
+  }
+  return Status::OK();
+}
+
 Status VersionEdit::DecodeFrom(const Slice& src) {
   Clear();
   Slice input = src;
@@ -285,29 +329,7 @@ Status VersionEdit::DecodeFrom(const Slice& src) {
               f.largest_fake = true;
             }
 
-            // new file format parser
-            Slice file_ptr = input;
-            uint32_t file_tag;
-            GetVarint32(&file_ptr, &file_tag);
-            if (file_tag > kMaxTag) {
-              // file_tag - kMaxTag;
-              GetVarint32(&file_ptr, &tag);
-            }
-            switch (tag) {
-              case kNewFileInfo:
-                GetVarint32(&input, &tag);// ignore len
-                GetVarint32(&input, &tag);// ignore tag
-                GetVarint64(&input, &f.del_percentage);
-                GetVarint64(&input, &f.ttl_percentage);
-                GetVarint64(&input, &f.check_ttl_ts);
-                break;
-
-              default:
-                fprintf(stderr, "NewFile %lu without info, skip tag %d, len %d\n",
-                        f.number & 0xffffffff,
-                        tag, file_tag);
-                break;
-            }
+            DecodeNewFileInfo(&input, &f);
             new_files_.push_back(std::make_pair(level, f));
           } else {
             msg = "new-file entry 1";
@@ -400,6 +422,8 @@ std::string VersionEdit::DebugString() const {
     AppendNumberTo(&r, file_number);
     r.append(" size ");
     AppendNumberTo(&r, f.file_size);
+    r.append(" data_size ");
+    AppendNumberTo(&r, f.data_size);
     r.append(" ");
     r.append(f.smallest.DebugString());
     r.append(" .. ");
diff --git a/src/leveldb/db/version_edit.h b/src/leveldb/db/version_edit.h
index 0c64728d0..17b9bfc36 100644
--- a/src/leveldb/db/version_edit.h
+++ b/src/leveldb/db/version_edit.h
@@ -33,6 +33,7 @@ struct FileMetaData {
   InternalKey largest;        // Largest internal key served by table
   bool smallest_fake;         // smallest is not real, have out-of-range keys
   bool largest_fake;          // largest is not real, have out-of-range keys
+  bool being_compacted;       // Is this file undergoing compaction?
 
   FileMetaData() :
       refs(0),
@@ -44,7 +45,8 @@ struct FileMetaData {
       file_size(0),
       data_size(0),
       smallest_fake(false),
-      largest_fake(false) { }
+      largest_fake(false),
+      being_compacted(false) { }
 };
 
 class VersionEdit {
@@ -157,6 +159,7 @@ class VersionEdit {
     FileMetaData f;
     f.number = file;
     f.file_size = file_size;
+    f.data_size = f.file_size;
     f.smallest = smallest;
     f.largest = largest;
     f.del_percentage = del_percentage;
@@ -185,6 +188,7 @@ class VersionEdit {
 
   void EncodeTo(std::string* dst) const;
   Status DecodeFrom(const Slice& src);
+  Status DecodeNewFileInfo(Slice* input, FileMetaData* f);
 
   std::string DebugString() const;
 
diff --git a/src/leveldb/db/version_edit_test.cc b/src/leveldb/db/version_edit_test.cc
index c728af4cc..44a5d308f 100644
--- a/src/leveldb/db/version_edit_test.cc
+++ b/src/leveldb/db/version_edit_test.cc
@@ -26,6 +26,7 @@ enum Tag {
   kNewFile              = 10,
   kDeletedFile          = 11,
   kNewFileInfo          = 12,
+  kSstFileDataSize      = 13,
 
   // no more than 1<<20
   kMaxTag               = 1 << 20,
@@ -53,7 +54,7 @@ class VersionEditTest: public VersionEdit {
         dst->append(str.data(), str.size());
       }
     }
-    void EncodeToOld(std::string* dst) {
+    void EncodeToOld(std::string* dst, bool with_sst, bool with_data_size) {
       DumpToOldFormat();
       if (has_comparator_) {
         PutVarint32(dst, kComparator);
@@ -71,7 +72,43 @@ class VersionEditTest: public VersionEdit {
         PutVarint32(dst, kLastSequence);
         PutVarint64(dst, last_sequence_);
       }
+      if (!with_sst) {
+          return;
+      }
+      for (uint32_t i = 0; i < 5; i++) {
+        FileMetaData f;
+        f.number = 100 + i;
+        f.file_size = 200 + i;
+        f.data_size = f.file_size;
+        f.smallest = InternalKey("apple", 300 + i, kTypeValue);
+        f.largest = InternalKey("zookeeper", 400 + i, kTypeDeletion);
+        EncodeSstFile(i, f, dst, with_data_size);
+      }
+    }
+
+    void EncodeSstFile(uint32_t level, const FileMetaData& f, std::string* dst, bool with_data_size) {
+        std::string str;
+        PutVarint32(&str,level);  // level
+        PutVarint64(&str, f.number);
+        PutVarint64(&str, f.file_size);
+        PutLengthPrefixedSlice(&str, f.smallest.Encode());
+        PutLengthPrefixedSlice(&str, f.largest.Encode());
+        PutVarint32(&str, 0);      // put f.smallest_fake
+        PutVarint32(&str, 0);     // put f.largest_fake
+
+        PutVarint32(dst, str.size() + kMaxTag);
+        PutVarint32(dst, kNewFile);
+        dst->append(str.data(), str.size());
+        // record sst FileData
+        if (with_data_size) {
+          str.clear();
+          PutVarint64(&str, f.data_size);
+          PutVarint32(dst, str.size() + kMaxTag);
+          PutVarint32(dst, kSstFileDataSize);
+          dst->append(str.data(), str.size());
+        }
     }
+
     void DumpToOldFormat() {
       has_comparator_ = HasComparator();
       comparator_ = GetComparatorName();
@@ -126,22 +163,28 @@ static void CreateEditContent(VersionEditTest* edit) {
   edit->SetLastSequence(900);
   TestEncodeDecode(*edit);
 }
-static void CreateEditContentV2(VersionEditTest* edit) {
+static void CreateOldEncodedContent(VersionEditTest* edit, std::string* dst,
+        bool with_sst, bool with_data_size) {
   edit->SetComparatorName("test_nil_cmp");
   edit->SetLogNumber(700);
   edit->SetNextFile(800);
   edit->SetLastSequence(900);
   TestEncodeDecode(*edit);
+  edit->EncodeToOld(dst, with_sst, with_data_size);
 }
-static void CreateEditWithTtlInfo(VersionEditTest* edit) {
+static void CreateEditWithSstDetail(VersionEditTest* edit) {
   for (int i = 0; i < 5; i++) {
     TestEncodeDecode(*edit);
-    edit->AddFile(i, 100 + i, 200 + i,
-                  InternalKey("apple", 300 + i, kTypeValue),
-                  InternalKey("zookeeper", 400 + i, kTypeDeletion),
-                  20 + i/* del percentage */,
-                  1000000000 + i/* timeout */,
-                  50 + i/* del percentage */);
+    FileMetaData f;
+    f.number = 100 + i;
+    f.file_size = 200 + i;
+    f.data_size = f.file_size;
+    f.smallest = InternalKey("apple", 300 + i, kTypeValue);
+    f.largest = InternalKey("zookeeper", 400 + i, kTypeDeletion);
+    f.del_percentage = 20 + i;
+    f.ttl_percentage = 50 + i;
+    f.check_ttl_ts = 1000000000 + i;
+    edit->AddFile(i, f);
     edit->DeleteFile(i, 500 + i);
     edit->SetCompactPointer(i, InternalKey("x00", 600 + i, kTypeValue));
   }
@@ -154,14 +197,13 @@ static void CreateEditWithTtlInfo(VersionEditTest* edit) {
 }
 TEST(VersionEditTest, EncodeFileInfoTag) {
   VersionEditTest edit;
-  CreateEditWithTtlInfo(&edit);
+  CreateEditWithSstDetail(&edit);
   fprintf(stderr, "%s\n", edit.DebugString().c_str());
 }
 TEST(VersionEditTest, OldFormatRead) {
   VersionEditTest edit;
-  CreateEditContentV2(&edit);
   std::string c1, c3;
-  edit.EncodeToOld(&c1); // dump into old format
+  CreateOldEncodedContent(&edit, &c1, false, false);
   edit.EncodeTo(&c3); // dump into new format
 
   VersionEditTest parsed;
@@ -174,6 +216,23 @@ TEST(VersionEditTest, OldFormatRead) {
   fprintf(stderr, "%s\n", parsed.DebugString().c_str());
 }
 
+TEST(VersionEditTest, DecodeFormatWithoutSstFileDataSize) {
+  VersionEditTest edit;
+  std::string c1, c3;
+  CreateOldEncodedContent(&edit, &c1, true, false);
+  edit.EncodeTo(&c3); // dump into new format
+
+  VersionEditTest parsed;
+  Status s = parsed.DecodeFrom(c1); // use new Decode to parse old format
+  ASSERT_TRUE(s.ok()) << s.ToString();
+  std::string c2;
+  parsed.EncodeTo(&c2);
+
+  ASSERT_NE(c2, c3);
+  fprintf(stderr, "%s\n", parsed.DebugString().c_str());
+
+}
+
 TEST(VersionEditTest, EncodeUnknowTag) {
   VersionEditTest edit;
   CreateEditContent(&edit);
diff --git a/src/leveldb/db/version_set.cc b/src/leveldb/db/version_set.cc
index 088acd090..4c5d328aa 100644
--- a/src/leveldb/db/version_set.cc
+++ b/src/leveldb/db/version_set.cc
@@ -70,6 +70,15 @@ static int64_t TotalFileSize(const std::vector<FileMetaData*>& files) {
   }
   return sum;
 }
+static int64_t TotalFileSizeNotBeingCompacted(const std::vector<FileMetaData*>& files) {
+  int64_t sum = 0;
+  for (size_t i = 0; i < files.size(); i++) {
+    if (!files[i]->being_compacted) {
+      sum += files[i]->file_size;
+    }
+  }
+  return sum;
+}
 
 Version::~Version() {
   assert(refs_ == 0);
@@ -129,11 +138,11 @@ static bool BeforeFile(const Comparator* ucmp,
 
 bool SomeFileOverlapsRange(
     const InternalKeyComparator& icmp,
+    const Comparator* ucmp,
     bool disjoint_sorted_files,
     const std::vector<FileMetaData*>& files,
     const Slice* smallest_user_key,
     const Slice* largest_user_key) {
-  const Comparator* ucmp = icmp.user_comparator();
   if (!disjoint_sorted_files) {
     // Need to check against all files
     for (size_t i = 0; i < files.size(); i++) {
@@ -473,8 +482,17 @@ void Version::Unref() {
 bool Version::OverlapInLevel(int level,
                              const Slice* smallest_user_key,
                              const Slice* largest_user_key) {
-  return SomeFileOverlapsRange(vset_->icmp_, (level > 0), files_[level],
-                               smallest_user_key, largest_user_key);
+
+  // use row key comparator
+  CompactStrategy* strategy = vset_->options_->compact_strategy_factory->NewInstance();
+  const Comparator* ucmp = strategy->RowKeyComparator();
+  if (ucmp == NULL) {
+    ucmp = vset_->icmp_.user_comparator();
+  }
+  bool overlap = SomeFileOverlapsRange(vset_->icmp_, ucmp, (level > 0), files_[level],
+                                       smallest_user_key, largest_user_key);
+  delete strategy;
+  return overlap;
 }
 
 int Version::PickLevelForMemTableOutput(
@@ -516,12 +534,10 @@ void Version::GetOverlappingInputs(
   if (end != NULL) {
     user_end = end->user_key();
   }
-  const Comparator* user_cmp = NULL;
-  CompactStrategy* strategy = NULL;
-  if (!vset_->options_->drop_base_level_del_in_compaction) { // use row key comparator
-    strategy = vset_->options_->compact_strategy_factory->NewInstance();
-    user_cmp = strategy->RowKeyComparator();
-  }
+
+  // use row key comparator
+  CompactStrategy* strategy = vset_->options_->compact_strategy_factory->NewInstance();
+  const Comparator* user_cmp = strategy->RowKeyComparator();
   if (user_cmp == NULL) {
     user_cmp = vset_->icmp_.user_comparator();
   }
@@ -818,11 +834,7 @@ class VersionSetBuilder {
 
       FileMetaData* f = new FileMetaData(f_new);
       f->refs = 1;
-
-      if (f->data_size == 0 && !f->smallest_fake && !f->largest_fake) {
-        // Make sure this is a new file generated by compaction.
-        f->data_size = f->file_size;
-      }
+      f->being_compacted = false;
 
       // We arrange to automatically compact this file after
       // a certain number of seeks.  Let's assume:
@@ -927,6 +939,7 @@ class VersionSetBuilder {
             vset_->db_key_start_.DebugString().c_str());
         f->smallest = vset_->db_key_start_;
         f->smallest_fake = true;
+        f->data_size = 0;
       } else {
         // file out of tablet range, skip it;
         return false;
@@ -943,6 +956,7 @@ class VersionSetBuilder {
             vset_->db_key_end_.DebugString().c_str());
         f->largest = vset_->db_key_end_;
         f->largest_fake = true;
+        f->data_size = 0;
       } else {
         // file out of tablet range, skip it;
         return false;
@@ -1014,7 +1028,18 @@ void VersionSet::AppendVersion(Version* v) {
   v->next_->prev_ = v;
 }
 
-Status VersionSet::LogAndApply(VersionEdit* edit, port::Mutex* mu) {
+// multi thread safe
+// Information kept for every waiting manifest writer
+struct VersionSet::ManifestWriter {
+  Status status;
+  VersionEdit* edit;
+  bool done;
+  port::CondVar cv;
+
+  explicit ManifestWriter(port::Mutex* mu) : done(false), cv(mu) { }
+};
+void VersionSet::LogAndApplyHelper(VersionSetBuilder* builder,
+                                   VersionEdit* edit) {
   if (edit->has_log_number_) {
     assert(edit->log_number_ >= log_number_);
     assert(edit->log_number_ < next_file_number_);
@@ -1036,13 +1061,28 @@ Status VersionSet::LogAndApply(VersionEdit* edit, port::Mutex* mu) {
     edit->SetLastSequence(last_sequence_);
   }
 
+  builder->Apply(edit);
+}
+
+Status VersionSet::LogAndApply(VersionEdit* edit, port::Mutex* mu) {
+  mu->AssertHeld();
+  // multi write control, do not batch edit write, but multi thread safety
+  ManifestWriter w(mu);
+  w.edit = edit;
+  manifest_writers_.push_back(&w);
+  while (!w.done && &w != manifest_writers_.front()) {
+    w.cv.Wait();
+  }
+  assert(manifest_writers_.front() == &w);
+
+  // first manifest writer, batch edit
   Version* v = new Version(this);
   {
     VersionSetBuilder builder(this, current_);
-    builder.Apply(edit);
+    LogAndApplyHelper(&builder, w.edit);
     builder.SaveTo(v);
   }
-  Finalize(v);
+  Finalize(v); // recalculate new version score
 
   const uint64_t switch_interval = options_->manifest_switch_interval * 1000000UL;
   if (descriptor_log_ != NULL &&
@@ -1050,6 +1090,7 @@ Status VersionSet::LogAndApply(VersionEdit* edit, port::Mutex* mu) {
     force_switch_manifest_ = true;
   }
 
+  uint64_t manifest_file_num = manifest_file_number_;
   int retry_count = 0;
   Status s;
   // Unlock during expensive MANIFEST log write
@@ -1063,13 +1104,14 @@ Status VersionSet::LogAndApply(VersionEdit* edit, port::Mutex* mu) {
     }
     mu->Unlock();
 
+    // close current manifest
     if (force_switch_manifest_) {
       delete descriptor_log_;
       delete descriptor_file_;
       descriptor_log_ = NULL;
       descriptor_file_ = NULL;
-      Log(options_->info_log, "[%s] force switch MANIFEST to %lu",
-        dbname_.c_str(), manifest_file_number_);
+      Log(options_->info_log, "[%s] force switch MANIFEST #%lu to #%lu",
+        dbname_.c_str(), manifest_file_num, manifest_file_number_);
       force_switch_manifest_ = false;
     }
 
@@ -1113,15 +1155,65 @@ Status VersionSet::LogAndApply(VersionEdit* edit, port::Mutex* mu) {
     if (s.ok() && !new_manifest_file.empty()) {
       s = SetCurrentFile(env_, dbname_, manifest_file_number_);
       if (s.ok()) {
-        Log(options_->info_log, "[%s] set CURRENT to %llu\n",
-          dbname_.c_str(), static_cast<unsigned long long>(manifest_file_number_));
+        Log(options_->info_log, "[%s] set CURRENT #%lu to #%llu success\n",
+          dbname_.c_str(),manifest_file_num,
+          static_cast<unsigned long long>(manifest_file_number_));
+        manifest_file_num = manifest_file_number_;
       } else {
-        Log(options_->info_log, "[%s][dfs error] set CURRENT error: %s\n",
-          dbname_.c_str(), s.ToString().c_str());
+        Log(options_->info_log, "[%s][dfs error] set CURRENT #%lu to #%lu error: %s\n",
+          dbname_.c_str(), manifest_file_num, manifest_file_number_,
+          s.ToString().c_str());
       }
     }
 
-    if (!s.ok()) {
+    // switch manifest success, try delete obsolete file
+    if (!new_manifest_file.empty() && s.ok()) {
+      // manifest file set, keep latest 3 manifest files for backup
+      std::set<std::string> manifest_set;
+      std::vector<std::string> filenames;
+      env_->GetChildren(dbname_, &filenames);
+
+      uint64_t number;
+      FileType type;
+      for (size_t i = 0; i < filenames.size(); i++) {
+        if (ParseFileName(filenames[i], &number, &type)) {
+          bool keep = true;
+          switch (type) {
+            case kDescriptorFile:
+              manifest_set.insert(filenames[i]);
+              if (manifest_set.size() > 3) {
+                std::set<std::string>::iterator it = manifest_set.begin();
+                ParseFileName(*it, &number, &type);
+                if (number < manifest_file_number_) {
+                  // Keep my manifest file, and any newer incarnations'
+                  // (in case there is a race that allows other incarnations)
+                  filenames[i] = *it;
+                  keep = false;
+                  manifest_set.erase(it);
+                }
+              }
+              break;
+            case kTempFile:
+              // Any temp files that are currently being written to must
+              // be recorded in pending_outputs_, which is inserted into "live"
+              keep = false;
+              break;
+            default:
+              break;
+          }
+
+          if (!keep) {
+            Log(options_->info_log, "[%s] version_set Delete type=%s #%lld, fname %s\n",
+                dbname_.c_str(), FileTypeToString(type),
+                static_cast<unsigned long long>(number), filenames[i].c_str());
+            env_->DeleteFile(dbname_ + "/" + filenames[i]);
+          }
+        }
+      }
+    }
+    // if MANIFEST or CURRENT file write error because of losting directory lock,
+    // do not try to switch manifest anymore
+    if (!s.ok() && !s.IsIOPermissionDenied()) {
       force_switch_manifest_ = true;
       if (!new_manifest_file.empty()) {
         env_->DeleteFile(new_manifest_file);
@@ -1141,7 +1233,7 @@ Status VersionSet::LogAndApply(VersionEdit* edit, port::Mutex* mu) {
     }
 
     mu->Lock();
-  } while (force_switch_manifest_);
+  } while (force_switch_manifest_); // bugfix issue=tera-10, dfs sync fail, but eventually success, cause reload fail
 
   // Install the new version
   if (s.ok()) {
@@ -1155,6 +1247,10 @@ Status VersionSet::LogAndApply(VersionEdit* edit, port::Mutex* mu) {
     Log(options_->info_log, "[%s][dfs error] set force_switch_manifest", dbname_.c_str());
   }
 
+  manifest_writers_.pop_front();
+  if (!manifest_writers_.empty()) {
+    manifest_writers_.front()->cv.Signal();
+  }
   return s;
 }
 
@@ -1462,9 +1558,11 @@ Status VersionSet::Recover() {
       FileMetaData* f = files[i];
       ModifyFileSize(f);
       // Debug
-      Log(options_->info_log, "[%s] recover: %s, level: %d, del_p: %lu, check_ttl_ts %lu, ttl_p %lu, s: %d %s, l: %d %s\n",
+      Log(options_->info_log, "[%s] recover: %s, level: %d, file_size %lu, data_size %lu, "
+          "del_p: %lu, check_ttl_ts %lu, ttl_p %lu, s: %d %s, l: %d %s\n",
           dbname_.c_str(),
           FileNumberDebugString(f->number).c_str(), level,
+          f->file_size, f->data_size,
           f->del_percentage,
           f->check_ttl_ts,
           f->ttl_percentage,
@@ -1480,6 +1578,9 @@ Status VersionSet::Recover() {
 
 // Modify data_size of file meta
 bool VersionSet::ModifyFileSize(FileMetaData* f) {
+  if (f->data_size != 0) {
+    return true;
+  }
   // Try modify data_size in file meta
   // data_size = largest_key_offset - smallest_key_offset
   if (f->largest_fake || f->smallest_fake) {
@@ -1508,8 +1609,7 @@ bool VersionSet::ModifyFileSize(FileMetaData* f) {
         static_cast<unsigned long long>(f->file_size),
         static_cast<unsigned long long>(f->data_size));
     delete iter;
-  } else {
-    // do not need modify
+  } else { // for compatibility, we have not decoded f->data_size from MANIFEST
     f->data_size = f->file_size;
   }
   return true;
@@ -1523,8 +1623,6 @@ void VersionSet::MarkFileNumberUsed(uint64_t number) {
 
 void VersionSet::Finalize(Version* v) {
   // Precomputed best level for next compaction
-  int best_level = -1;
-  double best_score = -1;
   int best_del_level = -1;
   int best_del_idx = -1;
   int best_ttl_level = -1;
@@ -1532,8 +1630,8 @@ void VersionSet::Finalize(Version* v) {
 
   int base_level =  -1;
   for (int level = config::kNumLevels - 1; level >= 0; level--) {
-    double score;
-    if (level == 0) {
+    double score = 0;
+    if (level == 0 && level0_compactions_in_progress_.empty()) {
       // We treat level-0 specially by bounding the number of files
       // instead of number of bytes for two reasons:
       //
@@ -1548,11 +1646,16 @@ void VersionSet::Finalize(Version* v) {
       //
       // (3) More level0 files means write hotspot.
       // We give lower score to avoid too much level0 compaction.
-      score = sqrt(v->files_[level].size() /
-          static_cast<double>(config::kL0_CompactionTrigger));
-    } else {
+      if (v->files_[level].size() <= (size_t)options_->slow_down_level0_score_limit) {
+        score = v->files_[level].size() /
+          static_cast<double>(config::kL0_CompactionTrigger);
+      } else {
+        score = sqrt(v->files_[level].size() /
+            static_cast<double>(config::kL0_CompactionTrigger));
+      }
+    } else if (level > 0) {
       // Compute the ratio of current size to size limit.
-      const uint64_t level_bytes = TotalFileSize(v->files_[level]);
+      const uint64_t level_bytes = TotalFileSizeNotBeingCompacted(v->files_[level]);
       score = static_cast<double>(level_bytes)
           / MaxBytesForLevel(level, options_->sst_size);
     }
@@ -1562,16 +1665,15 @@ void VersionSet::Finalize(Version* v) {
       base_level = level;
     }
 
-    // size compaction does not allow trigger by base level
-    if ((score > best_score) && (level < config::kNumLevels - 1)) {
-      best_level = level;
-      best_score = score;
+    if (level < config::kNumLevels - 1) {
+      v->compaction_level_[level] = level;
+      v->compaction_score_[level] = (score < 1.0) ? 0: score;
     }
 
     for (size_t i = 0; i < v->files_[level].size(); i++) {
       FileMetaData* f = v->files_[level][i];
       // del compaction does not allow trigger by base level
-      if ((level > 0) && (level < base_level) &&
+      if ((!f->being_compacted) && (level > 0) && (level < base_level) &&
           (f->del_percentage > options_->del_percentage) &&
           (best_del_level < 0 ||
            v->files_[best_del_level][best_del_idx]->del_percentage < f->del_percentage)) {
@@ -1580,7 +1682,7 @@ void VersionSet::Finalize(Version* v) {
       }
 
       // ttl compaction can trigger in base level
-      if ((f->check_ttl_ts > 0) &&
+      if ((!f->being_compacted) && (f->check_ttl_ts > 0) &&
           (best_ttl_level < 0 ||
            v->files_[best_ttl_level][best_ttl_idx]->check_ttl_ts > f->check_ttl_ts)) {
         best_ttl_level = level;
@@ -1589,30 +1691,44 @@ void VersionSet::Finalize(Version* v) {
     }
   }
 
-  v->compaction_level_ = best_level;
-  v->compaction_score_ = best_score;
+  // sort all the levels based on their score. Higher scores get listed
+  // first. Use bubble sort because the number of entries are small.
+  for (int i = 0; i < config::kNumLevels - 2; i++) {
+    for (int j = i + 1; j < config::kNumLevels - 1; j++) {
+      if (v->compaction_score_[i] < v->compaction_score_[j]) {
+        int level = v->compaction_level_[i];
+        double score = v->compaction_score_[i];
+        v->compaction_level_[i] = v->compaction_level_[j];
+        v->compaction_score_[i] = v->compaction_score_[j];
+        v->compaction_level_[j] = level;
+        v->compaction_score_[j] = score;
+      }
+    }
+  }
+
   if (best_del_level >= 0) {
     v->del_trigger_compact_ = v->files_[best_del_level][best_del_idx];
     v->del_trigger_compact_level_ = best_del_level;
     Log(options_->info_log,
-        "[%s] del_strategy(current), level %d, num #%lu, file_size %lu, del_p %lu\n",
-        dbname_.c_str(),
-        v->del_trigger_compact_level_,
-        (v->del_trigger_compact_->number) & 0xffffffff,
-        v->del_trigger_compact_->file_size,
-        v->del_trigger_compact_->del_percentage);
+      "[%s] del_strategy(current), level %d, num #%lu, file_size %lu, del_p %lu\n",
+      dbname_.c_str(),
+      v->del_trigger_compact_level_,
+      (v->del_trigger_compact_->number) & 0xffffffff,
+      v->del_trigger_compact_->file_size,
+      v->del_trigger_compact_->del_percentage);
   }
+
   if (best_ttl_level >= 0) {
     v->ttl_trigger_compact_ = v->files_[best_ttl_level][best_ttl_idx];
     v->ttl_trigger_compact_level_ = best_ttl_level;
     Log(options_->info_log,
-        "[%s] ttl_strategy(current), level %d, num #%lu, file_size %lu, ttl_p %lu, check_ts %lu\n",
-        dbname_.c_str(),
-        v->ttl_trigger_compact_level_,
-        (v->ttl_trigger_compact_->number) & 0xffffffff,
-        v->ttl_trigger_compact_->file_size,
-        v->ttl_trigger_compact_->ttl_percentage,
-        v->ttl_trigger_compact_->check_ttl_ts);
+      "[%s] ttl_strategy(current), level %d, num #%lu, file_size %lu, ttl_p %lu, check_ts %lu\n",
+      dbname_.c_str(),
+      v->ttl_trigger_compact_level_,
+      (v->ttl_trigger_compact_->number) & 0xffffffff,
+      v->ttl_trigger_compact_->file_size,
+      v->ttl_trigger_compact_->ttl_percentage,
+      v->ttl_trigger_compact_->check_ttl_ts);
   }
 }
 
@@ -1757,6 +1873,19 @@ void VersionSet::AddLiveFiles(std::map<uint64_t, int>* live) {
   }
 }
 
+void VersionSet::AddLiveFilesWithSize(std::map<uint64_t, uint64_t>* live) {
+  for (Version* v = dummy_versions_.next_;
+       v != &dummy_versions_;
+       v = v->next_) {
+    for (int level = 0; level < config::kNumLevels; level++) {
+      const std::vector<FileMetaData*>& files = v->files_[level];
+      for (size_t i = 0; i < files.size(); i++) {
+        (*live)[files[i]->number] = files[i]->file_size;
+      }
+    }
+  }
+}
+
 int64_t VersionSet::NumLevelBytes(int level) const {
   assert(level >= 0);
   assert(level < config::kNumLevels);
@@ -1854,97 +1983,472 @@ Iterator* VersionSet::MakeInputIterator(Compaction* c) {
   return result;
 }
 
+void VersionSet::PrintFilesInCompaction(const std::vector<FileMetaData*>& inputs) {
+  char buf[30];
+  std::string fstr = "file: ";
+  for (size_t i = 0; i < inputs.size(); i++) {
+    FileMetaData* f = inputs[i];
+    if (f->being_compacted) {
+      snprintf(buf, sizeof(buf), "%lu ", f->number);
+      fstr.append(buf);
+      break;
+    }
+  }
+  Log(options_->info_log, "[%s] test mark level [%s] bening compact.", dbname_.c_str(),
+    fstr.c_str());
+  return;
+}
+
+bool VersionSet::FilesInCompaction(const std::vector<FileMetaData*>& inputs) {
+  for (size_t i = 0; i < inputs.size(); i++) {
+    FileMetaData* f = inputs[i];
+    if (f->being_compacted) {
+      return true;
+    }
+  }
+  return false;
+}
+
+void VersionSet::PrintRangeInCompaction(const InternalKey* smallest, const InternalKey* largest, int level) {
+  std::vector<FileMetaData*> inputs;
+  assert(level < config::kNumLevels);
+  current_->GetOverlappingInputs(level, smallest, largest, &inputs);
+  PrintFilesInCompaction(inputs);
+  return;
+}
+
+bool VersionSet::RangeInCompaction(const InternalKey* smallest, const InternalKey* largest, int level) {
+  std::vector<FileMetaData*> inputs;
+  assert(level < config::kNumLevels);
+  current_->GetOverlappingInputs(level, smallest, largest, &inputs);
+  return FilesInCompaction(inputs);
+}
+
+bool VersionSet::PickFutureCompaction(int level, std::vector<FileMetaData*>* inputs) {
+  inputs->clear();
+  std::vector<FileMetaData*> candidate;
+  double low_level_score = 0;
+  double high_level_score = 0;
+  for (size_t li = 0; li < current_->compaction_score_.size(); li++) {
+    if (current_->compaction_level_[li] == level) {
+        low_level_score = current_->compaction_score_[li];
+    } else if (current_->compaction_level_[li] == level + 1) {
+        high_level_score = current_->compaction_score_[li];
+    }
+  }
+  if (low_level_score < 1.0 ||
+      low_level_score <= high_level_score) {
+      return false;
+  }
+
+  // file in level need compaction, pick file in next compaction
+  for (size_t i = 0; i < current_->files_[level].size(); i++) {
+    FileMetaData* f = current_->files_[level][i];
+    if (f->being_compacted) {
+      continue;
+    }
+
+    if (!compact_pointer_[level].empty() &&
+        icmp_.Compare(f->largest.Encode(), compact_pointer_[level]) <= 0) {
+      candidate.push_back(f);
+      continue;
+    }
+
+    inputs->push_back(f);
+    break;
+  }
+
+  if (inputs->empty()) {
+    FileMetaData* f = current_->files_[level][0];
+    if (!f->being_compacted) {
+      inputs->push_back(f);
+    }
+  }
+  if (inputs->empty() && candidate.size() > 0) {
+    inputs->push_back(candidate[candidate.size() - 1]);
+  }
+  return !inputs->empty();
+}
+
+bool VersionSet::IsOverlapInFileRange(FileMetaData* lf, FileMetaData* f) {
+  if (lf == NULL || f == NULL) {
+    return false;
+  }
+  if (icmp_.Compare(lf->largest.Encode(), f->smallest.Encode()) < 0 ||
+      icmp_.Compare(f->largest.Encode(), lf->smallest.Encode()) < 0) {
+    return false;
+  }
+  //Log(options_->info_log, "[%s] file range overlap, lfile #%d, [%s, %s] being_compact %d, "
+  //    "file #%d, [%s, %s] being_compact %d\n",
+  //    dbname_.c_str(),
+  //    static_cast<uint32_t>(lf->number & 0xffffffff),
+  //    lf->smallest.Encode().ToString().c_str(),
+  //    lf->largest.Encode().ToString().c_str(),
+  //    lf->being_compacted,
+  //    static_cast<uint32_t>(f->number & 0xffffffff),
+  //    f->smallest.Encode().ToString().c_str(),
+  //    f->largest.Encode().ToString().c_str(),
+  //    f->being_compacted);
+  return true;
+}
+
+// Note:
+//  1) if f in level1 being compacted, level0 may be blocked;
+//  2) compacting pointer may cause other f in the same level to be blocked.
+bool VersionSet::PickCompactionBySize(int level, std::vector<FileMetaData*>* inputs) {
+  // Pick low level file, which will be compact next time
+  std::vector<FileMetaData*> low_level_inputs;
+  PickFutureCompaction(level - 1, &low_level_inputs);
+  FileMetaData* low_level_file = NULL;
+  if (low_level_inputs.size() > 0) {
+    low_level_file = low_level_inputs[0];
+    //Log(options_->info_log, "[%s] PickCompactionBySize, low_level %d, f[%s, %s] being_compact %d\n",
+    //    dbname_.c_str(), level - 1,
+    //    low_level_file->smallest.Encode().ToString().c_str(),
+    //    low_level_file->largest.Encode().ToString().c_str(),
+    //    low_level_file->being_compacted);
+  }
+
+  inputs->clear();
+  std::vector<FileMetaData*> candidate;
+  // Pick the first file that comes after compact_pointer_[level]
+  for (size_t i = 0; i < current_->files_[level].size(); i++) {
+    FileMetaData* f = current_->files_[level][i];
+    if (f->being_compacted) {
+      //Log(options_->info_log, "[%s] PickCompactionBySize, level %d, f[%s, %s] being_compact %d\n",
+      //    dbname_.c_str(), level,
+      //    f->smallest.Encode().ToString().c_str(), f->largest.Encode().ToString().c_str(),
+      //    f->being_compacted);
+      continue;
+    }
+    if (!compact_pointer_[level].empty() &&
+        icmp_.Compare(f->largest.Encode(), compact_pointer_[level]) <= 0) {
+      //Log(options_->info_log, "[%s] PickCompactionBySize, skip by compact_pointer_[%d] %s, f[%s, %s] being_compacted %d\n",
+      //    dbname_.c_str(), level, compact_pointer_[level].c_str(),
+      //    f->smallest.Encode().ToString().c_str(), f->largest.Encode().ToString().c_str(),
+      //    f->being_compacted);
+      if (!RangeInCompaction(&f->smallest, &f->largest, level + 1) &&
+          !IsOverlapInFileRange(low_level_file, f)) {
+        candidate.push_back(f);
+      }
+      continue;
+    }
+    if (RangeInCompaction(&f->smallest, &f->largest, level + 1) ||
+        IsOverlapInFileRange(low_level_file, f)) {
+      //PrintRangeInCompaction(&f->smallest, &f->largest, level + 1);
+      continue;
+    }
+    inputs->push_back(f);
+    break;
+  }
+  if (inputs->empty()) {
+    // Wrap-around to the beginning of the key space
+    FileMetaData* f = current_->files_[level][0];
+    if (!f->being_compacted && !RangeInCompaction(&f->smallest, &f->largest, level + 1) &&
+        !IsOverlapInFileRange(low_level_file, f)) {
+      inputs->push_back(f);
+    }
+    //Log(options_->info_log, "[%s] PickCompactBySize, wrap-arroud level %d, f[%s, %s] being_compacted %d\n",
+    //    dbname_.c_str(), level,
+    //    f->smallest.Encode().ToString().c_str(), f->largest.Encode().ToString().c_str(),
+    //    f->being_compacted);
+    //PrintRangeInCompaction(&f->smallest, &f->largest, level + 1);
+  }
+  if (inputs->empty() && candidate.size() > 0) {
+    inputs->push_back(candidate[candidate.size() - 1]);
+  }
+  return !inputs->empty();
+}
+
 // timeout for micro_second
-double VersionSet::CompactionScore(uint64_t* timeout) const {
-  *timeout = 0;
+void VersionSet::CompactionScore(std::vector<std::pair<double, uint64_t> >* scores) {
   uint64_t ts = env_->NowMicros();
   Version* v = current_;
-  if (v->compaction_score_ >= 1) {
-    return v->compaction_score_;
-  } else if (v->del_trigger_compact_ != NULL &&
-            v->del_trigger_compact_->del_percentage > options_->del_percentage) {
-    return (double)(v->del_trigger_compact_->del_percentage / 100.0);
-  } else if (v->ttl_trigger_compact_ != NULL &&
-            ts >= v->ttl_trigger_compact_->check_ttl_ts) {
-    return (double)((v->ttl_trigger_compact_->ttl_percentage + 1) / 100.0);
-  } else if (v->file_to_compact_ != NULL) {
-    return 0.1f;
+  for (size_t i = 0; i < v->compaction_score_.size(); i++) {
+    if (v->compaction_score_[i] >= 1) {
+      scores->push_back(std::pair<double, uint64_t>(v->compaction_score_[i], 0));
+    }
+  }
+  if (v->del_trigger_compact_ != NULL &&
+      !v->del_trigger_compact_->being_compacted &&
+      v->del_trigger_compact_->del_percentage > options_->del_percentage) {
+    scores->push_back(std::pair<double, uint64_t>(
+                     (double)(v->del_trigger_compact_->del_percentage / 100.0), 0));
+  }
+  if (v->ttl_trigger_compact_ != NULL &&
+      !v->ttl_trigger_compact_->being_compacted &&
+      ts >= v->ttl_trigger_compact_->check_ttl_ts) {
+    scores->push_back(std::pair<double, uint64_t>(
+                     (double)((v->ttl_trigger_compact_->ttl_percentage + 1) / 100.0), 0));
+  }
+  if (v->file_to_compact_ != NULL &&
+             !v->file_to_compact_->being_compacted) {
+    scores->push_back(std::pair<double, uint64_t>(0.1, 0));
   }
 
   // delay task
   if (v->ttl_trigger_compact_ != NULL &&
+     !v->ttl_trigger_compact_->being_compacted &&
      ts < v->ttl_trigger_compact_->check_ttl_ts) {
-    *timeout = (v->ttl_trigger_compact_->check_ttl_ts - ts + 1000000) / 1000;
-    return (double)((v->ttl_trigger_compact_->ttl_percentage + 1) / 100.0);
+    scores->push_back(std::pair<double, uint64_t>(
+                     (double)((v->ttl_trigger_compact_->ttl_percentage + 1) / 100.0),
+                     ((v->ttl_trigger_compact_->check_ttl_ts - ts + 1000000) / 1000)));
+  }
+}
+
+Compaction* VersionSet::NewSubCompact(Compaction* compact) {
+  Compaction* c = new Compaction(compact->level_);
+  c->output_level_ = compact->output_level_;
+  c->max_output_file_size_ = compact->max_output_file_size_;
+  c->input_version_ = compact->input_version_;
+  c->input_version_->Ref(); // make sure compacting version will not delete
+
+  for (size_t i = 0; i < 2; i++) {
+    for (size_t j = 0; j < compact->inputs_[i].size(); j++) {
+      c->inputs_[i].push_back((compact->inputs_[i])[j]);
+    }
+  }
+
+  for (size_t i = 0; i < compact->grandparents_.size(); i++) {
+    c->grandparents_.push_back(compact->grandparents_[i]);
+  }
+  c->grandparent_index_ = compact->grandparent_index_;
+  c->seen_key_ = compact->seen_key_;
+  c->overlapped_bytes_ = compact->overlapped_bytes_;
+
+  c->drop_lower_bound_ = compact->drop_lower_bound_;
+  c->force_non_trivial_ = compact->force_non_trivial_;
+  return c;
+}
+
+struct InternalKeyCompare {
+  InternalKeyCompare(const InternalKeyComparator* cmp)
+    : icmp(cmp) {}
+
+  InternalKeyCompare(const InternalKeyCompare& key_cmp)
+    : icmp(key_cmp.icmp) {}
+
+  // retuen true if a < b
+  bool operator () (const std::string& ikey_a, const std::string& ikey_b) {
+    InternalKey ikey1, ikey2;
+    ikey1.DecodeFrom(ikey_a);
+    ikey2.DecodeFrom(ikey_b);
+    bool res = icmp->InternalKeyComparator::Compare(ikey1.Encode(), ikey2.Encode()) < 0;
+    return res;
+  }
+
+  const InternalKeyComparator* icmp;
+};
+
+uint64_t VersionSet::GetApproximateSizeByLevel(Version* v, int level, const InternalKey& ikey) {
+  uint64_t result = 0;
+  const std::vector<FileMetaData*>& files = v->files_[level];
+  for (size_t i = 0; i < files.size(); i++) {
+    if (icmp_.Compare(files[i]->largest, ikey) <= 0) {
+      // Entire file is before "ikey", so just add the file size
+      result += files[i]->file_size;
+    } else if (icmp_.Compare(files[i]->smallest, ikey) > 0) {
+      // Entire file is after "ikey", so ignore
+      if (level > 0) {
+        // Files other than level 0 are sorted by meta->smallest, so
+        // no further files in this level will contain data for
+        // "ikey".
+        break;
+      }
+    } else {
+      // "ikey" falls in the range for this table.  Add the
+      // approximate offset of "ikey" within the table.
+      Table* tableptr;
+      Slice smallest = files[i]->smallest_fake ? files[i]->smallest.Encode() : "";
+      Slice largest = files[i]->largest_fake ? files[i]->largest.Encode() : "";
+      Iterator* iter = table_cache_->NewIterator(
+          ReadOptions(options_), dbname_, files[i]->number, files[i]->file_size,
+          smallest, largest, &tableptr);
+      if (tableptr != NULL) {
+        result += tableptr->ApproximateOffsetOf(ikey.Encode());
+      }
+      delete iter;
+    }
+  }
+  return result;
+}
+
+void VersionSet::GenerateSubCompaction(Compaction* compact, std::vector<Compaction*> * compact_vec,
+                                       port::Mutex* mu) {
+  mu->AssertHeld();
+  if (options_->max_sub_parallel_compaction <= 1) {
+    Compaction* c = NewSubCompact(compact);
+    compact_vec->push_back(c);
+    return;
+  }
+
+  // generate candidate sub compaction split key
+  InternalKeyCompare icmp(&icmp_);
+  std::set<std::string, InternalKeyCompare> boundary(icmp);
+  for (int i = compact->level_; i < compact->output_level_; i++ ) {
+    for (size_t j = 0; j < compact->inputs_[i - compact->level_].size(); j++) {
+      FileMetaData* f = compact->inputs_[i - compact->level_][j];
+      boundary.insert(f->smallest.Encode().ToString());
+      boundary.insert(f->largest.Encode().ToString());
+    }
+  }
+  for (size_t j = 1; j < compact->inputs_[compact->output_level_ - compact->level_].size(); j++) {
+    FileMetaData* f = compact->inputs_[compact->output_level_ - compact->level_][j];
+    boundary.insert(f->smallest.Encode().ToString());
+  }
+
+  mu->Unlock();
+  // generate sub compaction range by output file size
+  uint64_t sum = 0, prev_sum = 0;
+  std::set<std::string, InternalKeyCompare>::iterator it = boundary.begin();
+  while (it != boundary.end()) {
+    sum = 0;
+    InternalKey ikey;
+    ikey.DecodeFrom(*it);
+    for (int i = compact->level_; i <= compact->output_level_; i++ ) {
+      sum += GetApproximateSizeByLevel(compact->input_version_, i, ikey);
+    }
+
+    assert(sum >= prev_sum);
+    if (compact->max_output_file_size_ > sum - prev_sum) {
+      it = boundary.erase(it);
+    } else {
+      ++it;
+      prev_sum = sum;
+    }
+  }
+  mu->Lock();
+
+  // limit max sub compaction
+  assert(options_->max_sub_parallel_compaction > 1);
+  uint64_t avg_num = (boundary.size() + 1) / options_->max_sub_parallel_compaction + 1;
+  it = boundary.begin();
+  uint64_t i = 1;
+  while (avg_num > 1 && it != boundary.end()) {
+    if (i % avg_num != 0) {
+      it = boundary.erase(it);
+    } else {
+      ++it;
+    }
+    i++;
   }
 
-  // nothing to do
-  return -1.0;
+  // construct compaction
+  if (boundary.size() == 0) {
+    Compaction* c = NewSubCompact(compact);
+    compact_vec->push_back(c);
+  } else {
+    std::set<std::string, InternalKeyCompare>::iterator it = boundary.begin();
+    std::string prev_key;
+    while (true) {
+      Compaction* c = NewSubCompact(compact);
+      c->sub_compact_start_ = prev_key;
+      c->sub_compact_end_ = *it;
+      compact_vec->push_back(c);
+
+      ++it;
+      prev_key = c->sub_compact_end_;
+      if (it == boundary.end()) {
+        Compaction* c1 = NewSubCompact(compact);
+        c1->sub_compact_start_ = prev_key;
+        compact_vec->push_back(c1);
+        break;
+      }
+    }
+  }
 }
 
 Compaction* VersionSet::PickCompaction() {
-  Compaction* c;
-  int level;
+  int level = -1;
+  std::vector<FileMetaData*> inputs;
+  bool set_non_trivial = false;
 
   // We prefer compactions triggered by too much data in a level over
   // the compactions triggered by seeks.
-  const bool size_compaction = (current_->compaction_score_ >= 1);
+  const bool size_compaction = (current_->compaction_score_[0] >= 1);
   const bool seek_compaction = (current_->file_to_compact_ != NULL);
   const bool del_compaction = (current_->del_trigger_compact_ != NULL);
   const bool ttl_compaction = (current_->ttl_trigger_compact_ != NULL);
-  if (size_compaction) {
-    level = current_->compaction_level_;
-    assert(level >= 0);
-    assert(level+1 < config::kNumLevels);
-    c = new Compaction(level);
 
-    // Pick the first file that comes after compact_pointer_[level]
-    for (size_t i = 0; i < current_->files_[level].size(); i++) {
-      FileMetaData* f = current_->files_[level][i];
-      if (compact_pointer_[level].empty() ||
-          icmp_.Compare(f->largest.Encode(), compact_pointer_[level]) > 0) {
-        c->inputs_[0].push_back(f);
+  // check size compaction
+  assert(level0_compactions_in_progress_.size() <= 1);
+  bool skipped_l0 = false;
+  for (size_t li = 0; size_compaction && li < current_->compaction_score_.size(); li++) {
+    double score = current_->compaction_score_[li];
+    level = current_->compaction_level_[li];
+    assert(li == 0 || score <= current_->compaction_score_[li - 1]);
+    if (score >= 1) {
+      assert(level >= 0);
+      assert(level+1 < config::kNumLevels);
+      if (skipped_l0 && level <= 1) {
+        // level0 in progress and level 0 will not directly compact to level > 1
+        //Log(options_->info_log, "[%s] lock level %d, conflict, score %.2f\n",
+        //    dbname_.c_str(), level, score);
+        continue;
+      }
+      if (level == 0 && !level0_compactions_in_progress_.empty()) {
+          skipped_l0 = true;
+        //Log(options_->info_log, "[%s] level %d in progress, conflict, score %.2f\n",
+        //    dbname_.c_str(), level, score);
+        continue;
+      }
+      if (PickCompactionBySize(level, &inputs)) {
         break;
       }
+      //Log(options_->info_log, "[%s] pick level %d, conflict, score %.2f\n",
+      //    dbname_.c_str(), level, score);
     }
-    if (c->inputs_[0].empty()) {
-      // Wrap-around to the beginning of the key space
-      c->inputs_[0].push_back(current_->files_[level][0]);
-    }
-  } else if (seek_compaction) {
-    // compaction trigger by seek percentage
-    // TODO: multithread should lock it
+  }
+
+  // check seek compaction
+  if (inputs.empty() && seek_compaction) {
     level = current_->file_to_compact_level_;
-    c = new Compaction(level);
-    c->inputs_[0].push_back(current_->file_to_compact_);
-  } else if (del_compaction) {
+    assert(level >= 0);
+    assert(level+1 < config::kNumLevels);
+    FileMetaData* f = current_->file_to_compact_;
+    if (!f->being_compacted &&
+       (level > 0 || level0_compactions_in_progress_.empty()) &&
+        !RangeInCompaction(&f->smallest, &f->largest, level + 1)) {
+      inputs.push_back(f);
+    }
+  }
+
+  // check del compaction
+  if (inputs.empty() && del_compaction) {
     // compaction trigger by delete tags percentage;
     // TODO: multithread should lock it
     level = current_->del_trigger_compact_level_;
     assert(level >= 0);
     assert(level+1 < config::kNumLevels);
-    c = new Compaction(level);
-    c->SetNonTrivial(true);
-    c->inputs_[0].push_back(current_->del_trigger_compact_);
-    Log(options_->info_log,
+    FileMetaData* f = current_->del_trigger_compact_;
+    if (!f->being_compacted &&
+       (level > 0 || level0_compactions_in_progress_.empty()) &&
+        !RangeInCompaction(&f->smallest, &f->largest, level + 1)) {
+      inputs.push_back(f);
+      set_non_trivial = true;
+      Log(options_->info_log,
         "[%s] compact trigger by del stragety, level %d, num #%lu, file_size %lu, del_p %lu\n",
         dbname_.c_str(),
         current_->del_trigger_compact_level_,
         (current_->del_trigger_compact_->number) & 0xffffffff,
         current_->del_trigger_compact_->file_size,
         current_->del_trigger_compact_->del_percentage);
-  } else if (ttl_compaction) {
+    }
+  }
+
+  // check ttl compaction
+  if (inputs.empty() && ttl_compaction) {
     // compaction trigger by ttl tags percentage
     // TODO: multithread should lock it
     level = current_->ttl_trigger_compact_level_;
     assert(level >= 0);
-    c = new Compaction(level);
-    c->SetNonTrivial(true);
-    c->inputs_[0].push_back(current_->ttl_trigger_compact_);
-    if (level == config::kNumLevels - 1) {// level in last level
-      c->set_output_level(level);
-    }
-    Log(options_->info_log,
+    FileMetaData* f = current_->ttl_trigger_compact_;
+    if (!f->being_compacted &&
+       (level > 0 || level0_compactions_in_progress_.empty()) &&
+       (level+1 == config::kNumLevels || !RangeInCompaction(&f->smallest, &f->largest, level + 1))) {
+      inputs.push_back(f);
+      set_non_trivial = true;
+      Log(options_->info_log,
         "[%s] compact trigger by ttl stragety, level %d, num #%lu, file_size %lu, ttl_p %lu, check_ts %lu\n",
         dbname_.c_str(),
         current_->ttl_trigger_compact_level_,
@@ -1952,32 +2456,57 @@ Compaction* VersionSet::PickCompaction() {
         current_->ttl_trigger_compact_->file_size,
         current_->ttl_trigger_compact_->ttl_percentage,
         current_->ttl_trigger_compact_->check_ttl_ts);
-  } else {
+    }
+  }
+  if (inputs.empty()) {
     return NULL;
   }
 
-  c->input_version_ = current_;
-  c->input_version_->Ref();
-  c->max_output_file_size_ =
-      MaxFileSizeForLevel(c->output_level(), current_->vset_->options_->sst_size);
-
+  assert(inputs.size() == 1);
+  assert(level >= 0);
   // Files in level 0 may overlap each other, so pick up all overlapping ones
   if (level == 0) {
+    assert(level0_compactions_in_progress_.size() == 0);
     InternalKey smallest, largest;
-    GetRange(c->inputs_[0], &smallest, &largest);
+    GetRange(inputs, &smallest, &largest);
     // Note that the next call will discard the file we placed in
     // c->inputs_[0] earlier and replace it with an overlapping set
     // which will include the picked file.
-    current_->GetOverlappingInputs(0, &smallest, &largest, &c->inputs_[0]);
-    assert(!c->inputs_[0].empty());
+    current_->GetOverlappingInputs(level, &smallest, &largest, &inputs);
+    GetRange(inputs, &smallest, &largest);
+    if (RangeInCompaction(&smallest, &largest, level + 1)) { // make sure level1 not in compaction
+      Log(options_->info_log, "[%s] level1 in compacting, level0 conflict\n",
+        dbname_.c_str());
+      return NULL;
+    }
+    assert(!inputs.empty());
+    assert(!FilesInCompaction(inputs));
+  }
+
+  // expand inputs
+  Compaction* c = new Compaction(level);
+  c->SetNonTrivial(set_non_trivial);
+  c->input_version_ = current_;
+  c->input_version_->Ref(); // make sure compacting version will not delete
+  if (level == config::kNumLevels - 1) {// level in last level
+    c->set_output_level(level);
   }
+  c->max_output_file_size_ =
+      MaxFileSizeForLevel(c->output_level(), current_->vset_->options_->sst_size);
+  c->inputs_[0] = inputs;
   SetupOtherInputs(c);
   // tera-specific: calculate the smallest rowkey which overlap with file not
   // in this compaction.
   SetupCompactionBoundary(c);
+
+  // mark being compacted
+  c->MarkBeingCompacted(true);
+  if (level == 0) {
+    level0_compactions_in_progress_.push_back(c);
+  }
+  Finalize(current_); // reculate level score
   return c;
 }
-
 void VersionSet::SetupOtherInputs(Compaction* c) {
   if (c->level() == c->output_level()) { // self level compaction, should select next level
     return;
@@ -2008,7 +2537,10 @@ void VersionSet::SetupOtherInputs(Compaction* c) {
       std::vector<FileMetaData*> expanded1;
       current_->GetOverlappingInputs(c->output_level(), &new_start, &new_limit,
                                      &expanded1);
-      if (expanded1.size() == c->inputs_[1].size()) {
+      // check expanded file wether in compacting
+      if ((expanded1.size() == c->inputs_[1].size()) &&
+          !RangeInCompaction(&new_start, &new_limit, level) &&
+          !RangeInCompaction(&new_start, &new_limit, c->output_level())) {
         Log(options_->info_log,
             "[%s] Expanding@%d %d+%d (%ld+%ld bytes) to %d+%d (%ld+%ld bytes)\n",
             dbname_.c_str(),
@@ -2084,11 +2616,18 @@ void VersionSet::SetupCompactionBoundary(Compaction* c) {
 Compaction* VersionSet::CompactRange(
     int level,
     const InternalKey* begin,
-    const InternalKey* end) {
+    const InternalKey* end, bool* being_compacted) {
+  *being_compacted = false;
   std::vector<FileMetaData*> inputs;
   current_->GetOverlappingInputs(level, begin, end, &inputs);
   if (inputs.empty()) {
-      return NULL;
+    return NULL;
+  }
+
+  // check level0 wether in compaction
+  if (level == 0 && !level0_compactions_in_progress_.empty()) {
+    *being_compacted = true;
+    return NULL;
   }
 
   // Avoid compacting too much in one shot in case the range is large.
@@ -2109,6 +2648,18 @@ Compaction* VersionSet::CompactRange(
     }
   }
 
+  // check being compacting
+  InternalKey smallest, largest;
+  GetRange(inputs, &smallest, &largest);
+  if (FilesInCompaction(inputs) || RangeInCompaction(&smallest, &largest, level + 1)) {
+    PrintFilesInCompaction(inputs);
+    PrintRangeInCompaction(&smallest, &largest, level + 1);
+    Log(options_->info_log, "[%s] RangeCompaction : %s...%s, level: %d or %d, in compaction",
+          dbname_.c_str(), smallest.DebugString().c_str(), largest.DebugString().c_str(), level, level + 1);
+    *being_compacted = true;
+    return NULL;
+  }
+
   Compaction* c = new Compaction(level);
   c->input_version_ = current_;
   c->input_version_->Ref();
@@ -2119,9 +2670,28 @@ Compaction* VersionSet::CompactRange(
   // tera-specific: calculate the smallest rowkey which overlap with file not
   // in this compaction.
   SetupCompactionBoundary(c);
+
+  // mark being compacted
+  c->MarkBeingCompacted(true);
+  if (level == 0) {
+    level0_compactions_in_progress_.push_back(c);
+  }
+  Finalize(current_); // reculate level score
   return c;
 }
 
+void VersionSet::ReleaseCompaction(Compaction* c, Status& s) {
+  c->MarkBeingCompacted(false);
+  assert(level0_compactions_in_progress_.size() <= 1);
+  if (c->level() == 0 && level0_compactions_in_progress_[0] == c) {
+    level0_compactions_in_progress_.resize(0);
+  }
+  if (!s.ok()) {
+    Finalize(current_);
+  }
+  return;
+}
+
 Compaction::Compaction(int level)
     : level_(level),
       output_level_(level + 1),
@@ -2209,6 +2779,16 @@ bool Compaction::ShouldStopBefore(const Slice& internal_key) {
   }
 }
 
+void Compaction::MarkBeingCompacted(bool flag) {
+  for (size_t i = 0; i < 2; i++) {
+    for (size_t j = 0; j < inputs_[i].size(); j++) {
+      assert(flag ? !inputs_[i][j]->being_compacted
+                  : inputs_[i][j]->being_compacted);
+      inputs_[i][j]->being_compacted = flag;
+    }
+  }
+}
+
 void Compaction::ReleaseInputs() {
   if (input_version_ != NULL) {
     input_version_->Unref();
diff --git a/src/leveldb/db/version_set.h b/src/leveldb/db/version_set.h
index 5a01d8dba..c933efced 100644
--- a/src/leveldb/db/version_set.h
+++ b/src/leveldb/db/version_set.h
@@ -19,6 +19,7 @@
 #ifndef STORAGE_LEVELDB_DB_VERSION_SET_H_
 #define STORAGE_LEVELDB_DB_VERSION_SET_H_
 
+#include <deque>
 #include <map>
 #include <set>
 #include <vector>
@@ -56,6 +57,7 @@ extern int FindFile(const InternalKeyComparator& icmp,
 //           in sorted order.
 extern bool SomeFileOverlapsRange(
     const InternalKeyComparator& icmp,
+    const Comparator* ucmp,
     bool disjoint_sorted_files,
     const std::vector<FileMetaData*>& files,
     const Slice* smallest_user_key,
@@ -147,8 +149,8 @@ class Version {
   // Level that should be compacted next and its compaction score.
   // Score < 1 means compaction is not strictly needed.  These fields
   // are initialized by Finalize().
-  double compaction_score_;
-  int compaction_level_;
+  std::vector<double> compaction_score_;
+  std::vector<int> compaction_level_;
 
   explicit Version(VersionSet* vset)
       : vset_(vset), next_(this), prev_(this), refs_(0),
@@ -157,9 +159,13 @@ class Version {
         ttl_trigger_compact_(NULL),
         ttl_trigger_compact_level_(-1),
         del_trigger_compact_(NULL),
-        del_trigger_compact_level_(-1),
-        compaction_score_(-1),
-        compaction_level_(-1) {
+        del_trigger_compact_level_(-1) {
+    compaction_score_.resize(config::kNumLevels - 1);
+    compaction_level_.resize(config::kNumLevels - 1);
+    for (size_t i = 0; i < config::kNumLevels - 1; i++) {
+      compaction_score_[i] = -1.0;
+      compaction_level_[i] = -1;
+    }
   }
 
   ~Version();
@@ -182,6 +188,8 @@ class VersionSet {
   // current version.  Will release *mu while actually writing to the file.
   // REQUIRES: *mu is held on entry.
   // REQUIRES: no other thread concurrently calls LogAndApply()
+  void LogAndApplyHelper(VersionSetBuilder* builder,
+                         VersionEdit* edit);
   Status LogAndApply(VersionEdit* edit, port::Mutex* mu)
       EXCLUSIVE_LOCKS_REQUIRED(mu);
 
@@ -231,7 +239,8 @@ class VersionSet {
   // being compacted, or zero if there is no such log file.
   uint64_t PrevLogNumber() const { return prev_log_number_; }
 
-  double CompactionScore(uint64_t* timeout) const;
+  // <compaction score, task delay time>
+  void CompactionScore(std::vector<std::pair<double, uint64_t> >* scores);
   // Pick level and inputs for a new compaction.
   // Returns NULL if there is no compaction to be done.
   // Otherwise returns a pointer to a heap-allocated object that
@@ -245,7 +254,10 @@ class VersionSet {
   Compaction* CompactRange(
       int level,
       const InternalKey* begin,
-      const InternalKey* end);
+      const InternalKey* end, bool* being_compacted);
+
+  // release file's being_compacted flag, and release level0's lock
+  void ReleaseCompaction(Compaction* c, Status& s);
 
   // Return the maximum overlapping data (in bytes) at next level for any
   // file at a level >= 1.
@@ -259,6 +271,7 @@ class VersionSet {
   // May also mutate some internal state.
   void AddLiveFiles(std::set<uint64_t>* live);
   void AddLiveFiles(std::map<uint64_t, int>* live);
+  void AddLiveFilesWithSize(std::map<uint64_t, uint64_t>* live);
 
   // Return the approximate offset in the database of the data for
   // "key" as of version "v".
@@ -271,10 +284,17 @@ class VersionSet {
   };
   const char* LevelSummary(LevelSummaryStorage* scratch) const;
 
+  void GenerateSubCompaction(Compaction* compact, std::vector<Compaction*> * compact_vec,
+                             port::Mutex* mu);
+
  private:
   friend class Compaction;
   friend class Version;
   friend class VersionSetBuilder;
+  struct ManifestWriter;
+
+  Compaction* NewSubCompact(Compaction* compact);
+  uint64_t GetApproximateSizeByLevel(Version* v, int level, const InternalKey& ikey);
 
   void Finalize(Version* v);
 
@@ -301,6 +321,15 @@ class VersionSet {
 
   bool ModifyFileSize(FileMetaData* f);
 
+  // milti thread compaction relatively
+  void PrintFilesInCompaction(const std::vector<FileMetaData*>& inputs);
+  bool FilesInCompaction(const std::vector<FileMetaData*>& inputs);
+  void PrintRangeInCompaction(const InternalKey* smallest, const InternalKey* largest, int level);
+  bool RangeInCompaction(const InternalKey* smallest, const InternalKey* largest, int level);
+  bool IsOverlapInFileRange(FileMetaData* lf, FileMetaData* f);
+  bool PickFutureCompaction(int level, std::vector<FileMetaData*>* inputs);
+  bool PickCompactionBySize(int level, std::vector<FileMetaData*>* inputs);
+
   Env* const env_;
   const std::string dbname_;
   const Options* const options_;
@@ -316,6 +345,8 @@ class VersionSet {
   uint64_t log_number_;
   uint64_t prev_log_number_;  // 0 or backing store for memtable being compacted
 
+  std::deque<ManifestWriter*> manifest_writers_;
+
   // Opened lazily
   WritableFile* descriptor_file_;
   log::Writer* descriptor_log_;
@@ -325,6 +356,7 @@ class VersionSet {
   // Per-level key at which the next compaction at that level should start.
   // Either an empty string, or a valid InternalKey.
   std::string compact_pointer_[config::kNumLevels];
+  std::vector<Compaction*> level0_compactions_in_progress_;
 
   // No copying allowed
   VersionSet(const VersionSet&);
@@ -372,6 +404,8 @@ class Compaction {
   // before processing "internal_key".
   bool ShouldStopBefore(const Slice& internal_key);
 
+  void MarkBeingCompacted(bool flag);
+
   // Release the input version for the compaction, once the compaction
   // is successful.
   void ReleaseInputs();
@@ -384,6 +418,7 @@ class Compaction {
  private:
   friend class Version;
   friend class VersionSet;
+  friend class DBImpl;
 
   explicit Compaction(int level);
 
@@ -420,6 +455,10 @@ class Compaction {
 
   // support self compaction
   bool force_non_trivial_;
+
+  // support parallel compaction
+  std::string sub_compact_start_;   // own by child
+  std::string sub_compact_end_; // own by child
 };
 
 }  // namespace leveldb
diff --git a/src/leveldb/db/version_set_test.cc b/src/leveldb/db/version_set_test.cc
index f4ad56367..4292ab0e7 100644
--- a/src/leveldb/db/version_set_test.cc
+++ b/src/leveldb/db/version_set_test.cc
@@ -6,10 +6,15 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.
 
+#define private public
 #include "db/version_set.h"
+#undef private
+
+#include "db/dbformat.h"
 #include "util/logging.h"
 #include "util/testharness.h"
 #include "util/testutil.h"
+#include "leveldb/compact_strategy.h"
 
 namespace leveldb {
 
@@ -46,7 +51,7 @@ class FindFileTest {
     InternalKeyComparator cmp(BytewiseComparator());
     Slice s(smallest != NULL ? smallest : "");
     Slice l(largest != NULL ? largest : "");
-    return SomeFileOverlapsRange(cmp, disjoint_sorted_files_, files_,
+    return SomeFileOverlapsRange(cmp, cmp.user_comparator(), disjoint_sorted_files_, files_,
                                  (smallest != NULL ? &s : NULL),
                                  (largest != NULL ? &l : NULL));
   }
@@ -90,7 +95,6 @@ TEST(FindFileTest, Single) {
   ASSERT_TRUE(Overlaps(NULL, NULL));
 }
 
-
 TEST(FindFileTest, Multiple) {
   Add("150", "200");
   Add("200", "250");
@@ -176,6 +180,57 @@ TEST(FindFileTest, OverlappingFiles) {
   ASSERT_TRUE(Overlaps("600", "700"));
 }
 
+class VersionSetTest {
+public:
+    VersionSetTest ()
+      : icmp(opt.comparator),
+        t_log_number(10),
+        t_next_file(20),
+        t_last_seq(100) {
+      opt.compact_strategy_factory = new DummyCompactStrategyFactory();
+      opt.env->DeleteDirRecursive("/tmp/db/test");
+      opt.env->CreateDir("/tmp/db/test");
+      t_vset = new VersionSet(std::string("/tmp/db/test"), &opt, NULL, &icmp);
+      t_vset->manifest_file_number_ = 100;
+    }
+
+public:
+    Options opt;
+    const InternalKeyComparator icmp;
+    VersionSet* t_vset;
+    uint64_t t_log_number;
+    uint64_t t_next_file;
+    uint64_t t_last_seq;
+    port::Mutex t_mu;
+};
+
+TEST(VersionSetTest, PickCompactionTest) {
+  VersionEdit edit;
+
+  edit.AddFile(0, t_vset->NewFileNumber(), 200,
+               InternalKey("a0001", 1, kTypeValue),
+               InternalKey("a0002", 1, kTypeDeletion));
+  edit.AddFile(0, t_vset->NewFileNumber(), 200,
+               InternalKey("a0003", 1, kTypeValue),
+               InternalKey("a0004", 1, kTypeValue));
+  edit.SetComparatorName(leveldb::BytewiseComparator()->Name());
+  t_mu.Lock();
+  t_vset->LogAndApply(&edit, &t_mu);
+  t_mu.Unlock();
+  Compaction* c = t_vset->PickCompaction();
+  ASSERT_TRUE((uint64_t)t_vset->level0_compactions_in_progress_[0] == (uint64_t)c);
+
+  VersionEdit edit1;
+  edit1.AddFile(0, t_vset->NewFileNumber(), 200,
+               InternalKey("a0005", 1, kTypeValue),
+               InternalKey("a0006", 1, kTypeValue));
+  edit1.SetComparatorName(leveldb::BytewiseComparator()->Name());
+  t_mu.Lock();
+  t_vset->LogAndApply(&edit1, &t_mu);
+  t_mu.Unlock();
+  ASSERT_TRUE(t_vset->PickCompaction() == NULL);
+}
+
 }  // namespace leveldb
 
 int main(int argc, char** argv) {
diff --git a/src/leveldb/include/leveldb/db.h b/src/leveldb/include/leveldb/db.h
index 1d235801a..1b93fe8df 100644
--- a/src/leveldb/include/leveldb/db.h
+++ b/src/leveldb/include/leveldb/db.h
@@ -174,6 +174,8 @@ class DB {
   // Add all sst files inherited from other tablets
   virtual void AddInheritedLiveFiles(std::vector<std::set<uint64_t> >* live) = 0;
 
+  virtual bool ShouldForceUnloadOnError() { return false; }
+
  private:
   // No copying allowed
   DB(const DB&);
diff --git a/src/leveldb/include/leveldb/dfs.h b/src/leveldb/include/leveldb/dfs.h
index b5874848d..b5df4b0b2 100644
--- a/src/leveldb/include/leveldb/dfs.h
+++ b/src/leveldb/include/leveldb/dfs.h
@@ -7,6 +7,7 @@
 
 #include <stdint.h>
 #include <string>
+#include <sys/stat.h>
 #include <vector>
 
 namespace leveldb {
@@ -70,8 +71,12 @@ class Dfs {
     static Dfs* NewDfs(const std::string& so_path, const std::string& conf);
     /// Returns 0 on success.
     virtual int32_t UnlockDirectory(const std::string& path) = 0;
+
+    virtual int32_t ClearDirOwner(const std::string& path) = 0;
     /// Returns DfsFile handler on success, NULL on error.WithTime
     virtual DfsFile* OpenFile(const std::string& filename, int32_t flags) = 0;
+
+    virtual int32_t Stat(const std::string& filename, struct stat* fstat) = 0;
 private:
     Dfs(const Dfs&);
     void operator=(const Dfs&);
diff --git a/src/leveldb/include/leveldb/env_dfs.h b/src/leveldb/include/leveldb/env_dfs.h
index d34a2c697..bc0e65d9a 100644
--- a/src/leveldb/include/leveldb/env_dfs.h
+++ b/src/leveldb/include/leveldb/env_dfs.h
@@ -17,7 +17,7 @@
 #include "leveldb/dfs.h"
 #include "leveldb/env.h"
 #include "leveldb/status.h"
-#include "../../../utils/counter.h"
+#include "../../../common/counter.h"
 
 namespace leveldb {
 
@@ -60,6 +60,8 @@ class DfsEnv : public EnvWrapper {
 
     virtual Status UnlockFile(FileLock* lock);
 
+    int32_t ClearDirOwner(const std::string& dir) {return dfs_->ClearDirOwner(dir);}
+
     virtual Env* CacheEnv() { return this; }
 
     static uint64_t gettid() {
diff --git a/src/leveldb/include/leveldb/options.h b/src/leveldb/include/leveldb/options.h
index be78d0d30..6793f0299 100644
--- a/src/leveldb/include/leveldb/options.h
+++ b/src/leveldb/include/leveldb/options.h
@@ -223,6 +223,8 @@ struct Options {
   std::set<uint32_t>* exist_lg_list;
   std::map<uint32_t, LG_info*>* lg_info_list;
 
+  std::set<uint32_t> ignore_corruption_in_open_lg_list;
+
   // compaction strategy to determine how to
   // drop the obsoleted kv records
   bool enable_strategy_when_get;
@@ -310,13 +312,24 @@ struct Options {
   bool ignore_corruption_in_open;
 
   // Statistic: By default, if 10% entry timeout, will trigger compaction
-  // Default: 10 %
+  // Default: 99 %
   uint64_t ttl_percentage;
 
   // Statistic: delete tag's percentage in sst
-  // Default: 10 %
+  // Default: 20 %
   uint64_t del_percentage;
 
+  // Max thread alloc for lg's compaction
+  // Default: 5
+  uint32_t max_background_compactions;
+
+  // if level0's file num >= limit, use sqrt slow down level score
+  // Default: 30
+  int slow_down_level0_score_limit;
+
+  // parallel compaction
+  int max_sub_parallel_compaction;
+
   // Create an Options object with default values for all fields.
   Options();
 };
diff --git a/src/leveldb/include/leveldb/status.h b/src/leveldb/include/leveldb/status.h
index 4bd364cca..0e062e6c1 100644
--- a/src/leveldb/include/leveldb/status.h
+++ b/src/leveldb/include/leveldb/status.h
@@ -55,6 +55,10 @@ class Status {
     return Status(kTimeOut, msg, msg2);
   }
 
+  static Status IOPermissionDenied(const Slice& msg, const Slice msg2 = Slice()) {
+    return Status(kIOPermissionDenied, msg, msg2);
+  }
+
   // Returns true iff the status indicates success.
   bool ok() const { return (state_ == NULL); }
 
@@ -69,6 +73,8 @@ class Status {
 
   // Returns true iff the status indicates an TimeOut.
   bool IsTimeOut() const { return code() == kTimeOut; }
+
+  bool IsIOPermissionDenied() const { return code() == kIOPermissionDenied; }
   // Return a string representation of this status suitable for printing.
   // Returns the string "OK" for success.
   std::string ToString() const;
@@ -88,7 +94,8 @@ class Status {
     kNotSupported = 3,
     kInvalidArgument = 4,
     kIOError = 5,
-    kTimeOut = 6
+    kTimeOut = 6,
+    kIOPermissionDenied = 13
   };
 
   Code code() const {
diff --git a/src/leveldb/port/port_posix.h b/src/leveldb/port/port_posix.h
index ed19e222f..65f4274a1 100644
--- a/src/leveldb/port/port_posix.h
+++ b/src/leveldb/port/port_posix.h
@@ -46,9 +46,7 @@
 #endif
 
 #include <pthread.h>
-#ifdef SNAPPY
 #include <snappy.h>
-#endif
 #include <stdint.h>
 #include <string>
 #include "port/atomic_pointer.h"
@@ -124,33 +122,21 @@ extern void InitOnce(OnceType* once, void (*initializer)());
 
 inline bool Snappy_Compress(const char* input, size_t length,
                             ::std::string* output) {
-#ifdef SNAPPY
   output->resize(snappy::MaxCompressedLength(length));
   size_t outlen;
   snappy::RawCompress(input, length, &(*output)[0], &outlen);
   output->resize(outlen);
   return true;
-#endif
-
-  return false;
 }
 
 inline bool Snappy_GetUncompressedLength(const char* input, size_t length,
                                          size_t* result) {
-#ifdef SNAPPY
   return snappy::GetUncompressedLength(input, length, result);
-#else
-  return false;
-#endif
 }
 
 inline bool Snappy_Uncompress(const char* input, size_t length,
                               char* output) {
-#ifdef SNAPPY
   return snappy::RawUncompress(input, length, output);
-#else
-  return false;
-#endif
 }
 
 /////////// Compression Ext ///////////
diff --git a/src/leveldb/table/table_builder.cc b/src/leveldb/table/table_builder.cc
index 9d6a7983b..63b70bb63 100644
--- a/src/leveldb/table/table_builder.cc
+++ b/src/leveldb/table/table_builder.cc
@@ -18,7 +18,7 @@
 #include "table/format.h"
 #include "util/coding.h"
 #include "util/crc32c.h"
-#include "../utils/counter.h"
+#include "../common/counter.h"
 
 namespace leveldb {
 
diff --git a/src/leveldb/util/env_cache.cc b/src/leveldb/util/env_cache.cc
index 9d99fd168..51db78a27 100644
--- a/src/leveldb/util/env_cache.cc
+++ b/src/leveldb/util/env_cache.cc
@@ -33,6 +33,9 @@ const char* paths[] = {"./cache_dir_1/", "./cache_dir_2/"};
 std::vector<std::string> ThreeLevelCacheEnv::cache_paths_(paths, paths + 2);
 
 static Status IOError(const std::string& context, int err_number) {
+    if (err_number == EACCES) {
+        return Status::IOPermissionDenied(context, strerror(err_number));
+    }
     return Status::IOError(context, strerror(err_number));
 }
 
diff --git a/src/leveldb/util/env_dfs.cc b/src/leveldb/util/env_dfs.cc
index 53fde1804..f9f260b13 100644
--- a/src/leveldb/util/env_dfs.cc
+++ b/src/leveldb/util/env_dfs.cc
@@ -22,7 +22,7 @@
 #include "leveldb/table_utils.h"
 #include "nfs.h"
 #include "util/mutexlock.h"
-#include "../utils/counter.h"
+#include "../common/counter.h"
 
 namespace leveldb {
 
@@ -95,6 +95,9 @@ char* get_time_str(char* p, size_t len)
 // Log error message
 static Status IOError(const std::string& context, int err_number)
 {
+    if (err_number == EACCES) {
+        return Status::IOPermissionDenied(context, strerror(err_number));
+    }
     return Status::IOError(context, strerror(err_number));
 }
 
diff --git a/src/leveldb/util/env_flash.cc b/src/leveldb/util/env_flash.cc
index fd0702388..c6c42a9cc 100644
--- a/src/leveldb/util/env_flash.cc
+++ b/src/leveldb/util/env_flash.cc
@@ -21,7 +21,7 @@
 #include "util/hash.h"
 #include "util/mutexlock.h"
 #include "helpers/memenv/memenv.h"
-#include "../utils/counter.h"
+#include "../common/counter.h"
 
 #include "leveldb/env_flash.h"
 
@@ -38,6 +38,9 @@ const int64_t kUpdateFlashRetryIntervalMillis = 60 * 1000;
 
 // Log error message
 static Status IOError(const std::string& context, int err_number) {
+    if (err_number == EACCES) {
+        return Status::IOPermissionDenied(context, strerror(err_number));
+    }
     return Status::IOError(context, strerror(err_number));
 }
 
@@ -68,7 +71,7 @@ Status CopyToLocal(const std::string& local_fname, Env* env,
         if (!s.ok()) {
             Log("[env_flash] create dir: %s failed: %s, exit",
                 local_fname.substr(0, dir_pos).c_str(), s.ToString().c_str());
-            exit(-1);
+            _exit(-1);
         }
     }
 
@@ -79,7 +82,7 @@ Status CopyToLocal(const std::string& local_fname, Env* env,
         if (!vanish_allowed) {
             Log("[env_flash] create file: %s failed: %s, exit",
                 local_fname.c_str(), s.ToString().c_str());
-            exit(-1);
+            _exit(-1);
         }
         delete dfs_file;
         return s;
@@ -501,7 +504,7 @@ void FlashEnv::SetFlashPath(const std::string& path, bool vanish_allowed) {
                 && !Env::Default()->CreateDir(flash_paths_.back()).ok()) {
                 Log("[env_flash] cannot access cache dir: %s\n",
                     flash_paths_.back().c_str());
-                exit(-1);
+                _exit(-1);
             }
         }
     }
diff --git a/src/leveldb/util/env_inmem.cc b/src/leveldb/util/env_inmem.cc
index 4e9855269..a587eacac 100644
--- a/src/leveldb/util/env_inmem.cc
+++ b/src/leveldb/util/env_inmem.cc
@@ -20,7 +20,7 @@
 #include "leveldb/table_utils.h"
 #include "util/mutexlock.h"
 #include "helpers/memenv/memenv.h"
-#include "../utils/counter.h"
+#include "../common/counter.h"
 
 #include "leveldb/env_inmem.h"
 
diff --git a/src/leveldb/util/env_mock.cc b/src/leveldb/util/env_mock.cc
index 5265e58ea..abf13089e 100644
--- a/src/leveldb/util/env_mock.cc
+++ b/src/leveldb/util/env_mock.cc
@@ -51,6 +51,9 @@ void MockEnv::SetPrefix(const std::string& p)
 // Log error message
 static Status IOError(const std::string& context, int err_number)
 {
+    if (err_number == EACCES) {
+        return Status::IOPermissionDenied(context, strerror(err_number));
+    }
     return Status::IOError(context, strerror(err_number));
 }
 
diff --git a/src/leveldb/util/env_posix.cc b/src/leveldb/util/env_posix.cc
index fdc1d2ce4..6d495768e 100644
--- a/src/leveldb/util/env_posix.cc
+++ b/src/leveldb/util/env_posix.cc
@@ -36,7 +36,7 @@
 #include "util/posix_logger.h"
 #include "util/string_ext.h"
 #include "util/thread_pool.h"
-#include "../utils/counter.h"
+#include "../common/counter.h"
 
 namespace leveldb {
 
@@ -59,6 +59,9 @@ tera::Counter posix_other_counter;
 namespace {
 
 static Status IOError(const std::string& context, int err_number) {
+  if (err_number == EACCES) {
+    return Status::IOPermissionDenied(context, strerror(err_number));
+  }
   return Status::IOError(context, strerror(err_number));
 }
 
@@ -132,9 +135,13 @@ class PosixRandomAccessFile: public RandomAccessFile {
 // problems for very large databases.
 class MmapLimiter {
  public:
-  // Up to 1000 mmaps for 64-bit binaries; none for smaller pointer sizes.
   MmapLimiter() {
-    SetAllowed(sizeof(void*) >= 8 ? 1000 : 0);
+    //Disable mmap in tera for reducing memory use.
+    SetAllowed(0);
+
+    // Up to 1000 mmaps for 64-bit binaries; none for smaller pointer sizes.
+    //SetAllowed(sizeof(void*) >= 8 ? 1000 : 0);
+    //If you want to enable mmap, uncomment the line above.
   }
 
   // If another mmap slot is available, acquire it and return true.
diff --git a/src/leveldb/util/hdfs.cc b/src/leveldb/util/hdfs.cc
index b90fea36e..4a9721bc2 100644
--- a/src/leveldb/util/hdfs.cc
+++ b/src/leveldb/util/hdfs.cc
@@ -6,10 +6,10 @@
 
 #include <assert.h>
 #include <dlfcn.h>
-
 #include "hdfs.h"
 #include "include/hdfs.h"
-#include "../utils/counter.h"
+#include "hdfs_util.h"
+#include "../common/counter.h"
 
 namespace leveldb {
 
@@ -233,6 +233,21 @@ int32_t Hdfs::UnlockDirectory(const std::string& path) {
   return -1;
 }
 
+
+int32_t Hdfs::Stat(const std::string& filename, struct stat* fstat) {
+  hdfsFileInfo* pFileInfo = (*hdfsGetPathInfo)((hdfsFS)fs_, filename.c_str());
+  if (pFileInfo != NULL) {
+    HdfsFileInfo2PosixFileStat(pFileInfo, fstat);
+    (*hdfsFreeFileInfo)(pFileInfo, 1);
+    return 0;
+  }
+  return -1;
+}
+
+int32_t Hdfs::ClearDirOwner(const std::string& path) {
+  // hdfs has no dir owner, so we return succ directly
+  return 0;
 }
 
+}
 /* vim: set expandtab ts=2 sw=2 sts=2 tw=100: */
diff --git a/src/leveldb/util/hdfs.h b/src/leveldb/util/hdfs.h
index 81ed269ac..ebf464f6b 100644
--- a/src/leveldb/util/hdfs.h
+++ b/src/leveldb/util/hdfs.h
@@ -48,8 +48,9 @@ class Hdfs : public Dfs {
   int32_t ListDirectory(const std::string& path, std::vector<std::string>* result);
   int32_t LockDirectory(const std::string& path);
   int32_t UnlockDirectory(const std::string& path);
+  int32_t ClearDirOwner(const std::string& path);
   DfsFile* OpenFile(const std::string& filename, int32_t flags);
-
+  int32_t Stat(const std::string& filename, struct stat* fstat);
 private:
   void* fs_;
 
@@ -92,8 +93,10 @@ class Hdfs2 : public Dfs {
   int32_t ListDirectory(const std::string& path, std::vector<std::string>* result);
   int32_t LockDirectory(const std::string& path);
   int32_t UnlockDirectory(const std::string& path);
+  int32_t ClearDirOwner(const std::string& path);
   DfsFile* OpenFile(const std::string& filename, int32_t flags);
 
+  int32_t Stat(const std::string& filename, struct stat* fstat);
 private:
   void* GetFSHandle(const std::string& path);
   std::vector<void*> fs_list_;
diff --git a/src/leveldb/util/hdfs2.cc b/src/leveldb/util/hdfs2.cc
index fa3a8902c..0eac0ecea 100644
--- a/src/leveldb/util/hdfs2.cc
+++ b/src/leveldb/util/hdfs2.cc
@@ -7,8 +7,9 @@
 
 #include "hdfs.h"
 #include "include/hdfs2.h"
+#include "hdfs_util.h"
 #include "util/hash.h"
-#include "../utils/counter.h"
+#include "../common/counter.h"
 
 namespace leveldb {
 
@@ -257,6 +258,21 @@ int32_t Hdfs2::UnlockDirectory(const std::string& path) {
   return -1;
 }
 
+int32_t Hdfs2::ClearDirOwner(const std::string& path) {
+  // hdfs has no dir owner, so return succ directly
+  return 0;
+}
+
+int32_t Hdfs2::Stat(const std::string& filepath, struct stat* st) {
+  hdfsFileInfo* pFileInfo = (*hdfsGetPathInfo)((hdfsFS)GetFSHandle(filepath), filepath.c_str());
+  if (pFileInfo != NULL) {
+    HdfsFileInfo2PosixFileStat(pFileInfo, st);
+    return 0;
+  }
+  return -1;
+
+}
+
 
 } // namespace leveldb
 
diff --git a/src/leveldb/util/hdfs_util.h b/src/leveldb/util/hdfs_util.h
new file mode 100644
index 000000000..ba2eb720b
--- /dev/null
+++ b/src/leveldb/util/hdfs_util.h
@@ -0,0 +1,64 @@
+// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+//
+
+#ifndef TERA_LEVELDB_HDFS_UTIL_H
+#define TERA_LEVELDB_HDFS_UTIL_H
+#include <sys/types.h>
+#include <pwd.h>
+#include <grp.h>
+#include <unistd.h>
+#include <stdlib.h>
+namespace leveldb {
+
+static void HdfsFileInfo2PosixFileStat(hdfsFileInfo* info, struct stat* st) {
+  memset(st, 0, sizeof(struct stat));
+  //by default: set to 0 to indicate not support for directory because we can not get this info
+  st->st_nlink = (info->mKind == kObjectKindDirectory) ? 0 : 1;
+  uid_t owner_id = 99; // no body, magic number in linux
+  if (info->mOwner != NULL) {
+    struct passwd passwd_info;
+    struct passwd* result = NULL;
+    ssize_t buf_size = sysconf(_SC_GETPW_R_SIZE_MAX);
+    buf_size = buf_size == -1 ? 16384 : buf_size;
+    char* pwbuf = new char[buf_size];
+    if (0 == getpwnam_r(info->mOwner, &passwd_info, pwbuf, buf_size, &result)) {
+      if (result != NULL) {
+        owner_id = passwd_info.pw_uid;
+      }
+    }
+    delete [] pwbuf;
+  }
+  gid_t group_id = 99; // no body, magic number in posix
+  if (info->mGroup != NULL) {
+    struct group result;
+    struct group* resultp;
+    ssize_t len = sysconf(_SC_GETGR_R_SIZE_MAX);
+    len = len == -1 ? 16384 : len;
+    char* group_buf = new char[len];
+    if (0 == getgrnam_r(info->mGroup, &result, group_buf, len, &resultp)) {
+      if (resultp != NULL) {
+        group_id = result.gr_gid;
+      }
+    }
+    delete [] group_buf;
+  }
+  short file_mode = (info->mKind == kObjectKindDirectory) ? (S_IFDIR | 0777) :  (S_IFREG | 0666);
+  if (info->mPermissions > 0) {
+    file_mode = (info->mKind == kObjectKindDirectory) ? S_IFDIR:  S_IFREG;
+    file_mode |= info->mPermissions;
+  }
+  st->st_size = (info->mKind == kObjectKindDirectory) ? 4096 : info->mSize;
+  st->st_blksize = 512; // posix default block size
+  st->st_blocks = (st->st_size + st->st_blksize - 1)/st->st_blksize;
+  st->st_mode = file_mode;
+  st->st_uid = owner_id;
+  st->st_gid = group_id;
+  st->st_atime = info->mLastAccess;
+  st->st_ctime = info->mLastMod;
+  st->st_mtime = info->mLastMod;
+  return;
+}
+}
+#endif
diff --git a/src/leveldb/util/nfs.cc b/src/leveldb/util/nfs.cc
index cb07a1797..37f0f0666 100644
--- a/src/leveldb/util/nfs.cc
+++ b/src/leveldb/util/nfs.cc
@@ -13,7 +13,7 @@
 #include "util/mutexlock.h"
 #include "util/string_ext.h"
 #include "../common/timer.h"
-#include "../utils/counter.h"
+#include "../common/counter.h"
 
 namespace leveldb {
 
@@ -29,6 +29,7 @@ static struct ::dirent* (*nfsReaddir)(nfs::NFSDIR* dir);
 static int (*nfsClosedir)(nfs::NFSDIR* dir);
 static int (*nfsSetDirOwner)(const char* path);
 static int (*nfsClearDirOwner)(const char* path);
+static int (*nfsForceClearDirOwner)(const char* path);
 
 static int (*nfsStat)(const char* path, struct ::stat* stat);
 static int (*nfsUnlink)(const char* path);
@@ -90,7 +91,7 @@ void Nfs::LoadSymbol() {
   }
 
   *(void**)(&printVersion) = ResolveSymbol(dl, "PrintNfsVersion");
-  fprintf(stderr, "libnfs.so version: \n%s\n\n", (*printVersion)());
+  //fprintf(stderr, "libnfs.so version: \n%s\n\n", (*printVersion)());
 
   *(void**)(&nfsInit) = ResolveSymbol(dl, "Init");
   *(void**)(&nfsSetComlogLevel) = ResolveSymbol(dl, "SetComlogLevel");
@@ -102,6 +103,7 @@ void Nfs::LoadSymbol() {
   *(void**)(&nfsClosedir) = ResolveSymbol(dl, "Closedir");
   *(void**)(&nfsSetDirOwner) = ResolveSymbol(dl, "SetDirOwner");
   *(void**)(&nfsClearDirOwner) = ResolveSymbol(dl, "ClearDirOwner");
+  *(void**)(&nfsForceClearDirOwner) = ResolveSymbol(dl, "ForceClearDirOwner");
   *(void**)(&nfsStat) = ResolveSymbol(dl, "Stat");
   *(void**)(&nfsUnlink) = ResolveSymbol(dl, "Unlink");
   *(void**)(&nfsAccess) = ResolveSymbol(dl, "Access");
@@ -256,7 +258,7 @@ int32_t Nfs::CreateDirectory(const std::string& name) {
     if (0 != (*nfsAccess)(path.c_str(), F_OK) && (*nfsGetErrno)() == ENOENT) {
       if (0 != (*nfsMkdir)(path.c_str()) && (*nfsGetErrno)() != EEXIST) {
         errno = (*nfsGetErrno)();
-        fprintf(stderr, "[%s] Createdir %s fail: %d\n", common::timer::get_curtime_str().c_str(), name.c_str(), errno);
+        fprintf(stderr, "[%s] Createdir %s fail: %d\n", tera::get_curtime_str().c_str(), name.c_str(), errno);
         return -1;
       }
     }
@@ -268,7 +270,7 @@ int32_t Nfs::DeleteDirectory(const std::string& name) {
   int32_t retval = (*nfsRmdir)(name.c_str());
   if (retval != 0) {
     errno = (*nfsGetErrno)();
-    fprintf(stderr, "[%s] DeleteDirectory %s fail: %d\n", common::timer::get_curtime_str().c_str(), name.c_str(), errno);
+    fprintf(stderr, "[%s] DeleteDirectory %s fail: %d\n", tera::get_curtime_str().c_str(), name.c_str(), errno);
   }
   return retval;
 }
@@ -277,7 +279,7 @@ int32_t Nfs::Exists(const std::string& filename) {
   if (retval != 0) {
     errno = (*nfsGetErrno)();
     int errno_saved = errno;
-    fprintf(stderr, "[%s] Exists %s fail: %d\n", common::timer::get_curtime_str().c_str(), filename.c_str(), errno);
+    fprintf(stderr, "[%s] Exists %s fail: %d\n", tera::get_curtime_str().c_str(), filename.c_str(), errno);
     errno = errno_saved;
   }
   return retval;
@@ -286,7 +288,7 @@ int32_t Nfs::Delete(const std::string& filename) {
   int32_t retval = (*nfsUnlink)(filename.c_str());
   if (retval != 0) {
     errno = (*nfsGetErrno)();
-    fprintf(stderr, "[%s] Delete %s fail: %d\n", common::timer::get_curtime_str().c_str(), filename.c_str(), errno);
+    fprintf(stderr, "[%s] Delete %s fail: %d\n", tera::get_curtime_str().c_str(), filename.c_str(), errno);
   }
   return retval;
 }
@@ -297,7 +299,7 @@ int32_t Nfs::GetFileSize(const std::string& filename, uint64_t* size) {
     *size = fileinfo.st_size;
   } else {
     errno = (*nfsGetErrno)();
-    fprintf(stderr, "[%s] Getfilesize %s fail: %d\n", common::timer::get_curtime_str().c_str(), filename.c_str(), errno);
+    fprintf(stderr, "[%s] Getfilesize %s fail: %d\n", tera::get_curtime_str().c_str(), filename.c_str(), errno);
   }
   return retval;
 }
@@ -305,7 +307,7 @@ int32_t Nfs::Rename(const std::string& from, const std::string& to) {
   int32_t retval = (*nfsRename)(from.c_str(), to.c_str());
   if (retval != 0) {
     errno = (*nfsGetErrno)();
-    fprintf(stderr, "[%s] Rename %s to %s fail: %d\n", common::timer::get_curtime_str().c_str(), from.c_str(), to.c_str(), errno);
+    fprintf(stderr, "[%s] Rename %s to %s fail: %d\n", tera::get_curtime_str().c_str(), from.c_str(), to.c_str(), errno);
   }
   return retval;
 }
@@ -322,10 +324,19 @@ DfsFile* Nfs::OpenFile(const std::string& filename, int32_t flags) {
     return new NFile(file, filename);
   }
   errno = (*nfsGetErrno)();
-  fprintf(stderr, "[%s] Openfile %s fail: %d\n", common::timer::get_curtime_str().c_str(), filename.c_str(), errno);
+  fprintf(stderr, "[%s] Openfile %s fail: %d\n", tera::get_curtime_str().c_str(), filename.c_str(), errno);
   return NULL;
 }
 
+int32_t Nfs::Stat(const std::string& filename, struct stat* fstat) {
+  int32_t retval = (*nfsStat)(filename.c_str(), fstat);
+  if (retval != 0) {
+    errno = (*nfsGetErrno)();
+    //fprintf(stderr, "[%s] Stat %s fail: %d\n", tera::get_curtime_str().c_str(), filename.c_str(), errno);
+  }
+  return retval;
+}
+
 int32_t Nfs::Copy(const std::string& from, const std::string& to) {
   // not support
   return -1;
@@ -336,7 +347,7 @@ int32_t Nfs::ListDirectory(const std::string& path,
   if (NULL == dir) {
     errno = (*nfsGetErrno)();
     int errno_saved = errno;
-    fprintf(stderr, "[%s] Opendir %s fail: %d\n", common::timer::get_curtime_str().c_str(), path.c_str(), errno);
+    fprintf(stderr, "[%s] Opendir %s fail: %d\n", tera::get_curtime_str().c_str(), path.c_str(), errno);
     errno = errno_saved;
     return -1;
   }
@@ -350,7 +361,7 @@ int32_t Nfs::ListDirectory(const std::string& path,
   errno = (*nfsGetErrno)();
   int errno_saved = errno;
   if (0 != errno) {
-    fprintf(stderr, "[%s] List %s error: %d\n", common::timer::get_curtime_str().c_str(), path.c_str(), errno);
+    fprintf(stderr, "[%s] List %s error: %d\n", tera::get_curtime_str().c_str(), path.c_str(), errno);
     (*nfsClosedir)(dir);
     errno = errno_saved;
     return -1;
@@ -394,5 +405,9 @@ int32_t Nfs::UnlockDirectory(const std::string& path) {
   return (*nfsClearDirOwner)(path.c_str());
 }
 
+int32_t Nfs::ClearDirOwner(const std::string& path) {
+  return (*nfsForceClearDirOwner)(path.c_str());
+}
+
 }
 /* vim: set expandtab ts=2 sw=2 sts=2 tw=100: */
diff --git a/src/leveldb/util/nfs.h b/src/leveldb/util/nfs.h
index b80dd0316..ab286d82b 100644
--- a/src/leveldb/util/nfs.h
+++ b/src/leveldb/util/nfs.h
@@ -50,7 +50,10 @@ class Nfs : public Dfs {
   int32_t ListDirectory(const std::string& path, std::vector<std::string>* result);
   int32_t LockDirectory(const std::string& path);
   int32_t UnlockDirectory(const std::string& path);
+  int32_t ClearDirOwner(const std::string& path);
+
   DfsFile* OpenFile(const std::string& filename, int32_t flags);
+  int32_t Stat(const std::string& filename, struct stat* fstat);
 private:
   Nfs();
   static port::Mutex mu_;
diff --git a/src/leveldb/util/options.cc b/src/leveldb/util/options.cc
index ecd11b57e..e64512908 100644
--- a/src/leveldb/util/options.cc
+++ b/src/leveldb/util/options.cc
@@ -53,7 +53,10 @@ Options::Options()
       disable_wal(false),
       ignore_corruption_in_open(false),
       ttl_percentage(99),
-      del_percentage(20) {
+      del_percentage(20),
+      max_background_compactions(5),
+      slow_down_level0_score_limit(30),
+      max_sub_parallel_compaction(10) {
 }
 
 }  // namespace leveldb
diff --git a/src/leveldb/util/raw_key_operator.cc b/src/leveldb/util/raw_key_operator.cc
index 9d5b5d3dc..8ce699c5b 100644
--- a/src/leveldb/util/raw_key_operator.cc
+++ b/src/leveldb/util/raw_key_operator.cc
@@ -7,7 +7,7 @@
 #include <pthread.h>
 
 #include "coding.h"
-#include "../utils/counter.h"
+#include "../common/counter.h"
 
 namespace leveldb {
 
diff --git a/src/leveldb/util/status.cc b/src/leveldb/util/status.cc
index 871a34872..14b22f82e 100644
--- a/src/leveldb/util/status.cc
+++ b/src/leveldb/util/status.cc
@@ -65,6 +65,9 @@ std::string Status::ToString() const {
       case kTimeOut:
         type = "Timeout error: ";
         break;
+      case kIOPermissionDenied:
+        type = "IO Permission Denied: ";
+        break;
       default:
         snprintf(tmp, sizeof(tmp), "Unknown code(%d): ",
                  static_cast<int>(code()));
diff --git a/src/load_balancer/action.h b/src/load_balancer/action.h
new file mode 100644
index 000000000..754382916
--- /dev/null
+++ b/src/load_balancer/action.h
@@ -0,0 +1,45 @@
+// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef TERA_LOAD_BALANCER_ACTION_H_
+#define TERA_LOAD_BALANCER_ACTION_H_
+
+#include <memory>
+#include <string>
+
+namespace tera {
+namespace load_balancer {
+
+class Action {
+public:
+    enum class Type {
+        ASSIGN,
+        MOVE,
+        SWAP,
+        EMPTY,
+    };
+
+    Type GetType() const {
+        return type_;
+    }
+
+public:
+    Action(Type t) {
+        type_ = t;
+    }
+
+    virtual ~Action() {}
+
+    virtual Action* UndoAction() = 0;
+
+    virtual std::string ToString() const = 0;
+
+private:
+    Type type_;
+};
+
+} // namespace load_balancer
+} // namespace tera
+
+#endif // TERA_LOAD_BALANCER_ACTION_H_
diff --git a/src/load_balancer/action_generator.h b/src/load_balancer/action_generator.h
new file mode 100644
index 000000000..77403bfe1
--- /dev/null
+++ b/src/load_balancer/action_generator.h
@@ -0,0 +1,67 @@
+// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef TERA_LOAD_BALANCER_ACTION_GENERATOR_H_
+#define TERA_LOAD_BALANCER_ACTION_GENERATOR_H_
+
+#include <limits>
+#include <memory>
+#include <string>
+
+#include "load_balancer/action.h"
+#include "load_balancer/cluster.h"
+#include "load_balancer/random.h"
+
+namespace tera {
+namespace load_balancer {
+
+const uint32_t kInvalidNodeIndex = std::numeric_limits<uint32_t>::max();
+const uint32_t kInvalidTabletIndex = std::numeric_limits<uint32_t>::max();
+
+class ActionGenerator {
+public:
+    virtual ~ActionGenerator() {}
+
+    virtual Action* Generate(const std::shared_ptr<Cluster>& cluster) = 0;
+
+    virtual std::string Name() = 0;
+
+    virtual uint32_t PickRandomNode(const std::shared_ptr<Cluster>& cluster) {
+        if (cluster->tablet_node_num_ > 0) {
+            return Random::Rand(0, cluster->tablet_node_num_);
+        } else {
+            return kInvalidNodeIndex;
+        }
+    }
+
+    // pick a different node with the picked_index
+    virtual uint32_t PickOtherRandomNode(const std::shared_ptr<Cluster>& cluster,
+                                         const uint32_t picked_index) {
+        assert(cluster->tablet_node_num_ >= 2);
+
+        while (true) {
+            uint32_t node_index = PickRandomNode(cluster);
+            if (node_index != picked_index) {
+                return node_index;
+            }
+        }
+    }
+
+    virtual uint32_t PickRandomTabletOfNode(const std::shared_ptr<Cluster>& cluster,
+                                            const uint32_t node_index) {
+        uint32_t tablet_num = cluster->tablets_per_node_[node_index].size();
+
+        if (tablet_num > 0) {
+            uint32_t rand = Random::Rand(0, tablet_num);
+            return cluster->tablets_per_node_[node_index][rand];
+        } else {
+            return kInvalidTabletIndex;
+        }
+    }
+};
+
+} // namespace load_balancer
+} // namespace tera
+
+#endif // TERA_LOAD_BALANCER_ACTION_GENERATOR_H_
diff --git a/src/load_balancer/action_generators.cc b/src/load_balancer/action_generators.cc
new file mode 100644
index 000000000..f0cfe53d1
--- /dev/null
+++ b/src/load_balancer/action_generators.cc
@@ -0,0 +1,344 @@
+// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include <assert.h>
+
+#include <limits>
+
+#include "glog/logging.h"
+#include "load_balancer/action_generators.h"
+#include "load_balancer/actions.h"
+#include "load_balancer/random.h"
+
+namespace tera {
+namespace load_balancer {
+
+RandomActionGenerator::RandomActionGenerator() :
+        name_("RandomActionGenerator") {
+}
+
+RandomActionGenerator::~RandomActionGenerator() {
+}
+
+Action* RandomActionGenerator::Generate(const std::shared_ptr<Cluster>& cluster) {
+    VLOG(20) << "[lb] RandomActionGenerator worked";
+
+    if (cluster->tablet_node_num_ < 2) {
+        return new EmptyAction();
+    }
+
+    uint32_t source_node_index = PickRandomNode(cluster);
+    uint32_t dest_node_index = PickOtherRandomNode(cluster, source_node_index);
+    uint32_t tablet_index = PickRandomTabletOfNode(cluster, source_node_index);
+
+    if (tablet_index == kInvalidTabletIndex ||
+            source_node_index == kInvalidNodeIndex ||
+            dest_node_index == kInvalidNodeIndex) {
+        return new EmptyAction();
+    }
+
+    return new MoveAction(tablet_index, source_node_index, dest_node_index);
+}
+
+std::string RandomActionGenerator::Name() {
+    return name_;
+}
+
+TabletCountActionGenerator::TabletCountActionGenerator() :
+        name_("TabletCountActionGenerator") {
+}
+
+TabletCountActionGenerator::~TabletCountActionGenerator() {
+}
+
+Action* TabletCountActionGenerator::Generate(const std::shared_ptr<Cluster>& cluster) {
+    VLOG(20) << "[lb] TabletCountActionGenerator worked";
+
+    if (cluster->tablet_node_num_ < 2) {
+        return new EmptyAction();
+    }
+
+    cluster->SortNodesByTabletCount();
+
+    uint32_t source_node_index = PickMostTabletsNode(cluster);
+    uint32_t dest_node_index = PickLeastTabletsNode(cluster);
+    uint32_t tablet_index = PickRandomTabletOfNode(cluster, source_node_index);
+
+    if (tablet_index == kInvalidTabletIndex ||
+            source_node_index == kInvalidNodeIndex ||
+            dest_node_index == kInvalidNodeIndex ||
+            source_node_index == dest_node_index) {
+        return new EmptyAction();
+    }
+
+    return new MoveAction(tablet_index, source_node_index, dest_node_index);
+}
+
+uint32_t TabletCountActionGenerator::PickMostTabletsNode(const std::shared_ptr<Cluster>& cluster) {
+    if (cluster->node_index_sorted_by_tablet_count_.size() >= 1) {
+        return cluster->node_index_sorted_by_tablet_count_[cluster->node_index_sorted_by_tablet_count_.size() - 1];
+    } else {
+        return kInvalidTabletIndex;
+    }
+}
+
+uint32_t TabletCountActionGenerator::PickLeastTabletsNode(const std::shared_ptr<Cluster>& cluster) {
+    if (cluster->node_index_sorted_by_tablet_count_.size() >= 1) {
+        uint32_t index = 0;
+        if (cluster->lb_options_.meta_table_isolate_enabled) {
+            while (cluster->node_index_sorted_by_tablet_count_[index] == cluster->meta_table_node_index_) {
+                ++index;
+                if (index == cluster->node_index_sorted_by_tablet_count_.size()) {
+                    return kInvalidNodeIndex;
+                }
+            }
+        }
+        return cluster->node_index_sorted_by_tablet_count_[index];
+    } else {
+        return kInvalidTabletIndex;
+    }
+}
+
+std::string TabletCountActionGenerator::Name() {
+    return name_;
+}
+
+SizeActionGenerator::SizeActionGenerator() :
+        name_("SizeActionGenerator") {
+}
+
+SizeActionGenerator::~SizeActionGenerator() {
+}
+
+Action* SizeActionGenerator::Generate(const std::shared_ptr<Cluster>& cluster) {
+    VLOG(20) << "[lb] SizeActionGenerator worked";
+
+    if (cluster->tablet_node_num_ < 2) {
+        return new EmptyAction();
+    }
+
+    cluster->SortNodesBySize();
+
+    uint32_t source_node_index = PickLargestSizeNode(cluster);
+    uint32_t dest_node_index = PickSmallestSizeNode(cluster);
+    uint32_t tablet_index = PickRandomTabletOfNode(cluster, source_node_index);
+
+    if (tablet_index == kInvalidTabletIndex ||
+            source_node_index == kInvalidNodeIndex ||
+            dest_node_index == kInvalidNodeIndex ||
+            source_node_index == dest_node_index) {
+        return new EmptyAction();
+    }
+
+    return new MoveAction(tablet_index, source_node_index, dest_node_index);
+}
+
+uint32_t SizeActionGenerator::PickLargestSizeNode(const std::shared_ptr<Cluster>& cluster) {
+    if (cluster->node_index_sorted_by_size_.size() >= 1) {
+        return cluster->node_index_sorted_by_size_[cluster->node_index_sorted_by_size_.size() - 1];
+    } else {
+        return kInvalidTabletIndex;
+    }
+}
+
+uint32_t SizeActionGenerator::PickSmallestSizeNode(const std::shared_ptr<Cluster>& cluster) {
+    if (cluster->node_index_sorted_by_size_.size() >= 1) {
+        uint32_t index = 0;
+        if (cluster->lb_options_.meta_table_isolate_enabled) {
+            while (cluster->node_index_sorted_by_size_[index] == cluster->meta_table_node_index_) {
+                ++index;
+                if (index == cluster->node_index_sorted_by_size_.size()) {
+                    return kInvalidNodeIndex;
+                }
+            }
+        }
+        return cluster->node_index_sorted_by_size_[index];
+    } else {
+        return kInvalidTabletIndex;
+    }
+}
+
+std::string SizeActionGenerator::Name() {
+    return name_;
+}
+
+ReadLoadActionGenerator::ReadLoadActionGenerator() :
+        name_("ReadLoadActionGenerator") {
+}
+
+ReadLoadActionGenerator::~ReadLoadActionGenerator() {
+}
+
+Action* ReadLoadActionGenerator::Generate(const std::shared_ptr<Cluster>& cluster) {
+    VLOG(20) << "[lb] ReadLoadActionGenerator worked";
+
+    if (cluster->tablet_node_num_ < 2) {
+        return new EmptyAction();
+    }
+
+    cluster->SortNodesByReadLoad();
+
+    uint32_t source_node_index = PickMostReadNode(cluster);
+    uint32_t dest_node_index = PickLeastReadNode(cluster);
+    uint32_t tablet_index = PickRandomTabletOfNode(cluster, source_node_index);
+
+    if (tablet_index == kInvalidTabletIndex ||
+            source_node_index == kInvalidNodeIndex ||
+            dest_node_index == kInvalidNodeIndex ||
+            source_node_index == dest_node_index) {
+        return new EmptyAction();
+    }
+
+    return new MoveAction(tablet_index, source_node_index, dest_node_index);
+}
+
+uint32_t ReadLoadActionGenerator::PickMostReadNode(const std::shared_ptr<Cluster>& cluster) {
+    if (cluster->node_index_sorted_by_read_load_.size() >= 1) {
+        return cluster->node_index_sorted_by_read_load_[cluster->node_index_sorted_by_read_load_.size() - 1];
+    } else {
+        return kInvalidTabletIndex;
+    }
+}
+
+uint32_t ReadLoadActionGenerator::PickLeastReadNode(const std::shared_ptr<Cluster>& cluster) {
+    if (cluster->node_index_sorted_by_read_load_.size() >= 1) {
+        uint32_t index = 0;
+        if (cluster->lb_options_.meta_table_isolate_enabled) {
+            while (cluster->node_index_sorted_by_read_load_[index] == cluster->meta_table_node_index_) {
+                ++index;
+                if (index == cluster->node_index_sorted_by_read_load_.size()) {
+                    return kInvalidNodeIndex;
+                }
+            }
+        }
+        return cluster->node_index_sorted_by_read_load_[index];
+    } else {
+        return kInvalidTabletIndex;
+    }
+}
+
+std::string ReadLoadActionGenerator::Name() {
+    return name_;
+}
+
+WriteLoadActionGenerator::WriteLoadActionGenerator() :
+        name_("WriteLoadActionGenerator") {
+}
+
+WriteLoadActionGenerator::~WriteLoadActionGenerator() {
+}
+
+Action* WriteLoadActionGenerator::Generate(const std::shared_ptr<Cluster>& cluster) {
+    VLOG(20) << "[lb] WriteLoadActionGenerator worked";
+
+    if (cluster->tablet_node_num_ < 2) {
+        return new EmptyAction();
+    }
+
+    cluster->SortNodesByWriteLoad();
+
+    uint32_t source_node_index = PickMostWriteNode(cluster);
+    uint32_t dest_node_index = PickLeastWriteNode(cluster);
+    uint32_t tablet_index = PickRandomTabletOfNode(cluster, source_node_index);
+
+    if (tablet_index == kInvalidTabletIndex ||
+            source_node_index == kInvalidNodeIndex ||
+            dest_node_index == kInvalidNodeIndex ||
+            source_node_index == dest_node_index) {
+        return new EmptyAction();
+    }
+
+    return new MoveAction(tablet_index, source_node_index, dest_node_index);
+}
+
+uint32_t WriteLoadActionGenerator::PickMostWriteNode(const std::shared_ptr<Cluster>& cluster) {
+    if (cluster->node_index_sorted_by_write_load_.size() >= 1) {
+        return cluster->node_index_sorted_by_write_load_[cluster->node_index_sorted_by_write_load_.size() - 1];
+    } else {
+        return kInvalidTabletIndex;
+    }
+}
+
+uint32_t WriteLoadActionGenerator::PickLeastWriteNode(const std::shared_ptr<Cluster>& cluster) {
+    if (cluster->node_index_sorted_by_write_load_.size() >= 1) {
+        uint32_t index = 0;
+        if (cluster->lb_options_.meta_table_isolate_enabled) {
+            while (cluster->node_index_sorted_by_write_load_[index] == cluster->meta_table_node_index_) {
+                ++index;
+                if (index == cluster->node_index_sorted_by_write_load_.size()) {
+                    return kInvalidNodeIndex;
+                }
+            }
+        }
+        return cluster->node_index_sorted_by_write_load_[index];
+    } else {
+        return kInvalidTabletIndex;
+    }
+}
+
+std::string WriteLoadActionGenerator::Name() {
+    return name_;
+}
+
+ScanLoadActionGenerator::ScanLoadActionGenerator() :
+        name_("ScanLoadActionGenerator") {
+}
+
+ScanLoadActionGenerator::~ScanLoadActionGenerator() {
+}
+
+Action* ScanLoadActionGenerator::Generate(const std::shared_ptr<Cluster>& cluster) {
+    VLOG(20) << "[lb] ScanLoadActionGenerator worked";
+
+    if (cluster->tablet_node_num_ < 2) {
+        return new EmptyAction();
+    }
+
+    cluster->SortNodesByScanLoad();
+
+    uint32_t source_node_index = PickMostScanNode(cluster);
+    uint32_t dest_node_index = PickLeastScanNode(cluster);
+    uint32_t tablet_index = PickRandomTabletOfNode(cluster, source_node_index);
+
+    if (tablet_index == kInvalidTabletIndex ||
+            source_node_index == kInvalidNodeIndex ||
+            dest_node_index == kInvalidNodeIndex ||
+            source_node_index == dest_node_index) {
+        return new EmptyAction();
+    }
+
+    return new MoveAction(tablet_index, source_node_index, dest_node_index);
+}
+
+uint32_t ScanLoadActionGenerator::PickMostScanNode(const std::shared_ptr<Cluster>& cluster) {
+    if (cluster->node_index_sorted_by_scan_load_.size() >= 1) {
+        return cluster->node_index_sorted_by_scan_load_[cluster->node_index_sorted_by_scan_load_.size() - 1];
+    } else {
+        return kInvalidTabletIndex;
+    }
+}
+
+uint32_t ScanLoadActionGenerator::PickLeastScanNode(const std::shared_ptr<Cluster>& cluster) {
+    if (cluster->node_index_sorted_by_scan_load_.size() >= 1) {
+        uint32_t index = 0;
+        if (cluster->lb_options_.meta_table_isolate_enabled) {
+            while (cluster->node_index_sorted_by_scan_load_[index] == cluster->meta_table_node_index_) {
+                ++index;
+                if (index == cluster->node_index_sorted_by_scan_load_.size()) {
+                    return kInvalidNodeIndex;
+                }
+            }
+        }
+        return cluster->node_index_sorted_by_scan_load_[index];
+    } else {
+        return kInvalidTabletIndex;
+    }
+}
+
+std::string ScanLoadActionGenerator::Name() {
+    return name_;
+}
+
+} // namespace load_balancer
+} // namespace tera
diff --git a/src/load_balancer/action_generators.h b/src/load_balancer/action_generators.h
new file mode 100644
index 000000000..16c663ae7
--- /dev/null
+++ b/src/load_balancer/action_generators.h
@@ -0,0 +1,134 @@
+// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef TERA_LOAD_BALANCER_ACTION_GENERATORS_H_
+#define TERA_LOAD_BALANCER_ACTION_GENERATORS_H_
+
+#include <memory>
+
+#include "load_balancer/action_generator.h"
+#include "load_balancer/actions.h"
+
+namespace tera {
+namespace load_balancer {
+
+// move a random tablet of a random node to another random node
+class RandomActionGenerator : public ActionGenerator {
+public:
+    RandomActionGenerator();
+    virtual ~RandomActionGenerator();
+
+    // generate a random move action
+    virtual Action* Generate(const std::shared_ptr<Cluster>& cluster) override;
+
+    virtual std::string Name() override;
+
+private:
+    std::string name_;
+};
+
+// move a tablet
+// from the node holding most tablets
+// to the node holding least tablets
+class TabletCountActionGenerator : public ActionGenerator {
+public:
+    TabletCountActionGenerator();
+    virtual ~TabletCountActionGenerator();
+
+    virtual Action* Generate(const std::shared_ptr<Cluster>& cluster) override;
+
+    virtual std::string Name() override;
+
+private:
+    uint32_t PickMostTabletsNode(const std::shared_ptr<Cluster>& cluster);
+    uint32_t PickLeastTabletsNode(const std::shared_ptr<Cluster>& cluster);
+
+private:
+    std::string name_;
+};
+
+// move a tablet
+// from the node holding largest data size
+// to the node holding smallest data size
+class SizeActionGenerator : public ActionGenerator {
+public:
+    SizeActionGenerator();
+    virtual ~SizeActionGenerator();
+
+    virtual Action* Generate(const std::shared_ptr<Cluster>& cluster) override;
+
+    virtual std::string Name() override;
+
+private:
+    uint32_t PickLargestSizeNode(const std::shared_ptr<Cluster>& cluster);
+    uint32_t PickSmallestSizeNode(const std::shared_ptr<Cluster>& cluster);
+
+private:
+    std::string name_;
+};
+
+// move a tablet
+// from the node has most read load
+// to the node has least read load
+class ReadLoadActionGenerator : public ActionGenerator {
+public:
+    ReadLoadActionGenerator();
+    virtual ~ReadLoadActionGenerator();
+
+    virtual Action* Generate(const std::shared_ptr<Cluster>& cluster) override;
+
+    virtual std::string Name() override;
+
+private:
+    uint32_t PickMostReadNode(const std::shared_ptr<Cluster>& cluster);
+    uint32_t PickLeastReadNode(const std::shared_ptr<Cluster>& cluster);
+
+private:
+    std::string name_;
+};
+
+// move a tablet
+// from the node has most write load
+// to the node has least write load
+class WriteLoadActionGenerator : public ActionGenerator {
+public:
+    WriteLoadActionGenerator();
+    virtual ~WriteLoadActionGenerator();
+
+    virtual Action* Generate(const std::shared_ptr<Cluster>& cluster) override;
+
+    virtual std::string Name() override;
+
+private:
+    uint32_t PickMostWriteNode(const std::shared_ptr<Cluster>& cluster);
+    uint32_t PickLeastWriteNode(const std::shared_ptr<Cluster>& cluster);
+
+private:
+    std::string name_;
+};
+
+// move a tablet
+// from the node has most scan load
+// to the node has least scan load
+class ScanLoadActionGenerator : public ActionGenerator {
+public:
+    ScanLoadActionGenerator();
+    virtual ~ScanLoadActionGenerator();
+
+    virtual Action* Generate(const std::shared_ptr<Cluster>& cluster) override;
+
+    virtual std::string Name() override;
+
+private:
+    uint32_t PickMostScanNode(const std::shared_ptr<Cluster>& cluster);
+    uint32_t PickLeastScanNode(const std::shared_ptr<Cluster>& cluster);
+
+private:
+    std::string name_;
+};
+
+} // namespace load_balancer
+} // namespace tera
+
+#endif // TERA_LOAD_BALANCER_ACTION_GENERATORS_H_
diff --git a/src/load_balancer/actions.cc b/src/load_balancer/actions.cc
new file mode 100644
index 000000000..0be2d9d5e
--- /dev/null
+++ b/src/load_balancer/actions.cc
@@ -0,0 +1,47 @@
+// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include <string>
+
+#include "load_balancer/actions.h"
+
+namespace tera {
+namespace load_balancer {
+
+EmptyAction::EmptyAction() :
+    Action(Action::Type::EMPTY) {
+}
+
+EmptyAction::~EmptyAction() {
+}
+
+Action* EmptyAction::UndoAction() {
+    return new EmptyAction();
+}
+
+std::string EmptyAction::ToString() const {
+    return "EmptyAction";
+}
+
+MoveAction::MoveAction(uint32_t tablet_index, uint32_t source_node_index, uint32_t dest_node_index) :
+    Action(Action::Type::MOVE),
+    tablet_index_(tablet_index),
+    source_node_index_(source_node_index),
+    dest_node_index_(dest_node_index) {
+}
+
+MoveAction::~MoveAction() {
+}
+
+Action* MoveAction::UndoAction() {
+    return new MoveAction(tablet_index_, dest_node_index_, source_node_index_);
+}
+
+std::string MoveAction::ToString() const {
+    return "move " + std::to_string(tablet_index_) + " from "
+            + std::to_string(source_node_index_) + " to " + std::to_string(dest_node_index_);
+}
+
+} // namespace load_balancer
+} // namespace tera
diff --git a/src/load_balancer/actions.h b/src/load_balancer/actions.h
new file mode 100644
index 000000000..f4751ea9c
--- /dev/null
+++ b/src/load_balancer/actions.h
@@ -0,0 +1,43 @@
+// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef TERA_LOAD_BALANCER_ACTIONS_H_
+#define TERA_LOAD_BALANCER_ACTIONS_H_
+
+#include <string>
+
+#include "load_balancer/action.h"
+
+namespace tera {
+namespace load_balancer {
+
+class EmptyAction : public Action {
+public:
+    EmptyAction();
+    virtual ~EmptyAction();
+
+    virtual Action* UndoAction() override;
+
+    virtual std::string ToString() const override;
+};
+
+class MoveAction : public Action {
+public:
+    MoveAction(uint32_t tablet_index, uint32_t source_node_index, uint32_t dest_node_index);
+    virtual ~MoveAction();
+
+    virtual Action* UndoAction() override;
+
+    virtual std::string ToString() const override;
+
+public:
+    uint32_t tablet_index_;
+    uint32_t source_node_index_;
+    uint32_t dest_node_index_;
+};
+
+} // namespace load_balancer
+} // namespace tera
+
+#endif // TERA_LOAD_BALANCER_ACTIONS_H_
diff --git a/src/load_balancer/balancer.h b/src/load_balancer/balancer.h
new file mode 100644
index 000000000..2ad1727ea
--- /dev/null
+++ b/src/load_balancer/balancer.h
@@ -0,0 +1,39 @@
+// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef TERA_LOAD_BALANCER_BALANCER_H_
+#define TERA_LOAD_BALANCER_BALANCER_H_
+
+#include <string>
+#include <vector>
+
+#include "load_balancer/lb_node.h"
+#include "load_balancer/options.h"
+#include "load_balancer/plan.h"
+#include "master/tablet_manager.h"
+#include "master/tabletnode_manager.h"
+
+namespace tera {
+namespace load_balancer {
+
+class Balancer {
+public:
+    virtual ~Balancer() {}
+
+    // balance the whole cluster
+    virtual bool BalanceCluster(
+            const std::vector<std::shared_ptr<LBTabletNode>>& lb_nodes,
+            std::vector<Plan>* plans) = 0;
+
+    // balance for the specified table
+    virtual bool BalanceCluster(
+            const std::string& table_name,
+            const std::vector<std::shared_ptr<LBTabletNode>>& lb_nodes,
+            std::vector<Plan>* plans) = 0;
+};
+
+} // namespace load_balancer
+} // namespace tera
+
+#endif // TERA_LOAD_BALANCER_BALANCER_H_
diff --git a/src/load_balancer/cluster.cc b/src/load_balancer/cluster.cc
new file mode 100644
index 000000000..72a3f740e
--- /dev/null
+++ b/src/load_balancer/cluster.cc
@@ -0,0 +1,537 @@
+// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include <assert.h>
+
+#include <algorithm>
+#include <limits>
+
+#include "glog/logging.h"
+#include "load_balancer/actions.h"
+#include "load_balancer/cluster.h"
+#include "common/timer.h"
+
+namespace tera {
+namespace load_balancer {
+
+Cluster::Cluster(const std::vector<std::shared_ptr<LBTabletNode>>& lb_nodes,
+                 const LBOptions& options) :
+        meta_table_node_index_(std::numeric_limits<uint32_t>::max()),
+        lb_options_(options) {
+    int64_t start_time_ns = get_micros();
+
+    for (const auto& node : lb_nodes) {
+        if (lb_options_.meta_table_isolate_enabled &&
+                node->tablet_node_ptr->GetAddr() == lb_options_.meta_table_node_addr) {
+            VLOG(5) << "skip meta table node:" << lb_options_.meta_table_node_addr;
+        } else {
+            lb_nodes_.emplace_back(node);
+        }
+    }
+
+    table_num_ = 0;
+    tablet_node_num_ = 0;
+    tablet_num_ = 0;
+    tablet_moved_num_ = 0;
+
+    for (const auto& node : lb_nodes_) {
+        uint32_t node_index = nodes_.size();
+        nodes_[node_index] = node;
+
+        std::string addr = node->tablet_node_ptr->GetAddr();
+        assert(nodes_to_index_.find(addr) == nodes_to_index_.end());
+        nodes_to_index_[addr] = node_index;
+
+        tablets_per_node_[node_index].clear();
+        initial_tablets_not_ready_per_node_[node_index].clear();
+        size_per_node_[node_index] = 0;
+        read_load_per_node_[node_index] = 0;
+        write_load_per_node_[node_index] = 0;
+        scan_load_per_node_[node_index] = 0;
+
+        node_index_sorted_by_tablet_count_.emplace_back(node_index);
+        node_index_sorted_by_size_.emplace_back(node_index);
+        node_index_sorted_by_read_load_.emplace_back(node_index);
+        node_index_sorted_by_write_load_.emplace_back(node_index);
+        node_index_sorted_by_scan_load_.emplace_back(node_index);
+
+        if (node->tablet_node_ptr->GetReadPending() > 0) {
+            read_pending_nodes_index_.insert(node_index);
+        }
+        if (node->tablet_node_ptr->GetWritePending() > 0) {
+            write_pending_nodes_index_.insert(node_index);
+        }
+        if (node->tablet_node_ptr->GetScanPending() > 0) {
+            scan_pending_nodes_index_.insert(node_index);
+        }
+
+        for (const auto& tablet : node->tablets) {
+            uint32_t tablet_index = tablets_.size();
+
+            RegisterTablet(tablet, tablet_index, node_index);
+
+            tablets_per_node_[node_index].emplace_back(tablet_index);
+            if (tablets_[tablet_index]->tablet_ptr->GetStatus() != kTableReady) {
+                initial_tablets_not_ready_per_node_[node_index].emplace_back(tablet_index);
+            }
+            size_per_node_[node_index] += static_cast<uint64_t>(tablet->tablet_ptr->GetDataSize());
+            read_load_per_node_[node_index] += static_cast<uint64_t>(tablet->tablet_ptr->GetReadQps());
+            write_load_per_node_[node_index] += static_cast<uint64_t>(tablet->tablet_ptr->GetWriteQps());
+            scan_load_per_node_[node_index] += static_cast<uint64_t>(tablet->tablet_ptr->GetScanQps());
+
+            ++tablet_num_;
+        }
+
+        ++ tablet_node_num_;
+    }
+
+    // if not ready tablets' ratio is higher than option, the node is considered abnormal
+    for (uint32_t i = 0; i < tablets_per_node_.size(); ++i) {
+        if (tablets_per_node_[i].size() != 0) {
+            double note_ready_num = static_cast<double>(initial_tablets_not_ready_per_node_[i].size());
+            double total_num = static_cast<double>(tablets_per_node_[i].size());
+            if (note_ready_num / total_num >= lb_options_.abnormal_node_ratio) {
+                abnormal_nodes_index_.insert(i);
+            }
+        }
+    }
+
+    assert(table_num_ == tables_.size());
+    assert(tablet_node_num_ == nodes_.size());
+    assert(tablet_num_ == tablets_.size());
+
+    assert(table_num_ == tables_to_index_.size());
+    assert(tablet_node_num_ == nodes_to_index_.size());
+    assert(tablet_num_ == tablets_to_index_.size());
+
+    assert(tablet_num_ == tablet_index_to_node_index_.size());
+    assert(tablet_num_ == initial_tablet_index_to_node_index_.size());
+    assert(tablet_num_ == tablet_index_to_table_index_.size());
+
+    assert(tablet_node_num_ == tablets_per_node_.size());
+    assert(tablet_node_num_ == initial_tablets_not_ready_per_node_.size());
+    assert(tablet_node_num_ == size_per_node_.size());
+    assert(tablet_node_num_ == read_load_per_node_.size());
+    assert(tablet_node_num_ == write_load_per_node_.size());
+    assert(tablet_node_num_ == scan_load_per_node_.size());
+    assert(abnormal_nodes_index_.size() <= tablet_node_num_);
+    assert(read_pending_nodes_index_.size() <= tablet_node_num_);
+    assert(write_pending_nodes_index_.size() <= tablet_node_num_);
+    assert(scan_pending_nodes_index_.size() <= tablet_node_num_);
+
+    assert(tablet_node_num_ == node_index_sorted_by_tablet_count_.size());
+    assert(tablet_node_num_ == node_index_sorted_by_size_.size());
+    assert(tablet_node_num_ == node_index_sorted_by_read_load_.size());
+    assert(tablet_node_num_ == node_index_sorted_by_write_load_.size());
+    assert(tablet_node_num_ == node_index_sorted_by_scan_load_.size());
+
+    VLOG(20) << "[lb] construct Cluster cost time(ms):" << (get_micros() - start_time_ns) / 1000;
+}
+
+Cluster::~Cluster() {
+}
+
+void Cluster::DebugCluster() {
+    LOG(INFO) << "";
+    LOG(INFO) << "DebugCluster begin -----";
+
+    LOG(INFO) << "table_num_:" << table_num_;
+    LOG(INFO) << "tablet_node_num_:" << tablet_node_num_;
+    LOG(INFO) << "tablet_num_:" << tablet_num_;
+    LOG(INFO) << "tablet_moved_num_:" << tablet_moved_num_;
+
+    LOG(INFO) << "[table_index -> table]:";
+    for (const auto& table : tables_) {
+        LOG(INFO) << table.first << " -> " << table.second;
+    }
+
+    LOG(INFO) << "[node_index -> node]:";
+    for (const auto& node : nodes_) {
+        LOG(INFO) << node.first << " -> " << node.second->tablet_node_ptr->GetAddr();
+    }
+    LOG(INFO) << "meta_table_node_index_:" << meta_table_node_index_;
+
+    LOG(INFO) << "[tablet_index -> tablet]:";
+    for (const auto& tablet : tablets_) {
+        LOG(INFO) << tablet.first << " -> " << tablet.second->tablet_ptr->GetPath();
+    }
+
+    LOG(INFO) << "[table -> table_index]:";
+    for (const auto& table : tables_to_index_) {
+        LOG(INFO) << table.first << " -> " << table.second;
+    }
+
+    LOG(INFO) << "[node -> node_index]:";
+    for (const auto& node : nodes_to_index_) {
+        LOG(INFO) << node.first << " -> " << node.second;
+    }
+
+    LOG(INFO) << "[tablet -> tablet_index]:";
+    for (const auto& tablet : tablets_to_index_) {
+        LOG(INFO) << tablet.first << " -> " << tablet.second;
+    }
+
+    LOG(INFO) << "[tablet_index -> node_index]:";
+    for (const auto& it : tablet_index_to_node_index_) {
+        LOG(INFO) << it.first << " -> " << it.second;
+    }
+
+    LOG(INFO) << "[initial tablet_index -> node_index]:";
+    for (const auto& it : initial_tablet_index_to_node_index_) {
+        LOG(INFO) << it.first << " -> " << it.second;
+    }
+
+    LOG(INFO) << "[tablet_index -> table_index]:";
+    for (const auto& it : tablet_index_to_table_index_) {
+        LOG(INFO) << it.first << " -> " << it.second;
+    }
+
+    LOG(INFO) << "[node_index -> tablets index]:";
+    for (const auto& it : tablets_per_node_) {
+        std::string line = std::to_string(it.first) + " ->";
+        for (const auto tablet : it.second) {
+            line += " ";
+            line += std::to_string(tablet);
+        }
+        LOG(INFO) << line;
+    }
+
+    LOG(INFO) << "[node_index -> data size]:";
+    for (const auto& it : size_per_node_) {
+        LOG(INFO) << it.first << " -> " << it.second << "B";
+    }
+
+    LOG(INFO) << "[node_index -> read load]:";
+    for (const auto& it : read_load_per_node_) {
+        LOG(INFO) << it.first << " -> " << it.second;
+    }
+
+    LOG(INFO) << "[node_index -> write load]:";
+    for (const auto& it : write_load_per_node_) {
+        LOG(INFO) << it.first << " -> " << it.second;
+    }
+
+    LOG(INFO) << "[node_index -> scan load]:";
+    for (const auto& it : scan_load_per_node_) {
+        LOG(INFO) << it.first << " -> " << it.second;
+    }
+
+    LOG(INFO) << "[tablets index of moved too frequently]:";
+    for (const auto& tablet : tablets_moved_too_frequently_) {
+        LOG(INFO) << tablet;
+    }
+
+    LOG(INFO) << "[node_index -> not ready tablets index]:";
+    for (const auto& it : initial_tablets_not_ready_per_node_) {
+        std::string line = std::to_string(it.first) + " ->";
+        for (const auto tablet : it.second) {
+            line += " ";
+            line += std::to_string(tablet);
+        }
+        LOG(INFO) << line;
+    }
+
+    LOG(INFO) << "[abnormal nodes index]:";
+    for (const auto& node: abnormal_nodes_index_) {
+        LOG(INFO) << node;
+    }
+
+    LOG(INFO) << "[tablets index of moved to abnormal nodes]:";
+    for (const auto& tablet : tablets_moved_to_abnormal_nodes_) {
+        LOG(INFO) << tablet;
+    }
+
+    LOG(INFO) << "[read pending nodes index]:";
+    for (const auto& node: read_pending_nodes_index_) {
+        LOG(INFO) << node;
+    }
+
+    LOG(INFO) << "[tablets index of moved to read pending nodes]:";
+    for (const auto& tablet : tablets_moved_to_read_pending_nodes_) {
+        LOG(INFO) << tablet;
+    }
+
+    LOG(INFO) << "[write pending nodes index]:";
+    for (const auto& node: write_pending_nodes_index_) {
+        LOG(INFO) << node;
+    }
+
+    LOG(INFO) << "[tablets index of moved to write pending nodes]:";
+    for (const auto& tablet : tablets_moved_to_write_pending_nodes_) {
+        LOG(INFO) << tablet;
+    }
+
+    LOG(INFO) << "[scan pending nodes index]:";
+    for (const auto& node: scan_pending_nodes_index_) {
+        LOG(INFO) << node;
+    }
+
+    LOG(INFO) << "[tablets index of moved to scan pending nodes]:";
+    for (const auto& tablet : tablets_moved_to_scan_pending_nodes_) {
+        LOG(INFO) << tablet;
+    }
+
+    LOG(INFO) << "DebugCluster end -----";
+    LOG(INFO) << "";
+}
+
+bool Cluster::ValidAction(const std::shared_ptr<Action>& action) {
+    switch (action->GetType()) {
+        case Action::Type::EMPTY:
+            return false;
+        case Action::Type::ASSIGN:
+            return true;
+        case Action::Type::MOVE: {
+            MoveAction* move_action = dynamic_cast<MoveAction*>(action.get());
+            if (tablets_[move_action->tablet_index_]->tablet_ptr->GetStatus() != kTableReady) {
+                VLOG(20) << "[lb] invalid action, reason:tablet not ready, tablet status:"
+                        << StatusCodeToString(tablets_[move_action->tablet_index_]->tablet_ptr->GetStatus());
+                return false;
+            }
+
+            if (tables_[tablet_index_to_table_index_[move_action->tablet_index_]] ==
+                    lb_options_.meta_table_name) {
+                VLOG(20) << "[lb] invalid action, reason:move meta table";
+                return false;
+            }
+
+            if (lb_options_.meta_table_isolate_enabled &&
+                    move_action->dest_node_index_ == meta_table_node_index_) {
+                VLOG(20) << "[lb] invalid action, reason:move tablet to meta table node";
+                return false;
+            }
+
+            return true;
+        }
+        case Action::Type::SWAP:
+            return true;;
+        default:
+            return false;
+    }
+}
+
+void Cluster::DoAction(const std::shared_ptr<Action>& action) {
+    switch (action->GetType()) {
+        case Action::Type::EMPTY:
+            break;
+        case Action::Type::ASSIGN:
+            break;
+        case Action::Type::MOVE: {
+            MoveAction* move_action = dynamic_cast<MoveAction*>(action.get());
+            VLOG(20) << "[lb] DoAction: " << move_action->ToString();
+            assert(move_action->source_node_index_ != move_action->dest_node_index_);
+
+            RemoveTablet(move_action->tablet_index_, move_action->source_node_index_);
+            AddTablet(move_action->tablet_index_, move_action->dest_node_index_);
+            MoveTablet(move_action->tablet_index_, move_action->source_node_index_, move_action->dest_node_index_);
+
+            break;
+        }
+        case Action::Type::SWAP:
+            break;
+        default:
+            break;
+    }
+}
+
+void Cluster::SortNodesByTabletCount() {
+    std::sort(
+            node_index_sorted_by_tablet_count_.begin(),
+            node_index_sorted_by_tablet_count_.end(),
+            [this](int a, int b) {
+                return tablets_per_node_[a].size() < tablets_per_node_[b].size();
+            });
+}
+
+void Cluster::SortNodesBySize() {
+    std::sort(
+            node_index_sorted_by_size_.begin(),
+            node_index_sorted_by_size_.end(),
+            [this](int a, int b) {
+                return size_per_node_[a] < size_per_node_[b];
+            });
+}
+
+void Cluster::SortNodesByReadLoad() {
+    std::sort(
+            node_index_sorted_by_read_load_.begin(),
+            node_index_sorted_by_read_load_.end(),
+            [this](int a, int b) {
+                return read_load_per_node_[a] < read_load_per_node_[b];
+            });
+}
+
+void Cluster::SortNodesByWriteLoad() {
+    std::sort(
+            node_index_sorted_by_write_load_.begin(),
+            node_index_sorted_by_write_load_.end(),
+            [this](int a, int b) {
+                return write_load_per_node_[a] < write_load_per_node_[b];
+            });
+}
+
+void Cluster::SortNodesByScanLoad() {
+    std::sort(
+            node_index_sorted_by_scan_load_.begin(),
+            node_index_sorted_by_scan_load_.end(),
+            [this](int a, int b) {
+                return scan_load_per_node_[a] < scan_load_per_node_[b];
+            });
+}
+
+void Cluster::RegisterTablet(const std::shared_ptr<LBTablet>& tablet, uint32_t tablet_index, uint32_t node_index) {
+    std::string table_name = tablet->tablet_ptr->GetTableName();
+    if (tables_to_index_.find(table_name) == tables_to_index_.end()) {
+        uint32_t table_index = tables_.size();
+        tables_[table_index] = table_name;
+        tables_to_index_[table_name] = table_index;
+        ++table_num_;
+
+        if (table_name == lb_options_.meta_table_name) {
+            meta_table_node_index_ = node_index;
+        }
+    }
+
+    std::string path = tablet->tablet_ptr->GetPath();
+    tablets_to_index_[path] = tablet_index;
+    tablets_[tablet_index] = tablet;
+
+    tablet_index_to_node_index_[tablet_index] = node_index;
+    initial_tablet_index_to_node_index_[tablet_index] = node_index;
+    tablet_index_to_table_index_[tablet_index] = tables_to_index_[table_name];
+}
+
+void Cluster::AddTablet(uint32_t tablet_index, uint32_t to_node_index) {
+    tablets_per_node_[to_node_index].emplace_back(tablet_index);
+
+    size_per_node_[to_node_index] += static_cast<uint64_t>(
+            tablets_[tablet_index]->tablet_ptr->GetDataSize());
+    read_load_per_node_[to_node_index] += static_cast<uint64_t>(
+            tablets_[tablet_index]->tablet_ptr->GetReadQps());
+    write_load_per_node_[to_node_index] += static_cast<uint64_t>(
+            tablets_[tablet_index]->tablet_ptr->GetWriteQps());
+    scan_load_per_node_[to_node_index] += static_cast<uint64_t>(
+            tablets_[tablet_index]->tablet_ptr->GetScanQps());
+}
+
+void Cluster::RemoveTablet(uint32_t tablet_index, uint32_t from_node_index) {
+    if (tablets_per_node_.find(from_node_index) == tablets_per_node_.end()) {
+        return;
+    }
+    auto& tablets = tablets_per_node_[from_node_index];
+    for (auto it = tablets.begin(); it != tablets.end();) {
+        if (*it == tablet_index) {
+            it = tablets.erase(it);
+            break;
+        } else {
+            ++it;
+        }
+    }
+
+    size_per_node_[from_node_index] -= static_cast<uint64_t>(
+            tablets_[tablet_index]->tablet_ptr->GetDataSize());
+    read_load_per_node_[from_node_index] -= static_cast<uint64_t>(
+            tablets_[tablet_index]->tablet_ptr->GetReadQps());
+    write_load_per_node_[from_node_index] -= static_cast<uint64_t>(
+            tablets_[tablet_index]->tablet_ptr->GetWriteQps());
+    scan_load_per_node_[from_node_index] -= static_cast<uint64_t>(
+            tablets_[tablet_index]->tablet_ptr->GetScanQps());
+
+    assert(size_per_node_[from_node_index] >= 0);
+    assert(read_load_per_node_[from_node_index] >= 0);
+    assert(write_load_per_node_[from_node_index] >= 0);
+    assert(scan_load_per_node_[from_node_index] >= 0);
+}
+
+void Cluster::MoveTablet(uint32_t tablet_index, uint32_t source_node_index, uint32_t dest_node_index) {
+    tablet_index_to_node_index_[tablet_index] = dest_node_index;
+
+    if (initial_tablet_index_to_node_index_[tablet_index] == source_node_index) {
+        ++tablet_moved_num_;
+
+        int64_t last_move_time_us = tablets_[tablet_index]->tablet_ptr->LastMoveTime();
+        int64_t current_time_us = get_micros();
+        if (current_time_us - last_move_time_us <
+                1000000 * static_cast<int64_t>(lb_options_.tablet_move_too_frequently_threshold_s)) {
+            tablets_moved_too_frequently_.insert(tablet_index);
+            VLOG(20) << "[lb] add tablet moved too frequently, tablet index: " << tablet_index
+                    << ", last_move_time: " << last_move_time_us << ", current time: " << current_time_us
+                    << ", tablets_moved_too_frequently_ size: " << tablets_moved_too_frequently_.size();
+        }
+    } else if (initial_tablet_index_to_node_index_[tablet_index] == dest_node_index) {
+        // tablet moved back
+        --tablet_moved_num_;
+        assert(tablet_moved_num_ >= 0);
+
+        if (tablets_moved_too_frequently_.find(tablet_index) != tablets_moved_too_frequently_.end()) {
+            tablets_moved_too_frequently_.erase(tablet_index);
+            VLOG(20) << "[lb] remove tablet moved too frequently, tablet index: " << tablet_index
+                    << ", tablets_moved_too_frequently_ size: " << tablets_moved_too_frequently_.size();
+        }
+    } else {
+    }
+
+    if (abnormal_nodes_index_.find(dest_node_index) != abnormal_nodes_index_.end() &&
+            dest_node_index != initial_tablet_index_to_node_index_[tablet_index]) {
+        tablets_moved_to_abnormal_nodes_.insert(tablet_index);
+        VLOG(20) << "[lb] add tablet moved to abnormal node, tablet index: " << tablet_index
+                << ", node index: " << dest_node_index
+                << ", tablets_moved_to_abnormal_nodes_ size: " << tablets_moved_to_abnormal_nodes_.size();
+    } else if (abnormal_nodes_index_.find(source_node_index) != abnormal_nodes_index_.end()) {
+        if (tablets_moved_to_abnormal_nodes_.find(tablet_index) != tablets_moved_to_abnormal_nodes_.end()) {
+            tablets_moved_to_abnormal_nodes_.erase(tablet_index);
+            VLOG(20) << "[lb] remove tablet moved to abnormal nodes, tablet index: " << tablet_index
+                    << ", tablets_moved_to_abnormal_nodes_ size: " << tablets_moved_to_abnormal_nodes_.size();
+        }
+    } else {
+    }
+
+    if (read_pending_nodes_index_.find(dest_node_index) != read_pending_nodes_index_.end() &&
+            dest_node_index != initial_tablet_index_to_node_index_[tablet_index]) {
+        tablets_moved_to_read_pending_nodes_.insert(tablet_index);
+        VLOG(20) << "[lb] add tablet moved to read pending node, tablet index: " << tablet_index
+                << ", node index: " << dest_node_index
+                << ", tablets_moved_to_read_pending_nodes_ size: " << tablets_moved_to_read_pending_nodes_.size();
+    } else if (read_pending_nodes_index_.find(source_node_index) != read_pending_nodes_index_.end()) {
+        if (tablets_moved_to_read_pending_nodes_.find(tablet_index) != tablets_moved_to_read_pending_nodes_.end()) {
+            tablets_moved_to_read_pending_nodes_.erase(tablet_index);
+            VLOG(20) << "[lb] remove tablet moved to read pending nodes, tablet index: " << tablet_index
+                    << ", tablets_moved_to_read_pending_nodes_ size: " << tablets_moved_to_read_pending_nodes_.size();
+        }
+    } else {
+    }
+
+    if (write_pending_nodes_index_.find(dest_node_index) != write_pending_nodes_index_.end() &&
+            dest_node_index != initial_tablet_index_to_node_index_[tablet_index]) {
+        tablets_moved_to_write_pending_nodes_.insert(tablet_index);
+        VLOG(20) << "[lb] add tablet moved to write pending node, tablet index: " << tablet_index
+                << ", node index: " << dest_node_index
+                << ", tablets_moved_to_write_pending_nodes_ size: " << tablets_moved_to_write_pending_nodes_.size();
+    } else if (write_pending_nodes_index_.find(source_node_index) != write_pending_nodes_index_.end()) {
+        if (tablets_moved_to_write_pending_nodes_.find(tablet_index) != tablets_moved_to_write_pending_nodes_.end()) {
+            tablets_moved_to_write_pending_nodes_.erase(tablet_index);
+            VLOG(20) << "[lb] remove tablet moved to write pending nodes, tablet index: " << tablet_index
+                    << ", tablets_moved_to_write_pending_nodes_ size: " << tablets_moved_to_write_pending_nodes_.size();
+        }
+    } else {
+    }
+
+    if (scan_pending_nodes_index_.find(dest_node_index) != scan_pending_nodes_index_.end() &&
+            dest_node_index != initial_tablet_index_to_node_index_[tablet_index]) {
+        tablets_moved_to_scan_pending_nodes_.insert(tablet_index);
+        VLOG(20) << "[lb] add tablet moved to scan pending node, tablet index: " << tablet_index
+                << ", node index: " << dest_node_index
+                << ", tablets_moved_to_scan_pending_nodes_ size: " << tablets_moved_to_scan_pending_nodes_.size();
+    } else if (scan_pending_nodes_index_.find(source_node_index) != scan_pending_nodes_index_.end()) {
+        if (tablets_moved_to_scan_pending_nodes_.find(tablet_index) != tablets_moved_to_scan_pending_nodes_.end()) {
+            tablets_moved_to_scan_pending_nodes_.erase(tablet_index);
+            VLOG(20) << "[lb] remove tablet moved to scan pending nodes, tablet index: " << tablet_index
+                    << ", tablets_moved_to_scan_pending_nodes_ size: " << tablets_moved_to_scan_pending_nodes_.size();
+        }
+    } else {
+    }
+}
+
+} // namespace load_balancer
+} // namespace tera
diff --git a/src/load_balancer/cluster.h b/src/load_balancer/cluster.h
new file mode 100644
index 000000000..8a22acd7c
--- /dev/null
+++ b/src/load_balancer/cluster.h
@@ -0,0 +1,130 @@
+// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef TERA_LOAD_BALANCER_CLUSTER_H_
+#define TERA_LOAD_BALANCER_CLUSTER_H_
+
+#include <map>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "load_balancer/action.h"
+#include "load_balancer/lb_node.h"
+#include "load_balancer/options.h"
+#include "master/tablet_manager.h"
+#include "master/tabletnode_manager.h"
+
+namespace tera {
+namespace load_balancer {
+
+class Cluster {
+public:
+    Cluster(const std::vector<std::shared_ptr<LBTabletNode>>& tablet_nodes,
+            const LBOptions& options);
+
+    virtual ~Cluster();
+
+    void DebugCluster();
+
+    bool ValidAction(const std::shared_ptr<Action>& action);
+
+    void DoAction(const std::shared_ptr<Action>& action);
+
+    void SortNodesByTabletCount();
+
+    void SortNodesBySize();
+
+    void SortNodesByReadLoad();
+
+    void SortNodesByWriteLoad();
+
+    void SortNodesByScanLoad();
+
+private:
+    void RegisterTablet(const std::shared_ptr<LBTablet>& tablet, uint32_t tablet_index, uint32_t node_index);
+    void AddTablet(uint32_t tablet_index, uint32_t to_node_index);
+    void RemoveTablet(uint32_t tablet_index, uint32_t from_node_index);
+    void MoveTablet(uint32_t tablet_index, uint32_t source_node_index, uint32_t dest_node_index);
+
+// cluster info, use index to speed up the calculation
+// make these info public also for speeding up
+public:
+    uint32_t table_num_;
+    uint32_t tablet_node_num_;
+    uint32_t tablet_num_;
+    uint32_t tablet_moved_num_;
+
+    // table_index -> table
+    std::map<uint32_t, std::string> tables_;
+    // node_index -> node
+    std::map<uint32_t, std::shared_ptr<LBTabletNode>> nodes_;
+    // tablet_index -> tablet
+    std::map<uint32_t, std::shared_ptr<LBTablet>> tablets_;
+
+    // table -> table_index
+    std::map<std::string, uint32_t> tables_to_index_;
+    // node -> node_index
+    std::map<std::string, uint32_t> nodes_to_index_;
+    // tablet -> tablet_index
+    std::map<std::string, uint32_t> tablets_to_index_;
+
+    // tablet_index -> node_index
+    std::map<uint32_t, uint32_t> tablet_index_to_node_index_;
+    // initial tablet_index -> node_index, it's the initial cluster state
+    std::map<uint32_t, uint32_t> initial_tablet_index_to_node_index_;
+    // tablet_index -> table_index
+    std::map<uint32_t, uint32_t> tablet_index_to_table_index_;
+
+    // node_index -> tablets index on the node
+    std::map<uint32_t, std::vector<uint32_t>> tablets_per_node_;
+    // node_index -> tablets index of not ready on the node
+    std::map<uint32_t, std::vector<uint32_t>> initial_tablets_not_ready_per_node_;
+    // abnormal nodes index
+    std::unordered_set<uint32_t> abnormal_nodes_index_;
+    // index of tablets moved to abnormal nodes
+    std::unordered_set<uint32_t> tablets_moved_to_abnormal_nodes_;
+    // read pending nodes index
+    std::unordered_set<uint32_t> read_pending_nodes_index_;
+    // index of tablets moved to read pending nodes
+    std::unordered_set<uint32_t> tablets_moved_to_read_pending_nodes_;
+    // write pending nodes index
+    std::unordered_set<uint32_t> write_pending_nodes_index_;
+    // index of tablets moved to write pending nodes
+    std::unordered_set<uint32_t> tablets_moved_to_write_pending_nodes_;
+    // scan pending nodes index
+    std::unordered_set<uint32_t> scan_pending_nodes_index_;
+    // index of tablets moved to scan pending nodes
+    std::unordered_set<uint32_t> tablets_moved_to_scan_pending_nodes_;
+    // node_index -> data size on the node
+    std::map<uint32_t, uint64_t> size_per_node_;
+    // node_index -> read load on the node
+    std::map<uint32_t, uint64_t> read_load_per_node_;
+    // node_index -> write load on the node
+    std::map<uint32_t, uint64_t> write_load_per_node_;
+    // node_index -> scan load on the node
+    std::map<uint32_t, uint64_t> scan_load_per_node_;
+    // tablets index of moved too frequently
+    std::unordered_set<uint32_t> tablets_moved_too_frequently_;
+
+    // meta table node index
+    uint32_t meta_table_node_index_;
+
+    // for ActionGenerator
+    std::vector<uint32_t> node_index_sorted_by_tablet_count_;
+    std::vector<uint32_t> node_index_sorted_by_size_;
+    std::vector<uint32_t> node_index_sorted_by_read_load_;
+    std::vector<uint32_t> node_index_sorted_by_write_load_;
+    std::vector<uint32_t> node_index_sorted_by_scan_load_;
+
+    LBOptions lb_options_;
+
+private:
+    std::vector<std::shared_ptr<LBTabletNode>> lb_nodes_;
+};
+
+} // namespace load_balancer
+} // namespace tera
+
+#endif // TERA_LOAD_BALANCER_CLUSTER_H_
diff --git a/src/load_balancer/cost_function.h b/src/load_balancer/cost_function.h
new file mode 100644
index 000000000..862b09285
--- /dev/null
+++ b/src/load_balancer/cost_function.h
@@ -0,0 +1,125 @@
+// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef TERA_LOAD_BALANCER_COST_FUNCTION_H_
+#define TERA_LOAD_BALANCER_COST_FUNCTION_H_
+
+#include <math.h>
+
+#include <algorithm>
+#include <cmath>
+#include <cstdlib>
+#include <memory>
+#include <string>
+
+#include "glog/logging.h"
+#include "load_balancer/cluster.h"
+#include "load_balancer/options.h"
+
+namespace tera {
+namespace load_balancer {
+
+class CostFunction {
+public:
+    CostFunction(const LBOptions& options, const std::string& name) :
+            lb_options_(options),
+            name_(name) {
+
+    }
+
+    virtual ~CostFunction() {}
+
+    virtual double Cost() = 0;
+
+    virtual void Init(const std::shared_ptr<Cluster>& cluster) {
+        cluster_ = cluster;
+    }
+
+    double GetWeight() const {
+        return weight_;
+    }
+
+    void SetWeight(double w) {
+        weight_ = w;
+    }
+
+    std::string Name() const {
+        return name_;
+    }
+
+protected:
+    double Scale(double min, double max, double value) {
+        VLOG(20) << "[lb] Scale begin, min:" << min << " max:" << max << " value:" << value;
+        if (max <= min || value <= min) {
+            return 0.0;
+        }
+        if (max - min == 0) {
+            return 0.0;
+        }
+
+        double scaled = std::max(0.0, std::min(1.0, (value - min) / (max - min)));
+        VLOG(20) << "[lb] Scale end, scaled:" << scaled;
+        return scaled;
+    }
+
+    double ScaleFromArray(const std::vector<double>& stats) {
+        if (lb_options_.debug_mode_enabled) {
+            std::string line;
+            for (const auto& s : stats) {
+                line += std::to_string(s);
+                line += " ";
+            }
+            LOG(INFO) << "[lb] stats:" << line;
+        }
+
+        double total_cost = 0;
+        double total = GetSum(stats);
+
+        double count = stats.size();
+        double mean = total/count;
+
+        double max = ((count - 1) * mean) + (total - mean);
+
+        double min;
+        if (count > total) {
+                min = ((count - total) * mean) + ((1 - mean) * total);
+        } else {
+                int num_high = (int) (total - (floor(mean) * count));
+                int num_low = (int) (count - num_high);
+
+                min = (num_high * (ceil(mean) - mean)) + (num_low * (mean - floor(mean)));
+
+        }
+        min = std::max(0.0, min);
+        for (size_t i = 0; i < stats.size(); i++) {
+                double n = stats[i];
+                double diff = std::abs(mean - n);
+                total_cost += diff;
+        }
+
+        return Scale(min, max, total_cost);
+    }
+
+private:
+    double GetSum(const std::vector<double>& stats) {
+        double total = 0;
+        for (const auto& s : stats) {
+            total += s;
+        }
+        return total;
+    }
+
+protected:
+    std::shared_ptr<Cluster> cluster_;
+
+private:
+    double weight_;
+    LBOptions lb_options_;
+    std::string name_;
+};
+
+} // namespace load_balancer
+} // namespace tera
+
+#endif // TERA_LOAD_BALANCER_COST_FUNCTION_H_
diff --git a/src/load_balancer/cost_functions.cc b/src/load_balancer/cost_functions.cc
new file mode 100644
index 000000000..e459b4337
--- /dev/null
+++ b/src/load_balancer/cost_functions.cc
@@ -0,0 +1,222 @@
+// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "load_balancer/cost_functions.h"
+
+#include <algorithm>
+#include <vector>
+
+namespace tera {
+namespace load_balancer {
+
+MoveCountCostFunction::MoveCountCostFunction (const LBOptions& options) :
+        CostFunction(options, "MoveCountCostFunction"),
+        kExpensiveCost(1000000),
+        tablet_max_move_num_(options.tablet_max_move_num),
+        tablet_max_move_percent_(options.tablet_max_move_percent) {
+    SetWeight(options.move_count_cost_weight);
+}
+
+MoveCountCostFunction::~MoveCountCostFunction() {
+}
+
+double MoveCountCostFunction::Cost() {
+    uint32_t max_move_num = std::max(tablet_max_move_num_, static_cast<uint32_t>(cluster_->tablet_num_ * tablet_max_move_percent_));
+    double cost = cluster_->tablet_moved_num_;
+    if (cost > static_cast<double>(max_move_num)) {
+        // return an expensive cost
+        VLOG(20) << "[lb] reach max move num limit, max_move_num:" << max_move_num;
+        return kExpensiveCost;
+    }
+
+    return Scale(0, std::min(cluster_->tablet_num_, max_move_num), cost);
+}
+
+MoveFrequencyCostFunction::MoveFrequencyCostFunction(const LBOptions& options) :
+        CostFunction(options, "MoveFrequencyCostFunction"),
+        kExpensiveCost(100000) {
+    SetWeight(options.move_frequency_cost_weight);
+}
+
+MoveFrequencyCostFunction::~MoveFrequencyCostFunction() {
+}
+
+double MoveFrequencyCostFunction::Cost() {
+    if (cluster_->tablets_moved_too_frequently_.size() > 0) {
+        // there are tablets moved too frequently, return an expensive cost
+        VLOG(20) << "[lb] there are " << cluster_->tablets_moved_too_frequently_.size()
+                << " tablets moved too frequently";
+        return kExpensiveCost;
+    } else {
+        return 0;
+    }
+}
+
+AbnormalNodeCostFunction::AbnormalNodeCostFunction(const LBOptions& options) :
+        CostFunction(options, "AbnormalNodeCostFunction"),
+        kExpensiveCost(100000) {
+    SetWeight(options.abnormal_node_cost_weight);
+}
+
+AbnormalNodeCostFunction::~AbnormalNodeCostFunction() {
+}
+
+double AbnormalNodeCostFunction::Cost() {
+    if (cluster_->tablets_moved_to_abnormal_nodes_.size() > 0) {
+        // there are tablets moved to abnormal nodes, return an expensive cost
+        VLOG(20) << "[lb] there are " << cluster_->tablets_moved_to_abnormal_nodes_.size()
+                << " tablets moved to abnormal nodes";
+        return kExpensiveCost;
+    } else {
+        return 0;
+    }
+}
+
+ReadPendingNodeCostFunction::ReadPendingNodeCostFunction(const LBOptions& options) :
+        CostFunction(options, "ReadPendingNodeCostFunction"),
+        kExpensiveCost(10000) {
+    SetWeight(options.read_pending_node_cost_weight);
+}
+
+ReadPendingNodeCostFunction::~ReadPendingNodeCostFunction() {
+}
+
+double ReadPendingNodeCostFunction::Cost() {
+    if (cluster_->tablets_moved_to_read_pending_nodes_.size() > 0) {
+        // there are tablets moved to read pending nodes, return an expensive cost
+        VLOG(20) << "[lb] there are " << cluster_->tablets_moved_to_read_pending_nodes_.size()
+                << " tablets moved to read pending nodes";
+        return kExpensiveCost;
+    } else {
+        return 0;
+    }
+}
+
+WritePendingNodeCostFunction::WritePendingNodeCostFunction(const LBOptions& options) :
+        CostFunction(options, "WritePendingNodeCostFunction"),
+        kExpensiveCost(10000) {
+    SetWeight(options.write_pending_node_cost_weight);
+}
+
+WritePendingNodeCostFunction::~WritePendingNodeCostFunction() {
+}
+
+double WritePendingNodeCostFunction::Cost() {
+    if (cluster_->tablets_moved_to_write_pending_nodes_.size() > 0) {
+        // there are tablets moved to write pending nodes, return an expensive cost
+        VLOG(20) << "[lb] there are " << cluster_->tablets_moved_to_write_pending_nodes_.size()
+                << " tablets moved to write pending nodes";
+        return kExpensiveCost;
+    } else {
+        return 0;
+    }
+}
+
+ScanPendingNodeCostFunction::ScanPendingNodeCostFunction(const LBOptions& options) :
+        CostFunction(options, "ScanPendingNodeCostFunction"),
+        kExpensiveCost(10000) {
+    SetWeight(options.scan_pending_node_cost_weight);
+}
+
+ScanPendingNodeCostFunction::~ScanPendingNodeCostFunction() {
+}
+
+double ScanPendingNodeCostFunction::Cost() {
+    if (cluster_->tablets_moved_to_scan_pending_nodes_.size() > 0) {
+        // there are tablets moved to scan pending nodes, return an expensive cost
+        VLOG(20) << "[lb] there are " << cluster_->tablets_moved_to_scan_pending_nodes_.size()
+                << " tablets moved to scan pending nodes";
+        return kExpensiveCost;
+    } else {
+        return 0;
+    }
+}
+
+TabletCountCostFunction::TabletCountCostFunction (const LBOptions& options) :
+        CostFunction(options, "TabletCountCostFunction") {
+    SetWeight(options.tablet_count_cost_weight);
+}
+
+TabletCountCostFunction::~TabletCountCostFunction() {
+}
+
+double TabletCountCostFunction::Cost() {
+    std::vector<double> tablet_nums_per_node;
+    for (uint32_t i = 0; i < cluster_->tablet_node_num_; ++i) {
+        tablet_nums_per_node.emplace_back(cluster_->tablets_per_node_[i].size());
+    }
+
+    return ScaleFromArray(tablet_nums_per_node);
+}
+
+SizeCostFunction::SizeCostFunction (const LBOptions& options) :
+        CostFunction(options, "SizeCostFunction") {
+    SetWeight(options.size_cost_weight);
+}
+
+SizeCostFunction::~SizeCostFunction() {
+}
+
+double SizeCostFunction::Cost() {
+    std::vector<double> size_per_node;
+    for (uint32_t i = 0; i < cluster_->tablet_node_num_; ++i) {
+        size_per_node.emplace_back(cluster_->size_per_node_[i]);
+    }
+
+    return ScaleFromArray(size_per_node);
+}
+
+ReadLoadCostFunction::ReadLoadCostFunction (const LBOptions& options) :
+        CostFunction(options, "ReadLoadCostFunction") {
+    SetWeight(options.read_load_cost_weight);
+}
+
+ReadLoadCostFunction::~ReadLoadCostFunction() {
+}
+
+double ReadLoadCostFunction::Cost() {
+    std::vector<double> read_load_per_node;
+    for (uint32_t i = 0; i < cluster_->tablet_node_num_; ++i) {
+        read_load_per_node.emplace_back(cluster_->read_load_per_node_[i]);
+    }
+
+    return ScaleFromArray(read_load_per_node);
+}
+
+WriteLoadCostFunction::WriteLoadCostFunction (const LBOptions& options) :
+        CostFunction(options, "WriteLoadCostFunction") {
+    SetWeight(options.write_load_cost_weight);
+}
+
+WriteLoadCostFunction::~WriteLoadCostFunction() {
+}
+
+double WriteLoadCostFunction::Cost() {
+    std::vector<double> write_load_per_node;
+    for (uint32_t i = 0; i < cluster_->tablet_node_num_; ++i) {
+        write_load_per_node.emplace_back(cluster_->write_load_per_node_[i]);
+    }
+
+    return ScaleFromArray(write_load_per_node);
+}
+
+ScanLoadCostFunction::ScanLoadCostFunction (const LBOptions& options) :
+        CostFunction(options, "ScanLoadCostFunction") {
+    SetWeight(options.scan_load_cost_weight);
+}
+
+ScanLoadCostFunction::~ScanLoadCostFunction() {
+}
+
+double ScanLoadCostFunction::Cost() {
+    std::vector<double> scan_load_per_node;
+    for (uint32_t i = 0; i < cluster_->tablet_node_num_; ++i) {
+        scan_load_per_node.emplace_back(cluster_->scan_load_per_node_[i]);
+    }
+
+    return ScaleFromArray(scan_load_per_node);
+}
+
+} // namespace load_balancer
+} // namespace tera
diff --git a/src/load_balancer/cost_functions.h b/src/load_balancer/cost_functions.h
new file mode 100644
index 000000000..5f977275a
--- /dev/null
+++ b/src/load_balancer/cost_functions.h
@@ -0,0 +1,135 @@
+// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef TERA_LOAD_BALANCER_COST_FUNCTIONS_H_
+#define TERA_LOAD_BALANCER_COST_FUNCTIONS_H_
+
+#include "load_balancer/cost_function.h"
+
+namespace tera {
+namespace load_balancer {
+
+// moving too many tablets will cost high
+class MoveCountCostFunction : public CostFunction {
+public:
+    MoveCountCostFunction(const LBOptions& options);
+    virtual ~MoveCountCostFunction();
+
+    virtual double Cost() override;
+
+private:
+    const double kExpensiveCost;
+    uint32_t tablet_max_move_num_;
+    double tablet_max_move_percent_;
+};
+
+// moving tablet oo frequently will cost high
+class MoveFrequencyCostFunction : public CostFunction {
+public:
+    MoveFrequencyCostFunction(const LBOptions& options);
+    virtual ~MoveFrequencyCostFunction();
+
+    virtual double Cost() override;
+
+private:
+    const double kExpensiveCost;
+};
+
+// moving a tablet to an abnormal node will cost high
+class AbnormalNodeCostFunction : public CostFunction {
+public:
+    AbnormalNodeCostFunction(const LBOptions& options);
+    virtual ~AbnormalNodeCostFunction();
+
+    virtual double Cost() override;
+
+private:
+    const double kExpensiveCost;
+};
+
+// moving a tablet to a read pending node will cost high
+class ReadPendingNodeCostFunction : public CostFunction {
+public:
+    ReadPendingNodeCostFunction(const LBOptions& options);
+    virtual ~ReadPendingNodeCostFunction();
+
+    virtual double Cost() override;
+
+private:
+    const double kExpensiveCost;
+};
+
+// moving a tablet to a write pending node will cost high
+class WritePendingNodeCostFunction : public CostFunction {
+public:
+    WritePendingNodeCostFunction(const LBOptions& options);
+    virtual ~WritePendingNodeCostFunction();
+
+    virtual double Cost() override;
+
+private:
+    const double kExpensiveCost;
+};
+
+// moving a tablet to a scan pending node will cost high
+class ScanPendingNodeCostFunction : public CostFunction {
+public:
+    ScanPendingNodeCostFunction(const LBOptions& options);
+    virtual ~ScanPendingNodeCostFunction();
+
+    virtual double Cost() override;
+
+private:
+    const double kExpensiveCost;
+};
+
+// balance the tablets num for each tablet node
+class TabletCountCostFunction : public CostFunction {
+public:
+    TabletCountCostFunction(const LBOptions& options);
+    virtual ~TabletCountCostFunction();
+
+    virtual double Cost() override;
+};
+
+// banlance the data size for each tablet node
+class SizeCostFunction : public CostFunction {
+public:
+    SizeCostFunction(const LBOptions& options);
+    virtual ~SizeCostFunction();
+
+    virtual double Cost() override;
+};
+
+// banlance the read load for each tablet node
+class ReadLoadCostFunction : public CostFunction {
+public:
+    ReadLoadCostFunction(const LBOptions& options);
+    virtual ~ReadLoadCostFunction();
+
+    virtual double Cost() override;
+};
+
+// banlance the write load for each tablet node
+class WriteLoadCostFunction : public CostFunction {
+public:
+    WriteLoadCostFunction(const LBOptions& options);
+    virtual ~WriteLoadCostFunction();
+
+    virtual double Cost() override;
+};
+
+// banlance the scan load for each tablet node
+class ScanLoadCostFunction : public CostFunction {
+public:
+    ScanLoadCostFunction(const LBOptions& options);
+    virtual ~ScanLoadCostFunction();
+
+    virtual double Cost() override;
+};
+
+} // namespace load_balancer
+} // namespace tera
+
+#endif // TERA_LOAD_BALANCER_COST_FUNCTIONS_H_
diff --git a/src/load_balancer/lb_entry.cc b/src/load_balancer/lb_entry.cc
new file mode 100644
index 000000000..abf0b3ad6
--- /dev/null
+++ b/src/load_balancer/lb_entry.cc
@@ -0,0 +1,74 @@
+// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "load_balancer/lb_entry.h"
+
+#include <string>
+
+#include "gflags/gflags.h"
+#include "glog/logging.h"
+
+#include "common/net/ip_address.h"
+#include "common/this_thread.h"
+#include "load_balancer/lb_impl.h"
+#include "load_balancer/lb_service_impl.h"
+
+DECLARE_string(tera_lb_server_addr);
+DECLARE_string(tera_lb_server_port);
+
+std::string GetTeraEntryName() {
+    return "load_balancer";
+}
+
+tera::TeraEntry* GetTeraEntry() {
+    return new tera::load_balancer::LBEntry();
+}
+
+namespace tera {
+namespace load_balancer {
+
+LBEntry::LBEntry() :
+    rpc_server_(nullptr),
+    lb_service_impl_(nullptr),
+    lb_impl_(nullptr) {
+    sofa::pbrpc::RpcServerOptions rpc_options;
+    rpc_server_.reset(new sofa::pbrpc::RpcServer(rpc_options));
+}
+
+LBEntry::~LBEntry() {
+}
+
+bool LBEntry::StartServer() {
+    IpAddress lb_addr(FLAGS_tera_lb_server_addr, FLAGS_tera_lb_server_port);
+    LOG(INFO) << "Start load balancer RPC server at: " << lb_addr.ToString();
+
+    lb_impl_.reset(new LBImpl());
+    lb_service_impl_ = new LBServiceImpl(lb_impl_);
+
+    if (!lb_impl_->Init()) {
+        return false;
+    }
+
+    rpc_server_->RegisterService(lb_service_impl_);
+    if (!rpc_server_->Start(lb_addr.ToString())) {
+        LOG(ERROR) << "start RPC server error";
+        return false;
+    }
+
+    LOG(INFO) << "finish starting load balancer server";
+    return true;
+}
+
+bool LBEntry::Run() {
+    ThisThread::Sleep(1000);
+    return true;
+}
+
+void LBEntry::ShutdownServer() {
+    rpc_server_->Stop();
+}
+
+} // namespace load_balancer
+} // namespace tera
+
diff --git a/src/load_balancer/lb_entry.h b/src/load_balancer/lb_entry.h
new file mode 100644
index 000000000..03399bc00
--- /dev/null
+++ b/src/load_balancer/lb_entry.h
@@ -0,0 +1,38 @@
+// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef TERA_LOAD_BALANCER_LB_ENTRY_H_
+#define TERA_LOAD_BALANCER_LB_ENTRY_H_
+
+#include <memory>
+
+#include "sofa/pbrpc/pbrpc.h"
+
+#include "tera_entry.h"
+
+namespace tera {
+namespace load_balancer {
+
+class LBServiceImpl;
+class LBImpl;
+
+class LBEntry : public TeraEntry {
+public:
+    LBEntry();
+    virtual ~LBEntry();
+
+    virtual bool StartServer();
+    virtual bool Run();
+    virtual void ShutdownServer();
+
+private:
+    std::unique_ptr<sofa::pbrpc::RpcServer> rpc_server_;
+    LBServiceImpl* lb_service_impl_;
+    std::shared_ptr<LBImpl> lb_impl_;
+};
+
+} // namespace load_balancer
+} // namespace tera
+
+#endif // TERA_LOAD_BALANCER_LB_ENTRY_H_
diff --git a/src/load_balancer/lb_impl.cc b/src/load_balancer/lb_impl.cc
new file mode 100644
index 000000000..690528531
--- /dev/null
+++ b/src/load_balancer/lb_impl.cc
@@ -0,0 +1,531 @@
+// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "load_balancer/lb_impl.h"
+
+#include <functional>
+#include <map>
+#include <memory>
+#include <string>
+
+#include "gflags/gflags.h"
+#include "glog/logging.h"
+
+#include "load_balancer/unity_balancer.h"
+#include "proto/tabletnode.pb.h"
+#include "tera.h"
+#include "common/timer.h"
+
+DECLARE_bool(tera_master_meta_isolate_enabled);
+DECLARE_string(tera_master_meta_table_name);
+DECLARE_int32(tera_lb_impl_thread_num);
+DECLARE_int32(tera_lb_load_balance_period_s);
+DECLARE_int32(tera_lb_max_compute_steps);
+DECLARE_int32(tera_lb_max_compute_steps_per_tablet);
+DECLARE_int32(tera_lb_max_compute_time_ms);
+DECLARE_double(tera_lb_min_cost_need_balance);
+DECLARE_double(tera_lb_move_count_cost_weight);
+DECLARE_int32(tera_lb_tablet_max_move_num);
+DECLARE_double(tera_lb_tablet_max_move_percent);
+DECLARE_double(tera_lb_move_frequency_cost_weight);
+DECLARE_int32(tera_lb_tablet_move_too_frequently_threshold_s);
+DECLARE_double(tera_lb_abnormal_node_cost_weight);
+DECLARE_double(tera_lb_abnormal_node_ratio);
+DECLARE_double(tera_lb_read_pending_node_cost_weight);
+DECLARE_double(tera_lb_write_pending_node_cost_weight);
+DECLARE_double(tera_lb_scan_pending_node_cost_weight);
+DECLARE_double(tera_lb_tablet_count_cost_weight);
+DECLARE_double(tera_lb_size_cost_weight);
+DECLARE_double(tera_lb_read_load_cost_weight);
+DECLARE_double(tera_lb_write_load_cost_weight);
+DECLARE_double(tera_lb_scan_load_cost_weight);
+DECLARE_bool(tera_lb_debug_mode_enabled);
+
+using tera::master::NodeState;
+using tera::master::Table;
+using tera::master::TablePtr;
+using tera::master::Tablet;
+using tera::master::TabletPtr;
+using tera::master::TabletNode;
+using tera::master::TabletNodePtr;
+
+namespace tera {
+namespace load_balancer {
+
+LBImpl::LBImpl() :
+    thread_pool_(new ThreadPool(FLAGS_tera_lb_impl_thread_num)),
+    sdk_client_(nullptr),
+    safemode_(true),
+    round_(0),
+    lb_debug_mode_(FLAGS_tera_lb_debug_mode_enabled) {
+}
+
+LBImpl::~LBImpl() {
+}
+
+bool LBImpl::Init() {
+    if (lb_debug_mode_) {
+        LOG(INFO) << "[lb] debug mode enabled";
+    }
+
+    // tera_entry has init glog
+    Client::SetGlogIsInitialized();
+
+    sdk_client_.reset(Client::NewClient());
+    if (!sdk_client_) {
+        LOG(ERROR) << "[lb] open sdk client fail";
+        return false;
+    }
+
+    ScheduleLoadBalance();
+
+    return true;
+}
+
+void LBImpl::ScheduleLoadBalance() {
+    int schedule_period = FLAGS_tera_lb_load_balance_period_s * 1000;
+    VLOG(5) << "[lb] LoadBalance will be scheduled in: " << FLAGS_tera_lb_load_balance_period_s << "s";
+    thread_pool_->DelayTask(schedule_period,
+            [this](int64_t) {
+                DoLoadBalance();
+                ScheduleLoadBalance();
+            }
+    );
+}
+
+void LBImpl::DoLoadBalance() {
+    ++round_;
+    VLOG(5) << "[lb] LoadBalance begin round: " << round_;
+    int64_t start_time = get_micros();
+
+    std::vector<TabletNodePtr> tablet_nodes;
+    std::vector<TablePtr> tables;
+    std::vector<TabletPtr> tablets;
+    if (!Collect(&tablet_nodes, &tables, &tablets)) {
+        return;
+    }
+
+    if (lb_debug_mode_) {
+        DebugCollect(tablet_nodes, tables, tablets);
+    }
+
+    std::vector<std::shared_ptr<LBTabletNode>> lb_nodes;
+    CreateLBInput(tables, tablet_nodes, tablets, &lb_nodes);
+
+    if (lb_debug_mode_) {
+        DebugLBNode(lb_nodes);
+    }
+
+    LBOptions options;
+    options.max_compute_steps = FLAGS_tera_lb_max_compute_steps;
+    options.max_compute_steps_per_tablet = FLAGS_tera_lb_max_compute_steps_per_tablet;
+    options.max_compute_time_ms = FLAGS_tera_lb_max_compute_time_ms;
+    options.min_cost_need_balance = FLAGS_tera_lb_min_cost_need_balance;
+    options.move_count_cost_weight = FLAGS_tera_lb_move_count_cost_weight;
+    options.tablet_max_move_num = FLAGS_tera_lb_tablet_max_move_num;
+    options.tablet_max_move_percent = FLAGS_tera_lb_tablet_max_move_percent;
+    options.move_frequency_cost_weight = FLAGS_tera_lb_move_frequency_cost_weight;
+    options.tablet_move_too_frequently_threshold_s = FLAGS_tera_lb_tablet_move_too_frequently_threshold_s;
+    options.abnormal_node_cost_weight = FLAGS_tera_lb_abnormal_node_cost_weight;
+    options.abnormal_node_ratio = FLAGS_tera_lb_abnormal_node_ratio;
+    options.read_pending_node_cost_weight = FLAGS_tera_lb_read_pending_node_cost_weight;
+    options.write_pending_node_cost_weight = FLAGS_tera_lb_write_pending_node_cost_weight;
+    options.scan_pending_node_cost_weight = FLAGS_tera_lb_scan_pending_node_cost_weight;
+    options.tablet_count_cost_weight = FLAGS_tera_lb_tablet_count_cost_weight;
+    options.size_cost_weight = FLAGS_tera_lb_size_cost_weight;
+    options.read_load_cost_weight = FLAGS_tera_lb_read_load_cost_weight;
+    options.write_load_cost_weight = FLAGS_tera_lb_write_load_cost_weight;
+    options.scan_load_cost_weight = FLAGS_tera_lb_scan_load_cost_weight;
+    options.meta_table_isolate_enabled = FLAGS_tera_master_meta_isolate_enabled;
+    options.meta_table_name = FLAGS_tera_master_meta_table_name;
+    options.meta_table_node_addr = GetMetaNodeAddr();
+    options.debug_mode_enabled = lb_debug_mode_;
+
+    std::unique_ptr<Balancer> balancer(new UnityBalancer(options));
+    std::vector<Plan> plans;
+    if (!balancer->BalanceCluster(lb_nodes, &plans)) {
+        LOG(WARNING) << "[lb] LoadBalance failed";
+        return;
+    }
+
+    DebugPlan(plans);
+
+    if (!IsSafemode()) {
+        bool master_safe_mode = true;
+        bool get_success = GetMasterSafemode(&master_safe_mode);
+
+        if (get_success && !master_safe_mode) {
+            ExecutePlan(plans);
+        } else if (!get_success) {
+            VLOG(5) << "[lb] skip execute plan due to fail to get master safe mode";
+        } else if (master_safe_mode) {
+            VLOG(5) << "[lb] skip execute plan due to master is in safe mode";
+        } else {
+        }
+    } else {
+        VLOG(5) << "[lb] skip execute plan in safe mode";
+    }
+
+    int64_t cost_time = get_micros() - start_time;
+    VLOG(5) << "[lb] LoadBalance end round: " << round_
+              <<", cost: " << cost_time / 1000.0 << "ms";
+}
+
+bool LBImpl::CreateLBInput(
+        const std::vector<TablePtr>& tables,
+        const std::vector<TabletNodePtr>& nodes,
+        const std::vector<TabletPtr>& tablets,
+        std::vector<std::shared_ptr<LBTabletNode>>* lb_nodes) {
+    lb_nodes->clear();
+
+    std::map<std::string, std::shared_ptr<LBTabletNode>> nodes_map;
+    for (const auto& node : nodes) {
+        LBTabletNode* p_lb_node = new LBTabletNode();
+        p_lb_node->tablet_node_ptr = node;
+        nodes_map[node->GetAddr()].reset(p_lb_node);
+    }
+
+    for (const auto& tablet : tablets) {
+        std::string addr = tablet->GetServerAddr();
+        if (nodes_map.find(addr) != nodes_map.end()) {
+            LBTablet* p_lb_tablet = new LBTablet();
+            p_lb_tablet->tablet_ptr = tablet;
+            std::shared_ptr<LBTablet> lb_tablet(p_lb_tablet);
+            nodes_map[addr]->tablets.emplace_back(lb_tablet);
+        } else {
+            // TODO
+            // unassigned tablet, skip now
+        }
+    }
+
+    for (const auto& pair : nodes_map) {
+        lb_nodes->emplace_back(pair.second);
+    }
+
+    return true;
+}
+
+bool LBImpl::Collect(std::vector<TabletNodePtr>* nodes,
+                     std::vector<TablePtr>* tables,
+                     std::vector<TabletPtr>* tablets) {
+    if (nodes == nullptr || tables == nullptr || tablets == nullptr) {
+        return false;
+    }
+    nodes->clear();
+    tables->clear();
+    tablets->clear();
+
+    int64_t start_time = get_micros();
+
+    if (!CollectNodes(nodes)) {
+        LOG(ERROR) << "[lb] collect nodes fail";
+        return false;
+    }
+
+    if (!CollectTablets(tables, tablets)) {
+        LOG(ERROR) << "[lb] collect tablets fail";
+        return false;
+    }
+
+    int64_t cost_time = get_micros() - start_time;
+    VLOG(5) << "[lb] Collect cost: " << cost_time / 1000.0 << "ms";
+
+    return true;
+}
+
+bool LBImpl::CollectNodes(std::vector<TabletNodePtr>* nodes) {
+    tera::ClientImpl* client_impl = static_cast<tera::ClientImpl*>(sdk_client_.get());
+    std::vector<TabletNodeInfo> infos;
+    ErrorCode err;
+    if (!client_impl->ShowTabletNodesInfo(&infos, &err)) {
+        LOG(ERROR) << "[lb] fail to get TabletNodeInfo, err: " << err.ToString();
+        return false;
+    }
+
+    for (const auto& info : infos) {
+        TabletNodePtr node(new TabletNode());
+        NodeInfoToNode(info, node);
+        nodes->push_back(node);
+    }
+
+    VLOG(5) << "[lb] collected node size: " << nodes->size();
+
+    return true;
+}
+
+bool LBImpl::NodeInfoToNode(const TabletNodeInfo& info, TabletNodePtr node) {
+    node->info_ = info;
+
+    node->addr_ = info.addr();
+    node->state_ = StringToNodeState(info.status_m());
+    node->data_size_ = info.load();
+    node->average_counter_.read_pending_ = info.read_pending();
+    node->average_counter_.write_pending_ = info.write_pending();
+    node->average_counter_.scan_pending_ = info.scan_pending();
+
+    return true;
+}
+
+NodeState LBImpl::StringToNodeState(const std::string& str) {
+    if (str == "kReady") {
+        return tera::master::kReady;
+    } else if (str == "kOffLine") {
+        return tera::master::kOffLine;
+    } else if (str == "kOnKick") {
+        return tera::master::kOnKick;
+    } else if (str == "kWaitKick") {
+        return tera::master::kWaitKick;
+    } else {
+        return tera::master::kOffLine;
+    }
+}
+
+bool LBImpl::CollectTablets(std::vector<TablePtr>* tables,
+                            std::vector<TabletPtr>* tablets) {
+    tera::ClientImpl* client_impl = static_cast<tera::ClientImpl*>(sdk_client_.get());
+    TableMetaList table_list;
+    TabletMetaList tablet_list;
+    bool is_brief = false;
+    ErrorCode err;
+    if (!client_impl->ShowTablesInfo(&table_list, &tablet_list, is_brief, &err)) {
+        LOG(ERROR) << "[lb] fail to get tablets, err: " << err.ToString();
+        return false;
+    }
+
+    std::map<std::string, TablePtr> table_name_to_ptr;
+
+    for (int i = 0; i < table_list.meta_size(); ++i) {
+        std::string table_name = table_list.meta(i).table_name();
+        TablePtr table(new tera::master::Table(table_name));
+        TableMetaToTable(table_list.meta(i), table);
+        tables->push_back(table);
+
+        if (table_name_to_ptr.find(table_name) == table_name_to_ptr.end()) {
+            table_name_to_ptr[table_name] = table;
+        }
+    }
+
+    if (tablet_list.meta_size() != tablet_list.counter_size()) {
+        LOG(ERROR) << "[lb] invalid TabletMetaList, meta size: " << tablet_list.meta_size()
+                   << " counter size: " << tablet_list.counter_size();
+        return false;
+    }
+    for (int i = 0; i < tablet_list.meta_size(); ++i) {
+        std::string table_name = tablet_list.meta(i).table_name();
+        if (table_name_to_ptr.find(table_name) == table_name_to_ptr.end()) {
+            LOG(WARNING) << "[lb] tablet's table not exist " << "tablet path: "
+                         << tablet_list.meta(i).path() << "table: " << table_name;
+            continue;
+        }
+        TabletPtr tablet(new tera::master::Tablet(tablet_list.meta(i), table_name_to_ptr[table_name]));
+        tablet->SetCounter(tablet_list.counter(i));
+        if (tablet_list.meta(i).has_last_move_time_us()) {
+            tablet->SetLastMoveTime(tablet_list.meta(i).last_move_time_us());
+        } else {
+            // !!! compatible with old master
+            // !!! set last move time to 0 will disable the MoveFrequencyCostFunction strategy
+            tablet->SetLastMoveTime(0);
+        }
+        tablets->push_back(tablet);
+
+        if (table_name == FLAGS_tera_master_meta_table_name) {
+            SetMetaNodeAddr(tablet->GetServerAddr());
+            VLOG(5) << "[lb] meta table node addr: " << GetMetaNodeAddr();
+        }
+    }
+
+    VLOG(5) << "[lb] collected table size: " << tables->size();
+    VLOG(5) << "[lb] collected tablet size: " << tablets->size();
+
+    return true;
+}
+
+bool LBImpl::TableMetaToTable(const TableMeta& meta, TablePtr table) {
+    table->SetStatus(meta.status());
+    table->SetSchema(meta.schema());
+
+    return true;
+}
+
+void LBImpl::DebugCollect(const std::vector<TabletNodePtr>& nodes,
+                          const std::vector<TablePtr>& tables,
+                          const std::vector<TabletPtr>& tablets) {
+    LOG(INFO) << "";
+    LOG(INFO) << "[lb] DebugCollect begin -----";
+
+    LOG(INFO) << "[lb] " << tables.size() << " table:" ;
+    for (const auto& table : tables) {
+        LOG(INFO) << "table:" + table->GetTableName()
+                << " status:" << StatusCodeToString(table->GetStatus());
+    }
+
+    LOG(INFO) << "[lb] " << nodes.size()  << " node:";
+    for (const auto& node : nodes) {
+        LOG(INFO) << "addr:" + node->GetAddr()
+                << " state:" << tera::master::NodeStateToString(node->GetState())
+                << " size:" << node->GetSize() << "B"
+                << " r_pending:" << node->GetReadPending()
+                << " w_pending:" << node->GetWritePending()
+                << " s_pending:" << node->GetScanPending();
+    }
+
+    LOG(INFO) << "[lb] " << tablets.size() << " tablet:";
+    for (const auto& tablet : tablets) {
+        LOG(INFO) << "path:" + tablet->GetPath()
+                 << " status:" << StatusCodeToString(tablet->GetStatus())
+                 << " server:" << tablet->GetServerAddr()
+                 << " table:" << tablet->GetTableName()
+                 << " last_move_time_us:" << tablet->LastMoveTime();
+    }
+
+    LOG(INFO) << "[lb] DebugCollect end -----";
+    LOG(INFO) << "";
+}
+
+void LBImpl::DebugLBNode(const std::vector<std::shared_ptr<LBTabletNode>>& lb_nodes) {
+    LOG(INFO) << "";
+    LOG(INFO) << "[lb] DebugLBNode begin -----";
+    LOG(INFO) << "[lb] " << lb_nodes.size() << " lb_nodes:" ;
+
+    for (const auto& node : lb_nodes) {
+        LOG(INFO) << "[lb] " << node->tablet_node_ptr->GetAddr() << ":";
+        for (const auto& lb_tablet : node->tablets) {
+            LOG(INFO) << "[lb] " << lb_tablet->tablet_ptr->GetPath();
+        }
+    }
+
+    LOG(INFO) << "[lb] DebugLBNode end -----";
+    LOG(INFO) << "";
+}
+
+void LBImpl::DebugPlan(const std::vector<Plan>& plans) {
+    VLOG(5) << "";
+    VLOG(5) << "[lb] DebugPlan begin ----";
+    VLOG(5) << plans.size() << " plans:";
+
+    for (const auto& plan : plans) {
+        VLOG(5) << "[lb] " + plan.ToString();
+    }
+
+    VLOG(5) << "[lb] DebugPlan end ----";
+    VLOG(5) << "";
+}
+
+void LBImpl::ExecutePlan(const std::vector<Plan>& plans) {
+    tera::ClientImpl* client_impl = static_cast<tera::ClientImpl*>(sdk_client_.get());
+    for (const auto& plan : plans) {
+        std::string tablet_path = plan.TabletPath();
+        std::string dest_addr = plan.DestAddr();
+
+        std::vector<std::string> arg_list;
+        arg_list.emplace_back("move");
+        arg_list.emplace_back(tablet_path);
+        arg_list.emplace_back(dest_addr);
+
+        ErrorCode err;
+        if (!client_impl->CmdCtrl("tablet", arg_list, nullptr, nullptr, &err)) {
+            LOG(ERROR) << "[lb] fail to execute plan:" << plan.ToString() << err.ToString();
+        } else {
+            VLOG(5) << "[lb] execute plan success:" << plan.ToString();
+        }
+    }
+}
+
+bool LBImpl::IsSafemode() const {
+    MutexLock lock(&mutex_);
+    return safemode_;
+}
+
+bool LBImpl::SetSafemode(bool value) {
+    MutexLock lock(&mutex_);
+    safemode_ = value;
+
+    if (value) {
+        LOG(INFO) << "[lb] LoadBanlacer enter safemode";
+    } else {
+        LOG(INFO) << "[lb] LoadBanlacer leave safemode";
+    }
+
+    return true;
+}
+
+bool LBImpl::GetMasterSafemode(bool* safe_mode) {
+    if (safe_mode == nullptr) {
+        return false;
+    }
+
+    std::string op = "get";
+    std::vector<std::string> arg_list;
+    arg_list.push_back(op);
+
+    tera::ClientImpl* client_impl = static_cast<tera::ClientImpl*>(sdk_client_.get());
+    ErrorCode err;
+    if (!client_impl->CmdCtrl("safemode", arg_list, safe_mode, NULL, &err)) {
+        LOG(ERROR) << "[lb] fail to " << op << " master safemode" << err.ToString();
+        return false;
+    }
+
+    VLOG(20) << "[lb] master safemode: " << *safe_mode;
+    return true;
+}
+
+std::string LBImpl::GetMetaNodeAddr() const {
+    MutexLock lock(&mutex_);
+    return meta_node_addr_;
+}
+
+bool LBImpl::SetMetaNodeAddr(const std::string& addr) {
+    MutexLock lock(&mutex_);
+    meta_node_addr_ = addr;
+    return true;
+}
+
+void LBImpl::CmdCtrl(const CmdCtrlRequest* request,
+                     CmdCtrlResponse* response,
+                     google::protobuf::Closure* done) {
+    std::string cmd_line;
+    for (int32_t i = 0; i < request->arg_list_size(); i++) {
+        cmd_line += request->arg_list(i);
+        if (i != request->arg_list_size() - 1) {
+            cmd_line += " ";
+        }
+    }
+    LOG(INFO) << "[lb] receive cmd: " << request->command() << " " << cmd_line;
+
+    response->set_sequence_id(request->sequence_id());
+
+    if (request->command() == "safemode") {
+        SafeModeCmdCtrl(request, response);
+    } else {
+        response->set_status(kInvalidArgument);
+    }
+
+    done->Run();
+    return;
+}
+
+void LBImpl::SafeModeCmdCtrl(const CmdCtrlRequest* request,
+                             CmdCtrlResponse* response) {
+    if (request->arg_list_size() != 1) {
+        response->set_status(kInvalidArgument);
+        return;
+    }
+
+    if (request->arg_list(0) == "enter") {
+        SetSafemode(true);
+        response->set_status(kLoadBalancerOk);
+    } else if (request->arg_list(0) == "leave") {
+        SetSafemode(false);
+        response->set_status(kLoadBalancerOk);
+    } else if (request->arg_list(0) == "get") {
+        response->set_bool_result(IsSafemode());
+        response->set_status(kLoadBalancerOk);
+    } else {
+        response->set_status(kInvalidArgument);
+    }
+}
+
+} // namespace load_balancer
+} // namespace tera
+
diff --git a/src/load_balancer/lb_impl.h b/src/load_balancer/lb_impl.h
new file mode 100644
index 000000000..2e2abe88d
--- /dev/null
+++ b/src/load_balancer/lb_impl.h
@@ -0,0 +1,93 @@
+// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef TERA_LOAD_BALANCER_LB_IMPL_H_
+#define TERA_LOAD_BALANCER_LB_IMPL_H_
+
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "common/mutex.h"
+#include "common/thread_pool.h"
+#include "load_balancer/lb_node.h"
+#include "load_balancer/plan.h"
+#include "master/tablet_manager.h"
+#include "master/tabletnode_manager.h"
+#include "proto/load_balancer_rpc.pb.h"
+#include "sdk/client_impl.h"
+
+namespace tera {
+namespace load_balancer {
+
+class LBImpl {
+public:
+    LBImpl();
+    virtual ~LBImpl();
+
+    bool Init();
+
+    void CmdCtrl(const CmdCtrlRequest* request,
+                 CmdCtrlResponse* response,
+                 google::protobuf::Closure* done);
+
+private:
+    void ScheduleLoadBalance();
+    void DoLoadBalance();
+
+    bool CreateLBInput(const std::vector<tera::master::TablePtr>& tables,
+                       const std::vector<tera::master::TabletNodePtr>& nodes,
+                       const std::vector<tera::master::TabletPtr>& tablets,
+                       std::vector<std::shared_ptr<LBTabletNode>>* lb_nodes);
+
+    bool Collect(std::vector<tera::master::TabletNodePtr>* nodes,
+                 std::vector<tera::master::TablePtr>* tables,
+                 std::vector<tera::master::TabletPtr>* tablets);
+
+    bool CollectNodes(std::vector<tera::master::TabletNodePtr>* nodes);
+    bool NodeInfoToNode(const TabletNodeInfo& info,
+                        tera::master::TabletNodePtr node);
+    tera::master::NodeState StringToNodeState(const std::string& str);
+
+    bool CollectTablets(std::vector<tera::master::TablePtr>* tables,
+                        std::vector<tera::master::TabletPtr>* tablets);
+    bool TableMetaToTable(const TableMeta& meta, tera::master::TablePtr table);
+
+    void ExecutePlan(const std::vector<Plan>& plans);
+
+    bool IsSafemode() const;
+    bool SetSafemode(bool value);
+
+    bool GetMasterSafemode(bool* safe_mode);
+
+    std::string GetMetaNodeAddr() const;
+    bool SetMetaNodeAddr(const std::string& addr);
+
+    void DebugCollect(const std::vector<tera::master::TabletNodePtr>& nodes,
+                      const std::vector<tera::master::TablePtr>& tables,
+                      const std::vector<tera::master::TabletPtr>& tablets);
+    void DebugLBNode(const std::vector<std::shared_ptr<LBTabletNode>>& lb_nodes);
+    void DebugPlan(const std::vector<Plan>& plans);
+
+    void SafeModeCmdCtrl(const CmdCtrlRequest* request,
+                         CmdCtrlResponse* response);
+
+private:
+    mutable Mutex mutex_;
+
+    std::unique_ptr<ThreadPool> thread_pool_;
+    std::unique_ptr<tera::Client> sdk_client_;
+
+    bool safemode_;
+    uint64_t round_;
+    std::string meta_node_addr_;
+
+    bool lb_debug_mode_;
+};
+
+} // namespace load_balancer
+} // namespace tera
+
+#endif // TERA_LOAD_BALANCER_LB_IMPL_H_
diff --git a/src/load_balancer/lb_node.h b/src/load_balancer/lb_node.h
new file mode 100644
index 000000000..b3b4430e2
--- /dev/null
+++ b/src/load_balancer/lb_node.h
@@ -0,0 +1,30 @@
+// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef TERA_LOAD_BALANCER_LB_NODE_H_
+#define TERA_LOAD_BALANCER_LB_NODE_H_
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "master/tablet_manager.h"
+#include "master/tabletnode_manager.h"
+
+namespace tera {
+namespace load_balancer {
+
+struct LBTablet {
+    tera::master::TabletPtr tablet_ptr;
+};
+
+struct LBTabletNode {
+    tera::master::TabletNodePtr tablet_node_ptr;
+    std::vector<std::shared_ptr<LBTablet>> tablets;
+};
+
+} // namespace load_balancer
+} // namespace tera
+
+#endif // TERA_LOAD_BALANCER_LB_NODE_H_
diff --git a/src/load_balancer/lb_service_impl.cc b/src/load_balancer/lb_service_impl.cc
new file mode 100644
index 000000000..e67759c1c
--- /dev/null
+++ b/src/load_balancer/lb_service_impl.cc
@@ -0,0 +1,49 @@
+// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "load_balancer/lb_service_impl.h"
+
+#include <functional>
+
+#include "gflags/gflags.h"
+#include "glog/logging.h"
+
+#include "load_balancer/lb_impl.h"
+#include "utils/network_utils.h"
+
+DECLARE_int32(tera_lb_server_thread_num);
+
+namespace tera {
+namespace load_balancer {
+
+LBServiceImpl::LBServiceImpl(const std::shared_ptr<LBImpl>& lb_impl) :
+    lb_impl_(lb_impl),
+    thread_pool_(new ThreadPool(FLAGS_tera_lb_server_thread_num)) {
+}
+
+LBServiceImpl::~LBServiceImpl() {
+}
+
+void LBServiceImpl::CmdCtrl(google::protobuf::RpcController* controller,
+                            const CmdCtrlRequest* request,
+                            CmdCtrlResponse* response,
+                            google::protobuf::Closure* done) {
+    VLOG(20) << "accept RPC (CmdCtrl) from: " << tera::utils::GetRemoteAddress(controller);
+    ThreadPool::Task task =
+        std::bind(&LBServiceImpl::DoCmdCtrl, this, controller, request, response, done);
+    thread_pool_->AddTask(task);
+}
+
+void LBServiceImpl::DoCmdCtrl(google::protobuf::RpcController* controller,
+                              const CmdCtrlRequest* request,
+                              CmdCtrlResponse* response,
+                              google::protobuf::Closure* done) {
+    VLOG(20) << "run RPC (CmdCtrl)";
+    lb_impl_->CmdCtrl(request, response, done);
+    VLOG(20) << "finish RPC (CmdCtrl)";
+}
+
+} // namespace load_balancer
+} // namespace tera
+
diff --git a/src/load_balancer/lb_service_impl.h b/src/load_balancer/lb_service_impl.h
new file mode 100644
index 000000000..f0754bb6e
--- /dev/null
+++ b/src/load_balancer/lb_service_impl.h
@@ -0,0 +1,42 @@
+// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef TERA_LOAD_BALANCER_LB_SERVICE_IMPL_H_
+#define TERA_LOAD_BALANCER_LB_SERVICE_IMPL_H_
+
+#include <memory>
+
+#include "common/thread_pool.h"
+#include "proto/load_balancer_rpc.pb.h"
+
+namespace tera {
+namespace load_balancer {
+
+class LBImpl;
+
+class LBServiceImpl: public LoadBalancerService {
+public:
+    explicit LBServiceImpl(const std::shared_ptr<LBImpl>& lb_impl);
+    virtual ~LBServiceImpl();
+
+    void CmdCtrl(google::protobuf::RpcController* controller,
+                 const CmdCtrlRequest* request,
+                 CmdCtrlResponse* response,
+                 google::protobuf::Closure* done);
+
+private:
+    void DoCmdCtrl(google::protobuf::RpcController* controller,
+                   const CmdCtrlRequest* request,
+                   CmdCtrlResponse* response,
+                   google::protobuf::Closure* done);
+
+private:
+    std::shared_ptr<LBImpl> lb_impl_;
+    std::unique_ptr<ThreadPool> thread_pool_;
+};
+
+} // namespace load_balancer
+} // namespace tera
+
+#endif // TERA_LOAD_BALANCER_LB_SERVICE_IMPL_H_
diff --git a/src/load_balancer/options.h b/src/load_balancer/options.h
new file mode 100644
index 000000000..4d280c6ce
--- /dev/null
+++ b/src/load_balancer/options.h
@@ -0,0 +1,100 @@
+// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef TERA_LOAD_BALANCER_OPTIONS_H_
+#define TERA_LOAD_BALANCER_OPTIONS_H_
+
+#include <string>
+
+namespace tera {
+namespace load_balancer {
+
+struct LBOptions {
+    // calculate
+    uint64_t max_compute_steps;
+    uint32_t max_compute_steps_per_tablet;
+    uint64_t max_compute_time_ms;
+    double min_cost_need_balance;
+
+    // MoveCountCostFunction
+    double move_count_cost_weight;
+    uint32_t tablet_max_move_num;
+    double tablet_max_move_percent;
+
+    // MoveFrequencyCostFunction
+    double move_frequency_cost_weight;
+    uint32_t tablet_move_too_frequently_threshold_s;
+
+    // AbnormalNodeCostFunction
+    double abnormal_node_cost_weight;
+    // if not ready tablets's ratio is hither than this value,
+    // the node in considered abnormal
+    double abnormal_node_ratio;
+
+    // ReadPendingNodeCostFunction
+    double read_pending_node_cost_weight;
+
+    // WritePendingNodeCostFunction
+    double write_pending_node_cost_weight;
+
+    // ScanPendingNodeCostFunction
+    double scan_pending_node_cost_weight;
+
+    // CountCostFunction
+    double tablet_count_cost_weight;
+
+    // SizeCostFunction
+    double size_cost_weight;
+
+    // LoadCostFunction
+    double read_load_cost_weight;
+    double write_load_cost_weight;
+    double scan_load_cost_weight;
+
+    // meta table
+    bool meta_table_isolate_enabled;
+    std::string meta_table_name;
+    std::string meta_table_node_addr;
+
+    // debug
+    bool debug_mode_enabled;
+
+    LBOptions() :
+            max_compute_steps(1000000),
+            max_compute_steps_per_tablet(1000),
+            max_compute_time_ms(30 * 1000),
+            min_cost_need_balance(0.1),
+
+            move_count_cost_weight(10),
+            tablet_max_move_num(10),
+            tablet_max_move_percent(0.001),
+
+            move_frequency_cost_weight(10),
+            tablet_move_too_frequently_threshold_s(600),
+
+            abnormal_node_cost_weight(10),
+            abnormal_node_ratio(0.5),
+
+            read_pending_node_cost_weight(10),
+            write_pending_node_cost_weight(10),
+            scan_pending_node_cost_weight(10),
+
+            tablet_count_cost_weight(0),
+            size_cost_weight(100),
+            read_load_cost_weight(0),
+            write_load_cost_weight(0),
+            scan_load_cost_weight(0),
+
+            meta_table_isolate_enabled(true),
+            meta_table_name("meta_table"),
+            meta_table_node_addr(""),
+
+            debug_mode_enabled(false) {
+    }
+};
+
+} // namespace load_balancer
+} // namespace tera
+
+#endif // TERA_LOAD_BALANCER_OPTIONS_H_
diff --git a/src/load_balancer/plan.h b/src/load_balancer/plan.h
new file mode 100644
index 000000000..6e4ca41ae
--- /dev/null
+++ b/src/load_balancer/plan.h
@@ -0,0 +1,71 @@
+// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef TERA_LOAD_BALANCER_PLAN_H_
+#define TERA_LOAD_BALANCER_PLAN_H_
+
+#include <string>
+
+#include "master/tablet_manager.h"
+#include "master/tabletnode_manager.h"
+
+namespace tera {
+namespace load_balancer {
+
+class Plan {
+public:
+    Plan() {}
+
+    Plan(const tera::master::TabletPtr& tablet,
+         const tera::master::TabletNodePtr& source,
+         const tera::master::TabletNodePtr& dest) {
+        tablet_ = tablet;
+        source_ = source;
+        dest_ = dest;
+    }
+
+    virtual ~Plan() {}
+
+    virtual std::string TabletPath() const {
+        if (tablet_) {
+            return tablet_->GetPath();
+        } else {
+            return "";
+        }
+    }
+
+    virtual std::string SourceAddr() const {
+        if (source_) {
+            return source_->GetAddr();
+        } else {
+            return "";
+        }
+    }
+
+    virtual std::string DestAddr() const {
+        if (dest_) {
+            return dest_->GetAddr();
+        } else {
+            return "";
+        }
+    }
+
+    virtual std::string ToString() const {
+        std::string str = "tablet:" + (tablet_ ? tablet_->GetPath() : "")
+                + " source:" + (source_ ? source_->GetAddr() : "")
+                + " dest:" + (dest_ ? dest_->GetAddr() : "");
+
+        return str;
+    }
+
+private:
+    tera::master::TabletPtr tablet_;
+    tera::master::TabletNodePtr source_;
+    tera::master::TabletNodePtr dest_;
+};
+
+} // namespace load_balancer
+} // namespace tera
+
+#endif // TERA_LOAD_BALANCER_PLAN_H_
diff --git a/src/load_balancer/random.h b/src/load_balancer/random.h
new file mode 100644
index 000000000..46a43008f
--- /dev/null
+++ b/src/load_balancer/random.h
@@ -0,0 +1,73 @@
+// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef TERA_LOAD_BALANCER_RANDOM_H_
+#define TERA_LOAD_BALANCER_RANDOM_H_
+
+#include <assert.h>
+
+#include <ctime>
+#include <random>
+
+#include "common/timer.h"
+
+namespace tera {
+namespace load_balancer {
+
+class Random {
+public:
+    // random from [a, b)
+    //  a < b should be ensured
+    // can generate negative number
+    // avg time cost: 25us
+    static int RandStd(int a, int b) {
+        assert(a < b);
+
+        std::random_device rd;
+        std::mt19937 gen(rd());
+        std::uniform_int_distribution<> dis(a, b - 1);
+
+        return dis(gen);
+    }
+
+    // random from [a, b)
+    // a < b should be ensured
+    // can not generate negative number
+    // avg time cost: 150ns
+    static uint32_t RandTime(uint32_t a, uint32_t b) {
+        assert(a < b);
+
+        int64_t time_us = get_micros();
+        return time_us % (b - a) + a;
+    }
+
+    // random from [a, b)
+    // a < b should be ensured
+    // can not generate negative number
+    // avg time cost: 15ns
+    static uint32_t Rand(uint32_t a, uint32_t b) {
+        assert(a < b);
+
+        uint32_t rand = xorshift32();
+        return rand % (b - a) + a;
+    }
+
+private:
+    /* The state word must be initialized to non-zero */
+    static uint32_t xorshift32() {
+        /* Algorithm "xor" from p. 4 of Marsaglia, "Xorshift RNGs" */
+        static uint32_t state = time(NULL);
+        uint32_t x = state;
+        x ^= x << 13;
+        x ^= x >> 17;
+        x ^= x << 5;
+        state = x;
+        return x;
+    }
+};
+
+} // namespace load_balancer
+} // namespace tera
+
+#endif // TERA_LOAD_BALANCER_RANDOM_H_
diff --git a/src/load_balancer/test/action_generators_test.cc b/src/load_balancer/test/action_generators_test.cc
new file mode 100644
index 000000000..6cbe65e4d
--- /dev/null
+++ b/src/load_balancer/test/action_generators_test.cc
@@ -0,0 +1,311 @@
+// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include <limits>
+
+#include "gflags/gflags.h"
+#include "glog/logging.h"
+#include "gtest/gtest.h"
+
+#include "load_balancer/action_generators.h"
+
+namespace tera {
+namespace load_balancer {
+
+class RandomActionGeneratorTest : public ::testing::Test {
+public:
+    virtual void SetUp() {
+        random_action_generator_.reset(new RandomActionGenerator());
+
+        std::vector<std::shared_ptr<LBTabletNode>> empty_lb_nodes;
+        LBOptions options;
+        cluster_.reset(new Cluster(empty_lb_nodes, options));
+    }
+
+    virtual void TearDown() {
+    }
+
+private:
+    std::shared_ptr<RandomActionGenerator> random_action_generator_;
+    std::shared_ptr<Cluster> cluster_;
+};
+
+class TabletCountActionGeneratorTest : public ::testing::Test {
+public:
+    virtual void SetUp() {
+        tablet_count_action_generator_.reset(new TabletCountActionGenerator());
+
+        std::vector<std::shared_ptr<LBTabletNode>> empty_lb_nodes;
+        LBOptions options;
+        cluster_.reset(new Cluster(empty_lb_nodes, options));
+    }
+
+    virtual void TearDown() {
+    }
+
+private:
+    std::shared_ptr<TabletCountActionGenerator> tablet_count_action_generator_;
+    std::shared_ptr<Cluster> cluster_;
+};
+
+class SizeActionGeneratorTest : public ::testing::Test {
+public:
+    virtual void SetUp() {
+        size_action_generator_.reset(new SizeActionGenerator());
+
+        std::vector<std::shared_ptr<LBTabletNode>> empty_lb_nodes;
+        LBOptions options;
+        cluster_.reset(new Cluster(empty_lb_nodes, options));
+    }
+
+    virtual void TearDown() {
+    }
+
+private:
+    std::shared_ptr<SizeActionGenerator> size_action_generator_;
+    std::shared_ptr<Cluster> cluster_;
+};
+
+class ReadLoadActionGeneratorTest : public ::testing::Test {
+public:
+    virtual void SetUp() {
+        read_load_action_generator_.reset(new ReadLoadActionGenerator());
+
+        std::vector<std::shared_ptr<LBTabletNode>> empty_lb_nodes;
+        LBOptions options;
+        cluster_.reset(new Cluster(empty_lb_nodes, options));
+    }
+
+    virtual void TearDown() {
+    }
+
+private:
+    std::shared_ptr<ReadLoadActionGenerator> read_load_action_generator_;
+    std::shared_ptr<Cluster> cluster_;
+};
+
+class WriteLoadActionGeneratorTest : public ::testing::Test {
+public:
+    virtual void SetUp() {
+        write_load_action_generator_.reset(new WriteLoadActionGenerator());
+
+        std::vector<std::shared_ptr<LBTabletNode>> empty_lb_nodes;
+        LBOptions options;
+        cluster_.reset(new Cluster(empty_lb_nodes, options));
+    }
+
+    virtual void TearDown() {
+    }
+
+private:
+    std::shared_ptr<WriteLoadActionGenerator> write_load_action_generator_;
+    std::shared_ptr<Cluster> cluster_;
+};
+
+class ScanLoadActionGeneratorTest : public ::testing::Test {
+public:
+    virtual void SetUp() {
+        scan_load_action_generator_.reset(new ScanLoadActionGenerator());
+
+        std::vector<std::shared_ptr<LBTabletNode>> empty_lb_nodes;
+        LBOptions options;
+        cluster_.reset(new Cluster(empty_lb_nodes, options));
+    }
+
+    virtual void TearDown() {
+    }
+
+private:
+    std::shared_ptr<ScanLoadActionGenerator> scan_load_action_generator_;
+    std::shared_ptr<Cluster> cluster_;
+};
+
+TEST_F(RandomActionGeneratorTest, PickNodeTest) {
+    cluster_->tablet_node_num_ = 10;
+
+    uint32_t index = random_action_generator_->PickRandomNode(cluster_);
+    ASSERT_GE(index, 0);
+    ASSERT_LT(index, cluster_->tablet_node_num_);
+
+    uint32_t other_index = random_action_generator_->PickOtherRandomNode(cluster_, index);
+    ASSERT_GE(other_index, 0);
+    ASSERT_LT(other_index, cluster_->tablet_node_num_);
+    ASSERT_NE(index, other_index);
+}
+
+TEST_F(RandomActionGeneratorTest, PickRandomTabletOfNodeTest) {
+    cluster_->tablet_node_num_ = 1;
+    ASSERT_EQ(random_action_generator_->PickRandomTabletOfNode(cluster_, 0), std::numeric_limits<uint32_t>::max());
+
+    cluster_->tablets_per_node_[0].emplace_back(0);
+    ASSERT_EQ(random_action_generator_->PickRandomTabletOfNode(cluster_, 0), 0);
+}
+
+TEST_F(RandomActionGeneratorTest, GenerateTest) {
+    cluster_->tablet_node_num_ = 1;
+    std::shared_ptr<Action> action(random_action_generator_->Generate(cluster_));
+    ASSERT_EQ(Action::Type::EMPTY, action->GetType());
+
+    cluster_->tablet_node_num_ = 2;
+    cluster_->tablets_per_node_[0].emplace_back(0);
+    cluster_->tablets_per_node_[1].emplace_back(1);
+    std::shared_ptr<Action> action_0(random_action_generator_->Generate(cluster_));
+    ASSERT_EQ(Action::Type::MOVE, action_0->GetType());
+}
+
+TEST_F(TabletCountActionGeneratorTest, GenerateTest) {
+    uint32_t more_tablets_node_index = 0;
+    uint32_t less_tablets_node_index = 1;
+    cluster_->tablets_per_node_[more_tablets_node_index].emplace_back(0);
+    cluster_->tablets_per_node_[more_tablets_node_index].emplace_back(1);
+    cluster_->tablets_per_node_[less_tablets_node_index].emplace_back(2);
+
+    cluster_->tablet_node_num_ = 2;
+
+    cluster_->node_index_sorted_by_tablet_count_.emplace_back(more_tablets_node_index);
+    cluster_->node_index_sorted_by_tablet_count_.emplace_back(less_tablets_node_index);
+
+    cluster_->SortNodesByTabletCount();
+    ASSERT_EQ(more_tablets_node_index, tablet_count_action_generator_->PickMostTabletsNode(cluster_));
+    ASSERT_EQ(less_tablets_node_index, tablet_count_action_generator_->PickLeastTabletsNode(cluster_));
+
+    std::shared_ptr<Action> action(tablet_count_action_generator_->Generate(cluster_));
+    ASSERT_EQ(Action::Type::MOVE, action->GetType());
+    MoveAction* move_action = dynamic_cast<MoveAction*>(action.get());
+    ASSERT_EQ(more_tablets_node_index, move_action->source_node_index_);
+    ASSERT_EQ(less_tablets_node_index, move_action->dest_node_index_);
+
+    cluster_->meta_table_node_index_ = less_tablets_node_index;
+    ASSERT_EQ(more_tablets_node_index, tablet_count_action_generator_->PickMostTabletsNode(cluster_));
+    ASSERT_EQ(more_tablets_node_index, tablet_count_action_generator_->PickLeastTabletsNode(cluster_));
+}
+
+TEST_F(SizeActionGeneratorTest, GenerateTest) {
+    uint32_t larger_size_node_index = 0;
+    uint32_t smaller_size_node_index = 1;
+    cluster_->size_per_node_[larger_size_node_index] = 20;
+    cluster_->size_per_node_[smaller_size_node_index] = 10;
+
+    uint32_t tablet_index_on_larger_size_node = 0;
+    uint32_t tablet_index_on_smaller_size_node = 1;
+    cluster_->tablet_node_num_ = 2;
+    cluster_->tablets_per_node_[larger_size_node_index].emplace_back(tablet_index_on_larger_size_node);
+    cluster_->tablets_per_node_[smaller_size_node_index].emplace_back(tablet_index_on_smaller_size_node);
+
+    cluster_->node_index_sorted_by_size_.emplace_back(larger_size_node_index);
+    cluster_->node_index_sorted_by_size_.emplace_back(smaller_size_node_index);
+
+    cluster_->SortNodesBySize();
+    ASSERT_EQ(larger_size_node_index, size_action_generator_->PickLargestSizeNode(cluster_));
+    ASSERT_EQ(smaller_size_node_index, size_action_generator_->PickSmallestSizeNode(cluster_));
+
+    std::shared_ptr<Action> action(size_action_generator_->Generate(cluster_));
+    ASSERT_EQ(Action::Type::MOVE, action->GetType());
+    MoveAction* move_action = dynamic_cast<MoveAction*>(action.get());
+    ASSERT_EQ(tablet_index_on_larger_size_node, move_action->tablet_index_);
+    ASSERT_EQ(larger_size_node_index, move_action->source_node_index_);
+    ASSERT_EQ(smaller_size_node_index, move_action->dest_node_index_);
+
+    cluster_->meta_table_node_index_ = smaller_size_node_index;
+    ASSERT_EQ(larger_size_node_index, size_action_generator_->PickLargestSizeNode(cluster_));
+    ASSERT_EQ(larger_size_node_index, size_action_generator_->PickSmallestSizeNode(cluster_));
+}
+
+TEST_F(ReadLoadActionGeneratorTest, GenerateTest) {
+    uint32_t more_read_node_index = 0;
+    uint32_t less_read_node_index = 1;
+    cluster_->read_load_per_node_[more_read_node_index] = 20;
+    cluster_->read_load_per_node_[less_read_node_index] = 10;
+
+    uint32_t tablet_index_on_more_read_node = 0;
+    uint32_t tablet_index_on_less_read_node = 1;
+    cluster_->tablet_node_num_ = 2;
+    cluster_->tablets_per_node_[more_read_node_index].emplace_back(tablet_index_on_more_read_node);
+    cluster_->tablets_per_node_[less_read_node_index].emplace_back(tablet_index_on_less_read_node);
+
+    cluster_->node_index_sorted_by_read_load_.emplace_back(more_read_node_index);
+    cluster_->node_index_sorted_by_read_load_.emplace_back(less_read_node_index);
+
+    cluster_->SortNodesByReadLoad();
+    ASSERT_EQ(more_read_node_index, read_load_action_generator_->PickMostReadNode(cluster_));
+    ASSERT_EQ(less_read_node_index, read_load_action_generator_->PickLeastReadNode(cluster_));
+
+    std::shared_ptr<Action> action(read_load_action_generator_->Generate(cluster_));
+    ASSERT_EQ(Action::Type::MOVE, action->GetType());
+    MoveAction* move_action = dynamic_cast<MoveAction*>(action.get());
+    ASSERT_EQ(tablet_index_on_more_read_node, move_action->tablet_index_);
+    ASSERT_EQ(more_read_node_index, move_action->source_node_index_);
+    ASSERT_EQ(less_read_node_index, move_action->dest_node_index_);
+
+    cluster_->meta_table_node_index_ = less_read_node_index;
+    ASSERT_EQ(more_read_node_index, read_load_action_generator_->PickMostReadNode(cluster_));
+    ASSERT_EQ(more_read_node_index, read_load_action_generator_->PickLeastReadNode(cluster_));
+}
+
+TEST_F(WriteLoadActionGeneratorTest, GenerateTest) {
+    uint32_t more_write_node_index = 0;
+    uint32_t less_write_node_index = 1;
+    cluster_->write_load_per_node_[more_write_node_index] = 20;
+    cluster_->write_load_per_node_[less_write_node_index] = 10;
+
+    uint32_t tablet_index_on_more_write_node = 0;
+    uint32_t tablet_index_on_less_write_node = 1;
+    cluster_->tablet_node_num_ = 2;
+    cluster_->tablets_per_node_[more_write_node_index].emplace_back(tablet_index_on_more_write_node);
+    cluster_->tablets_per_node_[less_write_node_index].emplace_back(tablet_index_on_less_write_node);
+
+    cluster_->node_index_sorted_by_write_load_.emplace_back(more_write_node_index);
+    cluster_->node_index_sorted_by_write_load_.emplace_back(less_write_node_index);
+
+    cluster_->SortNodesByWriteLoad();
+    ASSERT_EQ(more_write_node_index, write_load_action_generator_->PickMostWriteNode(cluster_));
+    ASSERT_EQ(less_write_node_index, write_load_action_generator_->PickLeastWriteNode(cluster_));
+
+    std::shared_ptr<Action> action(write_load_action_generator_->Generate(cluster_));
+    ASSERT_EQ(Action::Type::MOVE, action->GetType());
+    MoveAction* move_action = dynamic_cast<MoveAction*>(action.get());
+    ASSERT_EQ(tablet_index_on_more_write_node, move_action->tablet_index_);
+    ASSERT_EQ(more_write_node_index, move_action->source_node_index_);
+    ASSERT_EQ(less_write_node_index, move_action->dest_node_index_);
+
+    cluster_->meta_table_node_index_ = less_write_node_index;
+    ASSERT_EQ(more_write_node_index, write_load_action_generator_->PickMostWriteNode(cluster_));
+    ASSERT_EQ(more_write_node_index, write_load_action_generator_->PickLeastWriteNode(cluster_));
+}
+
+TEST_F(ScanLoadActionGeneratorTest, GenerateTest) {
+    uint32_t more_scan_node_index = 0;
+    uint32_t less_scan_node_index = 1;
+    cluster_->scan_load_per_node_[more_scan_node_index] = 20;
+    cluster_->scan_load_per_node_[less_scan_node_index] = 10;
+
+    uint32_t tablet_index_on_more_scan_node = 0;
+    uint32_t tablet_index_on_less_scan_node = 1;
+    cluster_->tablet_node_num_ = 2;
+    cluster_->tablets_per_node_[more_scan_node_index].emplace_back(tablet_index_on_more_scan_node);
+    cluster_->tablets_per_node_[less_scan_node_index].emplace_back(tablet_index_on_less_scan_node);
+
+    cluster_->node_index_sorted_by_scan_load_.emplace_back(more_scan_node_index);
+    cluster_->node_index_sorted_by_scan_load_.emplace_back(less_scan_node_index);
+
+    cluster_->SortNodesByScanLoad();
+    ASSERT_EQ(more_scan_node_index, scan_load_action_generator_->PickMostScanNode(cluster_));
+    ASSERT_EQ(less_scan_node_index, scan_load_action_generator_->PickLeastScanNode(cluster_));
+
+    std::shared_ptr<Action> action(scan_load_action_generator_->Generate(cluster_));
+    ASSERT_EQ(Action::Type::MOVE, action->GetType());
+    MoveAction* move_action = dynamic_cast<MoveAction*>(action.get());
+    ASSERT_EQ(tablet_index_on_more_scan_node, move_action->tablet_index_);
+    ASSERT_EQ(more_scan_node_index, move_action->source_node_index_);
+    ASSERT_EQ(less_scan_node_index, move_action->dest_node_index_);
+
+    cluster_->meta_table_node_index_ = less_scan_node_index;
+    ASSERT_EQ(more_scan_node_index, scan_load_action_generator_->PickMostScanNode(cluster_));
+    ASSERT_EQ(more_scan_node_index, scan_load_action_generator_->PickLeastScanNode(cluster_));
+}
+
+} // namespace load_balancer
+} // namespace tera
+
+/* vim: set expandtab ts=4 sw=4 sts=4 tw=100: */
diff --git a/src/load_balancer/test/actions_test.cc b/src/load_balancer/test/actions_test.cc
new file mode 100644
index 000000000..28096efa3
--- /dev/null
+++ b/src/load_balancer/test/actions_test.cc
@@ -0,0 +1,29 @@
+// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "gflags/gflags.h"
+#include "glog/logging.h"
+#include "gtest/gtest.h"
+
+#include "load_balancer/actions.h"
+
+namespace tera {
+namespace load_balancer {
+
+class ActionsTest : public ::testing::Test {
+};
+
+TEST_F(ActionsTest, MoveActionTest) {
+    MoveAction move_action(0, 0, 1);
+    std::shared_ptr<MoveAction> undo_action(dynamic_cast<MoveAction*>(move_action.UndoAction()));
+
+    ASSERT_EQ(move_action.tablet_index_, undo_action->tablet_index_);
+    ASSERT_EQ(move_action.source_node_index_, undo_action->dest_node_index_);
+    ASSERT_EQ(move_action.dest_node_index_, undo_action->source_node_index_);
+}
+
+} // namespace load_balancer
+} // namespace tera
+
+/* vim: set expandtab ts=4 sw=4 sts=4 tw=100: */
diff --git a/src/load_balancer/test/balancer_test_main.cc b/src/load_balancer/test/balancer_test_main.cc
new file mode 100644
index 000000000..c08b2451d
--- /dev/null
+++ b/src/load_balancer/test/balancer_test_main.cc
@@ -0,0 +1,29 @@
+// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include <string>
+
+#include "gflags/gflags.h"
+#include "glog/logging.h"
+#include "gtest/gtest.h"
+
+#include "utils/utils_cmd.h"
+
+int main(int argc, char** argv) {
+    ::google::InitGoogleLogging(argv[0]);
+    FLAGS_v = 16;
+    FLAGS_minloglevel=0;
+    FLAGS_log_dir = "./log";
+    if (access(FLAGS_log_dir.c_str(), F_OK)) {
+        mkdir(FLAGS_log_dir.c_str(), 0777);
+    }
+    std::string pragram_name("load balancer");
+    tera::utils::SetupLog(pragram_name);
+    ::google::ParseCommandLineFlags(&argc, &argv, true);
+    ::testing::InitGoogleTest(&argc, argv);
+
+    return RUN_ALL_TESTS();
+}
+
+/* vim: set expandtab ts=4 sw=4 sts=4 tw=100: */
diff --git a/src/load_balancer/test/cluster_test.cc b/src/load_balancer/test/cluster_test.cc
new file mode 100644
index 000000000..026ad1b78
--- /dev/null
+++ b/src/load_balancer/test/cluster_test.cc
@@ -0,0 +1,391 @@
+// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "gflags/gflags.h"
+#include "glog/logging.h"
+#include "gtest/gtest.h"
+
+#include "load_balancer/actions.h"
+#include "load_balancer/cluster.h"
+#include "load_balancer/lb_node.h"
+#include "common/timer.h"
+
+namespace tera {
+namespace load_balancer {
+
+class ClusterTest : public ::testing::Test {
+public:
+    virtual void SetUp() {
+        std::vector<std::shared_ptr<LBTabletNode>> empty_lb_nodes;
+        LBOptions options;
+        cluster_.reset(new Cluster(empty_lb_nodes, options));
+    }
+
+    virtual void TearDown() {
+    }
+
+private:
+    std::shared_ptr<Cluster> cluster_;
+};
+
+TEST_F(ClusterTest, ValidActionTest) {
+    TabletMeta tablet_meta_meta;
+    TabletMeta tablet_meta_other;
+    tera::master::TabletPtr tablet_ptr_meta(new tera::master::Tablet(tablet_meta_meta));
+    tera::master::TabletPtr tablet_ptr_other(new tera::master::Tablet(tablet_meta_other));
+    std::shared_ptr<LBTablet> lb_tablet_meta = std::make_shared<LBTablet>();
+    std::shared_ptr<LBTablet> lb_tablet_other = std::make_shared<LBTablet>();
+    lb_tablet_meta->tablet_ptr = tablet_ptr_meta;
+    lb_tablet_other->tablet_ptr = tablet_ptr_other;
+
+    cluster_->lb_options_.meta_table_name = "meta_table";
+    uint32_t table_index_meta = 0;
+    uint32_t table_index_other= 1;
+    cluster_->tables_[table_index_meta] = "meta_table";
+    cluster_->tables_[table_index_other] = "other_table";
+    uint32_t tablet_index_meta = 0;
+    uint32_t tablet_index_other = 1;
+    cluster_->tablet_index_to_table_index_[tablet_index_meta] = table_index_meta;
+    cluster_->tablet_index_to_table_index_[tablet_index_other] = table_index_other;
+    cluster_->tablets_[tablet_index_meta] = lb_tablet_meta;
+    cluster_->tablets_[tablet_index_other] = lb_tablet_other;
+
+    uint32_t meta_table_node_index = 0;
+    uint32_t other_node_index = 1;
+    cluster_->meta_table_node_index_ = meta_table_node_index;
+
+    // empty action is invalid
+    std::shared_ptr<Action> empty_action(new EmptyAction());
+    ASSERT_FALSE(cluster_->ValidAction(empty_action));
+
+    std::shared_ptr<Action> normal_move_action(new MoveAction(tablet_index_meta, 0, 1));
+    // move not ready tablet is invalid
+    ASSERT_TRUE(cluster_->tablets_[tablet_index_meta]->tablet_ptr->SetStatus(kTableOffLine));
+    ASSERT_FALSE(cluster_->ValidAction(normal_move_action));
+
+    // move meta table is invalid
+    std::shared_ptr<Action> move_meta_table_action(new MoveAction(tablet_index_meta, 0, 1));
+    ASSERT_TRUE(cluster_->tablets_[tablet_index_meta]->tablet_ptr->SetStatus(kTableReady));
+    ASSERT_FALSE(cluster_->ValidAction(move_meta_table_action));
+    // move nomal tablet is valid
+    std::shared_ptr<Action> move_other_table_action(new MoveAction(tablet_index_other, 0, 1));
+    ASSERT_TRUE(cluster_->tablets_[tablet_index_other]->tablet_ptr->SetStatus(kTableReady));
+    ASSERT_TRUE(cluster_->ValidAction(move_other_table_action));
+
+    std::shared_ptr<Action> move_to_meta_table_node_action(new MoveAction(tablet_index_other, 0, meta_table_node_index));
+    std::shared_ptr<Action> move_to_other_node_action(new MoveAction(tablet_index_other, 0, other_node_index));
+    cluster_->lb_options_.meta_table_isolate_enabled = true;
+    // move tablet to meta node is invalid if meta_table_isolate_enabled is true
+    ASSERT_FALSE(cluster_->ValidAction(move_to_meta_table_node_action));
+    // move tablet to normal node is valid even if meta_table_isolate_enabled is true
+    ASSERT_TRUE(cluster_->ValidAction(move_to_other_node_action));
+    cluster_->lb_options_.meta_table_isolate_enabled = false;
+    // move tablet to any node is valid if meta_table_isolate_enabled is true
+    ASSERT_TRUE(cluster_->ValidAction(move_to_meta_table_node_action));
+    ASSERT_TRUE(cluster_->ValidAction(move_to_other_node_action));
+}
+
+TEST_F(ClusterTest, RegisterTabletTest) {
+    TabletMeta tablet_meta_meta;
+    tablet_meta_meta.set_table_name("meta_table");
+    tablet_meta_meta.set_path("path/meta_table");
+    tera::master::TabletPtr tablet_ptr_meta(new tera::master::Tablet(tablet_meta_meta));
+    std::shared_ptr<LBTablet> lb_tablet_meta = std::make_shared<LBTablet>();
+    lb_tablet_meta->tablet_ptr = tablet_ptr_meta;
+
+    uint32_t tablet_index_0 = 0;
+    uint32_t node_index_0 = 0;
+    cluster_->RegisterTablet(lb_tablet_meta, tablet_index_0, node_index_0);
+
+    ASSERT_EQ(1, cluster_->table_num_);
+    ASSERT_EQ(1, cluster_->tables_.size());
+    ASSERT_STREQ("meta_table", cluster_->tables_[0].c_str());
+    ASSERT_EQ(0, cluster_->tables_to_index_["meta_table"]);
+
+    ASSERT_EQ(tablet_index_0, cluster_->tablets_to_index_["path/meta_table"]);
+
+    ASSERT_EQ(node_index_0, cluster_->tablet_index_to_node_index_[tablet_index_0]);
+    ASSERT_EQ(node_index_0, cluster_->initial_tablet_index_to_node_index_[tablet_index_0]);
+    ASSERT_EQ(0, cluster_->tablet_index_to_table_index_[tablet_index_0]);
+}
+
+TEST_F(ClusterTest, AddTabletTest) {
+    TabletMeta tablet_meta_meta;
+    tablet_meta_meta.set_size(10);
+    tera::master::TabletPtr tablet_ptr_meta(new tera::master::Tablet(tablet_meta_meta));
+    tablet_ptr_meta->average_counter_.set_read_rows(20);
+    tablet_ptr_meta->average_counter_.set_write_rows(30);
+    tablet_ptr_meta->average_counter_.set_scan_rows(40);
+    std::shared_ptr<LBTablet> lb_tablet_meta = std::make_shared<LBTablet>();
+    lb_tablet_meta->tablet_ptr = tablet_ptr_meta;
+
+    uint32_t tablet_index = 0;
+    cluster_->tablets_[tablet_index] = lb_tablet_meta;
+
+    uint32_t node_index = 0;
+    cluster_->size_per_node_[node_index] = 0;
+    cluster_->read_load_per_node_[node_index] = 0;
+    cluster_->write_load_per_node_[node_index] = 0;
+    cluster_->scan_load_per_node_[node_index] = 0;
+
+    cluster_->AddTablet(tablet_index, node_index);
+
+    ASSERT_EQ(1, cluster_->tablets_per_node_.size());
+    ASSERT_EQ(10, cluster_->size_per_node_[node_index]);
+    ASSERT_EQ(20, cluster_->read_load_per_node_[node_index]);
+    ASSERT_EQ(30, cluster_->write_load_per_node_[node_index]);
+    ASSERT_EQ(40, cluster_->scan_load_per_node_[node_index]);
+}
+
+TEST_F(ClusterTest, RemoveTabletTest) {
+    TabletMeta tablet_meta_meta;
+    tablet_meta_meta.set_size(10);
+    tera::master::TabletPtr tablet_ptr_meta(new tera::master::Tablet(tablet_meta_meta));
+    tablet_ptr_meta->average_counter_.set_read_rows(20);
+    tablet_ptr_meta->average_counter_.set_write_rows(30);
+    tablet_ptr_meta->average_counter_.set_scan_rows(40);
+    std::shared_ptr<LBTablet> lb_tablet_meta = std::make_shared<LBTablet>();
+    lb_tablet_meta->tablet_ptr = tablet_ptr_meta;
+
+    uint32_t tablet_index = 0;
+    cluster_->tablets_[tablet_index] = lb_tablet_meta;
+
+    uint32_t node_index = 0;
+    cluster_->tablets_per_node_[node_index].emplace_back(tablet_index);
+
+    cluster_->size_per_node_[node_index] = 10;
+    cluster_->read_load_per_node_[node_index] = 20;
+    cluster_->write_load_per_node_[node_index] = 30;
+    cluster_->scan_load_per_node_[node_index] = 40;
+
+    cluster_->RemoveTablet(tablet_index, node_index);
+
+    ASSERT_EQ(0, cluster_->tablets_per_node_[node_index].size());
+    ASSERT_EQ(0, cluster_->size_per_node_[node_index]);
+    ASSERT_EQ(0, cluster_->read_load_per_node_[node_index]);
+    ASSERT_EQ(0, cluster_->write_load_per_node_[node_index]);
+    ASSERT_EQ(0, cluster_->scan_load_per_node_[node_index]);
+}
+
+TEST_F(ClusterTest, MoveTabletTest) {
+    TabletMeta tablet_meta_meta;
+    tablet_meta_meta.set_size(10);
+    tera::master::TabletPtr tablet_ptr_meta(new tera::master::Tablet(tablet_meta_meta));
+    tablet_ptr_meta->average_counter_.set_read_rows(20);
+    tablet_ptr_meta->average_counter_.set_write_rows(30);
+    tablet_ptr_meta->average_counter_.set_scan_rows(40);
+    std::shared_ptr<LBTablet> lb_tablet_meta = std::make_shared<LBTablet>();
+    lb_tablet_meta->tablet_ptr = tablet_ptr_meta;
+
+    uint32_t tablet_index = 0;
+    uint32_t first_node_index = 0;
+    uint32_t second_node_index = 1;
+    uint32_t third_node_index = 2;
+
+    cluster_->tablets_[tablet_index] = lb_tablet_meta;
+    cluster_->tablet_moved_num_ = 0;
+    cluster_->initial_tablet_index_to_node_index_[tablet_index] = first_node_index;
+    cluster_->tablet_index_to_node_index_[tablet_index] = first_node_index;
+    cluster_->abnormal_nodes_index_.insert(second_node_index);
+    cluster_->read_pending_nodes_index_.insert(second_node_index);
+    cluster_->write_pending_nodes_index_.insert(second_node_index);
+    cluster_->scan_pending_nodes_index_.insert(second_node_index);
+
+    ASSERT_EQ(0, cluster_->tablets_moved_too_frequently_.size());
+    ASSERT_EQ(0, cluster_->tablets_moved_to_abnormal_nodes_.size());
+    ASSERT_EQ(0, cluster_->tablets_moved_to_read_pending_nodes_.size());
+    ASSERT_EQ(0, cluster_->tablets_moved_to_write_pending_nodes_.size());
+    ASSERT_EQ(0, cluster_->tablets_moved_to_scan_pending_nodes_.size());
+    ASSERT_TRUE(cluster_->tablets_[tablet_index]->tablet_ptr->SetStatus(kTableReady));
+    int64_t current_time_us = tera::get_micros();
+    cluster_->lb_options_.tablet_move_too_frequently_threshold_s = 600;
+    cluster_->tablets_[tablet_index]->tablet_ptr->last_move_time_us_ = current_time_us;
+
+    cluster_->MoveTablet(tablet_index, first_node_index, second_node_index);
+    ASSERT_EQ(first_node_index, cluster_->initial_tablet_index_to_node_index_[tablet_index]);
+    ASSERT_EQ(second_node_index, cluster_->tablet_index_to_node_index_[tablet_index]);
+    ASSERT_EQ(1, cluster_->tablet_moved_num_);
+    ASSERT_EQ(1, cluster_->tablets_moved_too_frequently_.size());
+    ASSERT_EQ(1, cluster_->tablets_moved_to_abnormal_nodes_.size());
+    ASSERT_EQ(1, cluster_->tablets_moved_to_read_pending_nodes_.size());
+    ASSERT_EQ(1, cluster_->tablets_moved_to_write_pending_nodes_.size());
+    ASSERT_EQ(1, cluster_->tablets_moved_to_scan_pending_nodes_.size());
+
+    cluster_->MoveTablet(tablet_index, second_node_index, third_node_index);
+    ASSERT_EQ(first_node_index, cluster_->initial_tablet_index_to_node_index_[tablet_index]);
+    ASSERT_EQ(third_node_index, cluster_->tablet_index_to_node_index_[tablet_index]);
+    ASSERT_EQ(1, cluster_->tablet_moved_num_);
+    ASSERT_EQ(1, cluster_->tablets_moved_too_frequently_.size());
+    ASSERT_EQ(0, cluster_->tablets_moved_to_abnormal_nodes_.size());
+    ASSERT_EQ(0, cluster_->tablets_moved_to_read_pending_nodes_.size());
+    ASSERT_EQ(0, cluster_->tablets_moved_to_write_pending_nodes_.size());
+    ASSERT_EQ(0, cluster_->tablets_moved_to_scan_pending_nodes_.size());
+
+    cluster_->MoveTablet(tablet_index, third_node_index, first_node_index);
+    ASSERT_EQ(first_node_index, cluster_->initial_tablet_index_to_node_index_[tablet_index]);
+    ASSERT_EQ(first_node_index, cluster_->tablet_index_to_node_index_[tablet_index]);
+    ASSERT_EQ(0, cluster_->tablet_moved_num_);
+    ASSERT_EQ(0, cluster_->tablets_moved_too_frequently_.size());
+    ASSERT_EQ(0, cluster_->tablets_moved_to_abnormal_nodes_.size());
+    ASSERT_EQ(0, cluster_->tablets_moved_to_read_pending_nodes_.size());
+    ASSERT_EQ(0, cluster_->tablets_moved_to_write_pending_nodes_.size());
+    ASSERT_EQ(0, cluster_->tablets_moved_to_scan_pending_nodes_.size());
+
+    cluster_->tablets_[tablet_index]->tablet_ptr->last_move_time_us_ = current_time_us - 2 * cluster_->lb_options_.tablet_move_too_frequently_threshold_s * 1000000;
+    cluster_->MoveTablet(tablet_index, first_node_index, second_node_index);
+    ASSERT_EQ(0, cluster_->tablets_moved_too_frequently_.size());
+    ASSERT_EQ(1, cluster_->tablets_moved_to_abnormal_nodes_.size());
+    ASSERT_EQ(1, cluster_->tablets_moved_to_read_pending_nodes_.size());
+    ASSERT_EQ(1, cluster_->tablets_moved_to_write_pending_nodes_.size());
+    ASSERT_EQ(1, cluster_->tablets_moved_to_scan_pending_nodes_.size());
+}
+
+TEST_F(ClusterTest, AbnormalNodeConstructTest) {
+    TabletMeta tablet_meta_0;
+    tablet_meta_0.set_path("path/meta_0");
+    tera::master::TabletPtr tablet_ptr_0(new tera::master::Tablet(tablet_meta_0));
+    std::shared_ptr<LBTablet> lb_tablet_0 = std::make_shared<LBTablet>();
+    lb_tablet_0->tablet_ptr = tablet_ptr_0;
+
+    TabletMeta tablet_meta_1;
+    tablet_meta_1.set_path("path/meta_1");
+    tera::master::TabletPtr tablet_ptr_1(new tera::master::Tablet(tablet_meta_1));
+    std::shared_ptr<LBTablet> lb_tablet_1 = std::make_shared<LBTablet>();
+    lb_tablet_1->tablet_ptr = tablet_ptr_1;
+
+    TabletMeta tablet_meta_2;
+    tablet_meta_2.set_path("path/meta_2");
+    tera::master::TabletPtr tablet_ptr_2(new tera::master::Tablet(tablet_meta_2));
+    std::shared_ptr<LBTablet> lb_tablet_2 = std::make_shared<LBTablet>();
+    lb_tablet_2->tablet_ptr = tablet_ptr_2;
+
+    tera::master::TabletNodePtr tablet_node_ptr(new tera::master::TabletNode());
+    tablet_node_ptr->addr_ = "127.0.0.1:2200";
+    std::shared_ptr<LBTabletNode> lb_node = std::make_shared<LBTabletNode>();
+    lb_node->tablet_node_ptr = tablet_node_ptr;
+    lb_node->tablets.emplace_back(lb_tablet_0);
+    lb_node->tablets.emplace_back(lb_tablet_1);
+    lb_node->tablets.emplace_back(lb_tablet_2);
+
+    std::vector<std::shared_ptr<LBTabletNode>> lb_nodes;
+    lb_nodes.emplace_back(lb_node);
+
+    LBOptions options;
+    options.abnormal_node_ratio = 0.5;
+
+    tablet_ptr_0->SetStatus(kTableReady);
+    tablet_ptr_1->SetStatus(kTableReady);
+    tablet_ptr_2->SetStatus(kTableReady);
+    cluster_.reset(new Cluster(lb_nodes, options));
+    ASSERT_EQ(0, cluster_->initial_tablets_not_ready_per_node_[0].size());
+    ASSERT_EQ(0, cluster_->abnormal_nodes_index_.size());
+
+    tablet_ptr_0->SetStatus(kTableOffLine);
+    cluster_.reset(new Cluster(lb_nodes, options));
+    ASSERT_EQ(1, cluster_->initial_tablets_not_ready_per_node_[0].size());
+    ASSERT_EQ(0, cluster_->abnormal_nodes_index_.size());
+
+    tablet_ptr_1->SetStatus(kTableOffLine);
+    cluster_.reset(new Cluster(lb_nodes, options));
+    ASSERT_EQ(2, cluster_->initial_tablets_not_ready_per_node_[0].size());
+    ASSERT_EQ(1, cluster_->abnormal_nodes_index_.size());
+}
+
+TEST_F(ClusterTest, SortNodesByTabletCount) {
+    cluster_->tablets_per_node_[0].emplace_back(0);
+    cluster_->tablets_per_node_[0].emplace_back(1);
+    cluster_->tablets_per_node_[1].emplace_back(2);
+    cluster_->tablets_per_node_[2].emplace_back(3);
+    cluster_->tablets_per_node_[2].emplace_back(4);
+    cluster_->tablets_per_node_[2].emplace_back(5);
+
+    cluster_->node_index_sorted_by_tablet_count_.emplace_back(0);
+    cluster_->node_index_sorted_by_tablet_count_.emplace_back(1);
+    cluster_->node_index_sorted_by_tablet_count_.emplace_back(2);
+    ASSERT_EQ(0, cluster_->node_index_sorted_by_tablet_count_[0]);
+    ASSERT_EQ(1, cluster_->node_index_sorted_by_tablet_count_[1]);
+    ASSERT_EQ(2, cluster_->node_index_sorted_by_tablet_count_[2]);
+
+    cluster_->SortNodesByTabletCount();
+    ASSERT_EQ(1, cluster_->node_index_sorted_by_tablet_count_[0]);
+    ASSERT_EQ(0, cluster_->node_index_sorted_by_tablet_count_[1]);
+    ASSERT_EQ(2, cluster_->node_index_sorted_by_tablet_count_[2]);
+}
+
+TEST_F(ClusterTest, SortNodesBySizeTest) {
+    cluster_->size_per_node_[0] = 20;
+    cluster_->size_per_node_[1] = 10;
+    cluster_->size_per_node_[2] = 30;
+
+    cluster_->node_index_sorted_by_size_.emplace_back(0);
+    cluster_->node_index_sorted_by_size_.emplace_back(1);
+    cluster_->node_index_sorted_by_size_.emplace_back(2);
+    ASSERT_EQ(0, cluster_->node_index_sorted_by_size_[0]);
+    ASSERT_EQ(1, cluster_->node_index_sorted_by_size_[1]);
+    ASSERT_EQ(2, cluster_->node_index_sorted_by_size_[2]);
+
+    cluster_->SortNodesBySize();
+    ASSERT_EQ(1, cluster_->node_index_sorted_by_size_[0]);
+    ASSERT_EQ(0, cluster_->node_index_sorted_by_size_[1]);
+    ASSERT_EQ(2, cluster_->node_index_sorted_by_size_[2]);
+}
+
+TEST_F(ClusterTest, SortNodesByReadLoad) {
+    cluster_->read_load_per_node_[0] = 20;
+    cluster_->read_load_per_node_[1] = 10;
+    cluster_->read_load_per_node_[2] = 30;
+
+    cluster_->node_index_sorted_by_read_load_.emplace_back(0);
+    cluster_->node_index_sorted_by_read_load_.emplace_back(1);
+    cluster_->node_index_sorted_by_read_load_.emplace_back(2);
+    ASSERT_EQ(0, cluster_->node_index_sorted_by_read_load_[0]);
+    ASSERT_EQ(1, cluster_->node_index_sorted_by_read_load_[1]);
+    ASSERT_EQ(2, cluster_->node_index_sorted_by_read_load_[2]);
+
+    cluster_->SortNodesByReadLoad();
+    ASSERT_EQ(1, cluster_->node_index_sorted_by_read_load_[0]);
+    ASSERT_EQ(0, cluster_->node_index_sorted_by_read_load_[1]);
+    ASSERT_EQ(2, cluster_->node_index_sorted_by_read_load_[2]);
+}
+
+TEST_F(ClusterTest, SortNodesByWriteLoad) {
+    cluster_->write_load_per_node_[0] = 20;
+    cluster_->write_load_per_node_[1] = 10;
+    cluster_->write_load_per_node_[2] = 30;
+
+    cluster_->node_index_sorted_by_write_load_.emplace_back(0);
+    cluster_->node_index_sorted_by_write_load_.emplace_back(1);
+    cluster_->node_index_sorted_by_write_load_.emplace_back(2);
+    ASSERT_EQ(0, cluster_->node_index_sorted_by_write_load_[0]);
+    ASSERT_EQ(1, cluster_->node_index_sorted_by_write_load_[1]);
+    ASSERT_EQ(2, cluster_->node_index_sorted_by_write_load_[2]);
+
+    cluster_->SortNodesByWriteLoad();
+    ASSERT_EQ(1, cluster_->node_index_sorted_by_write_load_[0]);
+    ASSERT_EQ(0, cluster_->node_index_sorted_by_write_load_[1]);
+    ASSERT_EQ(2, cluster_->node_index_sorted_by_write_load_[2]);
+}
+
+TEST_F(ClusterTest, SortNodesByScanLoad) {
+    cluster_->scan_load_per_node_[0] = 20;
+    cluster_->scan_load_per_node_[1] = 10;
+    cluster_->scan_load_per_node_[2] = 30;
+
+    cluster_->node_index_sorted_by_scan_load_.emplace_back(0);
+    cluster_->node_index_sorted_by_scan_load_.emplace_back(1);
+    cluster_->node_index_sorted_by_scan_load_.emplace_back(2);
+    ASSERT_EQ(0, cluster_->node_index_sorted_by_scan_load_[0]);
+    ASSERT_EQ(1, cluster_->node_index_sorted_by_scan_load_[1]);
+    ASSERT_EQ(2, cluster_->node_index_sorted_by_scan_load_[2]);
+
+    cluster_->SortNodesByScanLoad();
+    ASSERT_EQ(1, cluster_->node_index_sorted_by_scan_load_[0]);
+    ASSERT_EQ(0, cluster_->node_index_sorted_by_scan_load_[1]);
+    ASSERT_EQ(2, cluster_->node_index_sorted_by_scan_load_[2]);
+}
+
+} // namespace load_balancer
+} // namespace tera
+
+/* vim: set expandtab ts=4 sw=4 sts=4 tw=100: */
diff --git a/src/load_balancer/test/cost_functions_test.cc b/src/load_balancer/test/cost_functions_test.cc
new file mode 100644
index 000000000..84f546fba
--- /dev/null
+++ b/src/load_balancer/test/cost_functions_test.cc
@@ -0,0 +1,176 @@
+// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "gflags/gflags.h"
+#include "glog/logging.h"
+#include "gtest/gtest.h"
+
+#include "load_balancer/cost_functions.h"
+#include "load_balancer/random.h"
+
+namespace tera {
+namespace load_balancer {
+
+class CostFunctionTest : public ::testing::Test {
+public:
+    virtual void SetUp() {
+        move_cost_function_.reset(new MoveCountCostFunction(lb_options_));
+    }
+
+    virtual void TearDown() {
+    }
+
+private:
+    LBOptions lb_options_;
+    std::shared_ptr<MoveCountCostFunction> move_cost_function_;
+};
+
+class MoveCountCostFunctionTest : public ::testing::Test {
+public:
+    virtual void SetUp() {
+        move_cost_function_.reset(new MoveCountCostFunction(lb_options_));
+
+        std::vector<std::shared_ptr<LBTabletNode>> empty_lb_nodes;
+        LBOptions options;
+        cluster_.reset(new Cluster(empty_lb_nodes, options));
+
+        move_cost_function_->Init(cluster_);
+    }
+
+    virtual void TearDown() {
+    }
+
+private:
+    LBOptions lb_options_;
+    std::shared_ptr<MoveCountCostFunction> move_cost_function_;
+    std::shared_ptr<Cluster> cluster_;
+};
+
+class TabletCountCostFunctionTest : public ::testing::Test {
+public:
+    virtual void SetUp() {
+        tablet_count_cost_function_.reset(new TabletCountCostFunction(lb_options_));
+
+        std::vector<std::shared_ptr<LBTabletNode>> empty_lb_nodes;
+        LBOptions options;
+        cluster_.reset(new Cluster(empty_lb_nodes, options));
+
+        tablet_count_cost_function_->Init(cluster_);
+    }
+
+    virtual void TearDown() {
+    }
+
+private:
+    LBOptions lb_options_;
+    std::shared_ptr<TabletCountCostFunction> tablet_count_cost_function_;
+    std::shared_ptr<Cluster> cluster_;
+};
+
+class SizeCostFunctionTest : public ::testing::Test {
+public:
+    virtual void SetUp() {
+        size_cost_function_.reset(new SizeCostFunction(lb_options_));
+
+        std::vector<std::shared_ptr<LBTabletNode>> empty_lb_nodes;
+        LBOptions options;
+        cluster_.reset(new Cluster(empty_lb_nodes, options));
+
+        size_cost_function_->Init(cluster_);
+    }
+
+    virtual void TearDown() {
+    }
+
+private:
+    LBOptions lb_options_;
+    std::shared_ptr<SizeCostFunction> size_cost_function_;
+    std::shared_ptr<Cluster> cluster_;
+};
+
+TEST_F(CostFunctionTest, WeightTest) {
+    double w = 3.14;
+    move_cost_function_->SetWeight(w);
+    ASSERT_DOUBLE_EQ(w, move_cost_function_->GetWeight());
+}
+
+TEST_F(CostFunctionTest, SumTest) {
+    std::vector<double> stats = {1, 2, 3};
+    ASSERT_DOUBLE_EQ(6, move_cost_function_->GetSum(stats));
+}
+
+TEST_F(CostFunctionTest, ScaleTest) {
+    // value <= min
+    ASSERT_DOUBLE_EQ(0, move_cost_function_->Scale(0, 10, -1));
+    ASSERT_DOUBLE_EQ(0, move_cost_function_->Scale(0, 10, 0));
+
+    // max <= min
+    ASSERT_DOUBLE_EQ(0, move_cost_function_->Scale(0, 0, 5));
+    ASSERT_DOUBLE_EQ(0, move_cost_function_->Scale(0, -1, 5));
+
+    // normal case
+    ASSERT_DOUBLE_EQ(0, move_cost_function_->Scale(0, 10, 0));
+    ASSERT_DOUBLE_EQ(0.5, move_cost_function_->Scale(0, 10, 5));
+    ASSERT_DOUBLE_EQ(1, move_cost_function_->Scale(0, 10, 10));
+
+    // random case
+    size_t times = 100;
+    int min = 0;
+    int max = 10;
+    for (size_t i = 0; i < times; ++i) {
+        int value = Random::Rand(min, max + 1);
+        ASSERT_TRUE(move_cost_function_->Scale(min, max, value) >= 0);
+        ASSERT_TRUE(move_cost_function_->Scale(min, max, value) <= 1);
+    }
+}
+
+TEST_F(CostFunctionTest, ScaleFromArrayTest) {
+    std::vector<double> stats_0 = {0, 0};
+    ASSERT_DOUBLE_EQ(0, move_cost_function_->ScaleFromArray(stats_0));
+
+    std::vector<double> stats_1 = {10, 10};
+    ASSERT_DOUBLE_EQ(0, move_cost_function_->ScaleFromArray(stats_0));
+
+    int begin = 0;
+    int end = 100;
+    size_t times = 100;
+    std::vector<double> stats_2;
+    for (size_t i = 0; i < times; ++i) {
+        stats_2.clear();
+        stats_2.emplace_back(Random::Rand(begin, end));
+        stats_2.emplace_back(Random::Rand(begin, end));
+
+        ASSERT_TRUE(move_cost_function_->ScaleFromArray(stats_2) >= 0);
+        ASSERT_TRUE(move_cost_function_->ScaleFromArray(stats_2) <= 1);
+    }
+}
+
+TEST_F(MoveCountCostFunctionTest, CostTest) {
+    move_cost_function_->tablet_max_move_num_ = 10;
+    move_cost_function_->tablet_max_move_percent_ = 0.05;
+    cluster_->tablet_num_ = 100;
+
+    cluster_->tablet_moved_num_ = 1;
+    ASSERT_DOUBLE_EQ(0.1, move_cost_function_->Cost());
+
+    cluster_->tablet_moved_num_ = 6;
+    ASSERT_DOUBLE_EQ(0.6, move_cost_function_->Cost());
+
+    cluster_->tablet_moved_num_ = 10;
+    ASSERT_DOUBLE_EQ(1, move_cost_function_->Cost());
+
+    cluster_->tablet_moved_num_ = 11;
+    ASSERT_DOUBLE_EQ(move_cost_function_->kExpensiveCost, move_cost_function_->Cost());
+}
+
+TEST_F(TabletCountCostFunctionTest, CostTest) {
+}
+
+TEST_F(SizeCostFunctionTest, CostTest) {
+}
+
+} // namespace load_balancer
+} // namespace tera
+
+/* vim: set expandtab ts=4 sw=4 sts=4 tw=100: */
diff --git a/src/load_balancer/test/random_test.cc b/src/load_balancer/test/random_test.cc
new file mode 100644
index 000000000..385b76877
--- /dev/null
+++ b/src/load_balancer/test/random_test.cc
@@ -0,0 +1,44 @@
+// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "gflags/gflags.h"
+#include "glog/logging.h"
+#include "gtest/gtest.h"
+
+#include "load_balancer/random.h"
+
+namespace tera {
+namespace load_balancer {
+
+class RandomTest : public ::testing::Test {
+};
+
+TEST_F(RandomTest, CommonTest) {
+    int start = 0;
+    int end = 3;
+    size_t times = 100;
+
+    for (size_t i  = 0; i < times; ++i) {
+        int rand = Random::Rand(start, end);
+        ASSERT_TRUE(rand >= start);
+        ASSERT_TRUE(rand < end);
+    }
+}
+
+TEST_F(RandomTest, NegativeTest) {
+    int start = -10;
+    int end = 10;
+    size_t times = 100;
+
+    for (size_t i  = 0; i < times; ++i) {
+        int rand = Random::RandStd(start, end);
+        ASSERT_TRUE(rand >= start);
+        ASSERT_TRUE(rand < end);
+    }
+}
+
+} // namespace load_balancer
+} // namespace tera
+
+/* vim: set expandtab ts=4 sw=4 sts=4 tw=100: */
diff --git a/src/load_balancer/unity_balancer.cc b/src/load_balancer/unity_balancer.cc
new file mode 100644
index 000000000..a6279d16f
--- /dev/null
+++ b/src/load_balancer/unity_balancer.cc
@@ -0,0 +1,264 @@
+// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "load_balancer/unity_balancer.h"
+
+#include <algorithm>
+#include <limits>
+
+#include "glog/logging.h"
+#include "load_balancer/random.h"
+#include "common/timer.h"
+
+namespace tera {
+namespace load_balancer {
+
+using tera::master::TabletNodePtr;
+using tera::master::TabletPtr;
+
+UnityBalancer::UnityBalancer(const LBOptions& options) :
+        lb_options_(options) {
+    // cost functions
+    if (lb_options_.move_count_cost_weight > 0) {
+        cost_functions_.emplace_back(new MoveCountCostFunction(options));
+        VLOG(20) << "[lb] " << cost_functions_[cost_functions_.size() - 1]->Name() << " enabled";
+    }
+    if (lb_options_.move_frequency_cost_weight > 0){
+        cost_functions_.emplace_back(new MoveFrequencyCostFunction(options));
+        VLOG(20) << "[lb] " << cost_functions_[cost_functions_.size() - 1]->Name() << " enabled";
+    }
+    if (lb_options_.abnormal_node_cost_weight > 0) {
+        cost_functions_.emplace_back(new AbnormalNodeCostFunction(options));
+        VLOG(20) << "[lb] " << cost_functions_[cost_functions_.size() - 1]->Name() << " enabled";
+    }
+    if (lb_options_.read_pending_node_cost_weight > 0) {
+        cost_functions_.emplace_back(new ReadPendingNodeCostFunction(options));
+        VLOG(20) << "[lb] " << cost_functions_[cost_functions_.size() - 1]->Name() << " enabled";
+    }
+    if (lb_options_.write_pending_node_cost_weight > 0) {
+        cost_functions_.emplace_back(new WritePendingNodeCostFunction(options));
+        VLOG(20) << "[lb] " << cost_functions_[cost_functions_.size() - 1]->Name() << " enabled";
+    }
+    if (lb_options_.scan_pending_node_cost_weight > 0) {
+        cost_functions_.emplace_back(new ScanPendingNodeCostFunction(options));
+        VLOG(20) << "[lb] " << cost_functions_[cost_functions_.size() - 1]->Name() << " enabled";
+    }
+    if (lb_options_.tablet_count_cost_weight > 0) {
+        cost_functions_.emplace_back(new TabletCountCostFunction(options));
+        VLOG(20) << "[lb] " << cost_functions_[cost_functions_.size() - 1]->Name() << " enabled";
+    }
+    if (lb_options_.size_cost_weight > 0) {
+        cost_functions_.emplace_back(new SizeCostFunction(options));
+        VLOG(20) << "[lb] " << cost_functions_[cost_functions_.size() - 1]->Name() << " enabled";
+    }
+    if (lb_options_.read_load_cost_weight > 0) {
+        cost_functions_.emplace_back(new ReadLoadCostFunction(options));
+        VLOG(20) << "[lb] " << cost_functions_[cost_functions_.size() - 1]->Name() << " enabled";
+    }
+    if (lb_options_.write_load_cost_weight > 0) {
+        cost_functions_.emplace_back(new WriteLoadCostFunction(options));
+        VLOG(20) << "[lb] " << cost_functions_[cost_functions_.size() - 1]->Name() << " enabled";
+    }
+    if (lb_options_.scan_load_cost_weight > 0) {
+        cost_functions_.emplace_back(new ScanLoadCostFunction(options));
+        VLOG(20) << "[lb] " << cost_functions_[cost_functions_.size() - 1]->Name() << " enabled";
+    }
+
+    // action generators
+    action_generators_.emplace_back(new RandomActionGenerator());
+    VLOG(20) << "[lb] " << action_generators_[action_generators_.size() - 1]->Name() << " enabled";
+    if (lb_options_.tablet_count_cost_weight > 0) {
+        action_generators_.emplace_back(new TabletCountActionGenerator());
+        VLOG(20) << "[lb] " << action_generators_[action_generators_.size() - 1]->Name() << " enabled";
+    }
+    if (lb_options_.size_cost_weight > 0) {
+        action_generators_.emplace_back(new SizeActionGenerator());
+        VLOG(20) << "[lb] " << action_generators_[action_generators_.size() - 1]->Name() << " enabled";
+    }
+    if (lb_options_.read_load_cost_weight > 0) {
+        action_generators_.emplace_back(new ReadLoadActionGenerator());
+        VLOG(20) << "[lb] " << action_generators_[action_generators_.size() - 1]->Name() << " enabled";
+    }
+    if (lb_options_.write_load_cost_weight > 0) {
+        action_generators_.emplace_back(new WriteLoadActionGenerator());
+        VLOG(20) << "[lb] " << action_generators_[action_generators_.size() - 1]->Name() << " enabled";
+    }
+    if (lb_options_.scan_load_cost_weight > 0) {
+        action_generators_.emplace_back(new ScanLoadActionGenerator());
+        VLOG(20) << "[lb] " << action_generators_[action_generators_.size() - 1]->Name() << " enabled";
+    }
+}
+
+UnityBalancer::~UnityBalancer() {
+}
+
+bool UnityBalancer::BalanceCluster(
+        const std::vector<std::shared_ptr<LBTabletNode>>& lb_nodes,
+        std::vector<Plan>* plans) {
+    return BalanceCluster("", lb_nodes, plans);
+}
+
+bool UnityBalancer::BalanceCluster(
+        const std::string& table_name,
+        const std::vector<std::shared_ptr<LBTabletNode>>& lb_nodes,
+        std::vector<Plan>* plans) {
+    if (lb_nodes.size() <= 1 || plans == nullptr) {
+        return false;
+    }
+
+    VLOG (5) << "[lb] BalanceCluster for table:" << table_name << " begin";
+
+    std::shared_ptr<Cluster> cluster = std::make_shared<Cluster>(lb_nodes, lb_options_);
+
+    if (lb_options_.debug_mode_enabled) {
+        cluster->DebugCluster();
+    }
+
+    InitCostFunctions(cluster);
+
+    if (!NeedBalance(cluster)) {
+        return true;
+    }
+
+    uint64_t max_steps = std::min(lb_options_.max_compute_steps, static_cast<uint64_t>(lb_options_.max_compute_steps_per_tablet * cluster->tablet_num_));
+    double init_cost = ComputeCost(std::numeric_limits<double>::max());
+    double current_cost = init_cost;
+
+    VLOG(5) << "[lb] compute begin, max_steps:" << max_steps << " init_cost:" << init_cost;
+
+    int64_t start_time_ns = get_micros();
+    int64_t cost_time_ms = 0;
+    uint64_t step = 0;
+    for (step = 0; step < max_steps; ++step) {
+        std::shared_ptr<Action> action(NextAction(cluster));
+        VLOG(20) << "[lb] step:" << step << " action:" << action->ToString();
+
+        if (!cluster->ValidAction(action)) {
+            continue;
+        }
+
+        cluster->DoAction(action);
+
+        if (lb_options_.debug_mode_enabled) {
+            cluster->DebugCluster();
+        }
+
+        double new_cost = ComputeCost(current_cost);
+        if (new_cost < current_cost) {
+            VLOG(20) << "[lb] got lower cost!";
+            current_cost = new_cost;
+        } else {
+            std::shared_ptr<Action> undo_action(action->UndoAction());
+            VLOG(20) << "[lb] undo action:" << undo_action->ToString();
+            cluster->DoAction(undo_action);
+
+            if (lb_options_.debug_mode_enabled) {
+                cluster->DebugCluster();
+            }
+        }
+
+        cost_time_ms = (get_micros() - start_time_ns) / 1000;
+        if (static_cast<uint64_t>(cost_time_ms) > lb_options_.max_compute_time_ms) {
+            VLOG(5) << "[lb] stop computing since time reach to max_compute_time_ms_:"
+                    << lb_options_.max_compute_time_ms;
+            break;
+        }
+    }
+
+    VLOG(5) << "[lb] compute end, cost time(ms):" << cost_time_ms
+            << " cost steps:" << step
+            << " init cost:" << init_cost
+            << " new cost:" << current_cost;
+
+    if (current_cost < init_cost) {
+        CreatePlans(cluster, plans);
+        VLOG(5) << "[lb] balance plan size:" << plans->size();
+    } else {
+        VLOG(5) << "[lb] no better balance plan";
+    }
+
+    VLOG (5) << "[lb] BalanceCluster for table:" << table_name << " end";
+
+    return true;
+}
+
+bool UnityBalancer::NeedBalance(const std::shared_ptr<Cluster>& cluster) {
+    double total_cost = 0.0;
+    double total_weight = 0.0;
+
+    for (const auto& cost_func : cost_functions_) {
+        double weight = cost_func->GetWeight();
+        if (weight <= 0) {
+            continue;
+        }
+
+        total_weight += weight;
+        total_cost += cost_func->Cost() * weight;
+    }
+    double cost = total_weight == 0 ? 0 : total_cost / total_weight;
+
+    VLOG(5) << "[lb] NeedBalance compute, total_cost:" << total_cost
+            << " total_weight:" << total_weight
+            << " cost:" << cost
+            << " min_cost_need_balance:" << lb_options_.min_cost_need_balance;
+
+    if (total_cost <= 0 || total_weight <= 0 || cost < lb_options_.min_cost_need_balance) {
+        LOG(INFO) << "[lb] no need to balance";
+        return false;
+    } else {
+        return true;
+    }
+}
+
+void UnityBalancer::InitCostFunctions(const std::shared_ptr<Cluster>& cluster) {
+    for (const auto& cost_func : cost_functions_) {
+        cost_func->Init(cluster);
+    }
+}
+
+double UnityBalancer::ComputeCost(double previous_cost) {
+    VLOG(20) << "[lb] ComputeCost begin, previous cost:" << previous_cost;
+    double total_cost = 0.0;
+
+    for (const auto& cost_func : cost_functions_) {
+        double weight = cost_func->GetWeight();
+        if (weight <= 0) {
+            continue;
+        }
+        double cost = cost_func->Cost();
+        total_cost += cost * weight;
+        VLOG(20) << "[lb] " << cost_func->Name() << " cost:" << cost << " weight:" << weight;
+        if (total_cost > previous_cost) {
+            break;
+        }
+    }
+
+    VLOG(20) << "[lb] ComputeCost end, new cost:" << total_cost;
+    return total_cost;
+}
+
+Action* UnityBalancer::NextAction(const std::shared_ptr<Cluster>& cluster) {
+    uint32_t rand = Random::Rand(0, action_generators_.size());
+    return action_generators_[rand]->Generate(cluster);
+}
+
+void UnityBalancer::CreatePlans(const std::shared_ptr<Cluster>& cluster, std::vector<Plan>* plans) {
+    plans->clear();
+
+    for (uint32_t i = 0; i < cluster->tablet_index_to_node_index_.size(); ++i) {
+        uint32_t initial_node_index = cluster->initial_tablet_index_to_node_index_[i];
+        uint32_t new_node_index = cluster->tablet_index_to_node_index_[i];
+
+        if (initial_node_index != new_node_index) {
+            // tablet has been moved to another tablet node
+            Plan plan(cluster->tablets_[i]->tablet_ptr,
+                      cluster->nodes_[initial_node_index]->tablet_node_ptr,
+                      cluster->nodes_[new_node_index]->tablet_node_ptr);
+            plans->emplace_back(plan);
+        }
+    }
+}
+
+} // namespace load_balancer
+} // namespace tera
diff --git a/src/load_balancer/unity_balancer.h b/src/load_balancer/unity_balancer.h
new file mode 100644
index 000000000..522acabff
--- /dev/null
+++ b/src/load_balancer/unity_balancer.h
@@ -0,0 +1,58 @@
+// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef TERA_LOAD_BALANCER_UNITY_BALANCER_H_
+#define TERA_LOAD_BALANCER_UNITY_BALANCER_H_
+
+#include <memory>
+#include <vector>
+
+#include "load_balancer/action_generators.h"
+#include "load_balancer/actions.h"
+#include "load_balancer/balancer.h"
+#include "load_balancer/cluster.h"
+#include "load_balancer/cost_functions.h"
+
+namespace tera {
+namespace load_balancer {
+
+class UnityBalancer : public Balancer {
+public:
+    explicit UnityBalancer(const LBOptions& options);
+    virtual ~UnityBalancer();
+
+    virtual bool BalanceCluster(
+            const std::vector<std::shared_ptr<LBTabletNode>>& lb_nodes,
+            std::vector<Plan>* plans) override;
+
+    // if table_name is empty, balance whole culster,
+    // otherwhise balance the specified table of table_name
+    virtual bool BalanceCluster(
+            const std::string& table_name,
+            const std::vector<std::shared_ptr<LBTabletNode>>& lb_nodes,
+            std::vector<Plan>* plans) override;
+
+    virtual bool NeedBalance(const std::shared_ptr<Cluster>& cluster);
+
+protected:
+    virtual void InitCostFunctions(const std::shared_ptr<Cluster>& cluster);
+
+    virtual double ComputeCost(double previous_cost);
+
+    virtual Action* NextAction(const std::shared_ptr<Cluster>& cluster);
+
+    // diff the initial cluster state with the current cluster state, then create plans
+    virtual void CreatePlans(const std::shared_ptr<Cluster>& cluster, std::vector<Plan>* plans);
+
+private:
+    std::vector<std::shared_ptr<CostFunction>> cost_functions_;
+    std::vector<std::shared_ptr<ActionGenerator>> action_generators_;
+
+    LBOptions lb_options_;
+};
+
+} // namespace load_balancer
+} // namespace tera
+
+#endif // TERA_LOAD_BALANCER_UNITY_BALANCER_H_
diff --git a/src/master/availability.cc b/src/master/availability.cc
index 998c14a8e..1b5c85b71 100644
--- a/src/master/availability.cc
+++ b/src/master/availability.cc
@@ -24,15 +24,50 @@ DECLARE_string(tera_master_meta_table_path);
 namespace tera {
 namespace master {
 
+static std::string GetNameFromPath(const std::string& path) {
+    if (path == FLAGS_tera_master_meta_table_path) {
+        return FLAGS_tera_master_meta_table_name;
+    }
+    std::vector<std::string> t;
+    SplitString(path, "/", &t); // table_name/tablet00...001
+    if (!t.empty()) {
+        return t[0];
+    } else {
+        return "";
+    }
+}
+
+
 TabletAvailability::TabletAvailability(std::shared_ptr<TabletManager> t)
     : tablet_manager_(t) {
     start_ts_ = get_micros();
 }
 
-void TabletAvailability::AddNotReadyTablet(const std::string& path) {
+void TabletAvailability::AddNotReadyTablet(const std::string& path,
+                                           const TabletStatus& tablet_status,
+                                           const TableStatus& table_status) {
+    if (tablet_status == kTableReady || table_status == kTableDisable) {
+        return;
+    }
+
     MutexLock lock(&mutex_);
     int64_t ts = get_micros();
     tablets_.insert(std::pair<std::string, int64_t>(path, ts));
+    auto iter = not_ready_tablet_metrics_.emplace(
+        path,
+        MetricCounter{
+            metric_name_,
+            "table:" + GetNameFromPath(path) + ",tablet:" + path,
+            {SubscriberType::LATEST},
+            false
+        });
+
+    if (iter.second) {
+        VLOG(12) << "[Add NotReady To Metric]: " << static_cast<int64_t>(TabletErrorStatus::kNotReady);
+        iter.first->second.Set(static_cast<int64_t>(TabletErrorStatus::kNotReady));
+    } else {
+        VLOG(12) << "[Add NotReady To Metric Failed]: " << static_cast<int64_t>(TabletErrorStatus::kNotReady);
+    }
 
     if (tablets_hist_cost_[path].start_ts > 0) {
         VLOG(10) << "notready again " << path;
@@ -51,6 +86,7 @@ void TabletAvailability::AddNotReadyTablet(const std::string& path) {
 void TabletAvailability::EraseNotReadyTablet(const std::string& path) {
     MutexLock lock(&mutex_);
     tablets_.erase(path);
+    not_ready_tablet_metrics_.erase(path);
 
     if (tablets_hist_cost_.find(path) == tablets_hist_cost_.end() ||
         tablets_hist_cost_[path].start_ts == 0) {
@@ -71,22 +107,13 @@ void TabletAvailability::EraseNotReadyTablet(const std::string& path) {
         << ", reready " << tablets_hist_cost_[path].reready_num;
 }
 
-static std::string GetNameFromPath(const std::string& path) {
-    if (path == FLAGS_tera_master_meta_table_path) {
-        return FLAGS_tera_master_meta_table_name;
-    }
-    std::vector<std::string> t;
-    SplitString(path, "/", &t); // table_name/tablet00...001
-    return t[0];
-}
-
 void TabletAvailability::LogAvailability() {
     MutexLock lock(&mutex_);
     int64_t not_avai_count = 0;
     int64_t not_avai_warning = 0;
     int64_t not_avai_error = 0;
     int64_t not_avai_fatal = 0;
-    int64_t start = ::common::timer::get_micros();
+    int64_t start = get_micros();
     std::map<std::string, int64_t>::iterator it;
     for (it = tablets_.begin(); it != tablets_.end(); ++it) {
         std::string table_name = GetNameFromPath(it->first);
@@ -99,22 +126,28 @@ void TabletAvailability::LogAvailability() {
             continue;
         }
 
+        auto metric_iter = not_ready_tablet_metrics_.find(it->first);
+        assert(metric_iter != not_ready_tablet_metrics_.end());
+
         if ((start - it->second) > FLAGS_tera_master_not_available_threshold * 1000 * 1000LL) {
             VLOG(12) << "[availability] not available:" << it->first;
             not_avai_count++;
         }
         if ((start - it->second) > FLAGS_tera_master_availability_fatal_threshold * 1000 * 1000LL) {
             not_avai_fatal++;
+            metric_iter->second.Set(static_cast<int64_t>(TabletErrorStatus::kFatal));
             if (FLAGS_tera_master_availability_show_details_enabled) {
                 LOG(INFO) << "[availability] fatal-tablet:" << it->first;
             }
         } else if ((start - it->second) > FLAGS_tera_master_availability_error_threshold * 1000 * 1000LL) {
             not_avai_error++;
+            metric_iter->second.Set(static_cast<int64_t>(TabletErrorStatus::kError));
             if (FLAGS_tera_master_availability_show_details_enabled) {
                 LOG(INFO) << "[availability] error-tablet:" << it->first;
             }
         } else if ((start - it->second) > FLAGS_tera_master_availability_warning_threshold * 1000 * 1000LL) {
             not_avai_warning++;
+            metric_iter->second.Set(static_cast<int64_t>(TabletErrorStatus::kWarning));
         }
     }
 
@@ -155,6 +188,9 @@ void TabletAvailability::LogAvailability() {
         }
     }
     int64_t nr_notready_tablets = tablets_hist_cost_.size();
+    double time_percent = 1.0 - (double)total_time / (all_time * all_tablets + 1);
+    ready_time_percent.Set(static_cast<int64_t>(time_percent * 100));
+
     LOG(INFO) << "[availability][tablet_staticstic] time_interval: " << all_time / 1000
       << ", notready_time: " << total_time / 1000
       << ", total_time: " << (all_time * all_tablets) / 1000
@@ -165,7 +201,7 @@ void TabletAvailability::LogAvailability() {
       << ", notready_count: " << total_notready
       << ", reready_count: " << total_reready;
 
-    int64_t cost = ::common::timer::get_micros() - start;
+    int64_t cost = get_micros() - start;
     LOG(INFO) << "[availability] cost time:" << cost/1000 << " ms";
 }
 
diff --git a/src/master/availability.h b/src/master/availability.h
index ddbe6a5f4..d6c133c72 100644
--- a/src/master/availability.h
+++ b/src/master/availability.h
@@ -6,10 +6,12 @@
 #define TERA_MASTER_TABLET_AVAILABILITY_H_
 
 #include <string>
+#include <map>
 
 #include "master/tablet_manager.h"
 
 #include "common/mutex.h"
+#include "common/metric/metric_counter.h"
 
 namespace tera {
 namespace master {
@@ -25,16 +27,32 @@ class TabletAvailability {
 public:
     TabletAvailability(std::shared_ptr<TabletManager> t);
     void LogAvailability();
-    void AddNotReadyTablet(const std::string& id);
+    void AddNotReadyTablet(const std::string& path,
+                           const TabletStatus& tablet_status,
+                           const TableStatus& table_status);
     void EraseNotReadyTablet(const std::string& id);
 
 private:
+
+    enum class TabletErrorStatus {
+        kNotReady = 1,
+        kFatal = 2,
+        kError = 3,
+        kWarning = 4
+    };
+
     Mutex mutex_;
     std::shared_ptr<TabletManager> tablet_manager_;
+
     std::map<std::string, int64_t> tablets_;
+    std::map<std::string, MetricCounter> not_ready_tablet_metrics_;
+    MetricCounter ready_time_percent{"tera_master_tablet_ready_time_percent",
+                                     {SubscriberType::LATEST},
+                                     false};
 
     int64_t start_ts_;
     std::map<std::string, TimeStatistic> tablets_hist_cost_;
+    const std::string metric_name_{"tera_master_tablet_availability"};
 };
 
 } // master
diff --git a/src/master/gc_strategy.cc b/src/master/gc_strategy.cc
index d87c96848..b87d113b1 100644
--- a/src/master/gc_strategy.cc
+++ b/src/master/gc_strategy.cc
@@ -8,12 +8,12 @@
 
 #include "db/filename.h"
 #include "io/utils_leveldb.h"
-
+#include "leveldb/env_dfs.h"
 
 DECLARE_string(tera_tabletnode_path_prefix);
 DECLARE_string(tera_master_meta_table_name);
 DECLARE_int32(tera_garbage_collect_debug_log);
-
+DECLARE_string(tera_leveldb_env_type);
 namespace tera {
 namespace master {
 
@@ -147,7 +147,15 @@ bool BatchGcStrategy::CollectSingleDeadTablet(const std::string& tablename, uint
     env->GetChildren(tablet_path, &children);
     list_count_.Inc();
     if (children.size() == 0) {
-        LOG(INFO) << "[gc] delete empty tablet dir: " << tablet_path;
+        leveldb::FileLock* file_lock = nullptr;
+        // NEVER remove the trailing character '/', otherwise you will lock the parent directory��
+        leveldb::Status s = env->LockFile(tablet_path + "/", &file_lock);
+        if (!s.ok()) {
+            LOG(WARNING) << "lock path failed, path: " << tablet_path << ", status: " << s.ToString();
+        }
+ 
+        delete file_lock;
+
         env->DeleteDir(tablet_path);
         return false;
     }
@@ -157,6 +165,14 @@ bool BatchGcStrategy::CollectSingleDeadTablet(const std::string& tablename, uint
         uint64_t number = 0;
         if (ParseFileName(children[lg], &number, &type)) {
             LOG(INFO) << "[gc] delete: " << lg_path;
+
+            leveldb::FileLock* file_lock = nullptr;
+            // NEVER remove the trailing character '/', otherwise you will lock the parent directory��
+            leveldb::Status s = env->LockFile(tablet_path + "/", &file_lock);
+            if (!s.ok()) {
+                LOG(WARNING) << "lock path failed, path: " << tablet_path << ", status: " << s.ToString();
+            }
+
             env->DeleteFile(lg_path);
             continue;
         }
@@ -173,6 +189,13 @@ bool BatchGcStrategy::CollectSingleDeadTablet(const std::string& tablename, uint
         list_count_.Inc();
         if (files.size() == 0) {
             LOG(INFO) << "[gc] delete empty lg dir: " << lg_path;
+            leveldb::FileLock* file_lock = nullptr;
+            // NEVER remove the trailing character '/', otherwise you will lock the parent directory��
+            leveldb::Status s = env->LockFile(tablet_path + "/", &file_lock);
+            if (!s.ok()) {
+                LOG(WARNING) << "lock path failed, path: " << tablet_path << ", status: " << s.ToString();
+            }
+            delete file_lock;
             env->DeleteDir(lg_path);
             continue;
         }
@@ -184,6 +207,13 @@ bool BatchGcStrategy::CollectSingleDeadTablet(const std::string& tablename, uint
             if (!ParseFileName(files[f], &number, &type) ||
                 type != leveldb::kTableFile) {
                 // only keep sst, delete rest files
+                leveldb::FileLock* file_lock = nullptr;
+                // NEVER remove the trailing character '/', otherwise you will lock the parent directory��
+                leveldb::Status s = env->LockFile(lg_path + "/", &file_lock);
+                if (!s.ok()) {
+                    LOG(WARNING) << "lock path failed, path: " << lg_path << ", status: " << s.ToString();
+                }
+                delete file_lock;
                 io::DeleteEnvDir(file_path);
                 continue;
             }
@@ -214,7 +244,20 @@ void BatchGcStrategy::DeleteObsoleteFiles() {
         for (size_t lg = 0; lg < file_set.size(); ++lg) {
             std::set<uint64_t>::iterator it = file_set[lg].begin();
             for (; it != file_set[lg].end(); ++it) {
-                std::string file_path = leveldb::BuildTableFilePath(tablepath, lg, *it);
+                uint64_t tablet = 0;
+                uint64_t number = 0;
+                leveldb::ParseFullFileNumber(*it, &tablet, &number);
+                std::string file_path = leveldb::BuildTableFilePath(tablepath, tablet, lg, number);
+                std::string lg_path = leveldb::BuildTabletLgPath(tablepath, tablet, lg);
+
+                leveldb::FileLock* file_lock = nullptr;
+                // NEVER remove the trailing character '/', otherwise you will lock the parent directory��
+                leveldb::Status s = env->LockFile(lg_path + "/", &file_lock);
+                if (!s.ok()) {
+                    LOG(WARNING) << "lock path failed, path: " << lg_path << ", status: " << s.ToString();
+                }
+                delete file_lock;
+
                 LOG(INFO) << "[gc] delete: " << file_path;
                 env->DeleteFile(file_path);
                 file_delete_num_++;
@@ -223,390 +266,5 @@ void BatchGcStrategy::DeleteObsoleteFiles() {
     }
 }
 
-IncrementalGcStrategy::IncrementalGcStrategy(std::shared_ptr<TabletManager> tablet_manager)
-    :   tablet_manager_(tablet_manager),
-        last_gc_time_(std::numeric_limits<int64_t>::max()),
-        max_ts_(std::numeric_limits<int64_t>::max()) {}
-
-bool IncrementalGcStrategy::PreQuery () {
-    int64_t start_ts = get_micros();
-    std::vector<TablePtr> tables;
-    tablet_manager_->ShowTable(&tables, NULL);
-
-    for (size_t i = 0; i < tables.size(); ++i) {
-        TabletFiles tablet_files;
-        std::string table_name = tables[i]->GetTableName();
-        if (table_name == FLAGS_tera_master_meta_table_name) continue;
-        dead_tablet_files_.insert(std::make_pair(table_name, tablet_files));
-        live_tablet_files_.insert(std::make_pair(table_name, tablet_files));
-
-        std::set<uint64_t> live_tablets, dead_tablets;
-        if (!tables[i]->GetTabletsForGc(&live_tablets, &dead_tablets, true)) {
-            continue;
-        }
-        std::set<uint64_t>::iterator it;
-        // update dead tablets
-        for (it = dead_tablets.begin(); it != dead_tablets.end(); ++it) {
-            TabletFiles& temp_tablet_files = dead_tablet_files_[table_name];
-            TabletFileSet tablet_file_set(get_micros() / 1000000, 0);
-            bool ret = temp_tablet_files.insert(std::make_pair(*it, tablet_file_set)).second;
-            if (ret) {
-                VLOG(10) << "[gc] newly dead talbet: " << leveldb::GetTabletPathFromNum(table_name, *it);
-                if (!CollectSingleDeadTablet(table_name, *it)) {
-                    // collect from DFS fails, so rollback memory status, retry in the next time
-                    assert(dead_tablet_files_[table_name].erase(*it) == 1);
-                }
-            } else {
-                VLOG(20) << "[gc] old dead talbet: " << leveldb::GetTabletPathFromNum(table_name, *it);
-            }
-        }
-
-        // erase newly dead tablets from live tablets
-        for (TabletFiles::iterator it = live_tablet_files_[table_name].begin();
-             it != live_tablet_files_[table_name].end();) {
-            if (dead_tablet_files_[table_name].find(static_cast<uint64_t>(it->first)) != dead_tablet_files_[table_name].end()) {
-                live_tablet_files_[table_name].erase(it++);
-            } else {
-                ++it;
-            }
-        }
-
-        // add new live tablets
-        for (it = live_tablets.begin(); it != live_tablets.end(); ++it) {
-            TabletFiles& temp_tablet_files = live_tablet_files_[table_name];
-            TabletFileSet tablet_file_set;
-            temp_tablet_files.insert(std::make_pair(*it, tablet_file_set));
-        }
-    }
-    if (FLAGS_tera_garbage_collect_debug_log) {
-        DEBUG_print_files(true);
-        DEBUG_print_files(false);
-    }
-    LOG(INFO) << "[gc] Gather dead tablets, cost: " << (get_micros() - start_ts) / 1000 << "ms.";
-
-    // do not need gc if there is no new dead tablet
-    if (dead_tablet_files_.size() == 0) {
-        LOG(INFO) << "[gc] Do not need gc this time";
-    }
-    return dead_tablet_files_.size() != 0;
-}
-
-void IncrementalGcStrategy::ProcessQueryCallbackForGc(QueryResponse* response) {
-    LOG(INFO) << "[gc] ProcessQueryCallbackForGc";
-    MutexLock lock(&gc_mutex_);
-
-    std::set<std::string> ready_tables;
-    for (int table = 0; table < response->inh_live_files_size(); ++table) {
-        ready_tables.insert(response->inh_live_files(table).table_name());
-    }
-
-    // update tablet ready time
-    for (int i = 0; i < response->tabletmeta_list().meta_size(); ++i) {
-        const TabletMeta& meta = response->tabletmeta_list().meta(i);
-        std::string table_name = meta.table_name();
-        if (table_name == FLAGS_tera_master_meta_table_name) continue;
-        if (live_tablet_files_.find(table_name) == live_tablet_files_.end() ||
-            ready_tables.find(table_name) == ready_tables.end()) {
-            continue;
-        }
-        int64_t tablet_number = static_cast<int64_t>(leveldb::GetTabletNumFromPath(meta.path()));
-        VLOG(15) << "[gc] see live tablet " << leveldb::GetTabletPathFromNum(table_name, tablet_number);
-        if (live_tablet_files_[table_name].find(tablet_number) == live_tablet_files_[table_name].end()) continue;
-        live_tablet_files_[table_name][tablet_number].ready_time_ = get_micros() / 1000000;
-    }
-
-    // insert live files
-    for (int table = 0; table < response->inh_live_files_size(); ++table) {
-        InheritedLiveFiles live_files = response->inh_live_files(table);
-        std::string table_name = live_files.table_name();
-        if (table_name == FLAGS_tera_master_meta_table_name) continue;
-        VLOG(12) << "[gc] inh pb: " << response->inh_live_files(table).ShortDebugString();
-        if (live_tablet_files_.find(table_name) == live_tablet_files_.end()) continue;
-        // collect live files
-        TabletFiles temp_tablet_files;
-        for (int lg = 0; lg < live_files.lg_live_files_size(); ++lg) {
-            LgInheritedLiveFiles lg_live_files = live_files.lg_live_files(lg);
-            uint32_t lg_no = lg_live_files.lg_no();
-            for (int i = 0; i < lg_live_files.file_number_size(); ++i) {
-                uint64_t tablet_number, file;
-                uint64_t file_number = lg_live_files.file_number(i);
-                leveldb::ParseFullFileNumber(file_number, &tablet_number, &file);
-                if (dead_tablet_files_[table_name].find(tablet_number) ==
-                    dead_tablet_files_[table_name].end()) {
-                    VLOG(12) << "[gc] skip live tablet " << tablet_number;
-                    continue;
-                }
-                TabletFileSet tablet_file_set;
-                temp_tablet_files.insert(std::make_pair(tablet_number, tablet_file_set));
-                TabletFileSet& temp_tablet_file_set = temp_tablet_files[tablet_number];
-                LgFileSet lg_files;
-                temp_tablet_file_set.files_.insert(std::make_pair(lg_no, lg_files));
-                temp_tablet_file_set.files_[lg_no].live_files_.insert(file_number);
-                VLOG(12) << "[gc] insert live file " << leveldb::GetTabletPathFromNum(table_name, tablet_number) << "/" << lg_no << "/" << file;
-                const LgFileSet& check = ((dead_tablet_files_[table_name][tablet_number]).files_)[lg_no];
-                if (check.storage_files_.find(file_number) == check.storage_files_.end()) {
-                    LOG(WARNING) << "[gc] insert error " << leveldb::GetTabletPathFromNum(table_name, tablet_number) << "/" << lg_no << "/" << file;
-                }
-            }
-        }
-        // update live files in dead tablets
-        TabletFiles::iterator tablet_it = temp_tablet_files.begin();
-        TabletFiles& dead_tablets = dead_tablet_files_[table_name];
-        for (; tablet_it != temp_tablet_files.end(); ++tablet_it) {
-            uint64_t tablet_number = tablet_it->first;
-            if (dead_tablets.find(tablet_number) == dead_tablets.end()) {
-                VLOG(12) << "[gc] skip live tablet " << table_name << "/" << tablet_number;
-                continue;
-            }
-            std::map<int64_t, LgFileSet>& live_lg = (tablet_it->second).files_;
-            std::map<int64_t, LgFileSet>& dead_lg = dead_tablets[tablet_number].files_;
-            std::map<int64_t, LgFileSet>::iterator lg_it = live_lg.begin();
-            for (; lg_it != live_lg.end(); ++lg_it) {
-                uint32_t lg_no = lg_it->first;
-                LgFileSet lg_file_set;
-                dead_lg.insert(std::make_pair(lg_no, lg_file_set));
-                for (std::set<uint64_t>::iterator it = live_lg[lg_no].live_files_.begin(); it != live_lg[lg_no].live_files_.end(); ++it) {
-                    dead_lg[lg_no].live_files_.insert(*it);
-                }
-                VLOG(12) << "[gc] dead tablet's live lg: " << leveldb::GetTabletPathFromNum(table_name, tablet_number) << "/" << lg_no;
-            }
-        }
-    }
-    if (FLAGS_tera_garbage_collect_debug_log) {
-        DEBUG_print_files(true);
-    }
-}
-
-void IncrementalGcStrategy::PostQuery () {
-    LOG(INFO) << "[gc] PostQuery";
-    if (FLAGS_tera_garbage_collect_debug_log) {
-        DEBUG_print_files(true);
-        DEBUG_print_files(false);
-    }
-    int64_t start_ts = get_micros();
-    TableFiles::iterator table_it = dead_tablet_files_.begin();
-    for (; table_it != dead_tablet_files_.end(); ++table_it) {
-        DeleteTableFiles(table_it->first);
-    }
-    if (FLAGS_tera_garbage_collect_debug_log) {
-        DEBUG_print_files(true);
-        DEBUG_print_files(false);
-    }
-    LOG(INFO) << "[gc] Delete useless sst, cost: " << (get_micros() - start_ts) / 1000 << "ms. list_times " << list_count_.Get();
-    list_count_.Clear();
-}
-
-void IncrementalGcStrategy::Clear(std::string tablename) {
-    LOG(INFO) << "[gc] Clear " << tablename;
-    MutexLock lock(&gc_mutex_);
-    dead_tablet_files_.erase(tablename);
-    live_tablet_files_.erase(tablename);
-}
-
-void IncrementalGcStrategy::DeleteTableFiles(const std::string& table_name) {
-    std::string table_path = FLAGS_tera_tabletnode_path_prefix + table_name;
-    leveldb::Env* env = io::LeveldbBaseEnv();
-    TabletFiles& dead_tablets = dead_tablet_files_[table_name];
-    TabletFiles& live_tablets = live_tablet_files_[table_name];
-    int64_t earliest_ready_time = max_ts_;
-    TabletFiles::iterator tablet_it = live_tablets.begin();
-    for (; tablet_it != live_tablets.end(); ++tablet_it) {
-        if (tablet_it->second.ready_time_ < earliest_ready_time) {
-            earliest_ready_time = tablet_it->second.ready_time_;
-        }
-    }
-
-    if (earliest_ready_time != max_ts_) {
-        VLOG(12) << "[gc] earliest ready time " << earliest_ready_time << " : " << common::timer::get_time_str(earliest_ready_time);
-    } else {
-        VLOG(12) << "[gc] " << table_name << "'s tablets not ready";
-    }
-    std::set<int64_t> gc_tablets;
-    for (tablet_it = dead_tablets.begin(); tablet_it != dead_tablets.end(); ++tablet_it) {
-        if (tablet_it->second.dead_time_ < earliest_ready_time) {
-            gc_tablets.insert(tablet_it->first);
-            VLOG(12) << "[gc] will gc tablet: " << leveldb::GetTabletPathFromNum(table_name, tablet_it->first);
-        }
-    }
-
-    for (std::set<int64_t>::iterator gc_it = gc_tablets.begin(); gc_it != gc_tablets.end();) {
-        std::map<int64_t, LgFileSet>& lg_files = dead_tablets[*gc_it].files_;
-        std::map<int64_t, LgFileSet>::iterator lg_it = lg_files.begin();
-        std::string tablet_path = leveldb::GetTabletPathFromNum(table_path, *gc_it);
-        for (; lg_it != lg_files.end();) {
-            VLOG(12) << "[gc] entry lg gc lg=" << lg_it->first;
-            LgFileSet& lg_file_set = lg_it->second;
-            std::set<uint64_t>::iterator file_it = lg_file_set.storage_files_.begin();
-            for (; file_it != lg_file_set.storage_files_.end();) {
-                if (lg_file_set.live_files_.find(*file_it) == lg_file_set.live_files_.end()) {
-                    std::string file_path =
-                        leveldb::BuildTableFilePath(table_path, lg_it->first, *file_it);
-
-                    std::string debug_str;
-                    for (std::set<uint64_t>::iterator it = lg_file_set.live_files_.begin(); it != lg_file_set.live_files_.end(); ++it) {
-                        uint64_t file_no;
-                        leveldb::ParseFullFileNumber(*it, NULL, &file_no);
-                        debug_str += " " + std::to_string(file_no);
-                    }
-                    // VLOG(12) << "[gc] live = " << debug_str;
-                    LOG(INFO) << "[gc] delete: " << file_path;
-                    if (env->DeleteFile(file_path).ok()) {
-                        lg_file_set.storage_files_.erase(file_it++);
-                    } else {
-                        ++file_it;
-                        // do nothing, try to delete next time
-                        // TODO: if retry times > MAX ?
-                        // TODO: if failed due to timeout but delete ok in DFS, it will always retry
-                    }
-                } else {
-                    ++file_it;
-                }
-            }
-            if (lg_file_set.storage_files_.size() == 0) {
-                if (lg_file_set.live_files_.size() != 0) {
-                    uint64_t full_number = *(lg_file_set.live_files_.begin());
-                    uint64_t tablet_number, file_number;
-                    leveldb::ParseFullFileNumber(full_number, &tablet_number, &file_number);
-                    LOG(ERROR) << "[gc] empty tablet still has live files: " << tablet_number << "/" << lg_it->first << "/" << file_number;
-                } else {
-                    std::string lg_str = std::to_string(lg_it->first);
-                    std::string lg_path = tablet_path + "/" + lg_str;
-                    LOG(INFO) << "[gc] delete empty lg dir: " << lg_path;
-                    if (io::DeleteEnvDir(lg_path).ok()) {
-                        lg_files.erase(lg_it++);
-                    } else {
-                        ++lg_it;
-                        // do nothing, try to delete next time
-                        // TODO: iff retry times > MAX ?
-                        // TODO: if failed due to timeout but delete ok in DFS, it will always retry
-                    }
-                }
-            } else {
-                ++lg_it;
-            }
-        }
-
-        if (lg_files.size() == 0) {
-            LOG(INFO) << "[gc] delete empty tablet dir: " << tablet_path;
-            if (env->DeleteDir(tablet_path).ok()) {
-                dead_tablets.erase(*gc_it);
-            } else {
-                LOG(ERROR) << "[gc] rm dir fail: " << tablet_path;
-                // do nothing, try to delete next time
-                // TODO: iff retry times > MAX ?
-                // TODO: if failed due to timeout but delete ok in DFS, it will always retry
-            }
-        } else {
-            // clear live_files_ in dead_tablets for next round of gc
-            for (lg_it = lg_files.begin(); lg_it != lg_files.end(); ++lg_it) {
-                VLOG(12) << "[gc] clear live_files_(lg_no/file_no): " << *gc_it << "/" << lg_it->first;
-                lg_it->second.live_files_.clear();
-            }
-            dead_tablets[*gc_it].dead_time_ = get_micros() / 1000000;
-            VLOG(12) << "[gc] update dead_time_ " << dead_tablets[*gc_it].dead_time_ << " " << common::timer::get_time_str(dead_tablets[*gc_it].dead_time_);
-        }
-        gc_it++;
-    }
-}
-
-bool IncrementalGcStrategy::CollectSingleDeadTablet(const std::string& tablename, uint64_t tabletnum) {
-    std::string tablepath = FLAGS_tera_tabletnode_path_prefix + tablename;
-    std::string tablet_path = leveldb::GetTabletPathFromNum(tablepath, tabletnum);
-    leveldb::Env* env = io::LeveldbBaseEnv();
-    std::vector<std::string> children;
-    leveldb::Status s = env->GetChildren(tablet_path, &children);
-    if (!s.ok()) {
-        LOG(ERROR) << "[gc] list directory fail: " << tablet_path;
-        return false;
-    }
-    list_count_.Inc();
-
-    for (size_t lg = 0; lg < children.size(); ++lg) {
-        std::string lg_path = tablet_path + "/" + children[lg];
-        leveldb::FileType type = leveldb::kUnknown;
-        uint64_t number = 0;
-        if (ParseFileName(children[lg], &number, &type)) {
-            LOG(INFO) << "[gc] delete: " << lg_path;
-            env->DeleteFile(lg_path);
-            continue;
-        }
-
-        leveldb::Slice rest(children[lg]);
-        uint64_t lg_num = 0;
-        if (!leveldb::ConsumeDecimalNumber(&rest, &lg_num)) {
-            LOG(INFO) << "[gc] skip unknown dir: " << lg_path;
-            continue;
-        }
-
-        std::vector<std::string> files;
-        env->GetChildren(lg_path, &files);
-        list_count_.Inc();
-
-        int64_t lg_no = std::stoll(children[lg]);
-        std::map<int64_t, LgFileSet>& tablet_files = dead_tablet_files_[tablename][tabletnum].files_;
-        LgFileSet lg_file_set;
-        tablet_files.insert(std::make_pair(lg_no, lg_file_set));
-        LgFileSet& temp_lg_files_set = tablet_files[lg_no];
-        for (size_t f = 0; f < files.size(); ++f) {
-            std::string file_path = lg_path + "/" + files[f];
-            type = leveldb::kUnknown;
-            number = 0;
-            if (!ParseFileName(files[f], &number, &type) ||
-                type != leveldb::kTableFile) {
-                // skip manifest/CURRENT
-                continue;
-            }
-
-            uint64_t full_number = leveldb::BuildFullFileNumber(lg_path, number);
-            temp_lg_files_set.storage_files_.insert(full_number);
-        }
-    }
-    return true;
-}
-
-void IncrementalGcStrategy::DEBUG_print_files(bool print_dead) {
-    TableFiles all_tablet_files;
-    if (print_dead == true) {
-        LOG(INFO) << "----------------------------[gc] Test print DEAD";
-        all_tablet_files = dead_tablet_files_;
-    } else {
-        LOG(INFO) << "----------------------------[gc] Test print LIVE";
-        all_tablet_files = live_tablet_files_;
-    }
-    TableFiles::iterator table_it;
-    for (table_it = all_tablet_files.begin(); table_it != all_tablet_files.end(); ++table_it) {
-        LOG(INFO) << "[gc] table=" << table_it->first;
-        TabletFiles& tablet_files = table_it->second;
-        TabletFiles::iterator tablet_it;
-        for (tablet_it = tablet_files.begin(); tablet_it != tablet_files.end(); ++tablet_it) {
-            LOG(INFO) << "[gc]   tablet -- " << tablet_it->first;
-            TabletFileSet tablet_file_set = tablet_it->second;
-            LOG(INFO) << "[gc]   ready -- " << tablet_file_set.ready_time_;
-            LOG(INFO) << "[gc]   dead  -- " << tablet_file_set.dead_time_;
-            std::map<int64_t, LgFileSet>& files = tablet_file_set.files_;
-            std::map<int64_t, LgFileSet>::iterator lg_it;
-            for (lg_it = files.begin(); lg_it != files.end(); ++lg_it) {
-                std::set<uint64_t>& f = (lg_it->second).storage_files_;
-                std::string debug_str = "";
-                for (std::set<uint64_t>::iterator it = f.begin(); it != f.end(); ++it) {
-                    uint64_t file_no;
-                    leveldb::ParseFullFileNumber(*it, NULL, &file_no);
-                    debug_str += " " + std::to_string(file_no);
-                }
-                LOG(INFO) << "[gc]     lg stor -- " << lg_it->first << "-" << (lg_it->second).storage_files_.size() << debug_str;
-                f = (lg_it->second).live_files_;
-                debug_str = "";
-                for (std::set<uint64_t>::iterator it = f.begin(); it != f.end(); ++it) {
-                    uint64_t file_no;
-                    leveldb::ParseFullFileNumber(*it, NULL, &file_no);
-                    debug_str += " " + std::to_string(file_no);
-                }
-                LOG(INFO) << "[gc]     lg live -- " << lg_it->first << "-" << (lg_it->second).live_files_.size() << debug_str;
-            }
-        }
-    }
-    LOG(INFO) << "----------------------------[gc] Done Test print";
-}
-
 } // namespace master
 } // namespace tera
diff --git a/src/master/gc_strategy.h b/src/master/gc_strategy.h
index cccbd91b0..c68364502 100644
--- a/src/master/gc_strategy.h
+++ b/src/master/gc_strategy.h
@@ -7,7 +7,7 @@
 #include "master/tablet_manager.h"
 #include "proto/tabletnode_client.h"
 #include "types.h"
-#include "utils/counter.h"
+#include "common/counter.h"
 
 namespace tera {
 namespace master {
@@ -68,58 +68,6 @@ class BatchGcStrategy : public GcStrategy {
     tera::Counter list_count_;
 };
 
-class IncrementalGcStrategy : public GcStrategy{
-public:
-    IncrementalGcStrategy(std::shared_ptr<TabletManager> tablet_manager);
-    virtual ~IncrementalGcStrategy() {}
-
-    // get dead tablets
-    virtual bool PreQuery ();
-
-    // gather live files
-    virtual void ProcessQueryCallbackForGc(QueryResponse* response);
-
-    // delete dead files
-    virtual void PostQuery ();
-
-    // clear memory when table is deleted
-    virtual void Clear(std::string tablename);
-
-private:
-    void DEBUG_print_files(bool print_dead);
-    bool CollectSingleDeadTablet(const std::string& tablename, uint64_t tabletnum);
-    void DeleteTableFiles(const std::string& table_name);
-
-    struct LgFileSet {
-        std::set<uint64_t> storage_files_;
-        std::set<uint64_t> live_files_;
-    };
-
-    struct TabletFileSet {
-        int64_t dead_time_;
-        int64_t ready_time_;
-        std::map<int64_t, LgFileSet> files_; // lg_no -> files
-        TabletFileSet() {
-            dead_time_ = std::numeric_limits<int64_t>::max();
-            ready_time_ = 0;
-        };
-        TabletFileSet(int64_t dead_time, int64_t ready_time) {
-            dead_time_ = dead_time;
-            ready_time_ = ready_time;
-        }
-    };
-
-    typedef std::map<int64_t, TabletFileSet> TabletFiles;  // tablet_number -> files
-    typedef std::map<std::string, TabletFiles> TableFiles; // table_name -> files
-    mutable Mutex gc_mutex_;
-    std::shared_ptr<TabletManager> tablet_manager_;
-    int64_t last_gc_time_;
-    TableFiles dead_tablet_files_;
-    TableFiles live_tablet_files_;
-    int64_t max_ts_;
-    tera::Counter list_count_;
-};
-
 } // namespace master
 } // namespace tera
 
diff --git a/src/master/master_entry.cc b/src/master/master_entry.cc
index 1e958c028..13c9d276d 100644
--- a/src/master/master_entry.cc
+++ b/src/master/master_entry.cc
@@ -7,6 +7,7 @@
 #include <gflags/gflags.h>
 #include <glog/logging.h>
 
+#include "common/metric/collector_report.h"
 #include "common/net/ip_address.h"
 #include "master/master_impl.h"
 #include "master/remote_master.h"
@@ -15,6 +16,8 @@
 DECLARE_string(tera_master_port);
 DECLARE_int32(tera_master_rpc_server_max_inflow);
 DECLARE_int32(tera_master_rpc_server_max_outflow);
+DECLARE_bool(tera_metric_http_server_enable);
+DECLARE_int32(tera_metric_http_server_listen_port);
 
 std::string GetTeraEntryName() {
     return "master";
@@ -30,7 +33,8 @@ namespace master {
 MasterEntry::MasterEntry()
     : master_impl_(NULL),
       remote_master_(NULL),
-      rpc_server_(NULL) {
+      rpc_server_(NULL),
+      metric_http_server_(new tera::MetricHttpServer()) {
     sofa::pbrpc::RpcServerOptions rpc_options;
     rpc_options.max_throughput_in = FLAGS_tera_master_rpc_server_max_inflow;
     rpc_options.max_throughput_out = FLAGS_tera_master_rpc_server_max_outflow;
@@ -57,10 +61,20 @@ bool MasterEntry::StartServer() {
     }
 
     LOG(INFO) << "finish starting master server";
+
+    // start metric http server
+	if (FLAGS_tera_metric_http_server_enable) {
+	    if(!metric_http_server_->Start(FLAGS_tera_metric_http_server_listen_port)) {
+		    LOG(WARNING) << "Start metric http server failed. Ignore";
+		}
+	} else {
+	    LOG(INFO) << "Metric http server is disabled.";
+	}
     return true;
 }
 
 bool MasterEntry::Run() {
+    CollectorReportPublisher::GetInstance().Refresh();
     static int64_t timer_ticks = 0;
     ++timer_ticks;
 
@@ -73,6 +87,7 @@ bool MasterEntry::Run() {
 }
 
 void MasterEntry::ShutdownServer() {
+    metric_http_server_->Stop();
     rpc_server_->Stop();
     master_impl_.reset();
 }
diff --git a/src/master/master_entry.h b/src/master/master_entry.h
index c8f738916..919da4928 100644
--- a/src/master/master_entry.h
+++ b/src/master/master_entry.h
@@ -8,6 +8,7 @@
 #include <sofa/pbrpc/pbrpc.h>
 
 #include "common/base/scoped_ptr.h"
+#include "common/metric/metric_http_server.h"
 #include "tera_entry.h"
 
 namespace tera {
@@ -33,6 +34,7 @@ class MasterEntry : public TeraEntry {
     // scoped_ptr<RemoteMaster> remote_master_;
     RemoteMaster* remote_master_;
     scoped_ptr<sofa::pbrpc::RpcServer> rpc_server_;
+    scoped_ptr<tera::MetricHttpServer> metric_http_server_;
 };
 
 } // namespace master
diff --git a/src/master/master_impl.cc b/src/master/master_impl.cc
index 598faa4ec..f667c7d0f 100644
--- a/src/master/master_impl.cc
+++ b/src/master/master_impl.cc
@@ -26,7 +26,7 @@
 #include "utils/config_utils.h"
 #include "utils/schema_utils.h"
 #include "utils/string_util.h"
-#include "utils/timer.h"
+#include "common/timer.h"
 #include "utils/utils_cmd.h"
 
 DECLARE_string(tera_master_port);
@@ -48,11 +48,15 @@ DECLARE_string(tera_master_meta_table_name);
 DECLARE_string(tera_master_meta_table_path);
 DECLARE_int32(tera_master_meta_retry_times);
 
+DECLARE_string(tera_coord_type);
 DECLARE_bool(tera_zk_enabled);
 DECLARE_bool(tera_mock_zk_enabled);
 
 DECLARE_double(tera_master_workload_split_threshold);
+DECLARE_double(tera_master_workload_merge_threshold);
 DECLARE_int64(tera_master_split_tablet_size);
+DECLARE_int64(tera_master_min_split_size);
+DECLARE_double(tera_master_min_split_ratio);
 DECLARE_int64(tera_master_merge_tablet_size);
 DECLARE_bool(tera_master_kick_tabletnode_enabled);
 DECLARE_int32(tera_master_kick_tabletnode_query_fail_times);
@@ -84,6 +88,8 @@ DECLARE_bool(tera_master_stat_table_enabled);
 DECLARE_int64(tera_master_stat_table_splitsize);
 
 DECLARE_int32(tera_master_gc_period);
+DECLARE_bool(tera_master_gc_trash_enabled);
+DECLARE_int64(tera_master_gc_trash_clean_period_s);
 
 DECLARE_string(tera_tabletnode_path_prefix);
 DECLARE_string(tera_leveldb_env_type);
@@ -108,6 +114,7 @@ DECLARE_int32(tera_master_schema_update_retry_times);
 DECLARE_int64(tera_master_availability_check_period);
 DECLARE_bool(tera_master_availability_check_enabled);
 
+DECLARE_bool(tera_master_update_split_meta);
 using namespace std::placeholders;
 
 namespace tera {
@@ -131,6 +138,8 @@ MasterImpl::MasterImpl()
       thread_pool_(new ThreadPool(FLAGS_tera_master_impl_thread_max_num)),
       is_stat_table_(false),
       stat_table_(NULL),
+      gc_trash_clean_enabled_(false),
+      gc_trash_clean_timer_id_(kInvalidTimerId),
       gc_enabled_(false),
       gc_timer_id_(kInvalidTimerId),
       gc_query_enable_(false),
@@ -152,15 +161,12 @@ MasterImpl::MasterImpl()
     if (FLAGS_tera_master_gc_strategy == "default") {
         LOG(INFO) << "[gc] gc strategy is BatchGcStrategy";
         gc_strategy_ = std::shared_ptr<GcStrategy>(new BatchGcStrategy(tablet_manager_));
-    } else if (FLAGS_tera_master_gc_strategy == "incremental") {
-        LOG(INFO) << "[gc] gc strategy is IncrementalGcStrategy";
-        gc_strategy_ = std::shared_ptr<GcStrategy>(new IncrementalGcStrategy(tablet_manager_));
     } else if (FLAGS_tera_master_gc_strategy == "trackable") {
         LOG(INFO) << "[gc] gc strategy is Trackable";
     } else {
-        LOG(WARNING) << "Unknown gc strategy: " << FLAGS_tera_master_gc_strategy
-            << ", default gc strategy: BatchGcStrategy will take effect";
-        gc_strategy_ = std::shared_ptr<GcStrategy>(new BatchGcStrategy(tablet_manager_));
+        LOG(ERROR) << "Unknown gc strategy: " << FLAGS_tera_master_gc_strategy
+            << ", exit";
+        exit(EXIT_FAILURE);
     }
 }
 
@@ -171,18 +177,29 @@ MasterImpl::~MasterImpl() {
 }
 
 bool MasterImpl::Init() {
-    if (FLAGS_tera_zk_enabled) {
+    if (FLAGS_tera_coord_type.empty()) {
+        LOG(ERROR) << "Note: We don't recommend that use '"
+                   << "--tera_[zk|ins|mock_zk|mock_ins]_enabled' flag for your cluster coord"
+                   << " replace by '--tera_coord_type=[zk|ins|mock_zk|mock_ins|fake_zk]'"
+                   << " flag is usually recommended.";
+    }
+    if (FLAGS_tera_coord_type == "zk"
+            || (FLAGS_tera_coord_type.empty() && FLAGS_tera_zk_enabled)) {
         zk_adapter_.reset(new MasterZkAdapter(this, local_addr_));
-    } else if (FLAGS_tera_ins_enabled) {
+    } else if (FLAGS_tera_coord_type == "ins"
+            || (FLAGS_tera_coord_type.empty() && FLAGS_tera_ins_enabled)) {
         LOG(INFO) << "ins mode" ;
         zk_adapter_.reset(new InsMasterZkAdapter(this, local_addr_));
-    } else if (FLAGS_tera_mock_zk_enabled) {
+    } else if (FLAGS_tera_coord_type == "mock_zk"
+            || (FLAGS_tera_coord_type.empty() && FLAGS_tera_mock_zk_enabled)) {
         LOG(INFO) << "mock zk mode" ;
         zk_adapter_.reset(new MockMasterZkAdapter(this, local_addr_));
-    } else if (FLAGS_tera_mock_ins_enabled) {
+    } else if (FLAGS_tera_coord_type == "mock_ins"
+            || (FLAGS_tera_coord_type.empty() && FLAGS_tera_mock_ins_enabled)) {
         LOG(INFO) << "mock ins mode" ;
         zk_adapter_.reset(new MockInsMasterZkAdapter(this, local_addr_));
-    } else {
+    } else if (FLAGS_tera_coord_type == "fake_zk"
+            || FLAGS_tera_coord_type.empty()) {
         LOG(INFO) << "fake zk mode!";
         zk_adapter_.reset(new FakeMasterZkAdapter(this, local_addr_));
     }
@@ -406,18 +423,22 @@ void MasterImpl::RestoreUserTablet(const std::vector<TabletMeta>& report_meta_li
             VLOG(8) << "READY Tablet, " << tablet;
             continue;
         }
-        tablet_availability_->AddNotReadyTablet(tablet->GetPath());
         CHECK(tablet->GetStatus() == kTableNotInit);
+        tablet_availability_->AddNotReadyTablet(tablet->GetPath(), tablet->GetStatus(),
+                                                tablet->GetTable()->GetStatus());
 
         TabletNodePtr node;
         if (server_addr.empty()) {
             tablet->SetStatus(kTableOffLine);
+            ProcessOffLineTablet(tablet);
             VLOG(8) << "OFFLINE Tablet with empty addr, " << tablet;
         } else if (!tabletnode_manager_->FindTabletNode(server_addr, &node)) {
             tablet->SetStatus(kTableOffLine);
+            ProcessOffLineTablet(tablet);
             VLOG(8) << "OFFLINE Tablet of Dead TS, " << tablet;
         } else if (node->state_ == kReady) {
             tablet->SetStatus(kTableOffLine);
+            ProcessOffLineTablet(tablet);
             VLOG(8) << "OFFLINE Tablet of Alive TS, " << tablet;
             TryLoadTablet(tablet, server_addr);
         } else {
@@ -1142,6 +1163,7 @@ void MasterImpl::ShowTables(const ShowTablesRequest* request,
             TabletPtr tablet = tablet_list[i];
             TabletMeta meta;
             tablet->ToMeta(&meta);
+            meta.set_last_move_time_us(tablet->LastMoveTime());
             tablet_meta_list->add_meta()->CopyFrom(meta);
             tablet_meta_list->add_counter()->CopyFrom(tablet->GetCounter());
             tablet_meta_list->add_timestamp(tablet->UpdateTime());
@@ -1266,6 +1288,8 @@ void MasterImpl::CmdCtrl(const CmdCtrlRequest* request,
         ReloadConfig(response);
     } else if (request->command() == "kick") {
         KickTabletNodeCmdCtrl(request, response);
+    } else if (request->command() == "table") {
+        TableCmdCtrl(request, response);
     } else {
         response->set_status(kInvalidArgument);
     }
@@ -1458,13 +1482,42 @@ void MasterImpl::ReloadConfig(CmdCtrlResponse* response) {
     }
 }
 
-void MasterImpl::TabletCmdCtrl(const CmdCtrlRequest* request,
-                               CmdCtrlResponse* response) {
+void MasterImpl::TableCmdCtrl(const CmdCtrlRequest* request,
+                              CmdCtrlResponse* response) {
     if (request->arg_list_size() < 2) {
         response->set_status(kInvalidArgument);
         return;
     }
 
+    if (request->arg_list(0) == "split") {
+        TabletPtr tablet;
+        StatusCode status;
+        for (int32_t i = 2; i < request->arg_list_size(); i++) {
+            if (!tablet_manager_->SearchTablet(request->arg_list(1),
+                                               request->arg_list(i),
+                                               &tablet, &status)) {
+                response->set_status(kInvalidArgument);
+                return;
+            }
+            VLOG(10) << "table split: key " << request->arg_list(i)
+                << ", " << tablet;
+            TrySplitTablet(tablet, request->arg_list(i));
+        }
+        response->set_status(kMasterOk);
+    } else {
+        response->set_status(kInvalidArgument);
+    }
+    return;
+}
+
+void MasterImpl::TabletCmdCtrl(const CmdCtrlRequest* request,
+                               CmdCtrlResponse* response) {
+    int32_t request_argc = request->arg_list_size();
+    if (request_argc < 2) {
+        response->set_status(kInvalidArgument);
+        return;
+    }
+    const std::string& op = request->arg_list(0);
     const std::string& tablet_id = request->arg_list(1);
     TabletPtr tablet;
     bool found = false;
@@ -1483,40 +1536,35 @@ void MasterImpl::TabletCmdCtrl(const CmdCtrlRequest* request,
         return;
     }
 
-    if (request->arg_list(0) == "reload") {
+    if (op == "reload" && request_argc == 2) {
         std::string current_server_addr = tablet->GetServerAddr();
         TryMoveTablet(tablet,
                       current_server_addr,
                       true);  // force to unload and load tablet even it on the same ts
-
-    } else if (request->arg_list(0) == "move") {
-        if (request->arg_list_size() > 3) {
-            response->set_status(kInvalidArgument);
-            return;
-        }
-        std::string expect_server_addr;
-        if (request->arg_list_size() == 3) {
-            expect_server_addr = request->arg_list(2);
-        }
+        response->set_status(kMasterOk);
+    } else if (op == "reloadx" && request_argc == 3
+               && tablet->SetErrorIgnoredLGs(request->arg_list(2))) {
+        std::string current_server_addr = tablet->GetServerAddr();
+        TryMoveTablet(tablet, current_server_addr, true);
+        response->set_status(kMasterOk);
+    } else if (op == "move" && request_argc == 3) {
+        std::string expect_server_addr = request->arg_list(2);
         TryMoveTablet(tablet, expect_server_addr);
         response->set_status(kMasterOk);
-    } else if (request->arg_list(0) == "split") {
-        if (request->arg_list_size() > 3) {
-            response->set_status(kInvalidArgument);
-            return;
-        }
+    } else if (op == "movex" && request_argc == 4
+            && tablet->SetErrorIgnoredLGs(request->arg_list(3))) {
+        std::string expect_server_addr = request->arg_list(2);
+        TryMoveTablet(tablet, expect_server_addr);
+        response->set_status(kMasterOk);
+    } else if (op == "split" && (request_argc == 2 || request_argc == 3)) {
         std::string split_key;
-        if (request->arg_list_size() == 3) {
+        if (request_argc == 3) {
             split_key = request->arg_list(2);
             LOG(INFO) << "User specified split key: " << split_key;
         }
         TrySplitTablet(tablet, split_key);
         response->set_status(kMasterOk);
-    } else if (request->arg_list(0) == "merge") {
-        if (request->arg_list_size() > 3) {
-            response->set_status(kInvalidArgument);
-            return;
-        }
+    } else if (op == "merge" && request_argc == 2) {
         TryMergeTablet(tablet);
         response->set_status(kMasterOk);
     } else {
@@ -1892,9 +1940,12 @@ bool MasterImpl::TabletNodeLoadBalance(TabletNodePtr tabletnode, Scheduler* sche
             split_size = tablet->GetSchema().split_size();
         }
         if (write_workload > FLAGS_tera_master_workload_split_threshold) {
-            split_size /= 2;
-            VLOG(6) << tablet->GetPath() << " write_workload too large, split it by size: "
-                << split_size;
+            if (split_size > FLAGS_tera_master_min_split_size) {
+                split_size = std::max(FLAGS_tera_master_min_split_size,
+                                      static_cast<int64_t>(split_size * FLAGS_tera_master_min_split_ratio));
+            }
+            VLOG(6) << tablet->GetPath() << ", trigger workload split, write_workload: " << write_workload
+                << ", split it by size(M): " << split_size;
         }
         int64_t merge_size = FLAGS_tera_master_merge_tablet_size;
         if (tablet->GetSchema().has_merge_size() && tablet->GetSchema().merge_size() > 0) {
@@ -1903,12 +1954,14 @@ bool MasterImpl::TabletNodeLoadBalance(TabletNodePtr tabletnode, Scheduler* sche
         if (tablet->GetDataSize() < 0) {
             // tablet size is error, skip it
             continue;
-        } else if (tablet->GetDataSize() > (split_size << 20)) {
+        } else if (tablet->GetDataSize() > (split_size << 20) &&
+                   tablet->TestAndSetSplitTimeStamp(get_micros())) {
             TrySplitTablet(tablet);
             any_tablet_split = true;
             continue;
         } else if (tablet->GetDataSize() < (merge_size << 20)) {
-            if (write_workload < 1) {
+            if (!tablet->IsBusy() &&
+                write_workload < FLAGS_tera_master_workload_merge_threshold) {
                 TryMergeTablet(tablet);
             } else {
                 VLOG(6) << "[merge] skip high workload tablet: "
@@ -2133,14 +2186,15 @@ void MasterImpl::DeleteTabletNode(const std::string& tabletnode_addr) {
     std::vector<TabletPtr>::iterator it;
     for (it = tablet_list.begin(); it != tablet_list.end(); ++it) {
         TabletPtr tablet = *it;
-        tablet_availability_->AddNotReadyTablet(tablet->GetPath());
 
         if (FLAGS_tera_master_tabletnode_timeout > 0
             && tablet->GetTableName() != FLAGS_tera_master_meta_table_name) {
-            tablet->SetStatusIf(kTabletPending, kTableReady);
-        } else if (tablet->SetStatusIf(kTableOffLine, kTableReady)) {
+            tablet->SetStatusIf(kTabletPending, kTableReady, tabletnode_addr);
+        } else if (tablet->SetStatusIf(kTableOffLine, kTableReady, tabletnode_addr)) {
             ProcessOffLineTablet(tablet);
         }
+        tablet_availability_->AddNotReadyTablet(tablet->GetPath(), tablet->GetStatus(),
+                                                tablet->GetTable()->GetStatus());
 
         if (tablet->GetStatus() == kTableUnLoadFail && tablet->GetMergeParam() != NULL) {
             MergeTabletUnloadCallback(tablet);
@@ -2238,6 +2292,7 @@ bool MasterImpl::EnterSafeMode(StatusCode* status) {
     tablet_manager_->Stop();
     DisableTabletNodeGcTimer();
     DisableLoadBalance();
+    DisableGcTrashCleanTimer();
     return true;
 }
 
@@ -2271,6 +2326,7 @@ bool MasterImpl::LeaveSafeMode(StatusCode* status) {
     EnableQueryTabletNodeTimer();
     EnableTabletNodeGcTimer();
     EnableLoadBalance();
+    EnableGcTrashCleanTimer();
 
     std::vector<TabletNodePtr> node_array;
     tabletnode_manager_->GetAllTabletNodeInfo(&node_array);
@@ -2481,6 +2537,14 @@ void MasterImpl::LoadTabletAsync(TabletPtr tablet, LoadClosure done, uint64_t) {
         request->add_parent_tablets(meta.parent_tablets(i));
     }
 
+    std::vector<std::string> ignore_err_lgs;
+    tablet->GetErrorIgnoredLGs(&ignore_err_lgs);
+    for (uint32_t i = 0; i < ignore_err_lgs.size(); ++i) {
+        VLOG(6) << "Add ignore err lg to request :" << ignore_err_lgs[i];
+        request->add_ignore_err_lgs(ignore_err_lgs[i]);
+    }
+    tablet->SetErrorIgnoredLGs(); // clean error lg, only for this request once
+
     LOG(INFO) << "LoadTabletAsync id: " << request->sequence_id() << ", "
         << tablet;
     node_client.LoadTablet(request, response, done);
@@ -3694,6 +3758,7 @@ void MasterImpl::SplitTabletAsync(TabletPtr tablet, const std::string& split_key
     request->add_child_tablets(tablet->GetTable()->GetNextTabletNo());
     request->add_child_tablets(tablet->GetTable()->GetNextTabletNo());
     request->set_split_key(split_key);
+    request->set_master_update_meta(FLAGS_tera_master_update_split_meta);
 
     tablet->ToMeta(request->mutable_tablet_meta());
     std::vector<uint64_t> snapshots;
@@ -3704,7 +3769,8 @@ void MasterImpl::SplitTabletAsync(TabletPtr tablet, const std::string& split_key
 
     LOG(INFO) << "SplitTabletAsync id: " << request->sequence_id() << ", "
         << tablet;
-    tablet_availability_->AddNotReadyTablet(tablet->GetPath());
+    tablet_availability_->AddNotReadyTablet(tablet->GetPath(), tablet->GetStatus(),
+                                            tablet->GetTable()->GetStatus());
     node_client.SplitTablet(request, response, done);
 }
 
@@ -3714,11 +3780,11 @@ void MasterImpl::SplitTabletCallback(TabletPtr tablet,
                                      bool failed, int error_code) {
     CHECK(tablet->GetStatus() == kTableOnSplit);
     StatusCode status = response->status();
-    delete request;
-    delete response;
+    std::unique_ptr<SplitTabletResponse> response_deleter(response);
+    std::unique_ptr<SplitTabletRequest> request_deleter(request);
     const std::string& server_addr = tablet->GetServerAddr();
 
-    // fail
+    // fail, RPC fail or unexpected return status
     if (failed || (status != kTabletNodeOk && status != kTableNotSupport
                    && status != kMetaTabletError)) {
         if (failed) {
@@ -3740,11 +3806,12 @@ void MasterImpl::SplitTabletCallback(TabletPtr tablet,
     if (status == kTabletNodeOk) {
         // tabletnode unloaded the tablet
         LOG(INFO) << "RPC SplitTablet success";
-    } else if (status == kTableNotSupport) {
+    } else if (status == kTableNotSupport) { // TODO: use TryLoadAsync will be more safe.
         // tabletnode refused to split and didn't unload the tablet
         tablet->SetStatusIf(kTableReady, kTableOnSplit);
         ProcessReadyTablet(tablet);
     } else {
+        // this will not be true once Master is responsible for write child tablets info, will be deleted
         CHECK(status == kMetaTabletError);
         // meta table is not ok
         LOG(ERROR) << "fail to split: " << StatusCodeToString(status) << ", "
@@ -3781,14 +3848,153 @@ void MasterImpl::SplitTabletCallback(TabletPtr tablet,
         tablet_availability_->EraseNotReadyTablet(tablet->GetPath());
         return;
     }
-
+    // old TS write child tablets info to meta table directly without sending back child tablets info
+    // we need scan MetaTable to get children meta info
+    if (response->split_keys_size() == 0) {
     // scan meta tablet
-    if (tablet->GetStatus() == kTableOnSplit) {
-        ScanClosure done =
-            std::bind(&MasterImpl::ScanMetaCallbackForSplit, this, tablet, _1, _2, _3, _4);
-        ScanMetaTableAsync(tablet->GetTableName(), tablet->GetKeyStart(),
-                           tablet->GetKeyEnd(), done);
+        if (tablet->GetStatus() == kTableOnSplit) {
+            ScanClosure done =
+                std::bind(&MasterImpl::ScanMetaCallbackForSplit, this, tablet, _1, _2, _3, _4);
+            ScanMetaTableAsync(tablet->GetTableName(), tablet->GetKeyStart(),
+                               tablet->GetKeyEnd(), done);
+        }
+    } else {
+        if (response->split_keys_size() > 1) {
+            LOG(INFO) << "currently we only support one split key, tablet "
+                << tablet << " will be split by key: " << response->split_keys(0);
+        }
+        SplitTabletWriteMetaAsync(tablet, response->split_keys(0));
+    }
+}
+
+void MasterImpl::SplitTabletWriteMetaAsync(TabletPtr tablet, const std::string& split_key) {
+    const std::string& key_start = tablet->GetKeyStart();
+    const std::string& key_end = tablet->GetKeyEnd();
+    if (split_key <= key_start || (key_end != "" && split_key >= key_end)) {
+         LOG(ERROR) << kSms << "two splits are not successive, "
+             << tablet << ", split_key: " << split_key;
+        // the tablet has alreay been unloaded, so we just mark it as kTableOffLine and try to reload it
+        tablet->SetStatus(kTableOffLine);
+        ProcessOffLineTablet(tablet);
+        TryLoadTablet(tablet);
+        return;
+    }
+    std::string meta_addr;
+    if (!tablet_manager_->GetMetaTabletAddr(&meta_addr)) {
+        LOG(ERROR) << "[split] meta table is not ready, try to load parent tablet";
+         tablet->SetStatus(kTableOffLine);
+        ProcessOffLineTablet(tablet);
+        TryLoadTablet(tablet);
+        return;
+    }
+
+    WriteTabletRequest* meta_request = new WriteTabletRequest;
+    WriteTabletResponse* meta_response = new WriteTabletResponse;
+    meta_request->set_sequence_id(this_sequence_id_.Inc());
+    meta_request->set_tablet_name(FLAGS_tera_master_meta_table_name);
+    meta_request->set_is_sync(true);
+    meta_request->set_is_instant(true);
+
+    const std::string& parent_path = tablet->GetPath();
+    int64_t parent_size = tablet->GetDataSize();
+    TablePtr table = tablet->GetTable();
+
+    std::string child_start_key = key_start;
+    std::string child_end_key = split_key;
+    std::vector<TabletPtr> child_tablets;
+    for (int i = 0; i < 2; ++i) {
+        TabletMeta child_meta;
+        tablet->ToMeta(&child_meta);
+        child_meta.clear_parent_tablets();
+        child_meta.add_parent_tablets(leveldb::GetTabletNumFromPath(parent_path));
+        child_meta.set_path(leveldb::GetChildTabletPath(parent_path, table->GetNextTabletNo()));
+        child_meta.mutable_key_range()->set_key_start(child_start_key);
+        child_meta.mutable_key_range()->set_key_end(child_end_key);
+        child_meta.set_size(parent_size / 2);
+        std::string meta_key, meta_value;
+        MakeMetaTableKeyValue(child_meta, &meta_key, &meta_value);
+        RowMutationSequence* mu_seq = meta_request->add_row_list();
+        mu_seq->set_row_key(meta_key);
+        Mutation* mutation = mu_seq->add_mutation_sequence();
+        mutation->set_type(kPut);
+        mutation->set_value(meta_value);
+        child_tablets.emplace_back(new Tablet(child_meta, table));
+        child_start_key = child_end_key;
+        child_end_key = key_end;
     }
+
+    WriteClosure done = std::bind(&MasterImpl::SplitTabletWriteMetaCallback, this, tablet,
+            child_tablets, FLAGS_tera_master_meta_retry_times, _1, _2, _3, _4);
+
+    tabletnode::TabletNodeClient meta_node_client(meta_addr);
+    meta_node_client.WriteTablet(meta_request, meta_response, done);
+    return;
+}
+
+void MasterImpl::SplitTabletWriteMetaCallback(TabletPtr parent_tablet,
+                                            std::vector<TabletPtr> child_tablets,
+                                            int32_t retry_times,
+                                            WriteTabletRequest* request,
+                                            WriteTabletResponse* response,
+                                            bool failed, int error_code) {
+    StatusCode status = response->status();
+    if (!failed && status == kTabletNodeOk) {
+        CHECK_EQ(response->row_status_list_size(), 2);
+        CHECK_EQ(child_tablets.size(), 2);
+        status = response->row_status_list(0);
+    }
+    delete request;
+    delete response;
+    if (failed || status != kTabletNodeOk) {
+        if (failed) {
+            LOG(ERROR) << "[split] fail to add to meta tablet "
+                << sofa::pbrpc::RpcErrorCodeToString(error_code) << ","
+                << parent_tablet;
+        } else {
+            LOG(ERROR) << "[split] fail to add to meta tablet"
+                << StatusCodeToString(status) << "," << parent_tablet;
+        }
+        if (retry_times <= 0) {
+            LOG(ERROR) << kSms << "[split] fail to update meta tablet in max retry"
+                <<" times, parent_tablet: " << parent_tablet;
+            parent_tablet->SetStatus(kTableOffLine);
+            ProcessOffLineTablet(parent_tablet);
+            TryLoadTablet(parent_tablet);
+        } else {
+            std::vector<ToMetaFunc> meta_entries;
+            for (std::size_t idx = 0; idx < child_tablets.size(); ++idx) {
+                meta_entries.push_back(std::bind(
+                            &Tablet::ToMetaTableKeyValue, child_tablets[idx], _1, _2));
+            }
+            WriteClosure done = std::bind(&MasterImpl::SplitTabletWriteMetaCallback, this,
+                    parent_tablet, child_tablets, retry_times - 1, _1, _2, _3, _4);
+            SuspendMetaOperation(meta_entries, false, done);
+        }
+        return;
+    }
+
+    TabletMeta first_meta, second_meta;
+    child_tablets[0]->ToMeta(&first_meta);
+    first_meta.set_status(kTableOffLine);
+    child_tablets[1]->ToMeta(&second_meta);
+    second_meta.set_status(kTableOffLine);
+    TablePtr table = parent_tablet->GetTable();
+    table->SplitTablet(parent_tablet, first_meta, second_meta, &child_tablets[0], &child_tablets[1]);
+
+    tablet_availability_->EraseNotReadyTablet(parent_tablet->GetPath());
+    tablet_availability_->AddNotReadyTablet(child_tablets[0]->GetPath(), child_tablets[0]->GetStatus(),
+                                            table->GetStatus());
+    tablet_availability_->AddNotReadyTablet(child_tablets[1]->GetPath(), child_tablets[1]->GetStatus(),
+                                            table->GetStatus());
+    LOG(INFO) << "split finish," << parent_tablet << ", try load child tablets,"
+        << "\nfirst: " << first_meta.ShortDebugString()
+        << "\nsecond: " << second_meta.ShortDebugString();
+
+    ProcessOffLineTablet(child_tablets[0]);
+    TryLoadTablet(child_tablets[0]);
+    ProcessOffLineTablet(child_tablets[1]);
+    TryLoadTablet(child_tablets[1]);
+    return;
 }
 
 void MasterImpl::TryLoadTablet(TabletPtr tablet, std::string server_addr) {
@@ -3971,6 +4177,18 @@ bool MasterImpl::TrySplitTablet(TabletPtr tablet, const std::string& split_key)
     // abort if status switch to offline (server down / disable)
     if (!tablet->SetStatusIf(kTableOnSplit, kTableReady)) {
         LOG(ERROR) << "error state, abort split table " << tablet->GetPath();
+        node->FinishSplit();
+
+        TabletPtr next_tablet;
+        std::string split_key;
+        while (node->SplitNextWaitTablet(&next_tablet, &split_key)) {
+            if (next_tablet->SetStatusIf(kTableOnSplit, kTableReady)) {
+                next_tablet->SetServerId(node->uuid_);
+                SplitTabletAsync(next_tablet, split_key);
+                break;
+            }
+            node->FinishSplit();
+        }
         return false;
     }
 
@@ -4005,7 +4223,7 @@ bool MasterImpl::TryMergeTablet(TabletPtr tablet) {
 
     if (tablet2->GetStatus() != kTableReady ||
         tablet2->IsBusy() ||
-        tablet2->GetCounter().write_workload() >= 1) {
+        tablet2->GetCounter().write_workload() >= FLAGS_tera_master_workload_merge_threshold) {
         VLOG(20) << "[merge] merge failed, none proper tablet."
             << " status:" << tablet2->GetStatus()
             << " isbusy:" << tablet2->IsBusy()
@@ -4051,8 +4269,10 @@ void MasterImpl::MergeTabletAsync(TabletPtr tablet_p1, TabletPtr tablet_p2) {
         std::bind(&MasterImpl::UnloadTabletCallback, this, tablet_p2,
                    FLAGS_tera_master_impl_retry_times, _1, _2, _3, _4);
 
-    tablet_availability_->AddNotReadyTablet(tablet_p1->GetPath());
-    tablet_availability_->AddNotReadyTablet(tablet_p2->GetPath());
+    tablet_availability_->AddNotReadyTablet(tablet_p1->GetPath(), tablet_p1->GetStatus(),
+                                            tablet_p1->GetTable()->GetStatus());
+    tablet_availability_->AddNotReadyTablet(tablet_p2->GetPath(), tablet_p2->GetStatus(),
+                                            tablet_p2->GetTable()->GetStatus());
     UnloadTabletAsync(tablet_p1, done1);
     UnloadTabletAsync(tablet_p2, done2);
 }
@@ -4230,8 +4450,9 @@ void MasterImpl::MergeTabletWriteMetaCallback(TabletPtr tablet_c,
 
     tablet_availability_->EraseNotReadyTablet(tablet_p1->GetPath());
     tablet_availability_->EraseNotReadyTablet(tablet_p2->GetPath());
-    tablet_availability_->AddNotReadyTablet(tablet_c->GetPath());
     ProcessOffLineTablet(tablet_c);
+    tablet_availability_->AddNotReadyTablet(tablet_c->GetPath(), tablet_c->GetStatus(),
+                                            tablet_c->GetTable()->GetStatus());
     TryLoadTablet(tablet_c);
     delete request;
     delete response;
@@ -4475,6 +4696,8 @@ void MasterImpl::UpdateTableRecordForEnableCallback(TablePtr table, int32_t retr
             LOG(ERROR) << "fail to load tablet: " << tablet->GetPath()
                 << ", tablet status: " << StatusCodeToString(tablet->GetStatus());
         }
+        tablet_availability_->AddNotReadyTablet(tablet->GetPath(), tablet->GetStatus(),
+                                                tablet->GetTable()->GetStatus());
     }
 }
 
@@ -4871,8 +5094,10 @@ void MasterImpl::ScanMetaCallbackForSplit(TabletPtr tablet,
     table->SplitTablet(tablet, first_meta, second_meta, &first_tablet, &second_tablet);
 
     tablet_availability_->EraseNotReadyTablet(tablet->GetPath());
-    tablet_availability_->AddNotReadyTablet(first_tablet->GetPath());
-    tablet_availability_->AddNotReadyTablet(second_tablet->GetPath());
+    tablet_availability_->AddNotReadyTablet(first_tablet->GetPath(), first_tablet->GetStatus(),
+                                            first_tablet->GetTable()->GetStatus());
+    tablet_availability_->AddNotReadyTablet(second_tablet->GetPath(), second_tablet->GetStatus(),
+                                            second_tablet->GetTable()->GetStatus());
     LOG(INFO) << "split finish, " << tablet << ", try load child tablets, "
         << "\nfirst: " << first_meta.ShortDebugString()
         << "\nsecond: " << second_meta.ShortDebugString();
@@ -5074,12 +5299,14 @@ void MasterImpl::TryMoveTablet(TabletPtr tablet, const std::string& server_addr,
         << " to " << server_addr;
     if (tablet->SetStatusIf(kTableUnLoading, kTableReady)) {
         tablet->SetExpectServerAddr(server_addr);
+        tablet->SetLastMoveTime(get_micros());
         TabletNodePtr node;
         if (!server_addr.empty() &&
             tabletnode_manager_->FindTabletNode(server_addr, &node)) {
             node->PlanToMoveIn();
         }
-        tablet_availability_->AddNotReadyTablet(tablet->GetPath());
+        tablet_availability_->AddNotReadyTablet(tablet->GetPath(), tablet->GetStatus(),
+                                                tablet->GetTable()->GetStatus());
         UnloadClosure done =
             std::bind(&MasterImpl::UnloadTabletCallback, this, tablet,
                        FLAGS_tera_master_impl_retry_times, _1, _2, _3, _4);
@@ -5209,6 +5436,60 @@ void MasterImpl::EnableTabletNodeGcTimer() {
     gc_enabled_ = true;
 }
 
+void MasterImpl::DoGcTrashClean() {
+    {
+        MutexLock lock(&mutex_);
+        if (!gc_trash_clean_enabled_) {
+            gc_trash_clean_timer_id_ = kInvalidTimerId;
+            return;
+        }
+    }
+
+    int64_t start_ts = get_micros();
+    io::CleanTrackableGcTrash();
+    LOG(INFO) << "[gc] clean trackable gc trash, cost: "
+        << (get_micros() - start_ts) / 1000 << " ms";
+
+    MutexLock lock(&mutex_);
+    ScheduleGcTrashClean();
+}
+
+void MasterImpl::ScheduleGcTrashClean() {
+    mutex_.AssertHeld();
+    VLOG(10) << "[gc] ScheduleGcTrashClean";
+    ThreadPool::Task task =
+        std::bind(&MasterImpl::DoGcTrashClean, this);
+    gc_timer_id_ = thread_pool_->DelayTask(
+        FLAGS_tera_master_gc_trash_clean_period_s * 1000, task);
+}
+
+void MasterImpl::EnableGcTrashCleanTimer() {
+    if (!FLAGS_tera_master_gc_trash_enabled) {
+        return;
+    }
+
+    MutexLock lock(&mutex_);
+    if (gc_trash_clean_timer_id_ == kInvalidTimerId) {
+        ScheduleGcTrashClean();
+    }
+    gc_trash_clean_enabled_ = true;
+}
+
+void MasterImpl::DisableGcTrashCleanTimer() {
+    if (!FLAGS_tera_master_gc_trash_enabled) {
+        return;
+    }
+
+    MutexLock lock(&mutex_);
+    if (gc_trash_clean_timer_id_ != kInvalidTimerId) {
+        bool non_block = true;
+        if (thread_pool_->CancelTask(gc_timer_id_, non_block)) {
+            gc_trash_clean_timer_id_ = kInvalidTimerId;
+        }
+    }
+    gc_trash_clean_enabled_ = false;
+}
+
 void MasterImpl::DoAvailableCheck() {
     MutexLock lock(&mutex_);
     if (FLAGS_tera_master_availability_check_enabled) {
@@ -5285,9 +5566,9 @@ void MasterImpl::DoTabletNodeGcPhase2() {
     }
 
     LOG(INFO) << "[gc] try clean trash dir.";
-    int64_t start = common::timer::get_micros();
+    int64_t start = get_micros();
     io::CleanTrashDir();
-    int64_t cost = (common::timer::get_micros() - start) / 1000;
+    int64_t cost = (get_micros() - start) / 1000;
     LOG(INFO) << "[gc] clean trash dir done, cost: " << cost << "ms.";
 
     MutexLock lock(&mutex_);
diff --git a/src/master/master_impl.h b/src/master/master_impl.h
index a8959c703..3a7a17b7e 100644
--- a/src/master/master_impl.h
+++ b/src/master/master_impl.h
@@ -52,7 +52,6 @@ class MetaTable;
 class Scheduler;
 class TabletManager;
 class TabletNodeManager;
-class MasterImplTest;
 
 class MasterImpl {
 public:
@@ -233,6 +232,8 @@ class MasterImpl {
                                CmdCtrlResponse* response);
     void TabletCmdCtrl(const CmdCtrlRequest* request,
                        CmdCtrlResponse* response);
+    void TableCmdCtrl(const CmdCtrlRequest* request,
+                      CmdCtrlResponse* response);
     void MetaCmdCtrl(const CmdCtrlRequest* request,
                      CmdCtrlResponse* response);
 
@@ -363,6 +364,15 @@ class MasterImpl {
                              SplitTabletResponse* response, bool failed,
                              int error_code);
 
+    virtual void SplitTabletWriteMetaAsync(TabletPtr tablet, const std::string& split_key);
+
+    void SplitTabletWriteMetaCallback(TabletPtr parent_tablet,
+                                    std::vector<TabletPtr> child_tablets,
+                                    int32_t retry_times,
+                                    WriteTabletRequest* request,
+                                    WriteTabletResponse* response,
+                                    bool failed, int err_code);
+
     void MergeTabletAsync(TabletPtr tablet_p1, TabletPtr tablet_p2);
     virtual void MergeTabletAsyncPhase2(TabletPtr tablet_p1, TabletPtr tablet_p2);
     void MergeTabletUnloadCallback(TabletPtr tablet);
@@ -440,7 +450,7 @@ class MasterImpl {
                              WriteTabletResponse* response,
                              bool failed, int error_code);
 
-    void ScanMetaTableAsync(const std::string& table_name,
+    virtual void ScanMetaTableAsync(const std::string& table_name,
                             const std::string& tablet_key_start,
                             const std::string& tablet_key_end,
                             ScanClosure done);
@@ -535,6 +545,10 @@ class MasterImpl {
     void DumpStatToTable(const TabletNode& stat);
 
     // garbage clean
+    void EnableGcTrashCleanTimer();
+    void DisableGcTrashCleanTimer();
+    void ScheduleGcTrashClean();
+    void DoGcTrashClean();
     void EnableTabletNodeGcTimer();
     void DisableTabletNodeGcTimer();
     void ScheduleTabletNodeGc();
@@ -609,6 +623,8 @@ class MasterImpl {
     TableImpl* stat_table_;
 
     // tabletnode garbage clean
+    bool gc_trash_clean_enabled_;
+    int64_t gc_trash_clean_timer_id_;
     bool gc_enabled_;
     int64_t gc_timer_id_;
     bool gc_query_enable_;
diff --git a/src/master/master_zk_adapter.cc b/src/master/master_zk_adapter.cc
index 6f481f225..7227a43ae 100644
--- a/src/master/master_zk_adapter.cc
+++ b/src/master/master_zk_adapter.cc
@@ -387,14 +387,6 @@ void MasterZkAdapter::OnSafeModeMarkDeleted() {
     LOG(ERROR) << "safemode mark node is deleted";
 }
 
-void MasterZkAdapter::OnMasterLockLost() {
-    LOG(ERROR) << "master lock lost";
-    master_impl_->SetMasterStatus(MasterImpl::kIsSecondary);
-    master_impl_->DisableQueryTabletNodeTimer();
-    DeleteMasterNode();
-    Reset();
-}
-
 void MasterZkAdapter::OnTabletNodeListDeleted() {
     LOG(ERROR) << "ts dir node is deleted";
     if (!MarkSafeMode()) {
diff --git a/src/master/master_zk_adapter.h b/src/master/master_zk_adapter.h
index 618dbc984..7419a1246 100644
--- a/src/master/master_zk_adapter.h
+++ b/src/master/master_zk_adapter.h
@@ -66,7 +66,6 @@ class MasterZkAdapter : public MasterZkAdapterBase {
 
     virtual void OnSafeModeMarkCreated();
     virtual void OnSafeModeMarkDeleted();
-    virtual void OnMasterLockLost();
     virtual void OnTabletNodeListDeleted();
     virtual void OnRootTabletNodeDeleted();
     virtual void OnMasterNodeDeleted();
diff --git a/src/master/tablet_manager.cc b/src/master/tablet_manager.cc
index d8049e26c..e45f99bd8 100644
--- a/src/master/tablet_manager.cc
+++ b/src/master/tablet_manager.cc
@@ -31,15 +31,17 @@
 DECLARE_string(tera_working_dir);
 DECLARE_string(tera_master_meta_table_path);
 DECLARE_string(tera_master_meta_table_name);
-DECLARE_bool(tera_zk_enabled);
 
 DECLARE_string(tera_master_gc_strategy);
+DECLARE_bool(tera_master_gc_trash_enabled);
 DECLARE_int32(tera_master_impl_retry_times);
 DECLARE_int32(tera_tabletnode_connect_retry_period);
 
 DECLARE_bool(tera_delete_obsolete_tabledir_enabled);
 
 DECLARE_string(tera_tabletnode_path_prefix);
+DECLARE_int64(tera_master_split_history_time_interval);
+DECLARE_string(tera_leveldb_env_type);
 
 namespace tera {
 namespace master {
@@ -63,20 +65,22 @@ std::ostream& operator << (std::ostream& o, const TabletPtr& tablet) {
     return o;
 }
 
-Tablet::Tablet(const TabletMeta& meta)
-    : meta_(meta),
-      update_time_(common::timer::get_micros()),
-      ready_time_(std::numeric_limits<int64_t>::max()),
-      merge_param_(NULL),
-      gc_reported_(false) {}
+Tablet::Tablet(const TabletMeta& meta):
+    meta_(meta),
+    update_time_(get_micros()),
+    ready_time_(std::numeric_limits<int64_t>::max()),
+    last_move_time_us_(0),
+    merge_param_(NULL),
+    gc_reported_(false) { }
 
-Tablet::Tablet(const TabletMeta& meta, TablePtr table)
-    : meta_(meta),
-      table_(table),
-      update_time_(common::timer::get_micros()),
-      ready_time_(std::numeric_limits<int64_t>::max()),
-      merge_param_(NULL),
-      gc_reported_(false) {}
+Tablet::Tablet(const TabletMeta& meta, TablePtr table):
+    meta_(meta),
+    table_(table),
+    update_time_(get_micros()),
+    ready_time_(std::numeric_limits<int64_t>::max()),
+    last_move_time_us_(0),
+    merge_param_(NULL),
+    gc_reported_(false) { }
 
 Tablet::~Tablet() {
     table_.reset();
@@ -131,6 +135,21 @@ int64_t Tablet::GetQps() {
         + average_counter_.scan_rows();
 }
 
+int64_t Tablet::GetReadQps() {
+    MutexLock lock(&mutex_);
+    return average_counter_.read_rows();
+}
+
+int64_t Tablet::GetWriteQps() {
+    MutexLock lock(&mutex_);
+    return average_counter_.write_rows();
+}
+
+int64_t Tablet::GetScanQps() {
+    MutexLock lock(&mutex_);
+    return average_counter_.scan_rows();
+}
+
 const std::string& Tablet::GetKeyStart() {
     MutexLock lock(&mutex_);
     return meta_.key_range().key_start();
@@ -188,8 +207,47 @@ bool Tablet::IsBusy() {
     if (counter_list_.size() > 0) {
         return counter_list_.back().is_on_busy();
     } else {
-        return false;
+        return average_counter_.is_on_busy();
+    }
+}
+
+bool Tablet::TestAndSetSplitTimeStamp(int64_t ts) { // timestamp in us
+    ts /= 1000; // transalte into ms
+    //MutexLock lock(&mutex_);
+    if (split_history_.last_split_ts < (ts - FLAGS_tera_master_split_history_time_interval)) {
+        split_history_.last_split_ts = ts;
+        return true;
+    }
+    return false;
+}
+
+void Tablet::GetErrorIgnoredLGs(std::vector<std::string>* lgs) {
+    MutexLock lock(&mutex_);
+    *lgs = ignore_err_lgs_;
+}
+
+bool Tablet::SetErrorIgnoredLGs(const std::string& lg_list_str) {
+    if (lg_list_str.empty()) {
+        MutexLock lock(&mutex_);
+        ignore_err_lgs_.clear();
+        return true;
     }
+    std::vector<std::string> lgs;
+    SplitString(lg_list_str, ":", &lgs);
+    const TableSchema& schema = GetSchema();
+    std::set<std::string> lg_schema_set;
+    for (int i = 0; i < schema.locality_groups_size(); ++i) {
+        lg_schema_set.insert(schema.locality_groups(i).name());
+    }
+    for (const auto& lg : lgs) {
+        if (lg_schema_set.find(lg) == lg_schema_set.end()) {
+            LOG(WARNING) << "set error ignored locality group ["<< lg << "] failed.";
+            return false;
+        }
+    }
+    MutexLock lock(&mutex_);
+    ignore_err_lgs_ = lgs;
+    return true;
 }
 
 std::string Tablet::DebugString() {
@@ -220,8 +278,8 @@ void Tablet::SetCounter(const TabletCounter& counter) {
     average_counter_.set_write_size(
         CounterWeightedSum(counter.write_size(), average_counter_.write_size()));
     average_counter_.set_write_workload(counter.write_workload());
-    average_counter_.set_is_on_busy(
-        CounterWeightedSum(counter.is_on_busy(), average_counter_.is_on_busy()));
+    average_counter_.set_is_on_busy(counter.is_on_busy());
+    average_counter_.set_db_status(counter.db_status());
 }
 
 void Tablet::UpdateSize(const TabletMeta& meta) {
@@ -282,6 +340,22 @@ bool Tablet::SetStatusIf(TabletStatus new_status, TabletStatus if_status,
     return false;
 }
 
+bool Tablet::SetStatusIf(TabletStatus new_status,
+                         TabletStatus if_status,
+                         const std::string& if_addr) {
+    MutexLock lock(&mutex_);
+    if (meta_.status() == if_status &&
+        meta_.server_addr() == if_addr &&
+        CheckStatusSwitch(meta_.status(), new_status)) {
+        meta_.set_status(new_status);
+        if (new_status == kTableReady) {
+            ready_time_ = get_micros();
+        }
+        return true;
+    }
+    return false;
+}
+
 bool Tablet::SetStatusIf(TabletStatus new_status, TabletStatus if_status,
                          TableStatus if_table_status, TabletStatus* old_status) {
     if (!IsBound()) {
@@ -368,12 +442,22 @@ int64_t Tablet::SetUpdateTime(int64_t timestamp) {
 int64_t Tablet::ReadyTime() {
     MutexLock lock(&mutex_);
     if (meta_.status() != kTableReady) {
-        return std::numeric_limits<int>::max();
+        return std::numeric_limits<int64_t>::max();
     } else {
         return ready_time_;
     }
 }
 
+int64_t Tablet::LastMoveTime() const {
+    MutexLock lock(&mutex_);
+    return last_move_time_us_;
+}
+
+void Tablet::SetLastMoveTime(int64_t time) {
+    MutexLock lock(&mutex_);
+    last_move_time_us_ = time;
+}
+
 int32_t Tablet::AddSnapshot(uint64_t snapshot) {
     MutexLock lock(&mutex_);
     meta_.add_snapshot_list(snapshot);
@@ -582,6 +666,7 @@ Table::Table(const std::string& table_name)
       deleted_tablet_num_(0),
       max_tablet_no_(0),
       create_time_((int64_t)time(NULL)),
+      metric_(table_name),
       schema_is_syncing_(false),
       rangefragment_(NULL),
       update_rpc_response_(NULL),
@@ -936,6 +1021,10 @@ void Table::RefreshCounter() {
         sspeed += counter.scan_size();
     }
 
+    metric_.SetTableSize(size);
+    metric_.SetTabletNum(tablet_num);
+    metric_.SetNotReady(notready);
+
     counter_.set_size(size);
     counter_.set_tablet_num(tablet_num);
     counter_.set_notready_num(notready);
@@ -1175,9 +1264,14 @@ bool Table::TryCollectInheritedFile() {
         std::vector<TabletFile> tablet_files;
         CollectInheritedFileFromFilesystem(name_, *it, &tablet_files);
 
-        for (uint32_t i = 0; i < tablet_files.size(); i++) {
+        if (tablet_files.empty()) {
             MutexLock l(&mutex_);
-            AddInheritedFile(tablet_files[i], false);
+            AddEmptyDeadTablet(*it);
+        } else {
+            for (uint32_t i = 0; i < tablet_files.size(); i++) {
+                MutexLock l(&mutex_);
+                AddInheritedFile(tablet_files[i], false);
+            }
         }
     }
     return dead_tablets.size() > 0;
@@ -1269,6 +1363,10 @@ bool Table::GetTabletsForGc(std::set<uint64_t>* live_tablets,
             VLOG(10) << "[gc] add dead tablet: " << path;
             dead_tablets->insert(tabletnum);
         }
+
+        if (0 == tabletnum) {
+            LOG(WARNING) << "[gc] invalid tablet path found: <" << path << ">";
+        }
     }
     if (dead_tablets->size() == 0) {
         VLOG(10) << "[gc] there is none dead tablets: " << name_;
@@ -1300,6 +1398,17 @@ void Table::AddInheritedFile(const TabletFile& file, bool need_ref) {
     VLOG(10) << "[gc] [" << name_ << "] file " << file << " ref increment to " << file_info.ref;
 }
 
+void Table::AddEmptyDeadTablet(uint64_t tablet_id) {
+    mutex_.AssertHeld();
+
+    if (useful_inh_files_.find(tablet_id) == useful_inh_files_.end()) {
+        LOG(INFO) << "[gc] [" << name_ << "] new empty dead tablet "
+            << tablet_id << ", gc disabled";
+        gc_disabled_dead_tablets_.insert(tablet_id);
+        useful_inh_files_[tablet_id];
+    }
+}
+
 uint64_t Table::CleanObsoleteFile() {
     leveldb::Env* env = io::LeveldbBaseEnv();
     std::string table_path = FLAGS_tera_tabletnode_path_prefix + name_;
@@ -1314,13 +1423,38 @@ uint64_t Table::CleanObsoleteFile() {
         leveldb::Status s;
         if (file.lg_id == 0 && file.file_id == 0) {
             std::string path = leveldb::BuildTabletPath(table_path, file.tablet_id);
+            leveldb::FileLock* file_lock = nullptr;
+            // NEVER remove the trailing character '/', otherwise you will lock the parent directory
+            s = env->LockFile(path + "/", &file_lock);
+            if (!s.ok()) {
+                LOG(WARNING) << "lock path failed, path: " << path << ", status: " << s.ToString();
+            }
+            delete file_lock;
+
             LOG(INFO) << "[gc] [" << name_ << "] delete dir " << path;
             s = io::DeleteEnvDir(path); //safely delete dir and all file in it
         } else {
+            std::string lg_path = leveldb::BuildTabletLgPath(table_path, file.tablet_id, file.lg_id);
+            leveldb::FileLock* file_lock = nullptr;
+            // NEVER remove the trailing character '/', otherwise you will lock the parent directory
+            s = env->LockFile(lg_path + "/", &file_lock);
+            if (!s.ok()) {
+                LOG(WARNING) << "lock path failed, path: " << lg_path << ", status: " << s.ToString();
+            }
+
+            delete file_lock;
+
             std::string path = leveldb::BuildTableFilePath(table_path, file.tablet_id,
                                                            file.lg_id, file.file_id);
-            LOG(INFO) << "[gc] [" << name_ << "] delete file " << file << " path " << path;
-            s = env->DeleteFile(path);
+            if (FLAGS_tera_master_gc_trash_enabled) {
+                LOG(INFO) << "[gc] [" << name_ << "] move file to trash, file: "
+                    << file << ", path: " << path;
+                // move sst to trackable gc trash instead of deleting it directly
+                s = io::MoveSstToTrackableGcTrash(name_, file.tablet_id, file.lg_id, file.file_id);
+            } else {
+                LOG(INFO) << "[gc] [" << name_ << "] delete file " << file << " path " << path;
+                s = env->DeleteFile(path);
+            }
         }
         mutex_.Lock();
         if (!s.ok()) {
@@ -1554,6 +1688,40 @@ bool TabletManager::FindOverlappedTablets(const std::string& table_name,
     return true;
 }
 
+bool TabletManager::SearchTablet(const std::string& table_name,
+                                 const std::string& key,
+                                 TabletPtr* tablet,
+                                 StatusCode* ret_status) {
+    // lock table list
+    mutex_.Lock();
+
+    // search table
+    TableList::iterator it = all_tables_.find(table_name);
+    if (it == all_tables_.end()) {
+        mutex_.Unlock();
+        VLOG(5) << "table: " << table_name << " not exist";
+        SetStatusCode(kTableNotFound, ret_status);
+        return false;
+    }
+    Table& table = *it->second;
+
+    // lock table
+    table.mutex_.Lock();
+    mutex_.Unlock();
+
+    // search tablet
+    Table::TabletList::reverse_iterator rit2 = table.tablets_list_.rbegin();
+    for (; rit2 != table.tablets_list_.rend(); ++rit2) {
+        if (rit2->first <= key) {
+            *tablet = rit2->second;
+            break;
+        }
+    }
+
+    table.mutex_.Unlock();
+    return true;
+}
+
 bool TabletManager::FindTable(const std::string& table_name,
                               std::vector<TabletPtr>* tablet_meta_list,
                               StatusCode* ret_status) {
diff --git a/src/master/tablet_manager.h b/src/master/tablet_manager.h
index 1e58d62cf..07e942ecb 100644
--- a/src/master/tablet_manager.h
+++ b/src/master/tablet_manager.h
@@ -16,11 +16,12 @@
 
 #include "common/mutex.h"
 #include "common/thread_pool.h"
+#include "common/metric/metric_counter.h"
 
 #include "proto/master_rpc.pb.h"
 #include "proto/table_meta.pb.h"
 #include "proto/tabletnode_rpc.pb.h"
-#include "utils/counter.h"
+#include "common/counter.h"
 #include "utils/fragment.h"
 
 using namespace std::placeholders;
@@ -83,7 +84,9 @@ class Tablet {
     friend std::ostream& operator << (std::ostream& o, const Tablet& tablet);
 
 public:
-    Tablet();
+    Tablet() = delete;
+    Tablet(const Tablet&) = delete;
+    Tablet& operator=(const Tablet&) = delete;
     explicit Tablet(const TabletMeta& meta);
     Tablet(const TabletMeta& meta, TablePtr table);
     ~Tablet();
@@ -95,6 +98,9 @@ class Tablet {
     int64_t GetDataSize();
     void GetDataSize(int64_t* size, std::vector<int64_t>* lg_size);
     int64_t GetQps();
+    int64_t GetReadQps();
+    int64_t GetWriteQps();
+    int64_t GetScanQps();
 
     const std::string& GetKeyStart();
     const std::string& GetKeyEnd();
@@ -117,6 +123,9 @@ class Tablet {
     bool SetStatus(TabletStatus new_status, TabletStatus* old_status = NULL);
     bool SetStatusIf(TabletStatus new_status, TabletStatus if_status,
                      TabletStatus* old_status = NULL);
+    bool SetStatusIf(TabletStatus new_status,
+                     TabletStatus if_status,
+                     const std::string& if_addr);
     bool SetStatusIf(TabletStatus new_status, TabletStatus if_status,
                      TableStatus if_table_status, TabletStatus* old_status = NULL);
     bool SetAddrIf(const std::string& server_addr, TabletStatus if_status,
@@ -151,13 +160,20 @@ class Tablet {
     int64_t UpdateTime();
     int64_t SetUpdateTime(int64_t timestamp);
     int64_t ReadyTime();
+    int64_t LastMoveTime() const;
+    void SetLastMoveTime(int64_t time);
 
     void* GetMergeParam();
     void SetMergeParam(void* merge_param);
 
+    bool TestAndSetSplitTimeStamp(int64_t ts);
+
+    // Will set a flag to ignore lost file error when tabletserver load tablet.
+    // We should set specific locality_groups that avoid missing some of the
+    // exceptions in others locality_groups.
+    void GetErrorIgnoredLGs(std::vector<std::string>* lgs);
+    bool SetErrorIgnoredLGs(const std::string& lg_list_str = "");
 private:
-    Tablet(const Tablet&) {}
-    Tablet& operator=(const Tablet&) {return *this;}
 
     static bool CheckStatusSwitch(TabletStatus old_status,
                                   TabletStatus new_status);
@@ -167,8 +183,10 @@ class Tablet {
     TablePtr table_;
     int64_t update_time_;
     int64_t ready_time_;
+    int64_t last_move_time_us_;
     std::string server_id_;
     std::string expect_server_addr_;
+    std::vector<std::string> ignore_err_lgs_; // lg array for ignore_err_
     std::list<TabletCounter> counter_list_;
     TabletCounter average_counter_;
     struct TabletAccumulateCounter {
@@ -189,6 +207,14 @@ class Tablet {
     } accumu_counter_;
     void* merge_param_;
 
+    // Tablet Split History Tracing
+    struct TabletSplitHistory {
+        int64_t last_split_ts;
+
+        TabletSplitHistory()
+        : last_split_ts(0) {}
+    } split_history_;
+
     // protected by Table::mutex_
     bool gc_reported_;
     std::multiset<TabletFile> inh_files_;
@@ -199,6 +225,42 @@ std::ostream& operator << (std::ostream& o, const TabletPtr& tablet);
 std::ostream& operator << (std::ostream& o, const TablePtr& table);
 
 class Table {
+
+    class TableMetric {
+    public:
+        TableMetric(const std::string& name):
+            table_name_(name),
+            tablet_num_("tera_master_tablet_num", GetTableNameLabel(),
+                        {SubscriberType::LATEST}, false),
+            not_ready_("tera_master_tablet_not_ready_num", GetTableNameLabel(),
+                       {SubscriberType::LATEST}, false),
+            table_size_("tera_master_table_size", GetTableNameLabel(),
+                        {SubscriberType::LATEST}, false)
+            {}
+
+        void SetTabletNum(int64_t tablet_num) {
+            tablet_num_.Set(tablet_num);
+        }
+
+        void SetNotReady(int64_t not_ready) {
+            not_ready_.Set(not_ready);
+        }
+
+        void SetTableSize(int64_t table_size) {
+            table_size_.Set(table_size);
+        }
+
+    private:
+        std::string GetTableNameLabel() {
+            return "table:" + table_name_;
+        }
+
+        const std::string table_name_;
+        tera::MetricCounter tablet_num_;
+        tera::MetricCounter not_ready_;
+        tera::MetricCounter table_size_;
+    };
+
     friend class Tablet;
     friend class TabletManager;
     friend std::ostream& operator << (std::ostream& o, const Table& tablet);
@@ -262,11 +324,12 @@ class Table {
     void EnableDeadTabletGarbageCollect(uint64_t tablet_id);
     void ReleaseInheritedFile(const TabletFile& file);
     void AddInheritedFile(const TabletFile& file, bool need_ref);
+    void AddEmptyDeadTablet(uint64_t tablet_id);
     uint64_t CleanObsoleteFile();
 
 private:
-    Table(const Table&) {}
-    Table& operator=(const Table&) {return *this;}
+    Table(const Table&) = delete;
+    Table& operator=(const Table&) = delete;
     typedef std::map<std::string, TabletPtr> TabletList;
     TabletList tablets_list_;
     mutable Mutex mutex_;
@@ -279,6 +342,7 @@ class Table {
     uint64_t max_tablet_no_;
     int64_t create_time_;
     TableCounter counter_;
+    TableMetric metric_;
     bool schema_is_syncing_; // is schema syncing to all ts(all tablets)
     RangeFragment* rangefragment_;
     UpdateTableResponse* update_rpc_response_;
@@ -348,6 +412,11 @@ class TabletManager {
                    std::vector<TabletPtr>* tablet_meta_list,
                    StatusCode* ret_status = NULL);
 
+    bool SearchTablet(const std::string& table_name,
+                      const std::string& key,
+                      TabletPtr* tablet,
+                      StatusCode* ret_status);
+
     bool FindTable(const std::string& table_name, TablePtr* tablet);
 
     int64_t SearchTable(std::vector<TabletPtr>* tablet_meta_list,
diff --git a/src/master/tabletnode_manager.cc b/src/master/tabletnode_manager.cc
index 383526a02..ff1767ee4 100644
--- a/src/master/tabletnode_manager.cc
+++ b/src/master/tabletnode_manager.cc
@@ -6,7 +6,7 @@
 
 #include "master/master_impl.h"
 #include "master/workload_scheduler.h"
-#include "utils/timer.h"
+#include "common/timer.h"
 
 DECLARE_string(tera_master_meta_table_name);
 DECLARE_int32(tera_master_max_load_concurrency);
diff --git a/src/master/test/master_impl_test.cc b/src/master/test/master_impl_test.cc
index e9e130e33..9e0573f04 100644
--- a/src/master/test/master_impl_test.cc
+++ b/src/master/test/master_impl_test.cc
@@ -14,36 +14,19 @@
 #include "utils/utils_cmd.h"
 #include "version.h"
 
+DECLARE_string(tera_master_port);
+DECLARE_string(log_dir);
+DECLARE_string(tera_coord_type);
+DECLARE_string(tera_leveldb_env_type);
+
 namespace tera {
 namespace master {
 
 class MasterImplTest : public ::testing::Test, public MasterImpl {
 public:
     MasterImplTest() : merge_enter_phase2(false) {
-    }
-
-    void SplitTabletTest() {
-        SplitTabletRequest* request = NULL;
-        SplitTabletResponse* response = NULL;
-        bool failed;
-        int error_code;
-        TablePtr table;
-        TabletPtr tablet;
-        TabletMeta meta;
-
-        table.reset(new Table("splittest"));
-        tablet.reset(new Tablet(meta, table));
-        request = new SplitTabletRequest;
-        response = new SplitTabletResponse;
-
-        tablet->SetStatus(kTableReady);
-        tablet->SetStatus(kTableOnSplit);
-        response->set_status(kTableNotSupport);
-        failed = false;
-        error_code = 0;
-
-        MasterImpl::SplitTabletCallback(tablet, request, response, failed, error_code);
-        EXPECT_TRUE(tablet->GetStatus() == kTableOffLine);
+        FLAGS_tera_coord_type = "fake_zk";
+        FLAGS_tera_leveldb_env_type = "local";
     }
 
     bool merge_enter_phase2;
@@ -95,6 +78,55 @@ class MasterImplTest : public ::testing::Test, public MasterImpl {
         return tablet;
     }
 
+    void DeleteTabletNodeTest() {
+        // add server
+        std::string addr1 = "127.0.0.1:22000";
+        std::string addr2 = "127.0.0.2:22000";
+        tabletnode_manager_->AddTabletNode(addr1, addr1);
+        tabletnode_manager_->AddTabletNode(addr2, addr2);
+
+        // add tabelt
+        StatusCode s;
+        TabletMeta meta;
+        TablePtr table(new Table("table001"));
+        TabletPtr tablet = MakeTabletPtr("a", "z", table);
+        tablet->SetStatus(kTableReady);
+        tablet->SetAddr(addr1);
+        tablet->ToMeta(&meta);
+        tablet_manager_->AddTablet(meta, tablet->GetSchema(), &tablet, &s);
+        tablet->SetServerId(addr1);
+
+        // thread1: get tablet from addr1
+        std::vector<TabletPtr> tablet_list;
+        std::vector<TabletPtr>::iterator it;
+        tablet_manager_->FindTablet(addr1, &tablet_list, true);
+        EXPECT_TRUE(it != tablet_list.end());
+        EXPECT_TRUE(tablet_list.size() == 1);
+
+        // thread2: load tablet into addr2
+        LoadTabletRequest* request = new LoadTabletRequest;
+        LoadTabletResponse* response = new LoadTabletResponse;
+        tablet->SetAddr(addr2);
+        tablet->SetServerId(addr2);
+
+        TabletNodePtr node;
+        tabletnode_manager_->FindTabletNode(addr2, &node);
+        node->TryLoad(tablet);
+        tablet->SetStatus(kTableOffLine);
+        tablet->SetStatus(kTableOnLoad);
+        response->set_status(kTabletNodeOk);
+        LoadTabletCallback(tablet, 10, request, response, 0, 0);
+        EXPECT_TRUE(tablet->GetStatus() == kTableReady);
+
+        // thread1: check addr1 and set status
+        for (it = tablet_list.begin(); it != tablet_list.end(); ++it) {
+            TabletPtr t = *it;
+            t->SetStatusIf(kTabletPending, kTableReady, addr1);
+        }
+        EXPECT_TRUE(tablet->GetStatus() == kTableReady);
+        EXPECT_STREQ(tablet->GetServerAddr().c_str(), addr2.c_str());
+    }
+
     // This unload function will not send unload request
     // Tablet will stay in kTableUnLoading status forever
     // It can be used to simulate a slow unload
@@ -119,9 +151,7 @@ class MasterImplTest : public ::testing::Test, public MasterImpl {
         LOG(ERROR) << t1->GetStatus() << ";" << t2->GetStatus() << ";" << t3->GetStatus();
         EXPECT_TRUE((t1->GetStatus() == kTableUnLoading)
                     && (t2->GetStatus() == kTableUnLoading)
-                    && (t3->GetStatus() == kTableReady));
-
-        // t2 & t3's merge should fail since t1 & t2 is merging
+                    && (t3->GetStatus() == kTableReady)); // t2 & t3's merge should fail since t1 & t2 is merging
         MergeTabletAsync(t2, t3);
         LOG(ERROR) << t1->GetStatus() << ";" << t2->GetStatus() << ";" << t3->GetStatus();
         EXPECT_TRUE((t1->GetStatus() == kTableUnLoading)
@@ -135,10 +165,49 @@ class MasterImplTest : public ::testing::Test, public MasterImpl {
                     && (t2->GetStatus() == kTableUnLoading)
                     && (t3->GetStatus() == kTableReady));
     }
+
+    virtual void ScanMetaTableAsync(const std::string& table_name,
+            const std::string& tablet_key_start,
+            const std::string& tablet_end_key,
+            ScanClosure done);
+
+    virtual void SplitTabletWriteMetaAsync(TabletPtr tablet, const std::string& split_key);
 };
 
-TEST_F(MasterImplTest, SplitTest) {
-    SplitTabletTest();
+void MasterImplTest::ScanMetaTableAsync(const std::string& table_name,
+        const std::string& tablet_key_start,
+        const std::string& tablet_end_key,
+        ScanClosure done) {
+
+    const ::testing::TestInfo* test_case = ::testing::UnitTest::GetInstance()->current_test_info();
+    std::string case_name(test_case->test_case_name());
+    if (case_name == "InteractWithOldTS") {
+        EXPECT_TRUE(true);
+    }
+    if (case_name.find("InteractWithNewTS") != std::string::npos) {
+        EXPECT_TRUE(false);
+    }
+}
+
+void MasterImplTest::SplitTabletWriteMetaAsync(TabletPtr tablet, const std::string& split_key) {
+    const ::testing::TestInfo* test_case = ::testing::UnitTest::GetInstance()->current_test_info();
+    std::string case_name(test_case->test_case_name());
+    if (case_name.find("InteractWithOldTS") != std::string::npos) {
+        EXPECT_TRUE(false);
+    }
+    if (case_name.find("InteractWithNewTS") != std::string::npos) {
+        EXPECT_TRUE(true);
+    }
+    EXPECT_EQ(tablet->GetStatus(), kTableOnSplit);
+    EXPECT_FALSE(split_key.empty());
+    EXPECT_GT(split_key, tablet->GetKeyStart());
+    if (!tablet->GetKeyEnd().empty()) {
+        EXPECT_GT(tablet->GetKeyEnd(), split_key);
+    }
+}
+
+TEST_F(MasterImplTest, DeleteTabletNodeTest) {
+    DeleteTabletNodeTest();
 }
 
 TEST_F(MasterImplTest, MergeTest) {
@@ -149,6 +218,163 @@ TEST_F(MasterImplTest, MergeTabletBrokenTest) {
     MergeTabletBrokenTest();
 }
 
+TEST_F(MasterImplTest, SplitNotSupport) {
+    SplitTabletRequest* request = NULL;
+    SplitTabletResponse* response = NULL;
+    bool failed;
+    int error_code;
+    TablePtr table;
+    TabletPtr tablet;
+    TabletMeta meta;
+
+    table.reset(new Table("splittest"));
+    tablet.reset(new Tablet(meta, table));
+    request = new SplitTabletRequest;
+    response = new SplitTabletResponse;
+
+    tablet->SetStatus(kTableReady);
+    tablet->SetStatus(kTableOnSplit);
+    response->set_status(kTableNotSupport);
+    failed = false;
+    error_code = 0;
+
+    MasterImpl::SplitTabletCallback(tablet, request, response, failed, error_code);
+    EXPECT_TRUE(tablet->GetStatus() == kTableOffLine);
+}
+
+TEST_F(MasterImplTest, InteractWithOldTS) {
+    SplitTabletRequest* request = NULL;
+    SplitTabletResponse* response = NULL;
+    TablePtr table;
+    TabletPtr tablet;
+    TabletMeta meta;
+
+    table.reset(new Table("splittest"));
+    tablet.reset(new Tablet(meta, table));
+    request = new SplitTabletRequest;
+    response = new SplitTabletResponse;
+
+    tablet->SetStatus(kTableReady);
+    tablet->SetStatus(kTableOnSplit);
+    response->set_status(kTabletNodeOk);
+
+    bool failed = false;
+    int error_code = 0;
+    MasterImpl::SplitTabletCallback(tablet, request, response, failed, error_code);
+}
+
+TEST_F(MasterImplTest, InteractWithNewTSOK){
+    TablePtr table;
+    TabletPtr tablet;
+    TabletMeta meta;
+
+    table.reset(new Table("splittest"));
+    tablet.reset(new Tablet(meta, table));
+    SplitTabletRequest* request = new SplitTabletRequest;
+    SplitTabletResponse* response = new SplitTabletResponse;
+    tablet->SetStatus(kTableReady);
+    tablet->SetStatus(kTableOnSplit);
+    response->set_status(kTabletNodeOk);
+    response->add_split_keys("abc");
+    bool failed = false;
+    int error_code = 0;
+    MasterImpl::SplitTabletCallback(tablet, request, response, failed, error_code);
+
+    meta.mutable_key_range()->set_key_start("ab");
+    meta.mutable_key_range()->set_key_end("bc");
+    tablet.reset(new Tablet(meta, table));
+    tablet->SetStatus(kTableReady);
+    tablet->SetStatus(kTableOnSplit);
+    request = new SplitTabletRequest;
+    response = new SplitTabletResponse;
+    response->add_split_keys("b");
+    MasterImpl::SplitTabletCallback(tablet, request, response, failed, error_code);
+    EXPECT_EQ(tablet->GetStatus(), kTableOnSplit);
+}
+
+TEST_F(MasterImplTest, NewTSReturnInvalidSplitKey){
+    TablePtr table;
+    TabletPtr tablet;
+    TabletMeta meta;
+
+    meta.mutable_key_range()->set_key_start("aa");
+    meta.mutable_key_range()->set_key_end("cc");
+    table.reset(new Table("splittest"));
+    tablet.reset(new Tablet(meta, table));
+    tablet->SetStatus(kTableReady);
+    tablet->SetStatus(kTableOnSplit);
+    MasterImpl::SplitTabletWriteMetaAsync(tablet, "");
+    EXPECT_EQ(tablet->GetStatus(), kTableOffLine);
+
+    tablet->SetStatus(kTableReady);
+    tablet->SetStatus(kTableOnSplit);
+    MasterImpl::SplitTabletWriteMetaAsync(tablet, "aa");
+    EXPECT_EQ(tablet->GetStatus(), kTableOffLine);
+
+    tablet->SetStatus(kTableReady);
+    tablet->SetStatus(kTableOnSplit);
+    MasterImpl::SplitTabletWriteMetaAsync(tablet, "cc");
+    EXPECT_EQ(tablet->GetStatus(), kTableOffLine);
+
+    tablet->SetStatus(kTableReady);
+    tablet->SetStatus(kTableOnSplit);
+    MasterImpl::SplitTabletWriteMetaAsync(tablet, "d");
+    EXPECT_EQ(tablet->GetStatus(), kTableOffLine);
+
+    meta.mutable_key_range()->set_key_end("");
+    tablet.reset(new Tablet(meta, table));
+    tablet->SetStatus(kTableReady);
+    tablet->SetStatus(kTableOnSplit);
+    MasterImpl::SplitTabletWriteMetaAsync(tablet, "");
+    EXPECT_EQ(tablet->GetStatus(), kTableOffLine);
+
+    meta.Clear();
+    tablet.reset(new Tablet(meta, table));
+    tablet->SetStatus(kTableReady);
+    tablet->SetStatus(kTableOnSplit);
+    MasterImpl::SplitTabletWriteMetaAsync(tablet, "");
+    EXPECT_EQ(tablet->GetStatus(), kTableOffLine);
+
+}
+
+TEST_F(MasterImplTest, SplitTabletWriteMetaCallback) {
+    TablePtr table;
+    TabletPtr tablet;
+    TabletMeta meta;
+
+    meta.mutable_key_range()->set_key_start("a");
+    meta.mutable_key_range()->set_key_end("c");
+    table.reset(new Table("splittest"));
+    tablet.reset(new Tablet(meta, table));
+    tablet->SetStatus(kTableReady);
+    tablet->SetStatus(kTableOnSplit);
+    std::vector<TabletPtr> child_tablets;
+    meta.mutable_key_range()->set_key_end("b");
+    child_tablets.emplace_back(new Tablet(meta));
+    meta.mutable_key_range()->set_key_start("b");
+    meta.mutable_key_range()->set_key_end("c");
+    child_tablets.emplace_back(new Tablet(meta));
+    bool failed = false;
+    int error_code = 0;
+
+    WriteTabletRequest* request = new WriteTabletRequest;
+    WriteTabletResponse* response = new WriteTabletResponse;
+
+    response->set_status(kTabletNodeOk);
+    response->add_row_status_list(kTabletNodeOk);
+    response->add_row_status_list(kTabletNodeOk);
+
+    MasterImpl::SplitTabletWriteMetaCallback(tablet,
+            child_tablets,  1, request, response, failed, error_code);
+    EXPECT_EQ(table->tablets_list_.size(), 2);
+    TabletPtr t1, t2;
+    table->FindTablet("a", &t1);
+    table->FindTablet("b", &t2);
+    EXPECT_EQ(t1->GetStatus(), kTableOffLine);
+    EXPECT_EQ(t2->GetStatus(), kTableOffLine);
+    EXPECT_STREQ(t1->GetKeyEnd().c_str(), t2->GetKeyStart().c_str());
+}
+
 } // master
 } // tera
 
diff --git a/src/master/test/master_test.cc b/src/master/test/master_test.cc
index 89b44c208..d0ecfb87f 100644
--- a/src/master/test/master_test.cc
+++ b/src/master/test/master_test.cc
@@ -8,20 +8,13 @@
 
 #include "utils/utils_cmd.h"
 
-DECLARE_string(tera_master_port);
-DECLARE_string(log_dir);
-DECLARE_bool(tera_zk_enabled);
 DECLARE_string(tera_leveldb_env_type);
-DECLARE_string(tera_fake_zk_path_prefix);
 
 int main(int argc, char** argv) {
     ::google::ParseCommandLineFlags(&argc, &argv, true);
     ::google::InitGoogleLogging(argv[0]);
-
-    FLAGS_tera_zk_enabled = false;
-    FLAGS_tera_leveldb_env_type = "local";
-
     tera::utils::SetupLog("master_test");
+    FLAGS_tera_leveldb_env_type = "local";
     ::testing::InitGoogleTest(&argc, argv);
 
     return RUN_ALL_TESTS();
diff --git a/src/master/test/trackable_gc_test.cc b/src/master/test/trackable_gc_test.cc
index 7d6c78dd6..09cee5dda 100644
--- a/src/master/test/trackable_gc_test.cc
+++ b/src/master/test/trackable_gc_test.cc
@@ -11,7 +11,7 @@
 #include "master/tablet_manager.h"
 #include "utils/utils_cmd.h"
 
-DECLARE_bool(tera_zk_enabled);
+DECLARE_string(tera_coord_type);
 DECLARE_string(tera_leveldb_env_type);
 DECLARE_string(tera_master_gc_strategy);
 DECLARE_string(tera_tabletnode_path_prefix);
@@ -500,7 +500,7 @@ class TrackableGcTest : public ::testing::Test {
 
     static void SetUpTestCase() {
         std::cout << "SetUpTestCase" << std::endl;
-        FLAGS_tera_zk_enabled = false;
+        FLAGS_tera_coord_type = "fake_zk";
         FLAGS_tera_leveldb_env_type = "local";
         FLAGS_tera_master_gc_strategy = "trackable";
         FLAGS_tera_tabletnode_path_prefix = "./";
diff --git a/src/master/workload_scheduler.cc b/src/master/workload_scheduler.cc
index f0f70540c..5933827cb 100644
--- a/src/master/workload_scheduler.cc
+++ b/src/master/workload_scheduler.cc
@@ -11,6 +11,7 @@
 
 DECLARE_double(tera_master_load_balance_size_ratio_trigger);
 DECLARE_int32(tera_master_load_balance_ts_load_threshold);
+DECLARE_int64(tera_master_load_balance_ts_size_threshold);
 DECLARE_int32(tera_master_load_balance_scan_weight);
 
 namespace tera {
@@ -76,8 +77,8 @@ bool SizeScheduler::MayMoveOut(const TabletNodePtr& node,
                                const std::string& table_name) {
     VLOG(16) << "[size-sched] MayMoveOut()";
     int64_t node_size = node->GetSize(table_name);
-    if (node_size <= 0) {
-        VLOG(16) << "[size-sched] node has no data";
+    if (node_size <= FLAGS_tera_master_load_balance_ts_size_threshold) {
+        VLOG(16) << "[size-sched] node do not need loadbalance";
         return false;
     }
     return true;
diff --git a/src/monitor/teramo_main.cc b/src/monitor/teramo_main.cc
index d2a5d6417..169437948 100644
--- a/src/monitor/teramo_main.cc
+++ b/src/monitor/teramo_main.cc
@@ -19,7 +19,7 @@
 #include "proto/tabletnode.pb.h"
 #include "tera.h"
 #include "utils/utils_cmd.h"
-#include "utils/timer.h"
+#include "common/timer.h"
 
 DEFINE_string(tera_monitor_default_request_filename, "tera_monitor.request", "");
 DEFINE_string(tera_monitor_default_response_filename, "tera_monitor.response", "");
@@ -34,6 +34,7 @@ DECLARE_string(tera_ins_addr_list);
 DECLARE_string(tera_ins_root_path);
 DECLARE_bool(tera_zk_enabled);
 DECLARE_bool(tera_ins_enabled);
+DECLARE_string(tera_coord_type);
 DECLARE_int64(tera_master_stat_table_interval);
 
 using namespace tera;
@@ -296,9 +297,11 @@ void InitFlags(int32_t argc, char** argv, const MonitorRequest& request) {
         if (request.has_tera_zk_root()) {
             FLAGS_tera_ins_root_path = request.tera_zk_root();
         }
+        FLAGS_tera_coord_type = "ins";
         FLAGS_tera_ins_enabled = true;
         FLAGS_tera_zk_enabled = false;
     } else {
+        FLAGS_tera_coord_type = "zk";
         if (request.has_tera_zk_addr()) {
             FLAGS_tera_zk_addr_list = request.tera_zk_addr();
         }
diff --git a/src/observer/executor/key_selector.h b/src/observer/executor/key_selector.h
new file mode 100644
index 000000000..b6746b612
--- /dev/null
+++ b/src/observer/executor/key_selector.h
@@ -0,0 +1,29 @@
+// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef TERA_OBSERVER_EXECUTOR_KEY_SELECTOR_H_
+#define TERA_OBSERVER_EXECUTOR_KEY_SELECTOR_H_
+
+#include <string>
+#include <vector>
+
+#include "tera.h"
+
+namespace tera {
+namespace observer {
+
+class KeySelector {
+public:
+	virtual ~KeySelector() {}
+
+	// output: selected table name, selected start key
+	virtual bool SelectStart(std::string* table_name,
+							 std::string* start_key) = 0;
+	virtual ErrorCode Observe(const std::string& table_name) = 0;
+};
+
+} // namespace observer
+} // namespace tera
+
+#endif  // TERA_OBSERVER_EXECUTOR_KEY_SELECTOR_H_
diff --git a/src/observer/executor/notification.h b/src/observer/executor/notification.h
new file mode 100644
index 000000000..a73cbb255
--- /dev/null
+++ b/src/observer/executor/notification.h
@@ -0,0 +1,38 @@
+// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef TERA_OBSERVER_EXECUTOR_NOTIFICATION_H_
+#define TERA_OBSERVER_EXECUTOR_NOTIFICATION_H_
+
+#include <string>
+#include <vector>
+
+#include "tera.h"
+
+#pragma GCC visibility push(default)
+
+namespace tera {
+namespace observer {
+
+class Notification {
+public:
+    virtual ~Notification() {}
+
+    virtual void Ack(Table* t,
+                     const std::string& row_key,
+                     const std::string& column_family,
+                     const std::string& qualifier) = 0;
+
+    virtual void Notify(Table* t,
+                        const std::string& row_key,
+                        const std::string& column_family,
+                        const std::string& qualifier) = 0;
+};
+
+} // namespace observer
+} // namespace tera
+
+#pragma GCC visibility pop
+
+#endif  // TERA_OBSERVER_EXECUTOR_NOTIFICATION_H_
diff --git a/src/observer/executor/notification_impl.cc b/src/observer/executor/notification_impl.cc
new file mode 100644
index 000000000..125509d79
--- /dev/null
+++ b/src/observer/executor/notification_impl.cc
@@ -0,0 +1,67 @@
+// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "observer/executor/notification_impl.h"
+
+#include <glog/logging.h>
+
+#include "common/timer.h"
+#include "common/base/string_number.h"
+#include "sdk/global_txn_internal.h"
+#include "types.h"
+
+namespace tera {
+namespace observer {
+
+Notification* GetNotification(Transaction* transaction) {
+    return new NotificationImpl(transaction);
+}
+
+NotificationImpl::NotificationImpl(Transaction* transaction)
+    : transaction_(transaction),
+    start_timestamp_(get_micros()),
+    notify_timestamp_(0) {}
+
+    void NotificationImpl::Ack(Table* t,
+                               const std::string& row_key,
+                               const std::string& column_family,
+                               const std::string& qualifier) {
+        if (transaction_ != NULL) {
+            transaction_->Ack(t, row_key, column_family, qualifier);
+            return;
+        }
+
+        // kNoneTransaction
+        tera::RowMutation* mutation = t->NewRowMutation(row_key);
+        std::string notify_qulifier = PackNotifyName(column_family, qualifier);
+        mutation->DeleteColumns(kNotifyColumnFamily, notify_qulifier, start_timestamp_);
+        t->ApplyMutation(mutation);
+        delete mutation;
+    }
+
+void NotificationImpl::Notify(Table* t,
+                              const std::string& row_key,
+                              const std::string& column_family,
+                              const std::string& qualifier) {
+    if (transaction_ != NULL) {
+        transaction_->Notify(t, row_key, column_family, qualifier);
+        return;
+    }
+
+    // kNoneTransaction
+    if (notify_timestamp_ == 0) {
+        notify_timestamp_ = get_micros();
+    }
+
+    tera::ErrorCode err;
+    std::string notify_qulifier = PackNotifyName(column_family, qualifier);
+    t->Put(row_key, kNotifyColumnFamily, notify_qulifier, NumberToString(notify_timestamp_), notify_timestamp_, &err);
+    if (err.GetType() != tera::ErrorCode::kOK) {
+        LOG(ERROR) << "Notify error. table: " << t->GetName() << " row "
+            << row_key << " pos: " << column_family << ":" << qualifier;
+    }
+}
+
+} // namespace observer
+} // namespace tera
diff --git a/src/observer/executor/notification_impl.h b/src/observer/executor/notification_impl.h
new file mode 100644
index 000000000..a88399d79
--- /dev/null
+++ b/src/observer/executor/notification_impl.h
@@ -0,0 +1,42 @@
+// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef TERA_OBSERVER_EXECUTOR_NOTIFICATION_IMPL_H_
+#define TERA_OBSERVER_EXECUTOR_NOTIFICATION_IMPL_H_
+
+#include <string>
+#include <vector>
+
+#include "observer/executor/notification.h"
+#include "tera.h"
+
+namespace tera {
+namespace observer {
+
+Notification* GetNotification(Transaction* transaction);
+
+class NotificationImpl : public Notification {
+public:
+    explicit NotificationImpl(Transaction* transaction);
+    virtual ~NotificationImpl() {}
+
+    virtual void Ack(Table* t,
+                     const std::string& row_key,
+                     const std::string& column_family,
+                     const std::string& qualifier);
+
+    virtual void Notify(Table* t,
+                        const std::string& row_key,
+                        const std::string& column_family,
+                        const std::string& qualifier);
+private:
+    Transaction* transaction_;
+    int64_t start_timestamp_;
+    int64_t notify_timestamp_;
+};
+
+} // namespace observer
+} // namespace tera
+
+#endif  // TERA_OBSERVER_EXECUTOR_NOTIFICATION_IMPL_H_
diff --git a/src/observer/executor/notify_cell.h b/src/observer/executor/notify_cell.h
new file mode 100644
index 000000000..9567c7231
--- /dev/null
+++ b/src/observer/executor/notify_cell.h
@@ -0,0 +1,110 @@
+// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef TERA_OBSERVER_EXECUTOR_NOTIFY_CELL_H_
+#define TERA_OBSERVER_EXECUTOR_TNOTIFY_CELL_H_
+
+#include <set>
+#include <map>
+#include <vector>
+#include <memory>
+
+#include "gflags/gflags.h"
+#include "glog/logging.h"
+
+#include "observer/executor/observer.h"
+#include "observer/rowlocknode/fake_rowlock_client.h"
+#include "sdk/rowlock_client.h"
+#include "tera.h"
+
+DECLARE_string(rowlock_server_port);
+DECLARE_string(rowlock_server_ip);
+DECLARE_bool(mock_rowlock_enable);
+
+
+namespace tera {
+namespace observer {
+
+struct Column {
+    std::string table_name;
+    std::string family;
+    std::string qualifier;
+
+    bool operator<(const Column& other) const {
+        int32_t result = 0;
+        result = table_name.compare(other.table_name);
+        if (result != 0) {
+            return result < 0;
+        }
+        result = family.compare(other.family);
+        if (result != 0) {
+            return result < 0;
+        }
+        result = qualifier.compare(other.qualifier);
+
+        return result < 0;
+    }
+
+    bool operator==(const Column& other) const {
+        return table_name == other.table_name && family == other.family
+               && qualifier == other.qualifier;
+    }
+};
+
+struct AutoRowUnlocker {
+    AutoRowUnlocker(const std::string& table, 
+                    const std::string& unlock_row)
+        : table_name(table),
+          row(unlock_row) {}
+    AutoRowUnlocker() {}
+
+    ~AutoRowUnlocker() {
+        // UnLockRow
+
+        if (FLAGS_mock_rowlock_enable == true) {
+            client.reset(new FakeRowlockClient());
+        } else {
+            client.reset(new RowlockClient());
+        }
+
+        RowlockRequest request;
+        RowlockResponse response;
+
+        request.set_row(row);
+        request.set_table_name(table_name);
+
+        client->UnLock(&request, &response);    
+        VLOG(12) <<"[time] Transaction finish. [row] " << row;
+    }
+
+    std::unique_ptr<RowlockClient> client;
+    std::string table_name;  
+    std::string row;  
+};
+
+// info inside scanner
+struct NotifyCell {
+    NotifyCell(tera::Transaction* t) : transaction(t), 
+                                       table(NULL) {}
+    ~NotifyCell() {
+        if (transaction) {
+            delete transaction;
+        }
+    }
+
+    std::string row;
+    std::string value;
+    int64_t timestamp;
+    
+    Column observed_column;
+    tera::Transaction* transaction;
+    tera::Table* table;
+
+    std::shared_ptr<AutoRowUnlocker> unlocker;
+};
+
+} // namespace observer
+} // namespace tera
+
+#endif  // TERA_OBSERVER_EXECUTOR_NOTIFY_CELL_H_
diff --git a/src/observer/executor/observer.h b/src/observer/executor/observer.h
new file mode 100644
index 000000000..db1d912ae
--- /dev/null
+++ b/src/observer/executor/observer.h
@@ -0,0 +1,52 @@
+// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef TERA_OBSERVER_H_
+#define TERA_OBSERVER_H_
+
+#include <string>
+
+#include "tera/client.h"
+#include "tera/error_code.h"
+#include "tera/transaction.h"
+#include "observer/executor/notification.h"
+
+#pragma GCC visibility push(default)
+namespace tera {
+namespace observer {
+
+enum TransactionType {
+    kGlobalTransaction = 0,
+    kSingleRowTransaction = 1,
+    kNoneTransaction = 2,
+};
+
+class Observer {
+public:
+    virtual ~Observer() {}
+
+    // if notify and ack are needed during OnNotify,
+    // call notifiaction->Ack and notification->Notify
+    // before transaction commit
+    virtual ErrorCode OnNotify(tera::Transaction* t,
+                               tera::Client* client,
+                               const std::string& table_name,
+                               const std::string& family,
+                               const std::string& qualifier,
+                               const std::string& row,
+                               const std::string& value,
+                               int64_t timestamp,
+                               Notification* notification) = 0;
+    // return observer name
+    virtual std::string GetObserverName() const = 0;
+    
+    // return TransactionType
+    virtual TransactionType GetTransactionType() const = 0; 
+};
+
+} // namespace observer
+}
+#pragma GCC visibility pop
+
+#endif  // TERA_OBSERVER_H_
diff --git a/src/observer/executor/random_key_selector.cc b/src/observer/executor/random_key_selector.cc
new file mode 100644
index 000000000..75b0129ab
--- /dev/null
+++ b/src/observer/executor/random_key_selector.cc
@@ -0,0 +1,134 @@
+// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "observer/executor/random_key_selector.h"
+
+#include "gflags/gflags.h"
+#include "glog/logging.h"
+
+#include "types.h"
+
+DECLARE_string(flagfile);
+
+namespace tera {
+namespace observer {
+
+RandomKeySelector::RandomKeySelector()
+    : tables_(new std::map<std::string, std::vector<tera::TabletInfo>>),
+      quit_(false),
+      cond_(&quit_mutex_) {
+    tera::ErrorCode err;
+    client_ = tera::Client::NewClient(FLAGS_flagfile, &err);
+    update_thread_.Start(std::bind(&RandomKeySelector::Update, this));
+}
+
+RandomKeySelector::~RandomKeySelector() {
+    {
+        MutexLock locker(&quit_mutex_);
+        quit_ = true;
+        cond_.Broadcast();
+    }
+
+    update_thread_.Join();
+    if (client_ != NULL) {
+        delete client_;
+    }
+}
+
+bool RandomKeySelector::SelectStart(std::string* table_name,
+                                    std::string* start_key) {
+    srand((unsigned)time(NULL));
+
+    std::shared_ptr<std::map<std::string, std::vector<tera::TabletInfo>>> table_read_copy;
+    {
+        MutexLock locker(&table_mutex_);
+        // copy for copy-on-write， ref +1
+        table_read_copy = tables_;
+    }
+
+    if (table_read_copy->size() == 0) {
+        return false;
+    }
+
+    // random table
+    uint32_t table_no = rand() % observe_tables_.size();
+    *table_name = observe_tables_[table_no];
+
+
+    // random key
+    size_t tablet_num = (*table_read_copy)[*table_name].size();
+    if (0 == tablet_num) {
+        LOG(ERROR) << "No tablet";
+        return false;
+    }
+
+    uint32_t tablet_no = rand() % tablet_num;
+    *start_key = (*table_read_copy)[*table_name][tablet_no].start_key;
+
+    VLOG(25) << "Random StartKey=" << *start_key << " TabletNo=" << tablet_no;
+    return true;
+}
+
+ErrorCode RandomKeySelector::Observe(const std::string& table_name) {
+    tera::ErrorCode err;
+
+    MutexLock locker(&table_mutex_);
+
+    if (!tables_.unique()) {
+        // In this case threads may reading this copy.
+        // Shared_ptr construct a new copy from the original one.
+        // Later requests will operate on the new copy.
+        tables_.reset(new std::map<std::string, std::vector<tera::TabletInfo>>(*tables_));
+    }
+    if (tables_->find(table_name) == tables_->end()) {
+
+        std::vector<tera::TabletInfo> tablets;
+        client_->GetTabletLocation(table_name, &tablets, &err);
+        if (tera::ErrorCode::kOK != err.GetType()) {
+            LOG(ERROR) << "Observe table failed, " << err.ToString();
+            return err;
+        }
+        observe_tables_.push_back(table_name);
+        (*tables_)[table_name] = tablets;
+    }
+    return err;
+}
+
+void RandomKeySelector::Update() {
+    tera::ErrorCode err;
+    while (true) {
+        {
+            MutexLock locker(&quit_mutex_);
+            if (quit_) {
+                return;
+            }
+            cond_.TimeWaitInUs(kObserverWaitTime);
+        }
+
+        // update data first
+        std::shared_ptr<std::map<std::string, std::vector<tera::TabletInfo>>> table_update_copy(
+            new std::map<std::string, std::vector<tera::TabletInfo>>);
+
+        // updated table
+        for (uint32_t i = 0; i < observe_tables_.size(); ++i) {
+            std::string table_name = observe_tables_[i];
+
+            std::vector<tera::TabletInfo> tablets;
+            client_->GetTabletLocation(table_name, &tablets, &err);
+            if (tera::ErrorCode::kOK != err.GetType()) {
+                LOG(ERROR) << "Observe table failed, " << err.ToString();
+                continue;
+            }
+
+            table_update_copy->insert(std::pair<std::string, std::vector<tera::TabletInfo>>(table_name, tablets));
+        }
+
+        // update pointer
+        MutexLock locker(&table_mutex_);
+        tables_.swap(table_update_copy);
+    }
+}
+
+} // namespace observer
+} // namespace tera
diff --git a/src/observer/executor/random_key_selector.h b/src/observer/executor/random_key_selector.h
new file mode 100644
index 000000000..5a20fb4f3
--- /dev/null
+++ b/src/observer/executor/random_key_selector.h
@@ -0,0 +1,47 @@
+// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef TERA_OBSERVER_EXECUTOR_RANDOM_KEY_SELECTOR_H_
+#define TERA_OBSERVER_EXECUTOR_RANDOM_KEY_SELECTOR_H_
+
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "common/mutex.h"
+#include "common/thread.h"
+#include "observer/executor/key_selector.h"
+#include "tera.h"
+
+namespace tera {
+namespace observer {
+
+class RandomKeySelector : public KeySelector {
+public:
+	RandomKeySelector();
+	virtual ~RandomKeySelector();
+
+	virtual bool SelectStart(std::string* table_name,
+							 std::string* start_key);
+	virtual ErrorCode Observe(const std::string& table_name);
+private:
+	void Update();
+
+private:
+	tera::Client* client_;
+	mutable Mutex table_mutex_;
+	std::vector<std::string> observe_tables_;
+	std::shared_ptr<std::map<std::string, std::vector<tera::TabletInfo>>> tables_;
+	common::Thread update_thread_;
+
+	mutable Mutex quit_mutex_;
+	bool quit_;
+	common::CondVar cond_;
+};
+
+} // namespace observer
+} // namespace tera
+
+#endif  // TERA_OBSERVER_EXECUTOR_RANDOM_KEY_SELECTOR_H_
\ No newline at end of file
diff --git a/src/observer/executor/scanner.h b/src/observer/executor/scanner.h
new file mode 100644
index 000000000..a11a8646d
--- /dev/null
+++ b/src/observer/executor/scanner.h
@@ -0,0 +1,41 @@
+// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef TERA_SCANNER_H_
+#define TERA_SCANNER_H_
+
+#include <memory>
+
+#include "observer/executor/observer.h"
+#include "tera/error_code.h"
+
+#pragma GCC visibility push(default)
+namespace tera {
+namespace observer {
+
+class Scanner {
+public:
+    static Scanner* GetScanner();
+
+    virtual ~Scanner() {}
+
+    // register user define observers
+    // user should not destruct observers, which will be handled by scanner
+    virtual ErrorCode Observe(const std::string& table_name,
+                              const std::string& column_family,
+                              const std::string& qualifier,
+                              Observer* observer) = 0;
+
+    virtual bool Init() = 0;
+
+    virtual bool Start() = 0;
+
+    virtual void Exit() = 0;
+};
+
+} // namespace observer
+} // namespace tera
+#pragma GCC visibility pop
+
+#endif  // TERA_SCANNER_H_
diff --git a/src/observer/executor/scanner_entry.cc b/src/observer/executor/scanner_entry.cc
new file mode 100644
index 000000000..5b012b339
--- /dev/null
+++ b/src/observer/executor/scanner_entry.cc
@@ -0,0 +1,63 @@
+// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "observer/executor/scanner_entry.h"
+
+#include "gflags/gflags.h"
+#include "glog/logging.h"
+
+#include "observer/executor/scanner_impl.h"
+
+namespace tera {
+namespace observer {
+
+ScannerEntry::ScannerEntry() {}
+
+ScannerEntry::~ScannerEntry() {}
+
+bool ScannerEntry::StartServer() {
+	scanner_.reset(tera::observer::Scanner::GetScanner());
+
+	if(!scanner_->Init()) {
+		LOG(ERROR) << "fail to init scanner_impl";
+        return false;
+	}
+	
+	// observe observers to scanner
+	ErrorCode err = Observe();
+	if (tera::ErrorCode::kOK != err.GetType()) {
+		LOG(ERROR) << "Observe failed, reason: " << err.ToString();
+		return false;
+	}
+
+	if(!scanner_->Start()) {
+		LOG(ERROR) << "fail to start scanner_impl";
+        return false;
+	}
+	return true;
+}
+
+void ScannerEntry::ShutdownServer() {
+    LOG(INFO) << "shut down scanner";
+    scanner_->Exit();
+    scanner_.reset();
+    LOG(INFO) << "scanner stop done!";
+}
+
+bool ScannerEntry::Run() {
+    ThisThread::Sleep(1000);
+    return true;
+}
+
+ErrorCode ScannerEntry::Observe() {
+	ErrorCode err;
+	return err;
+}
+
+Scanner* ScannerEntry::GetScanner() const {
+	return scanner_.get();
+}
+
+} // namespace observer
+} // namespace tera
\ No newline at end of file
diff --git a/src/observer/executor/scanner_entry.h b/src/observer/executor/scanner_entry.h
new file mode 100644
index 000000000..ed5e5c325
--- /dev/null
+++ b/src/observer/executor/scanner_entry.h
@@ -0,0 +1,40 @@
+// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef TERA_OBSERVER_EXECUTOR_SCANNER_ENTRY_H_
+#define TERA_OBSERVER_EXECUTOR_SCANNER_ENTRY_H_
+
+#include <memory>
+#include <string>
+
+#include "common/this_thread.h"
+#include "observer/executor/observer.h"
+#include "tera.h"
+#include "tera_entry.h"
+
+namespace tera {
+namespace observer {
+
+class Scanner;
+
+class ScannerEntry : public TeraEntry {
+public:
+	ScannerEntry();
+	virtual ~ScannerEntry();
+
+	virtual bool StartServer();
+    virtual bool Run();
+    virtual void ShutdownServer();
+
+    virtual ErrorCode Observe();
+    Scanner* GetScanner() const;
+private:
+	std::unique_ptr<Scanner> scanner_;
+};
+
+
+} // namespace observer
+} // namespace tera
+
+#endif  // TERA_OBSERVER_EXECUTOR_SCANNER_ENTRY_H_
\ No newline at end of file
diff --git a/src/observer/executor/scanner_impl.cc b/src/observer/executor/scanner_impl.cc
new file mode 100644
index 000000000..f42ba6b05
--- /dev/null
+++ b/src/observer/executor/scanner_impl.cc
@@ -0,0 +1,657 @@
+// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "observer/executor/scanner_impl.h"
+
+#include <assert.h>
+#include <signal.h>
+#include <sys/time.h>
+
+#include <functional>
+
+#include "gflags/gflags.h"
+
+#include "common/base/string_number.h"
+#include "observer/executor/random_key_selector.h"
+#include "observer/executor/notification.h"
+#include "observer/executor/notification_impl.h"
+#include "observer/rowlocknode/fake_rowlock_client.h"
+#include "sdk/table_impl.h"
+#include "sdk/sdk_utils.h"
+#include "tera.h"
+#include "types.h"
+
+DECLARE_int32(observer_proc_thread_num);
+DECLARE_int32(observer_scanner_thread_num);
+DECLARE_int32(observer_ack_conflict_timeout);
+DECLARE_int64(observer_max_pending_task);
+DECLARE_int64(observer_ack_timeout_time);
+DECLARE_string(flagfile);
+DECLARE_string(rowlock_server_ip);
+DECLARE_string(rowlock_server_port);
+DECLARE_int32(observer_rowlock_client_thread_num);
+DECLARE_bool(mock_rowlock_enable);
+
+namespace tera {
+namespace observer {
+
+ScannerImpl* ScannerImpl::scanner_instance_ = new ScannerImpl();
+Scanner* Scanner::GetScanner() {
+    return ScannerImpl::GetInstance();
+}
+
+ScannerImpl* ScannerImpl::GetInstance() {
+    return scanner_instance_;
+}
+
+ScannerImpl::ScannerImpl()
+    : tera_client_(NULL),
+      table_observe_info_(new std::map<std::string, TableObserveInfo>),
+      scan_table_threads_(new common::ThreadPool(FLAGS_observer_scanner_thread_num)),
+      transaction_threads_(new common::ThreadPool(FLAGS_observer_proc_thread_num)),
+      quit_(false),
+      cond_(&quit_mutex_) {
+    profiling_thread_.Start(std::bind(&ScannerImpl::Profiling, this));
+}
+
+ScannerImpl::~ScannerImpl() {
+    Exit();
+
+    scan_table_threads_->Stop(true);
+    transaction_threads_->Stop(true);
+    profiling_thread_.Join();
+
+    MutexLock locker(&table_mutex_);
+    // close table
+    for (auto it = table_observe_info_->begin(); it != table_observe_info_->end(); ++it) {
+        if (it->second.table != NULL) {
+            delete it->second.table;
+        }
+    }
+
+    if (tera_client_ != NULL) {
+        delete tera_client_;
+    }
+
+    for (auto it = observers_.begin(); it != observers_.end(); ++it) {
+    	delete *it;
+    }
+}
+
+ErrorCode ScannerImpl::Observe(const std::string& table_name,
+                               const std::string& column_family,
+                               const std::string& qualifier,
+                               Observer* observer) {
+    // Observe before init
+    tera::ErrorCode err;
+    if (NULL == tera_client_) {
+        LOG(ERROR) << "Init scanner first!";
+        err.SetFailed(ErrorCode::kSystem, "observe before scanner init");
+        return err;
+    }
+
+    Column column = {table_name, column_family, qualifier};
+
+    {
+        MutexLock locker(&table_mutex_);
+        if (!table_observe_info_.unique()) {
+            // Shared_ptr construct a new copy from the original one.
+            // Former requests still reading the original shared_ptr
+            // Write operation executed on the new copy, so as the later requests
+            table_observe_info_.reset(new std::map<std::string, TableObserveInfo>(*table_observe_info_));
+        }
+
+        if ((*table_observe_info_)[table_name].table == NULL) {
+            // init table
+            tera::Table* table = tera_client_->OpenTable(table_name, &err);
+            if (tera::ErrorCode::kOK != err.GetType()) {
+                LOG(ERROR) << "open tera table [" << table_name << "] failed, " << err.ToString();
+                return err;
+            }
+            LOG(INFO) << "open tera table [" << table_name << "] succ";
+
+            // build map<table_name, table>
+            (*table_observe_info_)[table_name].table = table;
+            (*table_observe_info_)[table_name].type = GetTableTransactionType(table);
+        }
+
+        if (!CheckTransactionTypeLegalForTable(observer->GetTransactionType(),
+                (*table_observe_info_)[table_name].type)) {
+            LOG(ERROR) << "Transaction type does not match table. table_name: " << table_name
+                << " type: " << (*table_observe_info_)[table_name].type << "  , observer name: " <<
+                observer->GetObserverName() << " type: " << observer->GetTransactionType();
+            err.SetFailed(ErrorCode::kSystem, "Transaction type does not match table");
+            return err;
+        }
+
+        auto it = (*table_observe_info_)[table_name].observe_columns[column].insert(observer);
+        if (!it.second) {
+            LOG(ERROR) << "Observer " << observer->GetObserverName() << " observe " << table_name
+                << ":" << column_family << ":" << qualifier << " more than once!";
+            err.SetFailed(ErrorCode::kSystem, "the same observer observe the same column more than once");
+            return err;
+        }
+        observers_.insert(observer);
+    }
+
+    err = key_selector_->Observe(table_name);
+    LOG(INFO) << "Observer start. table: " << table_name << "  cf:qu " << column_family << ":" <<
+        qualifier << "  observer: " << observer->GetObserverName();
+
+    return err;
+}
+
+bool ScannerImpl::Init() {
+    tera::ErrorCode err;
+    if (NULL == tera_client_) {
+        tera_client_ = tera::Client::NewClient(FLAGS_flagfile, &err);
+
+        if (tera::ErrorCode::kOK != err.GetType()) {
+            LOG(ERROR) << "init tera client [" << FLAGS_flagfile << "] failed, " << err.ToString();
+            return false;
+        }
+    }
+
+    // init key_selector_
+    // different selector started by different flags
+    key_selector_.reset(new RandomKeySelector());
+
+    return true;
+}
+
+bool ScannerImpl::Start() {
+    for (int32_t idx = 0; idx < FLAGS_observer_scanner_thread_num; ++idx) {
+        scan_table_threads_->AddTask(std::bind(&ScannerImpl::ScanTable, this));
+    }
+    return true;
+}
+
+void ScannerImpl::Exit() {
+    // the scope of quit_mutex only covers cond_ broadcast
+    MutexLock locker(&quit_mutex_);
+    quit_ = true;
+    cond_.Broadcast();
+}
+
+tera::Client* ScannerImpl::GetTeraClient() const {
+    return tera_client_;
+}
+
+void ScannerImpl::ScanTable() {
+    std::string start_key;
+    std::string table_name;
+    std::set<Column> columns;
+    tera::Table* table = NULL;
+
+    // table and start key will be refreshed.
+    while (true) {
+        {
+            MutexLock locker(&quit_mutex_);
+            if (quit_) {
+	            break;
+	        }
+            cond_.TimeWaitInUs(kObserverWaitTime);
+        }
+
+        if (key_selector_->SelectStart(&table_name, &start_key)) {
+            GetObserveColumns(table_name, &columns);
+        } else {
+            continue;
+        }
+
+        table = GetTable(table_name);
+        if (DoScanTable(table, columns, start_key, "")) {
+            DoScanTable(table, columns, "", start_key);
+        }
+    }
+}
+
+bool ScannerImpl::DoScanTable(tera::Table* table,
+                              const std::set<Column>& columns,
+                              const std::string& start_key,
+                              const std::string& end_key) {
+    if (table == NULL) {
+        return false;
+    }
+
+    LOG(INFO) << "Start scan table. Table name: [" << table->GetName()
+        << "]. Start key: [" << start_key << "]";
+
+    tera::ScanDescriptor desc(start_key);
+    desc.SetEnd(end_key);
+    // Notify stores in single lg
+    desc.AddColumnFamily(kNotifyColumnFamily);
+    tera::ErrorCode err;
+    std::unique_ptr<tera::ResultStream> result_stream(table->Scan(desc, &err));
+    if (tera::ErrorCode::kOK != err.GetType()) {
+        LOG(ERROR) << "table scan failed, " << err.ToString();
+        return false;
+    }
+
+    if (result_stream->Done(&err)) {
+        LOG(ERROR) << " ERR " << err.GetReason();
+    }
+
+    bool finished = false;
+    std::string rowkey;
+    std::vector<Column> vec_col;
+    while (NextRow(columns, result_stream.get(), table->GetName(), &finished, &rowkey, &vec_col)) {
+        // lock row
+        if (!TryLockRow(table->GetName(), rowkey)) {
+            // collision
+            LOG(INFO) <<"[rowlock failed] table=" << table->GetName() << " row=" << rowkey;
+            return false;
+        }
+        VLOG(12) <<"[time] Transaction start. [row] " << rowkey;
+
+        // automatic unlock
+        std::shared_ptr<AutoRowUnlocker> unlocker(
+            new AutoRowUnlocker(table->GetName(), rowkey));
+
+        for (uint32_t i = 0; i < vec_col.size(); ++i ) {
+            tera::Transaction* t = NULL;
+            TransactionType type;
+            {
+                MutexLock locker(&table_mutex_);
+                type = (*table_observe_info_)[table->GetName()].type;
+            }
+
+            switch (type) {
+                case kGlobalTransaction:
+                    t = tera_client_->NewGlobalTransaction();
+                    if (t == NULL) {
+                        LOG(ERROR) << "NewGlobalTransaction failed. Notify cell ignored. table: " << table->GetName()
+                        << " row: " << rowkey << " family: " << vec_col[i].family
+                        << " qualifier: " << vec_col[i].qualifier;
+                        continue;
+                    }
+                    break;
+                case kSingleRowTransaction:
+                    t = table->StartRowTransaction(rowkey);
+                    if (t == NULL) {
+                        LOG(ERROR) << "StartRowTransaction failed. Notify cell ignored. table: " << table->GetName()
+                        << " row: " << rowkey << " family: " << vec_col[i].family
+                        << " qualifier: " << vec_col[i].qualifier;
+                        continue;
+                    }
+                    break;
+                default:
+                    break;
+            }
+            std::shared_ptr<NotifyCell> notify_cell(new NotifyCell(t));
+            notify_cell->table = table;
+            notify_cell->row = rowkey;
+            notify_cell->observed_column = vec_col[i];
+            notify_cell->unlocker = unlocker;
+
+            DoReadValue(notify_cell);
+        }
+
+        MutexLock locker(&quit_mutex_);
+        if (quit_) {
+            return false;
+        }
+    }
+    if (finished) {
+        return true;
+    } else {
+        return false;
+    }
+
+}
+
+bool ScannerImpl::NextRow(const std::set<Column>& columns, tera::ResultStream* result_stream,
+                          const std::string& table_name, bool* finished,
+                          std::string* row, std::vector<Column>* vec_col) {
+    tera::ErrorCode err;
+
+    // check finish
+    if (result_stream->Done(&err)) {
+        *finished = true;
+        return false;
+    }
+
+    if (tera::ErrorCode::kOK != err.GetType()) {
+        LOG(ERROR) << "scanning failed" << err.ToString();
+        return false;
+    }
+
+    vec_col->clear();
+    *row = result_stream->RowName();
+
+    // scan cell
+    while (!result_stream->Done(&err) && result_stream->RowName() == *row) {
+        while (transaction_threads_->PendingNum() > FLAGS_observer_max_pending_task) {
+            VLOG(12) << "transaction_threads pending: " << transaction_threads_->PendingNum();
+            MutexLock locker(&quit_mutex_);
+            if (quit_) {
+                return false;
+            }
+            cond_.TimeWaitInUs(kObserverWaitTime);
+        }
+        std::string ob_cf;
+        std::string ob_qu;
+
+        if (!ParseNotifyQualifier(result_stream->Qualifier(), &ob_cf, &ob_qu)) {
+            LOG(WARNING) << "parse notify qualifier failed: " << result_stream->Qualifier();
+            result_stream->Next();
+            continue;
+        }
+
+        Column ob_col = {table_name, ob_cf, ob_qu};
+        if (columns.end() == columns.find(ob_col)) {
+            LOG(WARNING) << "miss observed column, table_name" << table_name <<
+                " cf=" << ob_cf << " qu=" << ob_qu;
+            result_stream->Next();
+            continue;
+        }
+        vec_col->push_back(ob_col);
+        result_stream->Next();
+
+    }
+    return true;
+}
+
+// example qualifier: C:url
+// C: cf; column: url;
+bool ScannerImpl::ParseNotifyQualifier(const std::string& notify_qualifier,
+                                       std::string* data_family,
+                                       std::string* data_qualifier) {
+
+    std::vector<std::string> frags;
+    std::size_t pos = std::string::npos;
+    std::size_t start_pos = 0;
+    std::string frag;
+
+    // parse cf
+    pos = notify_qualifier.find_first_of(':', start_pos);
+    if (pos == std::string::npos) {
+        LOG(ERROR) << "Parse notify qualifier error: " << notify_qualifier;
+        return false;
+    }
+    frag = notify_qualifier.substr(start_pos, pos - start_pos);
+    frags.push_back(frag);
+    start_pos = pos + 1;
+
+    pos = notify_qualifier.size();
+    frag = notify_qualifier.substr(start_pos, pos - start_pos);
+    frags.push_back(frag);
+    if (2 != frags.size()) {
+        return false;
+    }
+    if (frags[0] == "" || frags[1] == "") {
+        return false;
+    }
+    *data_family = frags[0];
+    *data_qualifier = frags[1];
+
+    return true;
+}
+
+bool ScannerImpl::DoReadValue(std::shared_ptr<NotifyCell> notify_cell) {
+    VLOG(12) <<"[time] do read value start. [row] " << notify_cell->row;
+    std::unique_ptr<tera::RowReader> row_reader(notify_cell->table->NewRowReader(notify_cell->row));
+    assert(row_reader.get() != NULL);
+    row_reader->AddColumn(notify_cell->observed_column.family, notify_cell->observed_column.qualifier);
+    // transaction read
+    if (notify_cell->transaction != NULL) {
+        notify_cell->transaction->Get(row_reader.get());
+    } else {
+        notify_cell->table->Get(row_reader.get());
+    }
+    VLOG(12) <<"[time] do read value finish. [row] " << notify_cell->row;
+    if (tera::ErrorCode::kOK == row_reader->GetError().GetType()) {
+        notify_cell->value = row_reader->Value();
+        notify_cell->timestamp = row_reader->Timestamp();
+
+        std::shared_ptr<std::map<std::string, TableObserveInfo>> table_observe_info_read_copy;
+        {
+            MutexLock locker(&table_mutex_);
+            // shared_ptr ref +1
+            table_observe_info_read_copy = table_observe_info_;
+        }
+
+        auto it = table_observe_info_read_copy->find(notify_cell->observed_column.table_name);
+        if (it == table_observe_info_read_copy->end()) {
+            LOG(WARNING) << "table not found: " << notify_cell->observed_column.table_name;
+            return false;
+        }
+
+        if (it->second.observe_columns.find(notify_cell->observed_column) == it->second.observe_columns.end()) {
+            LOG(WARNING) << "column not found. cf: " << notify_cell->observed_column.family
+                << "  qu: " << notify_cell->observed_column.qualifier;
+            return false;
+        }
+
+        if (it->second.observe_columns[notify_cell->observed_column].size() == 0) {
+                        LOG(WARNING) << "no match observers, table=" << notify_cell->observed_column.table_name <<
+                " cf=" << notify_cell->observed_column.family << " qu=" << notify_cell->observed_column.qualifier;
+            return false;
+        }
+
+        std::set<Observer*>& observer_set = (*table_observe_info_read_copy)[notify_cell->observed_column.table_name].observe_columns[notify_cell->observed_column];
+
+        // only gtxn check ack
+        if ((*observer_set.begin())->GetTransactionType() == kGlobalTransaction
+            && !CheckConflictOnAckColumn(notify_cell, observer_set)) {
+            LOG(WARNING) <<  "Ack failed ! row=" << notify_cell->row << " cf=" << notify_cell->observed_column.family <<
+                " qu=" << notify_cell->observed_column.qualifier;;
+            return false;
+        }
+        // every column may have more than one observers
+        for (auto observer = observer_set.begin(); observer != observer_set.end(); ++observer) {
+
+        	transaction_threads_->AddTask( [=] (int64_t) {
+                total_counter_.Inc();
+                std::unique_ptr<Notification> notification(GetNotification(notify_cell->transaction)); 
+        		tera::ErrorCode err = (*observer)->OnNotify(notify_cell->transaction, tera_client_, notify_cell->observed_column.table_name,
+                             	      notify_cell->observed_column.family, notify_cell->observed_column.qualifier,
+                                      notify_cell->row, notify_cell->value, notify_cell->timestamp, notification.get());
+                if (err.GetType() != tera::ErrorCode::kOK) {
+                    LOG(WARNING) << "OnNotify failed! reason: " << err.GetReason();
+                    fail_counter_.Inc();
+                }
+        	});
+        }
+
+    } else {
+        LOG(WARNING) << "[read failed] table=" << notify_cell->table->GetName() << " cf=" << notify_cell->observed_column.family <<
+            " qu=" << notify_cell->observed_column.qualifier << " row=" << notify_cell->row <<
+            " err=" << row_reader->GetError().GetType() << row_reader->GetError().GetReason();
+        return false;
+    }
+
+    return true;
+}
+
+void ScannerImpl::GetObserveColumns(const std::string& table_name, std::set<Column>* columns) {
+    columns->clear();
+
+    std::shared_ptr<std::map<std::string, TableObserveInfo>> table_observe_info_read_copy;
+    {
+        MutexLock locker(&table_mutex_);
+        // shared_ptr ref +1
+        table_observe_info_read_copy = table_observe_info_;
+    }
+
+    for (auto it : (*table_observe_info_read_copy)[table_name].observe_columns) {
+    	columns->insert(it.first);
+    }
+}
+
+tera::Table* ScannerImpl::GetTable(const std::string table_name) {
+    std::shared_ptr<std::map<std::string, TableObserveInfo>> table_observe_info_read_copy;
+    {
+        MutexLock locker(&table_mutex_);
+        table_observe_info_read_copy = table_observe_info_;
+    }
+    return (*table_observe_info_read_copy)[table_name].table;
+}
+
+void ScannerImpl::Profiling() {
+    while (true) {
+        {
+            MutexLock locker(&quit_mutex_);
+            if (quit_) {
+                return;
+            }
+            cond_.TimeWaitInUs(kObserverWaitTime);
+        }
+        LOG(INFO) << "[Observer Profiling Info]  total: " << total_counter_.Get() <<
+            " failed: " << fail_counter_.Get() << "  transaction pending: " <<
+            transaction_threads_->PendingNum();
+        total_counter_.Clear();
+        fail_counter_.Clear();
+    }
+}
+
+bool ScannerImpl::CheckConflictOnAckColumn(std::shared_ptr<NotifyCell> notify_cell,
+                                           const std::set<Observer*>& observers) {
+    VLOG(12) <<"[time] Check ACK start. [cf:qu] " << notify_cell->observed_column.family
+             << notify_cell->observed_column.qualifier;
+    bool is_collision = false;
+    std::vector<std::string> ack_qualifier_list;
+    std::string ack_qualifier_prefix = GetAckQualifierPrefix(notify_cell->observed_column.family,
+                                                             notify_cell->observed_column.qualifier);
+
+    // use transaction to read column Ack
+    std::unique_ptr<tera::Transaction> row_transaction(notify_cell->table->StartRowTransaction(notify_cell->row));
+
+    // read Acks
+    std::unique_ptr<tera::RowReader> row_reader(notify_cell->table->NewRowReader(notify_cell->row));
+    for (auto it : observers) {
+        std::string ack_qualifier = GetAckQualifier(ack_qualifier_prefix, it->GetObserverName());
+        ack_qualifier_list.push_back(ack_qualifier);
+
+        row_reader->AddColumn(notify_cell->observed_column.family, ack_qualifier);
+    }
+    row_transaction->Get(row_reader.get());
+    if (tera::ErrorCode::kOK == row_reader->GetError().GetType()) {
+        while (!row_reader->Done()) {
+            int64_t latest_observer_start_ts = 0;
+            if (!StringToNumber(row_reader->Value(), &latest_observer_start_ts)) {
+                LOG(ERROR) << "Convert string to timestamp failed! String: " << row_reader->Value() <<
+                    " row=" << notify_cell->row << " cf=" << notify_cell->observed_column.family <<
+                    " qu=" << notify_cell->observed_column.qualifier;
+                is_collision = true;
+                break;
+            }
+
+            // collision check： ack ts later than notify ts &&
+            if (latest_observer_start_ts >= notify_cell->timestamp &&
+                notify_cell->transaction->GetStartTimestamp() - latest_observer_start_ts
+                < FLAGS_observer_ack_conflict_timeout) {
+                // time too short, collisision, ignore
+
+                is_collision = true;
+                LOG(INFO) << "own collision. row=" << notify_cell->row <<
+                    " cf=" << notify_cell->observed_column.family << " qu=" <<
+                    notify_cell->observed_column.qualifier <<
+                    ", latest observer start_ts=" << latest_observer_start_ts <<
+                    ", observer start_ts=" << notify_cell->transaction->GetStartTimestamp() <<
+                    ", data commit_ts=" << notify_cell->timestamp;
+                break;
+
+            }
+            row_reader->Next();
+        }
+    } else {
+        LOG(INFO) << "read Acks failed, err=" << row_reader->GetError().GetReason() <<
+            " row=" << notify_cell->row << " cf=" << notify_cell->observed_column.family <<
+            " qu=" << notify_cell->observed_column.qualifier;
+    }
+
+    if (!is_collision) {
+        // set Acks
+        std::unique_ptr<tera::RowMutation> mutation(notify_cell->table->NewRowMutation(notify_cell->row));
+        for (size_t idx = 0; idx < ack_qualifier_list.size(); ++idx) {
+            mutation->Put(notify_cell->observed_column.family, ack_qualifier_list[idx],
+                std::to_string(notify_cell->transaction->GetStartTimestamp()));
+        }
+        row_transaction->ApplyMutation(mutation.get());
+        notify_cell->table->CommitRowTransaction(row_transaction.get());
+        if (row_transaction->GetError().GetType() != tera::ErrorCode::kOK) {
+            LOG(INFO) << "write Ack failed, row=" << notify_cell->row << " err=" <<
+                row_transaction->GetError().GetReason() << " cf=" <<
+                notify_cell->observed_column.family << " qu=" <<
+                notify_cell->observed_column.qualifier;
+            is_collision = true;
+        }
+    }
+    VLOG(12) <<"[time] Check ACK finish. [cf:qu] " << notify_cell->observed_column.family
+             << notify_cell->observed_column.qualifier;
+
+    return !is_collision;
+}
+
+std::string ScannerImpl::GetAckQualifierPrefix(const std::string& family,
+                                               const std::string& qualifier) const {
+    return family + ":" + qualifier;
+}
+
+std::string ScannerImpl::GetAckQualifier(const std::string& prefix,
+                                         const std::string& observer_name) const {
+    return prefix + "+ack_" + observer_name;
+}
+
+bool ScannerImpl::TryLockRow(const std::string& table_name,
+                             const std::string& row) const {
+    VLOG(12) << "[time] trylock " << table_name << " " << row;
+    RowlockRequest request;
+    RowlockResponse response;
+
+    std::shared_ptr<RowlockClient> rowlock_client;
+
+    if (FLAGS_mock_rowlock_enable == true) {
+        rowlock_client.reset(new FakeRowlockClient());
+    } else {
+        rowlock_client.reset(new RowlockClient());
+    }
+
+    request.set_table_name(table_name);
+    request.set_row(row);
+
+    if (!rowlock_client->TryLock(&request, &response)) {
+        LOG(ERROR) << "TryLock rpc fail, row: " << row;
+        return false;
+    }
+    if (response.lock_status() != kLockSucc) {
+        LOG(INFO) << "Lock row fail, row: " << row;
+        return false;
+    }
+    VLOG(12) << "[time] trylock finish " << table_name << " " << row;
+    return true;
+}
+
+bool ScannerImpl::CheckTransactionTypeLegalForTable(TransactionType type,
+                                                    TransactionType table_type) {
+    if (type == table_type) {
+        return true;
+    }
+
+    if (type == kNoneTransaction && table_type ==  kSingleRowTransaction) {
+        return true;
+    }
+
+    return false;
+}
+
+TransactionType ScannerImpl::GetTableTransactionType(tera::Table* table) {
+    tera::ErrorCode err;
+    TableImpl* table_impl(dynamic_cast<ClientImpl*>(tera_client_)->OpenTableInternal(table->GetName(), &err));
+    TableSchema schema = table_impl->GetTableSchema();
+
+    if (IsTransactionTable(schema)) {
+        std::set<std::string> gtxn_cfs;
+        FindGlobalTransactionCfs(schema, &gtxn_cfs);
+        if (gtxn_cfs.size() > 0) {
+            return kGlobalTransaction;
+        }
+        return kSingleRowTransaction;
+    }
+    return kNoneTransaction;
+}
+
+} // namespace observer
+} // namespace tera
diff --git a/src/observer/executor/scanner_impl.h b/src/observer/executor/scanner_impl.h
new file mode 100644
index 000000000..833ff3fa4
--- /dev/null
+++ b/src/observer/executor/scanner_impl.h
@@ -0,0 +1,118 @@
+// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef TERA_OBSERVER_EXECUTOR_SCANNER_IMPL_H_
+#define TERA_OBSERVER_EXECUTOR_SCANNER_IMPL_H_
+
+#include <mutex>
+#include <pthread.h>
+
+#include "common/counter.h"
+#include "common/mutex.h"
+#include "common/thread_pool.h"
+#include "common/thread.h"
+#include "common/this_thread.h"
+#include "observer/executor/notify_cell.h"
+#include "observer/executor/observer.h"
+#include "observer/executor/scanner.h"
+#include "tera.h"
+
+namespace tera {
+namespace observer {
+
+class Observer;
+class KeySelector;
+
+class ScannerImpl : public Scanner {
+private:
+    struct TableObserveInfo {
+        std::map<Column, std::set<Observer*>> observe_columns;
+        tera::Table* table;
+        TransactionType type;
+    };
+
+public:
+    ScannerImpl();
+    virtual ~ScannerImpl();
+
+    virtual ErrorCode Observe(const std::string& table_name,
+                              const std::string& column_family,
+                              const std::string& qualifier,
+                              Observer* observer);
+
+    virtual bool Init();
+
+    virtual bool Start();
+
+    virtual void Exit();
+    
+    tera::Client* GetTeraClient() const;
+
+    static ScannerImpl* GetInstance();
+
+private:
+    void ScanTable();
+
+    bool DoScanTable(tera::Table* table,
+                     const std::set<Column>& column_set,
+                     const std::string& start_key,
+                     const std::string& end_key);
+
+    bool DoReadValue(std::shared_ptr<NotifyCell> notify_cell);
+
+    bool ParseNotifyQualifier(const std::string& notify_qualifier,
+                              std::string* data_family,
+                              std::string* data_qualfier);
+
+    void GetObserveColumns(const std::string& table_name, 
+                           std::set<Column>* column_set);
+
+    tera::Table* GetTable(const std::string table_name);
+
+    bool NextRow(const std::set<Column>& columns, tera::ResultStream* result_stream, 
+                 const std::string& table_name, bool* finished, 
+                 std::string* row, std::vector<Column>* vec_col);
+
+    void Profiling();
+
+    bool CheckConflictOnAckColumn(std::shared_ptr<NotifyCell> notify_cell, 
+                                  const std::set<Observer*>& observers);
+    std::string GetAckQualifierPrefix(const std::string& family, const std::string& qualifier) const;
+    std::string GetAckQualifier(const std::string& prefix, const std::string& observer_name) const;
+    bool TryLockRow(const std::string& table_name, 
+                    const std::string& row) const;
+
+    bool CheckTransactionTypeLegalForTable(TransactionType type, TransactionType table_type);
+    TransactionType GetTableTransactionType(tera::Table* table);
+
+private:
+    mutable Mutex table_mutex_;
+    tera::Client* tera_client_;
+    std::unique_ptr<KeySelector> key_selector_;
+
+    // map<table name, table observe info:table ptr, map<column, observer>>
+    std::shared_ptr<std::map<std::string, TableObserveInfo>> table_observe_info_;
+    // This set stores unique user-define observer addresses. 
+    // Release user-define observers when scanner destruct 
+    std::set<Observer*> observers_;
+
+    std::unique_ptr<common::ThreadPool> scan_table_threads_;
+    std::unique_ptr<common::ThreadPool> transaction_threads_;
+
+    // for quit
+    bool quit_; 
+    mutable Mutex quit_mutex_;
+    common::CondVar cond_;
+
+    common::Thread profiling_thread_;
+    Counter total_counter_;
+    Counter fail_counter_;
+
+    static ScannerImpl* scanner_instance_;
+};
+
+} // namespace observer
+} // namespace tera
+
+#endif  // TERA_OBSERVER_EXECUTOR_SCANNER_IMPL_H_
diff --git a/src/observer/observer_demo/demo_entry.cc b/src/observer/observer_demo/demo_entry.cc
new file mode 100644
index 000000000..7d6e3a361
--- /dev/null
+++ b/src/observer/observer_demo/demo_entry.cc
@@ -0,0 +1,59 @@
+// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "observer/observer_demo/demo_entry.h"
+
+#include "observer/executor/observer.h"
+#include "observer/executor/scanner.h"
+#include "observer/observer_demo/demo_observer.h"
+#include "tera.h"
+
+std::string GetTeraEntryName() {
+    return "DemoEntry";
+}
+
+tera::TeraEntry* GetTeraEntry() {
+    return new tera::observer::DemoEntry();
+}
+
+namespace tera {
+namespace observer {
+
+DemoEntry::DemoEntry() {}
+
+ErrorCode DemoEntry::Observe() {
+	ErrorCode err;
+	// new an observer ptr and do not delete it
+	Observer* demo = new DemoObserver();
+	Observer* parser = new ParseObserver();
+	Observer* single_row_observer = new SingleRowObserver();
+	Observer* none_txn_observer = new NoneTransactionObserver();
+
+	Scanner* scanner = GetScanner();
+	err = scanner->Observe("observer_test_table", "Data", "Page", demo);
+	if (tera::ErrorCode::kOK != err.GetType()) {
+		return err;
+	}
+	err = scanner->Observe("observer_test_table", "Data", "Link", demo);
+	if (tera::ErrorCode::kOK != err.GetType()) {
+		return err;
+	}
+
+	err = scanner->Observe("observer_test_table", "Data", "Link", parser);
+	if (tera::ErrorCode::kOK != err.GetType()) {
+		return err;
+	}
+
+	err = scanner->Observe("single_row_test_table", "Data", "Link", single_row_observer);
+	if (tera::ErrorCode::kOK != err.GetType()) {
+		return err;
+	}
+
+	err = scanner->Observe("none_txn_test_table", "Data", "Link", none_txn_observer);
+	return err;
+
+}
+
+} // namespace observer
+} // namespace tera
\ No newline at end of file
diff --git a/src/observer/observer_demo/demo_entry.h b/src/observer/observer_demo/demo_entry.h
new file mode 100644
index 000000000..5f01ec840
--- /dev/null
+++ b/src/observer/observer_demo/demo_entry.h
@@ -0,0 +1,30 @@
+// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef TERA_OBSERVER_OBSERVER_DEMO_DEMO_ENTRY_H_
+#define TERA_OBSERVER_OBSERVER_DEMO_DEMO_ENTRY_H_
+
+#include <memory>
+#include <string>
+
+#include "observer/executor/scanner_entry.h"
+#include "tera.h"
+
+namespace tera {
+namespace observer {
+
+class DemoEntry : public ScannerEntry {
+public:
+	DemoEntry();
+	virtual ~DemoEntry() {}
+
+    virtual ErrorCode Observe();
+};
+
+
+} // namespace observer
+} // namespace tera
+
+#endif  // TERA_OBSERVER_OBSERVER_DEMO_DEMO_ENTRY_H_
+
diff --git a/src/observer/observer_demo/demo_observer.cc b/src/observer/observer_demo/demo_observer.cc
new file mode 100644
index 000000000..07048af92
--- /dev/null
+++ b/src/observer/observer_demo/demo_observer.cc
@@ -0,0 +1,156 @@
+// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "observer/observer_demo/demo_observer.h"
+
+#include <glog/logging.h>
+
+namespace tera {
+namespace observer {
+
+ErrorCode DemoObserver::OnNotify(tera::Transaction* t,
+		                         tera::Client* client,
+		                         const std::string& table_name,
+		                         const std::string& family,
+		                         const std::string& qualifier,
+		                         const std::string& row,
+		                         const std::string& value,
+		                         int64_t timestamp,
+                                 Notification* notification) {
+    VLOG(12) <<"[time] OnNotify start. [row] " << row;
+    LOG(INFO) << "[Notify DemoObserver] table:family:qualifer=" <<
+        table_name << ":" << family << ":" <<
+        qualifier << " row=" << row <<
+        " value=" << value << " timestamp=" << timestamp;
+
+    tera::ErrorCode err;
+    tera::Table* table = client->OpenTable(table_name, &err);
+
+    // write ForwordIndex column
+    tera::RowMutation* mutation = table->NewRowMutation(row);
+    mutation->Put("Data", "ForwordIndex", "FIValue_" + row);
+    t->ApplyMutation(mutation);
+
+    tera::ErrorCode error;
+    notification->Ack(table, row, family, qualifier);
+    error = t->Commit();
+    delete mutation;
+    VLOG(12) <<"[time] OnNotify finish. [row] " << row;
+    return error;
+}
+
+std::string DemoObserver::GetObserverName() const {
+	return "DemoObserver";
+}
+
+TransactionType DemoObserver::GetTransactionType() const {
+    return kGlobalTransaction;
+}
+
+ErrorCode ParseObserver::OnNotify(tera::Transaction* t,
+                                 tera::Client* client,
+                                 const std::string& table_name,
+                                 const std::string& family,
+                                 const std::string& qualifier,
+                                 const std::string& row,
+                                 const std::string& value,
+                                 int64_t timestamp,
+                                 Notification* notification) {
+    LOG(INFO) << "[Notify ParseObserver] table:family:qualifer=" <<
+        table_name << ":" << family << ":" <<
+        qualifier << " row=" << row <<
+        " value=" << value << " timestamp=" << timestamp;
+
+    tera::ErrorCode err;
+    // do nothing
+    tera::Table* table = client->OpenTable(table_name, &err);
+    notification->Ack(table, row, family, qualifier);
+    err = t->Commit();
+    return err;
+}
+
+std::string ParseObserver::GetObserverName() const {
+    return "ParseObserver";
+}
+
+TransactionType ParseObserver::GetTransactionType() const {
+    return kGlobalTransaction;
+}
+
+ErrorCode SingleRowObserver::OnNotify(tera::Transaction* t,
+                                 tera::Client* client,
+                                 const std::string& table_name,
+                                 const std::string& family,
+                                 const std::string& qualifier,
+                                 const std::string& row,
+                                 const std::string& value,
+                                 int64_t timestamp,
+                                 Notification* notification) {
+    LOG(INFO) << "[Notify SingleRowObserver] table:family:qualifer=" <<
+        table_name << ":" << family << ":" <<
+        qualifier << " row=" << row <<
+        " value=" << value << " timestamp=" << timestamp;
+
+    tera::ErrorCode err;
+    tera::Table* table = client->OpenTable(table_name, &err);
+
+    // single row txn
+    tera::RowMutation* mutation = table->NewRowMutation(row);
+    mutation->Put(family, "another_qu", "value");
+    t->ApplyMutation(mutation);
+
+    tera::ErrorCode error;
+    notification->Ack(table, row, family, qualifier);
+    tera::Table* another_table = client->OpenTable("another_table", &err);
+    notification->Ack(another_table, "somerow", "family", "qualifier");
+    error = t->Commit();
+    delete mutation;
+    return error;
+}
+
+std::string SingleRowObserver::GetObserverName() const {
+    return "SingleRowObserver";
+}
+
+TransactionType SingleRowObserver::GetTransactionType() const {
+    return kSingleRowTransaction;
+}
+
+ErrorCode NoneTransactionObserver::OnNotify(tera::Transaction* t,
+                                 tera::Client* client,
+                                 const std::string& table_name,
+                                 const std::string& family,
+                                 const std::string& qualifier,
+                                 const std::string& row,
+                                 const std::string& value,
+                                 int64_t timestamp,
+                                 Notification* notification) {
+    LOG(INFO) << "[Notify NoneTransactionObserver] table:family:qualifer=" <<
+        table_name << ":" << family << ":" <<
+        qualifier << " row=" << row <<
+        " value=" << value << " timestamp=" << timestamp;
+
+    tera::ErrorCode err;
+    tera::Table* table = client->OpenTable(table_name, &err);
+
+    // do something
+    // kNoneTransaction notify
+    notification->Ack(table, row, family, qualifier);
+
+    // kNoneTransaction ack 
+    tera::Table* notify_table = client->OpenTable("notify_table", &err);
+    notification->Notify(notify_table, "notify_row", "family", "qualifier");
+    return err;
+}
+
+std::string NoneTransactionObserver::GetObserverName() const {
+    return "NoneTransactionObserver";
+}
+
+TransactionType NoneTransactionObserver::GetTransactionType() const {
+    return kNoneTransaction;
+}
+
+} // namespace observer
+} // namespace tera
\ No newline at end of file
diff --git a/src/observer/observer_demo/demo_observer.h b/src/observer/observer_demo/demo_observer.h
new file mode 100644
index 000000000..201feebf2
--- /dev/null
+++ b/src/observer/observer_demo/demo_observer.h
@@ -0,0 +1,86 @@
+// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef TERA_OBSERVER_OBSERVER_DEMO_DEMO_OBSERVER_H_
+#define TERA_OBSERVER_OBSERVER_DEMO_DEMO_OBSERVER_H_
+
+#include "observer/executor/observer.h"
+#include "tera.h"
+
+namespace tera {
+namespace observer {
+
+class DemoObserver : public tera::observer::Observer {
+public:
+    DemoObserver() {}
+    virtual ~DemoObserver() {}
+    virtual ErrorCode OnNotify(tera::Transaction* t,
+                              tera::Client* client,
+                              const std::string& table_name,
+                              const std::string& family,
+                              const std::string& qualifier,
+                              const std::string& row,
+                              const std::string& value,
+                              int64_t timestamp,
+                              Notification* notification);
+    virtual std::string GetObserverName() const;
+    virtual TransactionType GetTransactionType() const; 
+};
+
+class ParseObserver : public tera::observer::Observer {
+public:
+    ParseObserver() {}
+    virtual ~ParseObserver() {}
+    virtual ErrorCode OnNotify(tera::Transaction* t,
+                              tera::Client* client,
+                              const std::string& table_name,
+                              const std::string& family,
+                              const std::string& qualifier,
+                              const std::string& row,
+                              const std::string& value,
+                              int64_t timestamp,
+                              Notification* notification);
+    virtual std::string GetObserverName() const;
+    virtual TransactionType GetTransactionType() const; 
+};
+
+class SingleRowObserver : public tera::observer::Observer {
+public:
+    SingleRowObserver() {}
+    virtual ~SingleRowObserver() {}
+    virtual ErrorCode OnNotify(tera::Transaction* t,
+                              tera::Client* client,
+                              const std::string& table_name,
+                              const std::string& family,
+                              const std::string& qualifier,
+                              const std::string& row,
+                              const std::string& value,
+                              int64_t timestamp,
+                              Notification* notification);
+    virtual std::string GetObserverName() const;
+    virtual TransactionType GetTransactionType() const; 
+};
+
+class NoneTransactionObserver : public tera::observer::Observer {
+public:
+    NoneTransactionObserver() {}
+    virtual ~NoneTransactionObserver() {}
+    virtual ErrorCode OnNotify(tera::Transaction* t,
+                              tera::Client* client,
+                              const std::string& table_name,
+                              const std::string& family,
+                              const std::string& qualifier,
+                              const std::string& row,
+                              const std::string& value,
+                              int64_t timestamp,
+                              Notification* notification);
+    virtual std::string GetObserverName() const;
+    virtual TransactionType GetTransactionType() const; 
+};
+
+} // namespace observer
+} // namespace tera
+
+#endif  // TERA_OBSERVER_OBSERVER_DEMO_DEMO_OBSERVER_H_
+
diff --git a/src/observer/observer_demo/observe_demo_main.cc b/src/observer/observer_demo/observe_demo_main.cc
new file mode 100644
index 000000000..af633255a
--- /dev/null
+++ b/src/observer/observer_demo/observe_demo_main.cc
@@ -0,0 +1,84 @@
+// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include <signal.h>
+
+#include <gflags/gflags.h>
+#include <glog/logging.h>
+
+#include "common/base/scoped_ptr.h"
+#include "common/log/log_cleaner.h"
+#include "tera_entry.h"
+#include "utils/utils_cmd.h"
+#include "version.h"
+
+DECLARE_string(tera_log_prefix);
+DECLARE_string(tera_local_addr);
+DECLARE_bool(tera_info_log_clean_enable);
+
+extern std::string GetTeraEntryName();
+extern tera::TeraEntry* GetTeraEntry();
+
+volatile sig_atomic_t g_quit = 0;
+
+static void SignalIntHandler(int sig) {
+    g_quit = 1;
+}
+
+int main(int argc, char** argv) {
+    ::google::ParseCommandLineFlags(&argc, &argv, true);
+    if (FLAGS_tera_log_prefix.empty()) {
+        FLAGS_tera_log_prefix = GetTeraEntryName();
+        if (FLAGS_tera_log_prefix.empty()) {
+            FLAGS_tera_log_prefix = "tera";
+        }
+    }
+    tera::utils::SetupLog(FLAGS_tera_log_prefix);
+
+    if (argc > 1) {
+        std::string ext_cmd = argv[1];
+        if (ext_cmd == "version") {
+            PrintSystemVersion();
+            return 0;
+        }
+    }
+
+    signal(SIGINT, SignalIntHandler);
+    signal(SIGTERM, SignalIntHandler);
+
+    scoped_ptr<tera::TeraEntry> entry(GetTeraEntry());
+    if (entry.get() == NULL) {
+        return -1;
+    }
+
+    if (!entry->Start()) {
+        return -1;
+    }
+
+	// start log cleaner
+	if (FLAGS_tera_info_log_clean_enable) {
+	    common::LogCleaner::StartCleaner();
+		LOG(INFO) << "start log cleaner";
+	} else {
+		LOG(INFO) << "log cleaner is disable";
+	}
+
+    while (!g_quit) {
+        if (!entry->Run()) {
+            LOG(ERROR) << "Server run error ,and then exit now ";
+            break;
+        }
+    }
+    if (g_quit) {
+        LOG(INFO) << "received interrupt signal from user, will stop";
+    }
+
+    common::LogCleaner::StopCleaner();
+
+    if (!entry->Shutdown()) {
+        return -1;
+    }
+
+    return 0;
+}
diff --git a/src/observer/rowlocknode/fake_rowlock_client.h b/src/observer/rowlocknode/fake_rowlock_client.h
new file mode 100644
index 000000000..d884d15e4
--- /dev/null
+++ b/src/observer/rowlocknode/fake_rowlock_client.h
@@ -0,0 +1,41 @@
+// Copyright (c) 2017, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef TERA_OBSERVER_EXECUTOR_FAKE_ROWLOCK_CLIENT_H_
+#define TERA_OBSERVER_EXECUTOR_FAKE_ROWLOCK_CLIENT_H_
+
+#include <gflags/gflags.h>
+#include <sofa/pbrpc/pbrpc.h>
+
+#include "proto/rpc_client.h"
+#include "sdk/rowlock_client.h"
+
+namespace tera {
+namespace observer {
+
+class FakeRowlockClient : public RowlockClient {
+public:
+    FakeRowlockClient() : RowlockClient("127.0.0.1:22222") {};
+    ~FakeRowlockClient() {}
+
+    virtual bool TryLock(const RowlockRequest* request,
+            RowlockResponse* response,
+            std::function<void (RowlockRequest*, RowlockResponse*, bool, int)> done = NULL) {
+        response->set_lock_status(kLockSucc);
+        return true;
+    }
+
+    virtual bool UnLock(const RowlockRequest* request,
+            RowlockResponse* response,
+            std::function<void (RowlockRequest*, RowlockResponse*, bool, int)> done = NULL) {
+    	response->set_lock_status(kLockSucc);
+    	return true;
+    }
+};
+
+} // namespace observer
+} // namespace tera
+#endif  // TERA_OBSERVER_EXECUTOR_FAKE_ROWLOCK_CLIENT_H_
+
+
diff --git a/src/observer/rowlocknode/fake_rowlocknode_zk_adapter.cc b/src/observer/rowlocknode/fake_rowlocknode_zk_adapter.cc
new file mode 100644
index 000000000..2cf0d8974
--- /dev/null
+++ b/src/observer/rowlocknode/fake_rowlocknode_zk_adapter.cc
@@ -0,0 +1,66 @@
+// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "observer/rowlocknode/fake_rowlocknode_zk_adapter.h"
+
+#include <stdlib.h>
+
+#include <gflags/gflags.h>
+
+#include "common/this_thread.h"
+#include "ins_sdk.h"
+#include "observer/rowlocknode/rowlocknode_zk_adapter_base.h"
+#include "types.h"
+
+DECLARE_string(rowlock_ins_root_path);
+DECLARE_int32(rowlock_server_node_num);
+DECLARE_string(rowlock_fake_root_path);
+
+namespace tera {
+namespace observer {
+
+FakeRowlockNodeZkAdapter::FakeRowlockNodeZkAdapter(RowlockNodeImpl* rowlocknode_impl,
+                                                   const std::string& server_addr) :
+    rowlocknode_impl_(rowlocknode_impl), server_addr_(server_addr) {
+}
+
+FakeRowlockNodeZkAdapter::~FakeRowlockNodeZkAdapter() {
+}
+
+void FakeRowlockNodeZkAdapter::Init() {
+    std::string root_path = FLAGS_rowlock_fake_root_path;
+
+    std::string node_num_key = root_path + kRowlockNodeNumPath;
+    zk::FakeZkUtil::WriteNode(node_num_key, std::to_string(FLAGS_rowlock_server_node_num));
+
+    // create node
+    int id = 0;
+    std::string id_lock_key;
+    std::string host_lock_key;
+    while (true) {
+        id_lock_key = root_path + kRowlockNodeIdListPath + "/" + std::to_string(id);
+        std::string file_path = "mkdir -p " + root_path + kRowlockNodeIdListPath;
+        system(file_path.c_str());
+        if (zk::FakeZkUtil::WriteNode(id_lock_key, std::to_string(id))) {
+            break;
+        } else {
+            LOG(ERROR) << "[Fake rowlock zk]: write node " << id_lock_key << " failed";
+        }
+        if (++id >= FLAGS_rowlock_server_node_num) {
+            id = 0;
+        }
+        ThisThread::Sleep(1);
+    }
+
+    LOG(INFO) << "RowlockNode Id=" << id << " host=" << server_addr_
+        << " nodenum=" << FLAGS_rowlock_server_node_num;
+}
+
+void FakeRowlockNodeZkAdapter::OnLockChange(std::string session_id, bool deleted) {
+    _Exit(EXIT_FAILURE);
+}
+
+} // namespace observer
+} // namespace tera
+
diff --git a/src/observer/rowlocknode/fake_rowlocknode_zk_adapter.h b/src/observer/rowlocknode/fake_rowlocknode_zk_adapter.h
new file mode 100644
index 000000000..686b2cdef
--- /dev/null
+++ b/src/observer/rowlocknode/fake_rowlocknode_zk_adapter.h
@@ -0,0 +1,55 @@
+// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef TERA_OBSERVER_ROWLOCKNODE_FAKE_ROWLOCKNODE_ZK_ADAPTER_H_
+#define TERA_OBSERVER_ROWLOCKNODE_FAKE_ROWLOCKNODE_ZK_ADAPTER_H_
+
+#include <string>
+#include <vector>
+
+#include "observer/rowlocknode/rowlocknode_impl.h"
+#include "observer/rowlocknode/rowlocknode_zk_adapter_base.h"
+#include "zk/zk_adapter.h"
+
+namespace galaxy {
+namespace ins {
+namespace sdk {
+    class InsSDK;
+} // namespace sdk
+} // namespace ins
+} // namespace galaxy
+
+namespace tera {
+namespace observer {
+
+class RowlockNodeImpl;
+
+class FakeRowlockNodeZkAdapter : public RowlockNodeZkAdapterBase {
+public:
+    FakeRowlockNodeZkAdapter(RowlockNodeImpl* rowlocknode_impl, const std::string& server_addr);
+    virtual ~FakeRowlockNodeZkAdapter();
+    virtual void Init();
+    void OnLockChange(std::string session_id, bool deleted);
+
+private:
+    virtual void OnChildrenChanged(const std::string& path,
+            const std::vector<std::string>& name_list,
+            const std::vector<std::string>& data_list) {}
+    virtual void OnNodeValueChanged(const std::string& path,
+            const std::string& value) {}
+    virtual void OnNodeCreated(const std::string& path) {}
+    virtual void OnNodeDeleted(const std::string& path) {}
+    virtual void OnWatchFailed(const std::string& path, int watch_type,
+            int err) {}
+    virtual void OnSessionTimeout() {}
+
+private:
+    RowlockNodeImpl* rowlocknode_impl_;
+    std::string server_addr_;
+};
+
+} // namespace observer
+} // namespace tera
+#endif  // TERA_OBSERVER_ROWLOCKNODE_FAKE_ROWLOCKNODE_ZK_ADAPTER_H_
+
diff --git a/src/observer/rowlocknode/ins_rowlock_client_zk_adapter.cc b/src/observer/rowlocknode/ins_rowlock_client_zk_adapter.cc
new file mode 100644
index 000000000..01c9e8970
--- /dev/null
+++ b/src/observer/rowlocknode/ins_rowlock_client_zk_adapter.cc
@@ -0,0 +1,55 @@
+// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "observer/rowlocknode/ins_rowlock_client_zk_adapter.h"
+
+#include <gflags/gflags.h>
+#include <glog/logging.h>
+
+#include "ins_sdk.h"
+
+#include "sdk/rowlock_client.h"
+#include "types.h"
+
+DECLARE_string(rowlock_ins_root_path);
+DECLARE_string(tera_ins_addr_list);
+DECLARE_int32(rowlock_server_node_num);
+DECLARE_int64(tera_zk_retry_period); 
+DECLARE_int32(tera_zk_timeout);
+DECLARE_int32(tera_zk_retry_max_times);
+
+namespace tera {
+namespace observer {
+
+InsRowlockClientZkAdapter::InsRowlockClientZkAdapter(RowlockClient* server_client, 
+											   const std::string& server_addr)
+    : ZkRowlockClientZkAdapter(server_client, server_addr),
+      client_(server_client),
+      server_addr_(server_addr) {}
+      
+bool InsRowlockClientZkAdapter::Init() {
+    std::string root_path = FLAGS_rowlock_ins_root_path;
+    std::vector<std::string> value;
+    // create session
+    ins_sdk_ = new galaxy::ins::sdk::InsSDK(FLAGS_tera_ins_addr_list);
+
+    // put server_node_num
+    std::string rowlock_proxy_path = root_path + kRowlockProxyPath;
+
+    galaxy::ins::sdk::ScanResult* result = ins_sdk_->Scan(rowlock_proxy_path + "/!",
+                                                          rowlock_proxy_path + "/~");
+    while (!result->Done()) {
+        CHECK_EQ(result->Error(), galaxy::ins::sdk::kOK);
+        value.push_back(result->Value());
+        result->Next();
+    }
+    delete result;
+
+    client_->Update(value);
+    return true;
+}
+
+} // namespace observer
+} // namespace tera
+
diff --git a/src/observer/rowlocknode/ins_rowlock_client_zk_adapter.h b/src/observer/rowlocknode/ins_rowlock_client_zk_adapter.h
new file mode 100644
index 000000000..7f56389ce
--- /dev/null
+++ b/src/observer/rowlocknode/ins_rowlock_client_zk_adapter.h
@@ -0,0 +1,50 @@
+// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef TERA_OBSERVER_ROWLOCKNODE_INS_ROWLOCK_CLIENT_ZK_ADAPTER_H_
+#define TERA_OBSERVER_ROWLOCKNODE_INS_ROWLOCK_CLIENT_ZK_ADAPTER_H_
+
+#include "observer/rowlocknode/zk_rowlock_client_zk_adapter.h"
+#include "zk/zk_adapter.h"
+
+namespace galaxy {
+namespace ins {
+namespace sdk {
+    class InsSDK;
+} // namespace sdk
+} // namespace ins
+} // namespace galaxy
+
+namespace tera {
+namespace observer {
+
+class RowlockClient;
+
+class InsRowlockClientZkAdapter : public ZkRowlockClientZkAdapter {
+public:
+    InsRowlockClientZkAdapter(RowlockClient* server_client, const std::string& server_addr);
+    virtual ~InsRowlockClientZkAdapter() {};
+    virtual bool Init();
+protected:
+	virtual void OnNodeValueChanged(const std::string& path,
+                                    const std::string& value) {}
+    virtual void OnWatchFailed(const std::string& path, int watch_type,
+                               int err) {}
+    virtual void OnNodeDeleted(const std::string& path) {}
+    virtual void OnSessionTimeout() {}
+    virtual void OnNodeCreated(const std::string& path) {}
+    virtual void OnChildrenChanged(const std::string& path,
+		                           const std::vector<std::string>& name_list,
+		                           const std::vector<std::string>& data_list) {}
+
+private:
+    RowlockClient* client_;
+    std::string server_addr_;
+    galaxy::ins::sdk::InsSDK* ins_sdk_;
+};
+
+} // namespace observer
+} // namespace tera
+
+#endif  // TERA_OBSERVER_ROWLOCKNODE_INS_ROWLOCK_CLIENT_ZK_ADAPTER_H_
diff --git a/src/observer/rowlocknode/ins_rowlocknode_zk_adapter.cc b/src/observer/rowlocknode/ins_rowlocknode_zk_adapter.cc
new file mode 100644
index 000000000..c0ec709d5
--- /dev/null
+++ b/src/observer/rowlocknode/ins_rowlocknode_zk_adapter.cc
@@ -0,0 +1,80 @@
+// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include <gflags/gflags.h>
+
+#include "common/this_thread.h"
+#include "ins_sdk.h"
+#include "observer/rowlocknode/ins_rowlocknode_zk_adapter.h"
+#include "types.h"
+
+DECLARE_int64(tera_zk_retry_period); 
+DECLARE_string(rowlock_ins_root_path);
+DECLARE_string(tera_ins_addr_list);
+DECLARE_int32(rowlock_server_node_num);
+DECLARE_string(rowlock_fake_root_path);
+
+namespace tera {
+namespace observer {
+
+InsRowlockNodeZkAdapter::InsRowlockNodeZkAdapter(RowlockNodeImpl* rowlocknode_impl,
+                                                 const std::string& server_addr) :
+    rowlocknode_impl_(rowlocknode_impl), server_addr_(server_addr) {
+}
+
+InsRowlockNodeZkAdapter::~InsRowlockNodeZkAdapter() {
+}
+
+static void InsOnLockChange(const galaxy::ins::sdk::WatchParam& param,
+                            galaxy::ins::sdk::SDKError error) {
+    LOG(ERROR) << "recv lock change event" ;
+    InsRowlockNodeZkAdapter* ins_adp = static_cast<InsRowlockNodeZkAdapter*>(param.context);
+    ins_adp->OnLockChange(param.value, param.deleted);
+}
+
+void InsRowlockNodeZkAdapter::Init() {
+    std::string root_path = FLAGS_rowlock_ins_root_path;
+    galaxy::ins::sdk::SDKError err;
+    // create session
+    ins_sdk_ = new galaxy::ins::sdk::InsSDK(FLAGS_tera_ins_addr_list);
+    // get session id
+    std::string session_id = ins_sdk_->GetSessionID();
+
+    // put server_node_num
+    std::string node_num_key = root_path + kRowlockNodeNumPath;
+    if (!ins_sdk_->Put(node_num_key, std::to_string(FLAGS_rowlock_server_node_num), &err)) {
+        LOG(WARNING) << "put NodeNum fail";
+    }
+
+    // create node
+    int id = 0;
+    std::string id_lock_key;
+    std::string host_lock_key;
+    while (true) {
+        id_lock_key = root_path + kRowlockNodeIdListPath + "/" + std::to_string(id);
+        if (ins_sdk_->Put(id_lock_key, server_addr_, &err) && galaxy::ins::sdk::kOK == err) {
+            host_lock_key = root_path + kRowlockNodeHostListPath + "/" + server_addr_;
+            CHECK(ins_sdk_->Lock(host_lock_key, &err)) << "register fail";
+            break;
+        }
+        if (++id >= FLAGS_rowlock_server_node_num) {
+            id = 0;
+        }
+        ThisThread::Sleep(FLAGS_tera_zk_retry_period);
+    }
+
+    // create watch node
+    CHECK(ins_sdk_->Watch(host_lock_key, &InsOnLockChange, this, &err)) << "watch lock fail";
+
+    LOG(ERROR) << "RowlockNode Id=" << id << " host=" << server_addr_
+        << " nodenum=" << FLAGS_rowlock_server_node_num;
+}
+
+void InsRowlockNodeZkAdapter::OnLockChange(std::string session_id, bool deleted) {
+    _Exit(EXIT_FAILURE);
+}
+
+} // namespace observer
+} // namespace tera
+
diff --git a/src/observer/rowlocknode/ins_rowlocknode_zk_adapter.h b/src/observer/rowlocknode/ins_rowlocknode_zk_adapter.h
new file mode 100644
index 000000000..b335115fa
--- /dev/null
+++ b/src/observer/rowlocknode/ins_rowlocknode_zk_adapter.h
@@ -0,0 +1,56 @@
+// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef TERA_OBSERVER_ROWLOCKNODE_INS_ROWLOCKNODE_ZK_ADAPTER_H_
+#define TERA_OBSERVER_ROWLOCKNODE_INS_ROWLOCKNODE_ZK_ADAPTER_H_
+
+#include <string>
+#include <vector>
+
+#include "observer/rowlocknode/rowlocknode_impl.h"
+#include "observer/rowlocknode/rowlocknode_zk_adapter_base.h"
+#include "zk/zk_adapter.h"
+
+namespace galaxy {
+namespace ins {
+namespace sdk {
+    class InsSDK;
+} // namespace sdk
+} // namespace ins
+} // namespace galaxy
+
+namespace tera {
+namespace observer {
+
+class RowlockNodeImpl;
+
+class InsRowlockNodeZkAdapter : public RowlockNodeZkAdapterBase {
+public:
+    InsRowlockNodeZkAdapter(RowlockNodeImpl* rowlocknode_impl, const std::string& server_addr);
+    virtual ~InsRowlockNodeZkAdapter();
+    virtual void Init();
+    void OnLockChange(std::string session_id, bool deleted);
+
+private:
+    virtual void OnChildrenChanged(const std::string& path,
+            const std::vector<std::string>& name_list,
+            const std::vector<std::string>& data_list) {}
+    virtual void OnNodeValueChanged(const std::string& path,
+            const std::string& value) {}
+    virtual void OnNodeCreated(const std::string& path) {}
+    virtual void OnNodeDeleted(const std::string& path) {}
+    virtual void OnWatchFailed(const std::string& path, int watch_type,
+            int err) {}
+    virtual void OnSessionTimeout() {}
+
+private:
+    RowlockNodeImpl* rowlocknode_impl_;
+    std::string server_addr_;
+    galaxy::ins::sdk::InsSDK* ins_sdk_;
+};
+
+} // namespace observer
+} // namespace tera
+#endif  // TERA_OBSERVER_ROWLOCKNODE_INS_ROWLOCKNODE_ZK_ADAPTER_H_
+
diff --git a/src/observer/rowlocknode/remote_rowlocknode.cc b/src/observer/rowlocknode/remote_rowlocknode.cc
new file mode 100644
index 000000000..533672607
--- /dev/null
+++ b/src/observer/rowlocknode/remote_rowlocknode.cc
@@ -0,0 +1,36 @@
+// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "observer/rowlocknode/remote_rowlocknode.h"
+
+#include "gflags/gflags.h"
+
+DECLARE_int32(rowlock_thread_max_num);
+
+namespace tera {
+namespace observer {
+
+RemoteRowlockNode::RemoteRowlockNode(RowlockNodeImpl* rowlocknode_impl) :
+    rowlocknode_impl_(rowlocknode_impl) {
+}
+
+RemoteRowlockNode::~RemoteRowlockNode() {
+}
+
+void RemoteRowlockNode::Lock(google::protobuf::RpcController* controller,
+                                const RowlockRequest* request,
+                                RowlockResponse* response,
+                                google::protobuf::Closure* done) {
+    rowlocknode_impl_->TryLock(request, response, done);
+}
+
+void RemoteRowlockNode::UnLock(google::protobuf::RpcController* controller,
+                               const RowlockRequest* request,
+                               RowlockResponse* response,
+                               google::protobuf::Closure* done) {
+    rowlocknode_impl_->UnLock(request, response, done);
+}
+
+} // namespace observer
+} // namespace tera
diff --git a/src/observer/rowlocknode/remote_rowlocknode.h b/src/observer/rowlocknode/remote_rowlocknode.h
new file mode 100644
index 000000000..6c65d79d2
--- /dev/null
+++ b/src/observer/rowlocknode/remote_rowlocknode.h
@@ -0,0 +1,37 @@
+// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef TERA_OBSERVER_ROWLOCKNODE_REMOTE_ROWLOCKNODE_H_
+#define TERA_OBSERVER_ROWLOCKNODE_REMOTE_ROWLOCKNODE_H_
+
+#include "common/base/scoped_ptr.h"
+#include "common/thread_pool.h"
+#include "observer/rowlocknode/rowlocknode_impl.h"
+
+namespace tera {
+namespace observer {
+
+class RemoteRowlockNode : public RowlockService {
+public:
+    explicit RemoteRowlockNode(RowlockNodeImpl* rowlocknode_impl);
+    ~RemoteRowlockNode();
+
+    void Lock(google::protobuf::RpcController* controller,
+            const RowlockRequest* request,
+            RowlockResponse* response,
+            google::protobuf::Closure* done);
+
+    void UnLock(google::protobuf::RpcController* controller,
+            const RowlockRequest* request,
+            RowlockResponse* response,
+            google::protobuf::Closure* done);
+
+private:
+    RowlockNodeImpl* rowlocknode_impl_;
+};
+
+} // namespace observer
+} // namespace tera
+#endif  // TERA_OBSERVER_ROWLOCKNODE_REMOTE_ROWLOCKNODE_H_
+
diff --git a/src/observer/rowlocknode/rowlock_db.h b/src/observer/rowlocknode/rowlock_db.h
new file mode 100644
index 000000000..94c98889c
--- /dev/null
+++ b/src/observer/rowlocknode/rowlock_db.h
@@ -0,0 +1,161 @@
+// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef TERA_OBSERVER_ROWLOCKNODE_ROWLOCK_DB_H_
+#define TERA_OBSERVER_ROWLOCKNODE_ROWLOCK_DB_H_
+
+#include <map>
+#include <memory>
+#include <pthread.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/time.h>
+#include <unordered_map>
+#include <vector>
+
+#include <gflags/gflags.h>
+#include <glog/logging.h>
+
+#include "common/base/scoped_ptr.h"
+#include "common/mutex.h"
+#include "common/thread_pool.h"
+#include "common/timer.h"
+
+DECLARE_int32(rowlock_db_sharding_number);
+DECLARE_int32(rowlock_db_ttl);
+DECLARE_int32(rowlock_timing_wheel_patch_num);
+
+namespace tera {
+namespace observer {
+
+class RowlockDB {
+public:
+    RowlockDB() 
+        : timing_wheel_pos_(0),
+          timing_wheel_patch_num_(FLAGS_rowlock_timing_wheel_patch_num) {
+        timing_wheel_.resize(timing_wheel_patch_num_);
+    }
+
+    ~RowlockDB() {}
+
+    bool TryLock(uint64_t row) {
+        MutexLock locker(&mutex_);
+        if (locks_.find(row) == locks_.end()) {
+            locks_[row].reset(new uint64_t(row));
+            std::weak_ptr<uint64_t> ptr = locks_[row];
+            timing_wheel_[timing_wheel_pos_].push_back(ptr);
+            return true;
+        } else {
+            return false;
+        }
+    }
+
+    void UnLock(uint64_t row) {
+        MutexLock locker(&mutex_);
+        locks_.erase(row);
+    }
+
+    // call this function ever timeout period
+    // 1. pointer of timing wheel move forward by one step
+    // 2. clear all the rowlock keys and remove them from locks_
+    // 3. the next 60 seconds all new rowlock keys will be put into this wheel patch
+    void ClearTimeout() { 
+        // pointer forward
+        mutex_.Lock();
+        timing_wheel_pos_ = (timing_wheel_pos_ + 1) % timing_wheel_patch_num_;
+        std::vector<std::weak_ptr<uint64_t>> buffer;
+
+        // release memory
+        buffer.swap(timing_wheel_[timing_wheel_pos_]);
+        mutex_.Unlock();
+
+        // remove key from locks_
+        for (uint32_t i = 0; i < buffer.size(); ++i) {
+            if (!buffer[i].expired()) {
+                mutex_.Lock();
+                auto it = buffer[i].lock();
+                locks_.erase(*it);
+                mutex_.Unlock();
+            }           
+        }
+    }
+
+    size_t Size() const {
+        MutexLock locker(&mutex_);
+        return locks_.size();
+    }
+
+private:
+    mutable Mutex mutex_;
+
+    std::unordered_map<uint64_t, std::shared_ptr<uint64_t>> locks_;
+
+    // timing wheel
+    uint32_t timing_wheel_pos_;
+    uint32_t timing_wheel_patch_num_;
+    std::vector<std::vector<std::weak_ptr<uint64_t>>> timing_wheel_;
+};
+
+class ShardedRowlockDB {
+public:
+    ShardedRowlockDB() : thread_pool_(new ThreadPool(1)) {
+        lock_map_.resize(FLAGS_rowlock_db_sharding_number);
+
+        for (int32_t i = 0; i < FLAGS_rowlock_db_sharding_number; ++i) {
+            std::unique_ptr<RowlockDB> db(new RowlockDB()); 
+            lock_map_[i].reset(db.release());
+        }
+        ScheduleClearTimeout();
+    }
+
+    ~ShardedRowlockDB() {}
+
+    bool TryLock(uint64_t row) {
+        std::unique_ptr<RowlockDB>& db_node = lock_map_[row % FLAGS_rowlock_db_sharding_number];
+
+        if (db_node->TryLock(row) == true) {
+            return true;
+        } else {
+            return false;
+        }
+    }
+
+    void UnLock(uint64_t row) {
+        std::unique_ptr<RowlockDB>& db_node = lock_map_[row % FLAGS_rowlock_db_sharding_number];
+        db_node->UnLock(row);
+    }
+
+    size_t Size() const {
+        size_t size = 0;
+        for (uint32_t i = 0; i < lock_map_.size(); ++i) {
+            size += lock_map_[i]->Size();
+        }
+        return size;
+    }
+
+private:
+    void ScheduleClearTimeout() {
+        ClearTimeout();
+
+        ThreadPool::Task task = std::bind(&ShardedRowlockDB::ScheduleClearTimeout, this);
+        // everytime timing wheel move forward one step, every patch_num steps data will be cleared
+        thread_pool_->DelayTask(FLAGS_rowlock_db_ttl / FLAGS_rowlock_timing_wheel_patch_num, task);
+    }
+
+    void ClearTimeout() {
+        for (int32_t i = 0; i < FLAGS_rowlock_db_sharding_number; ++i) {
+            lock_map_[i]->ClearTimeout();
+        }
+    }
+
+private:
+    std::vector<std::unique_ptr<RowlockDB>> lock_map_;
+    scoped_ptr<ThreadPool> thread_pool_;
+};
+
+} // namespace observer
+} // namespace tera
+
+#endif  // TERA_OBSERVER_ROWLOCKNODE_ROWLOCK_DB_H_
diff --git a/src/observer/rowlocknode/rowlocknode_entry.cc b/src/observer/rowlocknode/rowlocknode_entry.cc
new file mode 100644
index 000000000..eb2eb4e17
--- /dev/null
+++ b/src/observer/rowlocknode/rowlocknode_entry.cc
@@ -0,0 +1,87 @@
+// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "observer/rowlocknode/rowlocknode_entry.h"
+
+#include <glog/logging.h>
+#include <gflags/gflags.h>
+
+#include "common/base/string_ext.h"
+#include "common/base/string_number.h"
+#include "common/net/ip_address.h"
+#include "common/this_thread.h"
+#include "common/thread_attributes.h"
+#include "common/timer.h"
+#include "common/counter.h"
+#include "utils/rpc_timer_list.h"
+#include "common/timer.h"
+#include "observer/rowlocknode/remote_rowlocknode.h"
+
+DECLARE_string(rowlock_server_port);
+DECLARE_int32(rowlock_io_service_pool_size);
+DECLARE_int32(rowlock_rpc_work_thread_num);
+
+std::string GetTeraEntryName() {
+    return "rowlock";
+}
+
+tera::TeraEntry* GetTeraEntry() {
+    return new tera::observer::RowlockNodeEntry();
+}
+
+namespace tera {
+namespace observer {
+
+RowlockNodeEntry::RowlockNodeEntry() : rowlocknode_impl_(NULL), remote_rowlocknode_(NULL) {
+        sofa::pbrpc::RpcServerOptions rpc_options;
+        rpc_options.max_throughput_in = -1;
+        rpc_options.max_throughput_out = -1;
+        rpc_options.work_thread_num = FLAGS_rowlock_rpc_work_thread_num;
+        rpc_options.io_service_pool_size = FLAGS_rowlock_io_service_pool_size;
+        rpc_options.no_delay = false;                   //use Nagle's Algorithm
+        rpc_options.write_buffer_base_block_factor = 0; //64Bytes per malloc
+        rpc_options.read_buffer_base_block_factor = 7;  //8kBytes per malloc
+        rpc_server_.reset(new sofa::pbrpc::RpcServer(rpc_options));
+}
+
+RowlockNodeEntry::~RowlockNodeEntry() {}
+
+bool RowlockNodeEntry::StartServer() {
+    SetProcessorAffinity();
+    IpAddress rowlocknode_addr("0.0.0.0", FLAGS_rowlock_server_port);
+    LOG(INFO) << "Start RPC server at: " << rowlocknode_addr.ToString();
+    rowlocknode_impl_.reset(new RowlockNodeImpl());
+    remote_rowlocknode_ = new RemoteRowlockNode(rowlocknode_impl_.get());
+    rpc_server_->RegisterService(remote_rowlocknode_);
+    if (!rpc_server_->Start(rowlocknode_addr.ToString())) {
+        LOG(ERROR) << "start RPC server error";
+        return false;
+    }
+    if (!rowlocknode_impl_->Init()) {
+        LOG(ERROR) << "fail to init rowlocknode_impl";
+        return false;
+    }
+    LOG(INFO) << "finish starting RPC server";
+
+    return true;
+}
+
+void RowlockNodeEntry::ShutdownServer() {
+    LOG(INFO) << "shut down server";
+    rpc_server_->Stop();
+    rowlocknode_impl_->Exit();
+    rowlocknode_impl_.reset();
+    LOG(INFO) << "RowlockNodeEntry stop done!";
+}
+
+bool RowlockNodeEntry::Run() {
+    ThisThread::Sleep(3000);
+    rowlocknode_impl_->PrintQPS();
+    return true;
+}
+
+void RowlockNodeEntry::SetProcessorAffinity() {}
+
+} // namespace observer
+} // namespace tera
diff --git a/src/observer/rowlocknode/rowlocknode_entry.h b/src/observer/rowlocknode/rowlocknode_entry.h
new file mode 100644
index 000000000..b968e8d4e
--- /dev/null
+++ b/src/observer/rowlocknode/rowlocknode_entry.h
@@ -0,0 +1,39 @@
+// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef TERA_OBSERVER_ROWLOCKNODE_ROWLOCKNODE_ENTRY_H_
+#define TERA_OBSERVER_ROWLOCKNODE_ROWLOCKNODE_ENTRY_H_
+
+#include <sofa/pbrpc/pbrpc.h>
+
+#include "common/base/scoped_ptr.h"
+#include "observer/rowlocknode/remote_rowlocknode.h"
+#include "observer/rowlocknode/rowlocknode_impl.h"
+#include "tera_entry.h"
+
+namespace tera {
+namespace observer {
+
+class RowlockNodeEntry : public tera::TeraEntry {
+public:
+    RowlockNodeEntry();
+    virtual ~RowlockNodeEntry();
+
+    virtual bool StartServer();
+    virtual bool Run();
+    virtual void ShutdownServer();
+    void SetProcessorAffinity();
+
+private:
+    common::Mutex mutex_;
+
+    scoped_ptr<RowlockNodeImpl> rowlocknode_impl_;
+    RemoteRowlockNode* remote_rowlocknode_;
+    scoped_ptr<sofa::pbrpc::RpcServer> rpc_server_;
+};
+
+} // namespace observer
+} // namespace tera
+
+#endif  // TERA_OBSERVER_ROWLOCKNODE_ROWLOCKNODE_ENTRY_H_
diff --git a/src/observer/rowlocknode/rowlocknode_impl.cc b/src/observer/rowlocknode/rowlocknode_impl.cc
new file mode 100644
index 000000000..a8563a156
--- /dev/null
+++ b/src/observer/rowlocknode/rowlocknode_impl.cc
@@ -0,0 +1,82 @@
+// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "observer/rowlocknode/rowlocknode_impl.h"
+
+#include "common/timer.h"
+#include "observer/rowlocknode/fake_rowlocknode_zk_adapter.h"
+#include "observer/rowlocknode/ins_rowlocknode_zk_adapter.h"
+#include "observer/rowlocknode/rowlocknode_zk_adapter.h"
+#include "utils/utils_cmd.h"
+
+DECLARE_string(rowlock_server_port);
+DECLARE_string(tera_coord_type);
+
+namespace tera {
+namespace observer {
+
+RowlockNodeImpl::RowlockNodeImpl() {}
+
+RowlockNodeImpl::~RowlockNodeImpl() {}
+
+bool RowlockNodeImpl::Init() {
+    std::string local_addr = tera::utils::GetLocalHostName() + ":" + FLAGS_rowlock_server_port;
+    if (FLAGS_tera_coord_type == "zk") {
+        zk_adapter_.reset(new RowlockNodeZkAdapter(this, local_addr));
+    } else if (FLAGS_tera_coord_type == "ins") {
+        zk_adapter_.reset(new InsRowlockNodeZkAdapter(this, local_addr));
+    } else {
+        zk_adapter_.reset(new FakeRowlockNodeZkAdapter(this, local_addr));
+    }
+
+    zk_adapter_->Init();
+
+    LOG(INFO) << "Rowlock node init finish";
+    return true;
+}
+
+bool RowlockNodeImpl::Exit() {
+    return true;
+}
+
+void RowlockNodeImpl::TryLock(const RowlockRequest* request,
+        RowlockResponse* response,
+        google::protobuf::Closure* done) {
+    uint64_t rowlock_key = GetRowlockKey(request->table_name(), request->row());
+    if (rowlock_db_.TryLock(rowlock_key)) {
+        response->set_lock_status(kLockSucc);
+    } else {
+        response->set_lock_status(kLockFail);
+        LOG(WARNING) << " table name: " << request->table_name()
+                     << " row :" << request->row();
+    }
+
+    done->Run();
+}
+
+void RowlockNodeImpl::UnLock(const RowlockRequest* request,
+        RowlockResponse* response,
+        google::protobuf::Closure* done) {
+    uint64_t rowlock_key = GetRowlockKey(request->table_name(), request->row());
+    rowlock_db_.UnLock(rowlock_key);
+    response->set_lock_status(kLockSucc);
+    done->Run();
+}
+
+void RowlockNodeImpl::PrintQPS() {
+    return;
+}
+
+uint64_t RowlockNodeImpl::GetRowlockKey(const std::string& table_name, 
+                                        const std::string& row) const {
+    // RowlockKey : TableName + Row
+    std::string rowlock_key_str = table_name + row;
+    return std::hash<std::string>()(rowlock_key_str);
+
+}
+
+
+} // namespace observer
+} // namespace tera
+
diff --git a/src/observer/rowlocknode/rowlocknode_impl.h b/src/observer/rowlocknode/rowlocknode_impl.h
new file mode 100644
index 000000000..a60b89dde
--- /dev/null
+++ b/src/observer/rowlocknode/rowlocknode_impl.h
@@ -0,0 +1,52 @@
+// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef TERA_OBSERVER_ROWLOCKNODE_ROWLOCKNODE_IMPL_H_
+#define TERA_OBSERVER_ROWLOCKNODE_ROWLOCKNODE_IMPL_H_
+
+#include <glog/logging.h>
+#include <memory>
+#include <pthread.h>
+
+#include "common/base/scoped_ptr.h"
+#include "common/counter.h"
+#include "common/mutex.h"
+#include "observer/rowlocknode/fake_rowlocknode_zk_adapter.h"
+#include "observer/rowlocknode/rowlock_db.h"
+#include "observer/rowlocknode/rowlocknode_zk_adapter.h"
+#include "proto/rowlocknode_rpc.pb.h"
+#include "zk/zk_adapter.h"
+
+namespace tera {
+namespace observer {
+
+class RowlockNodeImpl {
+public:
+    RowlockNodeImpl();
+    ~RowlockNodeImpl();
+
+    bool Init();
+
+    bool Exit();
+
+    void TryLock(const RowlockRequest* request,
+            RowlockResponse* response,
+            google::protobuf::Closure* done);
+
+    void UnLock(const RowlockRequest* request,
+            RowlockResponse* response,
+            google::protobuf::Closure* done);
+
+    void PrintQPS();
+private:
+    uint64_t GetRowlockKey(const std::string& table_name, const std::string& row) const;
+private:
+    ShardedRowlockDB rowlock_db_;
+    std::unique_ptr<RowlockNodeZkAdapterBase> zk_adapter_;
+};
+
+} // namespace observer
+} // namespace tera
+
+#endif  // TERA_OBSERVER_ROWLOCKNODE_ROWLOCKNODE_IMPL_H_
diff --git a/src/observer/rowlocknode/rowlocknode_zk_adapter.cc b/src/observer/rowlocknode/rowlocknode_zk_adapter.cc
new file mode 100644
index 000000000..9d079a502
--- /dev/null
+++ b/src/observer/rowlocknode/rowlocknode_zk_adapter.cc
@@ -0,0 +1,119 @@
+// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "observer/rowlocknode/rowlocknode_zk_adapter.h"
+
+#include <gflags/gflags.h>
+
+#include "common/this_thread.h"
+#include "ins_sdk.h"
+#include "types.h"
+
+DECLARE_string(rowlock_zk_root_path);
+DECLARE_string(tera_zk_addr_list);
+DECLARE_int32(rowlock_server_node_num);
+DECLARE_int64(tera_zk_retry_period); 
+DECLARE_int32(tera_zk_timeout);
+DECLARE_int32(tera_zk_retry_max_times);
+
+namespace tera {
+namespace observer {
+
+RowlockNodeZkAdapter::RowlockNodeZkAdapter(RowlockNodeImpl* rowlocknode_impl,
+        const std::string& server_addr) :
+    rowlocknode_impl_(rowlocknode_impl), server_addr_(server_addr) {
+}
+
+RowlockNodeZkAdapter::~RowlockNodeZkAdapter() {
+}
+
+void RowlockNodeZkAdapter::Init() {
+    std::string root_path = FLAGS_rowlock_zk_root_path;
+    std::string node_num_key = root_path + kRowlockNodeNumPath;
+
+    int zk_errno = zk::ZE_OK;;
+    // init zk client
+    while (!ZooKeeperAdapter::Init(FLAGS_tera_zk_addr_list,
+                                   FLAGS_rowlock_zk_root_path, FLAGS_tera_zk_timeout,
+                                   server_addr_, &zk_errno)) {
+        LOG(ERROR) << "fail to init zk : " << zk::ZkErrnoToString(zk_errno);
+        ThisThread::Sleep(FLAGS_tera_zk_retry_period);
+    }
+    LOG(INFO) << "init zk success";
+    
+    // get session id
+    int64_t session_id_int = 0;
+    if (!GetSessionId(&session_id_int, &zk_errno)) {
+        LOG(ERROR) << "get session id fail : " << zk::ZkErrnoToString(zk_errno);
+        return;
+    }
+
+    // put server_node_num
+    zk_errno = zk::ZE_OK;
+    bool is_exist = true;
+    int32_t retry_count = 0;
+    std::string value = std::to_string(FLAGS_rowlock_server_node_num);
+    CheckExist(node_num_key, &is_exist, &zk_errno);
+    if (!is_exist) {
+        while (!CreateEphemeralNode(node_num_key, value, &zk_errno)) {
+            if (retry_count++ >= FLAGS_tera_zk_retry_max_times) {
+                LOG(ERROR) << "fail to create master node";
+                return;
+            }
+            LOG(ERROR) << "retry create rowlock number node in "
+                       << FLAGS_tera_zk_retry_period << " ms, retry=" << retry_count;
+            ThisThread::Sleep(FLAGS_tera_zk_retry_period);
+            zk_errno = zk::ZE_OK;
+        }
+    } else {
+        WriteNode(node_num_key, value, &zk_errno);
+        zk_errno = zk::ZE_OK;
+    }
+
+    value = server_addr_;
+
+    // create node
+    int id = 0;
+    std::string id_lock_key;
+    std::string host_lock_key;
+
+    while (true) {
+        id_lock_key = root_path + kRowlockNodeIdListPath + "/" + std::to_string(id);
+        zk_errno = zk::ZE_OK;
+
+        if (!CreateEphemeralNode(id_lock_key, server_addr_, &zk_errno)) {
+            LOG(ERROR) << "create rowlock node fail: " << id_lock_key;
+        } else {
+            break;
+        }
+        LOG(ERROR) << "fail to create serve-node : " << zk::ZkErrnoToString(zk_errno);
+
+        if (++id >= FLAGS_rowlock_server_node_num) {
+            id = 0;
+        }
+        ThisThread::Sleep(FLAGS_tera_zk_retry_period);
+    }
+    LOG(INFO) << "create serve-node success";
+
+    is_exist = false;
+
+    // watch my node
+    while (!CheckAndWatchExist(id_lock_key, &is_exist, &zk_errno)) {
+        LOG(ERROR) << "fail to watch serve-node : " << zk::ZkErrnoToString(zk_errno);
+        ThisThread::Sleep(FLAGS_tera_zk_retry_period);
+    }
+    LOG(INFO) << "watch rowlock-node success";
+
+    if (!is_exist) {
+        OnLockChange();
+    }
+}
+
+void RowlockNodeZkAdapter::OnLockChange() {
+    _Exit(EXIT_FAILURE);
+}
+
+} // namespace observer
+} // namespace tera
+
diff --git a/src/observer/rowlocknode/rowlocknode_zk_adapter.h b/src/observer/rowlocknode/rowlocknode_zk_adapter.h
new file mode 100644
index 000000000..67324f85f
--- /dev/null
+++ b/src/observer/rowlocknode/rowlocknode_zk_adapter.h
@@ -0,0 +1,55 @@
+// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef TERA_OBSERVER_ROWLOCKNODE_ROWLOCKNODE_ZK_ADAPTER_H_
+#define TERA_OBSERVER_ROWLOCKNODE_ROWLOCKNODE_ZK_ADAPTER_H_
+
+#include <string>
+#include <vector>
+
+#include "observer/rowlocknode/rowlocknode_impl.h"
+#include "observer/rowlocknode/rowlocknode_zk_adapter_base.h"
+#include "zk/zk_adapter.h"
+
+namespace galaxy {
+namespace ins {
+namespace sdk {
+    class InsSDK;
+} // namespace sdk
+} // namespace ins
+} // namespace galaxy
+
+namespace tera {
+namespace observer {
+
+class RowlockNodeImpl;
+
+class RowlockNodeZkAdapter : public RowlockNodeZkAdapterBase {
+public:
+    RowlockNodeZkAdapter(RowlockNodeImpl* rowlocknode_impl, const std::string& server_addr);
+    virtual ~RowlockNodeZkAdapter();
+    virtual void Init();
+    void OnLockChange();
+
+private:
+    virtual void OnChildrenChanged(const std::string& path,
+            const std::vector<std::string>& name_list,
+            const std::vector<std::string>& data_list) {}
+    virtual void OnNodeValueChanged(const std::string& path,
+            const std::string& value) {}
+    virtual void OnNodeCreated(const std::string& path) {}
+    virtual void OnNodeDeleted(const std::string& path) {}
+    virtual void OnWatchFailed(const std::string& path, int watch_type,
+            int err) {}
+    virtual void OnSessionTimeout() {}
+
+private:
+    RowlockNodeImpl* rowlocknode_impl_;
+    std::string server_addr_;
+};
+
+} // namespace observer
+} // namespace tera
+#endif  // TERA_OBSERVER_ROWLOCKNODE_ROWLOCKNODE_ZK_ADAPTER_H_
+
diff --git a/src/observer/rowlocknode/rowlocknode_zk_adapter_base.h b/src/observer/rowlocknode/rowlocknode_zk_adapter_base.h
new file mode 100644
index 000000000..1ef93ccfb
--- /dev/null
+++ b/src/observer/rowlocknode/rowlocknode_zk_adapter_base.h
@@ -0,0 +1,21 @@
+// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+//
+#ifndef TERA_OBSERVER_ROWLOCKNODE_ROWLOCKNODE_ZK_ADAPTER_BASE_H_
+#define TERA_OBSERVER_ROWLOCKNODE_ROWLOCKNODE_ZK_ADAPTER_BASE_H_
+
+#include "zk/zk_adapter.h"
+
+namespace tera {
+namespace observer {
+
+class RowlockNodeZkAdapterBase : public tera::zk::ZooKeeperAdapter {
+public:
+    virtual ~RowlockNodeZkAdapterBase() {}
+    virtual void Init() = 0;
+};
+
+} // namespace observer
+} // namespace tera
+#endif  // TERA_OBSERVER_ROWLOCKNODE_ROWLOCKNODE_ZK_ADAPTER_BASE_H_
diff --git a/src/observer/rowlocknode/zk_rowlock_client_zk_adapter.cc b/src/observer/rowlocknode/zk_rowlock_client_zk_adapter.cc
new file mode 100644
index 000000000..cacd993fc
--- /dev/null
+++ b/src/observer/rowlocknode/zk_rowlock_client_zk_adapter.cc
@@ -0,0 +1,58 @@
+// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "observer/rowlocknode/zk_rowlock_client_zk_adapter.h"
+
+#include <gflags/gflags.h>
+#include <glog/logging.h>
+
+#include "sdk/rowlock_client.h"
+#include "types.h"
+
+DECLARE_string(rowlock_zk_root_path);
+DECLARE_string(tera_zk_addr_list);
+DECLARE_int32(rowlock_server_node_num);
+DECLARE_int64(tera_zk_retry_period); 
+DECLARE_int32(tera_zk_timeout);
+DECLARE_int32(tera_zk_retry_max_times);
+
+namespace tera {
+namespace observer {
+
+ZkRowlockClientZkAdapter::ZkRowlockClientZkAdapter(RowlockClient* server_client, 
+											   const std::string& server_addr)
+    : client_(server_client),
+      server_addr_(server_addr) {}
+
+ZkRowlockClientZkAdapter::~ZkRowlockClientZkAdapter() {
+    ZooKeeperAdapter::Finalize();
+}
+      
+bool ZkRowlockClientZkAdapter::Init() {
+    std::string root_path = FLAGS_rowlock_zk_root_path;
+    std::string proxy_path = root_path + kRowlockProxyPath;
+
+    int zk_errno = zk::ZE_OK;;
+    // init zk client
+    while (!ZooKeeperAdapter::Init(FLAGS_tera_zk_addr_list,
+                                   FLAGS_rowlock_zk_root_path, FLAGS_tera_zk_timeout,
+                                   server_addr_, &zk_errno)) {
+        LOG(ERROR) << "fail to init zk : " << zk::ZkErrnoToString(zk_errno);
+        ThisThread::Sleep(FLAGS_tera_zk_retry_period);
+    }
+    LOG(INFO) << "init zk success";
+
+    std::vector<std::string> child;
+    std::vector<std::string> value;
+
+    while (!ListChildren(proxy_path, &child, &value, &zk_errno)) {
+    	LOG(ERROR) << "fail to get proxy addr : " << zk::ZkErrnoToString(zk_errno);
+        ThisThread::Sleep(FLAGS_tera_zk_retry_period);
+    }
+    client_->Update(value);
+    return true;
+}
+
+} // namespace observer
+} // namespace tera
\ No newline at end of file
diff --git a/src/observer/rowlocknode/zk_rowlock_client_zk_adapter.h b/src/observer/rowlocknode/zk_rowlock_client_zk_adapter.h
new file mode 100644
index 000000000..76a388895
--- /dev/null
+++ b/src/observer/rowlocknode/zk_rowlock_client_zk_adapter.h
@@ -0,0 +1,29 @@
+// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef TERA_OBSERVER_ROWLOCKNODE_ZK_ROWLOCK_CLIENT_ZK_ADAPTER_H_
+#define TERA_OBSERVER_ROWLOCKNODE_ZK_ROWLOCK_CLIENT_ZK_ADAPTER_H_
+
+#include "zk/zk_adapter.h"
+
+namespace tera {
+namespace observer {
+
+class RowlockClient;
+
+class ZkRowlockClientZkAdapter : public zk::ZooKeeperLightAdapter {
+public:
+    ZkRowlockClientZkAdapter(RowlockClient* server_client, const std::string& server_addr);
+    virtual ~ZkRowlockClientZkAdapter();
+    virtual bool Init();
+
+private:
+    RowlockClient* client_;
+    std::string server_addr_;
+};
+
+} // namespace observer
+} // namespace tera
+
+#endif  // TERA_OBSERVER_ROWLOCKNODE_ZK_ROWLOCK_CLIENT_ZK_ADAPTER_H_
diff --git a/src/observer/rowlockproxy/remote_rowlock_proxy.cc b/src/observer/rowlockproxy/remote_rowlock_proxy.cc
new file mode 100644
index 000000000..845d30fbe
--- /dev/null
+++ b/src/observer/rowlockproxy/remote_rowlock_proxy.cc
@@ -0,0 +1,36 @@
+// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+ 
+#include "observer/rowlockproxy/remote_rowlock_proxy.h"
+
+#include "gflags/gflags.h"
+
+DECLARE_int32(rowlock_thread_max_num);
+
+namespace tera {
+namespace observer {
+
+RemoteRowlockProxy::RemoteRowlockProxy(RowlockProxyImpl* rowlock_proxy_impl) : 
+    rowlock_proxy_impl_(rowlock_proxy_impl) {
+}
+
+RemoteRowlockProxy::~RemoteRowlockProxy() {
+}
+
+void RemoteRowlockProxy::Lock(google::protobuf::RpcController* controller,
+                                const RowlockRequest* request,
+                                RowlockResponse* response,
+                                google::protobuf::Closure* done) {
+    rowlock_proxy_impl_->TryLock(request, response, done);
+}
+
+void RemoteRowlockProxy::UnLock(google::protobuf::RpcController* controller,
+                               const RowlockRequest* request,
+                               RowlockResponse* response,
+                               google::protobuf::Closure* done) { 
+    rowlock_proxy_impl_->UnLock(request, response, done);
+}
+
+} // namespace observer
+} // namespace tera
diff --git a/src/observer/rowlockproxy/remote_rowlock_proxy.h b/src/observer/rowlockproxy/remote_rowlock_proxy.h
new file mode 100644
index 000000000..df8e2c2b8
--- /dev/null
+++ b/src/observer/rowlockproxy/remote_rowlock_proxy.h
@@ -0,0 +1,38 @@
+// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef TERA_OBSERVER_ROWLOCKPROXY_REMOTE_ROWLOCK_PROXY_H_
+#define TERA_OBSERVER_ROWLOCKPROXY_REMOTE_ROWLOCK_PROXY_H_
+
+#include <memory>
+
+#include "common/base/scoped_ptr.h"
+#include "common/thread_pool.h"
+#include "observer/rowlockproxy/rowlock_proxy_impl.h"
+
+namespace tera {
+namespace observer {
+
+class RemoteRowlockProxy : public RowlockService {
+public:
+    explicit RemoteRowlockProxy(RowlockProxyImpl* rowlock_proxy_impl);
+    ~RemoteRowlockProxy();
+
+    void Lock(google::protobuf::RpcController* controller,
+            const RowlockRequest* request,
+            RowlockResponse* response,
+            google::protobuf::Closure* done);
+
+    void UnLock(google::protobuf::RpcController* controller,
+            const RowlockRequest* request,
+            RowlockResponse* response,
+            google::protobuf::Closure* done);
+
+private:
+    RowlockProxyImpl* rowlock_proxy_impl_;
+};
+
+} // namespace observer
+} // namespace tera
+#endif  // TERA_OBSERVER_ROWLOCKPROXY_REMOTE_ROWLOCK_PROXY_H_
diff --git a/src/observer/rowlockproxy/rowlock_proxy_entry.cc b/src/observer/rowlockproxy/rowlock_proxy_entry.cc
new file mode 100644
index 000000000..e9f19faa0
--- /dev/null
+++ b/src/observer/rowlockproxy/rowlock_proxy_entry.cc
@@ -0,0 +1,79 @@
+// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "observer/rowlockproxy/rowlock_proxy_entry.h"
+
+#include <glog/logging.h>
+#include <gflags/gflags.h>
+
+#include "common/base/string_ext.h"
+#include "common/base/string_number.h"
+#include "common/net/ip_address.h"
+#include "common/this_thread.h"
+#include "common/thread_attributes.h"
+#include "common/timer.h"
+#include "common/counter.h"
+#include "utils/rpc_timer_list.h"
+#include "observer/rowlockproxy/remote_rowlock_proxy.h"
+
+DECLARE_string(rowlock_proxy_port);
+DECLARE_int32(rowlock_io_service_pool_size);
+DECLARE_int32(rowlock_rpc_work_thread_num);
+
+std::string GetTeraEntryName() {
+    return "rowlock_proxy";
+}
+
+tera::TeraEntry* GetTeraEntry() {
+    return new tera::observer::RowlockProxyEntry();
+}
+
+namespace tera {
+namespace observer {
+
+RowlockProxyEntry::RowlockProxyEntry() {
+        sofa::pbrpc::RpcServerOptions rpc_options;
+        rpc_options.max_throughput_in = -1;
+        rpc_options.max_throughput_out = -1;
+        rpc_options.work_thread_num = FLAGS_rowlock_rpc_work_thread_num;
+        rpc_options.io_service_pool_size = FLAGS_rowlock_io_service_pool_size;
+        rpc_server_.reset(new sofa::pbrpc::RpcServer(rpc_options));
+}
+
+RowlockProxyEntry::~RowlockProxyEntry() {}
+
+bool RowlockProxyEntry::StartServer() {
+    IpAddress rowlock_proxy_addr("0.0.0.0", FLAGS_rowlock_proxy_port);
+    LOG(INFO) << "Start RPC server at: " << rowlock_proxy_addr.ToString();
+    rowlock_proxy_impl_.reset(new RowlockProxyImpl());
+    remote_rowlock_proxy_ = new RemoteRowlockProxy(rowlock_proxy_impl_.get());
+    rpc_server_->RegisterService(remote_rowlock_proxy_);
+    if (!rpc_server_->Start(rowlock_proxy_addr.ToString())) {
+        LOG(ERROR) << "start RPC server error";
+        return false;
+    }
+    if (!rowlock_proxy_impl_->Init()) {
+        LOG(ERROR) << "fail to init rowlocknode_impl";
+        return false;
+    }
+    LOG(INFO) << "finish starting RPC server";
+
+    return true;
+}
+
+void RowlockProxyEntry::ShutdownServer() {
+    LOG(INFO) << "shut down server";
+    rpc_server_->Stop();
+
+    LOG(INFO) << "RowlockProxyEntry stop done!";
+    _exit(0);
+}
+
+bool RowlockProxyEntry::Run() {
+    ThisThread::Sleep(1000);
+    return true;
+}
+
+} // namespace observer
+} // namespace tera
\ No newline at end of file
diff --git a/src/observer/rowlockproxy/rowlock_proxy_entry.h b/src/observer/rowlockproxy/rowlock_proxy_entry.h
new file mode 100644
index 000000000..547cf8d04
--- /dev/null
+++ b/src/observer/rowlockproxy/rowlock_proxy_entry.h
@@ -0,0 +1,37 @@
+// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef TERA_OBSERVER_ROWLOCKPROXY_ROWLOCK_PROXY_ENTRY_H_
+#define TERA_OBSERVER_ROWLOCKPROXY_ROWLOCK_PROXY_ENTRY_H_
+
+#include <memory>
+
+#include <sofa/pbrpc/pbrpc.h>
+
+#include "observer/rowlockproxy/remote_rowlock_proxy.h"
+#include "observer/rowlockproxy/rowlock_proxy_impl.h"
+#include "tera_entry.h"
+
+namespace tera {
+namespace observer {
+
+class RowlockProxyEntry : public tera::TeraEntry {
+public:
+    RowlockProxyEntry();
+    virtual ~RowlockProxyEntry();
+
+    virtual bool StartServer();
+    virtual bool Run();
+    virtual void ShutdownServer();
+
+private:
+    std::unique_ptr<RowlockProxyImpl> rowlock_proxy_impl_;
+    RemoteRowlockProxy* remote_rowlock_proxy_;
+    std::unique_ptr<sofa::pbrpc::RpcServer> rpc_server_;
+};
+
+} // namespace observer
+} // namespace tera
+
+#endif  // TERA_OBSERVER_ROWLOCKPROXY_ROWLOCK_PROXY_ENTRY_H_
\ No newline at end of file
diff --git a/src/observer/rowlockproxy/rowlock_proxy_impl.cc b/src/observer/rowlockproxy/rowlock_proxy_impl.cc
new file mode 100644
index 000000000..0a499dabb
--- /dev/null
+++ b/src/observer/rowlockproxy/rowlock_proxy_impl.cc
@@ -0,0 +1,146 @@
+// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "observer/rowlockproxy/rowlock_proxy_impl.h"
+
+#include <functional>
+
+#include "common/timer.h"
+#include "utils/utils_cmd.h"
+
+DECLARE_string(rowlock_proxy_port);
+DECLARE_string(tera_coord_type);
+DECLARE_bool(rowlock_proxy_async_enable);
+
+namespace tera {
+namespace observer {
+
+RowlockProxyImpl::RowlockProxyImpl() 
+    : server_addrs_(new std::vector<std::string>),
+      clients_(new std::map<std::string, RowlockStub*>),
+      server_number_(1) {}
+
+RowlockProxyImpl::~RowlockProxyImpl() {
+    for (auto it = clients_->begin(); it != clients_->end(); ++it) {
+        delete it->second;
+    }
+}
+
+bool RowlockProxyImpl::Init() {
+    if (FLAGS_tera_coord_type == "zk") {
+        zk_adapter_.reset(new RowlockProxyZkAdapter(this, 
+            tera::utils::GetLocalHostName() + ":" + FLAGS_rowlock_proxy_port));
+    } else {
+        zk_adapter_.reset(new InsRowlockProxyZkAdapter(this, 
+            tera::utils::GetLocalHostName() + ":" + FLAGS_rowlock_proxy_port));
+    }
+
+    if (!zk_adapter_->Init()) {
+        LOG(ERROR) << "init zk adapter fail";
+        return false;
+    }
+
+    LOG(INFO) << "Rowlock node init finish";
+    return true;
+}
+
+void RowlockProxyImpl::TryLock(const RowlockRequest* request,
+        RowlockResponse* response,
+        google::protobuf::Closure* done) {
+    
+    uint64_t rowlock_key = GetRowKey(request->table_name(), request->row());
+    std::string addr = ScheduleRowKey(rowlock_key);
+
+    // read
+    std::shared_ptr<std::map<std::string, RowlockStub*>> read_clients;
+    {
+        MutexLock locker(&client_mutex_);
+        // copy-on-write， ref +1
+        read_clients = clients_;
+    }
+
+    if (FLAGS_rowlock_proxy_async_enable == false) {
+    	(*read_clients)[addr]->TryLock(request, response);
+    	done->Run();
+    } else {
+    	(*read_clients)[addr]->TryLock(request, response, [&] (RowlockRequest*, RowlockResponse*, bool, int) {done->Run();});
+    }
+    
+}
+
+void RowlockProxyImpl::UnLock(const RowlockRequest* request,
+        RowlockResponse* response,
+        google::protobuf::Closure* done) {
+
+    uint64_t rowlock_key = GetRowKey(request->table_name(), request->row());
+    std::string addr = ScheduleRowKey(rowlock_key);
+
+    // read
+    std::shared_ptr<std::map<std::string, RowlockStub*>> read_clients;
+    {
+        MutexLock locker(&client_mutex_);
+        // copy for copy-on-write， ref +1
+        read_clients = clients_;
+    }
+
+    if (FLAGS_rowlock_proxy_async_enable == false) {
+        (*read_clients)[addr]->UnLock(request, response);
+        done->Run();
+    } else {
+        (*read_clients)[addr]->UnLock(request, response, [&] (RowlockRequest*, RowlockResponse*, bool, int) {done->Run();});
+    }
+}
+
+uint64_t RowlockProxyImpl::GetRowKey(const std::string& table_name,
+                                    const std::string& row) const {
+    std::string rowkey_str = table_name + row;
+    return std::hash<std::string>()(rowkey_str);
+}
+
+std::string RowlockProxyImpl::ScheduleRowKey(uint64_t row_key) {
+    std::shared_ptr<std::vector<std::string>> server_addrs_copy;
+
+    MutexLock locker(&server_addrs_mutex_);
+    // copy for copy-on-write， ref +1
+    server_addrs_copy = server_addrs_;
+
+    return (*server_addrs_copy)[row_key % server_number_];
+}
+
+void RowlockProxyImpl::SetServerNumber(uint32_t number) {
+    MutexLock locker(&server_addrs_mutex_);
+
+    server_number_ = number;
+
+    if (server_addrs_->size() < number) {
+        server_addrs_->resize(number);
+    }
+}
+
+void RowlockProxyImpl::UpdateServers(uint32_t id, const std::string& addr) {
+    // update data first
+    {
+        MutexLock locker(&server_addrs_mutex_);
+        (*server_addrs_)[id] = addr;
+    }
+
+    MutexLock locker(&client_mutex_);
+    if(!clients_.unique()) {
+        clients_.reset(new std::map<std::string, RowlockStub*>(*clients_));
+    }
+
+    if (clients_->find(addr) == clients_->end()) {
+        clients_->insert(make_pair(addr, new RowlockStub(addr)));
+    } 
+}
+
+uint32_t RowlockProxyImpl::GetServerNumber() {
+    return server_number_;
+}
+
+} // namespace observer
+} // namespace tera
+
+
+
diff --git a/src/observer/rowlockproxy/rowlock_proxy_impl.h b/src/observer/rowlockproxy/rowlock_proxy_impl.h
new file mode 100644
index 000000000..4417c3973
--- /dev/null
+++ b/src/observer/rowlockproxy/rowlock_proxy_impl.h
@@ -0,0 +1,68 @@
+// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef TERA_OBSERVER_ROWLOCKPROXY_ROWLOCK_PROXY_IMPL_H_
+#define TERA_OBSERVER_ROWLOCKPROXY_ROWLOCK_PROXY_IMPL_H_
+
+#include <glog/logging.h>
+#include <memory>
+#include <pthread.h>
+
+#include "common/counter.h"
+#include "common/mutex.h"
+#include "observer/rowlockproxy/rowlock_proxy_zk_adapter.h"
+#include "proto/rowlocknode_rpc.pb.h"
+#include "sdk/rowlock_client.h"
+#include "zk/zk_adapter.h"
+
+namespace tera {
+namespace observer {
+
+class RowlockProxyZkAdapterBase;
+class RowLockStub;
+
+class RowlockProxyImpl {
+public:
+    RowlockProxyImpl();
+    ~RowlockProxyImpl();
+
+    bool Init();
+
+    void TryLock(const RowlockRequest* request,
+        RowlockResponse* response,
+        google::protobuf::Closure* done);
+
+    void UnLock(const RowlockRequest* request,
+        RowlockResponse* response,
+        google::protobuf::Closure* done);
+
+    // for zk
+    void SetServerNumber(uint32_t number);
+    uint32_t GetServerNumber();
+    void UpdateServers(uint32_t id, const std::string& addr);
+private:
+	uint64_t GetRowKey(const std::string& table_name,
+					   const std::string& row) const;
+	// rowkey -> server addr
+	std::string ScheduleRowKey(uint64_t row_key);
+
+private:
+    common::Mutex server_addrs_mutex_;
+    // a map from virtual node to server addr
+    // key: vector index, virtual node number
+    // value: vector value, server address
+    // shared_ptr: used for copy-on-write
+    std::shared_ptr<std::vector<std::string>> server_addrs_;
+
+    common::Mutex client_mutex_;
+    std::shared_ptr<std::map<std::string, RowlockStub*>> clients_;
+
+    uint32_t server_number_;
+    std::unique_ptr<RowlockProxyZkAdapterBase> zk_adapter_;
+};
+
+} // namespace observer
+} // namespace tera
+
+#endif  // TERA_OBSERVER_ROWLOCKPROXY_ROWLOCK_PROXY_IMPL_H_
diff --git a/src/observer/rowlockproxy/rowlock_proxy_zk_adapter.cc b/src/observer/rowlockproxy/rowlock_proxy_zk_adapter.cc
new file mode 100644
index 000000000..290c6815c
--- /dev/null
+++ b/src/observer/rowlockproxy/rowlock_proxy_zk_adapter.cc
@@ -0,0 +1,411 @@
+// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "observer/rowlockproxy/rowlock_proxy_zk_adapter.h"
+
+#include <gflags/gflags.h>
+#include <glog/logging.h>
+
+#include "common/base/string_number.h"
+#include "observer/rowlockproxy/rowlock_proxy_impl.h"
+#include "types.h"
+#include "ins_sdk.h"
+
+DECLARE_string(rowlock_zk_root_path);
+DECLARE_string(tera_zk_addr_list);
+DECLARE_int32(rowlock_server_node_num);
+DECLARE_int64(tera_zk_retry_period); 
+DECLARE_int32(tera_zk_timeout);
+DECLARE_int32(tera_zk_retry_max_times);
+
+DECLARE_string(rowlock_ins_root_path);
+DECLARE_string(tera_ins_addr_list);
+
+namespace tera {
+namespace observer {
+
+RowlockProxyZkAdapter::RowlockProxyZkAdapter(RowlockProxyImpl* rowlock_proxy_impl, 
+											 const std::string& server_addr)
+    : rowlock_proxy_impl_(rowlock_proxy_impl),
+      server_addr_(server_addr) {}
+
+bool RowlockProxyZkAdapter::Init() {
+    std::string root_path = FLAGS_rowlock_zk_root_path;
+    std::string node_num_key = root_path + kRowlockNodeNumPath;
+    std::string id_lock_path;
+    std::string proxy_path = root_path + kRowlockProxyPath + "/" + server_addr_;
+
+    int zk_errno = zk::ZE_OK;
+    int32_t retry_count = 0;
+    // init zk client
+    while (!ZooKeeperAdapter::Init(FLAGS_tera_zk_addr_list,
+                                   FLAGS_rowlock_zk_root_path, FLAGS_tera_zk_timeout,
+                                   server_addr_, &zk_errno)) {
+        if (retry_count++ >= FLAGS_tera_zk_retry_max_times) {
+            LOG(ERROR) << "fail to init zk: " << zk::ZkErrnoToString(zk_errno);
+            return false;
+        }
+
+        LOG(ERROR) << "init zk fail: " << zk::ZkErrnoToString(zk_errno)
+            << ". retry in " << FLAGS_tera_zk_retry_period << " ms, retry: "
+            << retry_count;
+        ThisThread::Sleep(FLAGS_tera_zk_retry_period);
+        zk_errno = zk::ZE_OK;
+    }
+    LOG(INFO) << "init zk success";
+
+    // get session id
+    int64_t session_id_int = 0;
+    if (!GetSessionId(&session_id_int, &zk_errno)) {
+        LOG(ERROR) << "get session id fail : " << zk::ZkErrnoToString(zk_errno);
+        return false;
+    }
+
+    bool is_exist = false;
+    uint32_t node_num;
+    while(!is_exist) {
+        CheckExist(node_num_key, &is_exist, &zk_errno);
+        if (!is_exist) {
+            LOG(ERROR) << "rowlock service number node not found: " << node_num_key
+                << "  make sure rowlock zk available";
+            ThisThread::Sleep(FLAGS_tera_zk_retry_period);
+        }
+    }
+    std::string value;    
+    ReadAndWatchNode(node_num_key, &value, &zk_errno);
+
+    if (!StringToNumber(value, &node_num)) {
+    	LOG(ERROR) << "read number node fail";
+    	return false;
+    }
+   
+    rowlock_proxy_impl_->SetServerNumber(node_num);
+
+    retry_count = 0;
+    id_lock_path = root_path + kRowlockNodeIdListPath;
+    std::vector<std::string> name_list;
+    std::vector<std::string> data_list;
+
+    while (!ListAndWatchChildren(id_lock_path, &name_list, &data_list,
+                                 &zk_errno) || name_list.size() != node_num) {
+        if (retry_count++ >= FLAGS_tera_zk_retry_max_times) {
+            LOG(ERROR) << "fail to watch rowlock server list or lack rowlock server";
+            return false;
+        }
+        LOG(ERROR) << "retry watch rowlock server list in "
+            << FLAGS_tera_zk_retry_period << " ms, retry=" << retry_count
+            << " node_num: " << node_num << " list size: " << name_list.size();
+        ThisThread::Sleep(FLAGS_tera_zk_retry_period);
+        zk_errno = zk::ZE_OK;
+    }
+    size_t list_count = name_list.size();
+    for (size_t i = 0; i < list_count; i++) {
+        const std::string& name = name_list[i];
+        const std::string& data = data_list[i];
+        
+        uint32_t id;
+        StringToNumber(name, &id);
+        rowlock_proxy_impl_->UpdateServers(id, data);
+    }
+
+    // create proxy node
+    retry_count = 0;
+    while (!CreateEphemeralNode(proxy_path, server_addr_, &zk_errno)) {
+        if (retry_count++ >= FLAGS_tera_zk_retry_max_times) {
+            LOG(ERROR) << "fail to create proxy node";
+            return false;
+        }
+        LOG(ERROR) << "retry create rowlock number node in "
+                   << FLAGS_tera_zk_retry_period << " ms, retry=" << retry_count;
+        ThisThread::Sleep(FLAGS_tera_zk_retry_period);
+        zk_errno = zk::ZE_OK;
+    }
+    return true;
+}
+void RowlockProxyZkAdapter::OnNodeValueChanged(const std::string& path,
+                                                const std::string& value) {
+	std::string value_str;    
+    int zk_errno = zk::ZE_OK;
+    std::string node_num_key = FLAGS_rowlock_zk_root_path + kRowlockNodeNumPath;
+
+	if (path == node_num_key) {
+		LOG(WARNING) << "rowlock service server number changed to " << value;
+		uint32_t node_num;
+		StringToNumber(value, &node_num);
+    	rowlock_proxy_impl_->SetServerNumber(node_num);
+    	ReadAndWatchNode(node_num_key, &value_str, &zk_errno);
+	} 
+}
+
+void RowlockProxyZkAdapter::OnWatchFailed(const std::string& path, int watch_type,
+                               			 int err) {
+    LOG(ERROR) << "watch failed ! " << path;
+    _Exit(EXIT_FAILURE);
+}
+
+void RowlockProxyZkAdapter::OnSessionTimeout() {
+    LOG(ERROR) << "zk session timeout!";
+    _Exit(EXIT_FAILURE);
+}
+
+void RowlockProxyZkAdapter::OnNodeCreated(const std::string& path) {
+	std::string value;    
+    int zk_errno = zk::ZE_OK;
+
+	if (path == FLAGS_rowlock_zk_root_path + kRowlockNodeNumPath) {
+		LOG(WARNING) << "rowlock service number node create"; 	
+    	ReadAndWatchNode(path, &value, &zk_errno);
+    	uint32_t node_num;
+		StringToNumber(value, &node_num);
+    	rowlock_proxy_impl_->SetServerNumber(node_num);
+	} else {
+		std::string id_str = path.substr(path.find_last_of("/"), 
+										 path.size() - path.find_last_of("/"));
+		uint32_t id;
+		StringToNumber(id_str, &id);	
+		ReadAndWatchNode(path, &value, &zk_errno);
+		rowlock_proxy_impl_->UpdateServers(id, value);
+	}
+}
+
+void RowlockProxyZkAdapter::OnNodeDeleted(const std::string& path) {
+	LOG(ERROR) << "node deleted: " << path;
+
+	int zk_errno = zk::ZE_OK;
+	bool is_exist = false;
+	if (path == FLAGS_rowlock_zk_root_path + kRowlockNodeNumPath) {
+		while(!is_exist) {
+	        CheckExist(path, &is_exist, &zk_errno);
+	        if (!is_exist) {
+	            LOG(ERROR) << "rowlock service number node not found: " << path
+	                << "  make sure rowlock zk available";
+	            ThisThread::Sleep(FLAGS_tera_zk_retry_period);
+	        }
+
+	        std::string value;
+	        ReadAndWatchNode(path, &value, &zk_errno);
+		    uint32_t node_num;
+		    if (!StringToNumber(value, &node_num)) {
+		    	LOG(ERROR) << "read number node fail";
+		    	return;
+		    }
+		   
+		    rowlock_proxy_impl_->SetServerNumber(node_num);
+	    }
+	    return;
+	}
+	// server node
+	std::string id_str = path.substr(path.find_last_of("/"), 
+								     path.size() - path.find_last_of("/"));
+	uint32_t id;
+	StringToNumber(id_str, &id);	
+
+	if (id >= rowlock_proxy_impl_->GetServerNumber()) {
+		return;
+	}
+
+	while(!is_exist) {
+        CheckExist(path, &is_exist, &zk_errno);
+        if (!is_exist) {
+            LOG(ERROR) << "rowlock server node not found: " << path;
+            ThisThread::Sleep(FLAGS_tera_zk_retry_period);
+        }
+
+        std::string value;
+        ReadAndWatchNode(path, &value, &zk_errno);
+	    uint32_t node_num;
+	    if (!StringToNumber(value, &node_num)) {
+	    	LOG(ERROR) << "read number node fail";
+	    	return;
+	    }
+	   
+	    rowlock_proxy_impl_->UpdateServers(node_num, value);
+    }	
+}
+
+void RowlockProxyZkAdapter::OnChildrenChanged(const std::string& path,
+                                           const std::vector<std::string>& name_list,
+                                           const std::vector<std::string>& data_list) {
+    std::string root_path = FLAGS_rowlock_ins_root_path;
+    int32_t retry_count = 0;
+    int zk_errno = zk::ZE_OK;
+    std::string id_lock_path = root_path + kRowlockNodeIdListPath;
+    std::vector<std::string> names;
+    std::vector<std::string> datum;
+
+    while (!ListAndWatchChildren(id_lock_path, &names, &datum,
+                                 &zk_errno)) {
+        if (retry_count++ >= FLAGS_tera_zk_retry_max_times) {
+            LOG(ERROR) << "fail to watch rowlock server list or lack rowlock server";
+            _Exit(EXIT_FAILURE);
+        }
+        LOG(ERROR) << "retry watch rowlock server list in "
+            << FLAGS_tera_zk_retry_period << " ms, retry=" << retry_count;
+        ThisThread::Sleep(FLAGS_tera_zk_retry_period);
+    }
+    size_t list_count = name_list.size();
+    for (size_t i = 0; i < list_count; i++) {
+        const std::string& name = names[i];
+        const std::string& data = datum[i];
+        
+        uint32_t id;
+        StringToNumber(name, &id);
+        rowlock_proxy_impl_->UpdateServers(id, data);
+    }
+}
+
+// ins
+
+InsRowlockProxyZkAdapter::InsRowlockProxyZkAdapter(RowlockProxyImpl* rowlock_proxy_impl, 
+                                                 const std::string& server_addr)
+    : rowlock_proxy_impl_(rowlock_proxy_impl),
+      server_addr_(server_addr) {}
+
+static void InsOnNumberChange(const galaxy::ins::sdk::WatchParam& param,
+                            galaxy::ins::sdk::SDKError error) {
+    InsRowlockProxyZkAdapter* ins_adp = static_cast<InsRowlockProxyZkAdapter*>(param.context);
+    ins_adp->OnValueChange(param.key, param.value);
+}
+
+static void InsOnServerChange(const galaxy::ins::sdk::WatchParam& param,
+                            galaxy::ins::sdk::SDKError error) {
+    InsRowlockProxyZkAdapter* ins_adp = static_cast<InsRowlockProxyZkAdapter*>(param.context);
+    ins_adp->OnServerChange();
+}
+
+bool InsRowlockProxyZkAdapter::Init() {
+    std::string root_path = FLAGS_rowlock_ins_root_path;
+    std::string node_num_key = root_path + kRowlockNodeNumPath;
+    std::string proxy_path = root_path + kRowlockProxyPath + "/" + server_addr_;
+    std::string value;
+    galaxy::ins::sdk::SDKError err;
+
+    ins_sdk_ = new galaxy::ins::sdk::InsSDK(FLAGS_tera_ins_addr_list);
+
+    LOG(INFO) << "init ins success";
+
+    if (!ins_sdk_->Get(node_num_key, &value, &err)) {
+        LOG(ERROR) << "ins rowlock service number node not found: " << node_num_key
+            << "  make sure rowlock ins available";
+        return false;
+    }
+
+    uint32_t node_num;
+    if (!StringToNumber(value, &node_num)) {
+        LOG(ERROR) << "read number node fail";
+        return false;
+    }
+    rowlock_proxy_impl_->SetServerNumber(node_num);
+
+    if (!ins_sdk_->Watch(node_num_key, InsOnNumberChange, this, &err)) {
+        LOG(ERROR) << "try to watch number node ,path=" << node_num_key << " failed,"
+                << ins_sdk_->ErrorToString(err);
+        return false;
+    }
+
+
+    // read server addr
+    int32_t retry_count = 0;
+    std::string id_lock_path = root_path + kRowlockNodeIdListPath;
+
+    while (!ins_sdk_->Watch(id_lock_path, InsOnServerChange, this, &err)) {
+        LOG(ERROR) << "try to watch server node ,path=" << id_lock_path << " failed,"
+                << ins_sdk_->ErrorToString(err);
+        if (retry_count++ > FLAGS_tera_zk_retry_max_times) {
+            return false;
+        }
+    }
+    
+    galaxy::ins::sdk::ScanResult* result = ins_sdk_->Scan(id_lock_path+"/!",
+                                                           id_lock_path+"/~");
+    while (!result->Done()) {
+        CHECK_EQ(result->Error(), galaxy::ins::sdk::kOK);
+        std::string value = result->Value();
+        std::string key = result->Key();
+        VLOG(12) << "Key: " << key << " value: " << value;
+
+        uint32_t node_num;
+        uint32_t pos = key.find_last_of("/") + 1;
+        key = key.substr(pos, key.length() - pos);
+        VLOG(12) << "key: " << key;
+        if (!StringToNumber(key, &node_num)) {
+            LOG(ERROR) << "read number node fail";
+            _Exit(EXIT_FAILURE);
+        }
+
+        rowlock_proxy_impl_->UpdateServers(node_num, value);
+        result->Next();
+    }
+    delete result;
+
+    // create proxy node
+    retry_count = 0;
+    while (!ins_sdk_->Put(proxy_path, server_addr_, &err)) {
+        if (retry_count++ >= FLAGS_tera_zk_retry_max_times) {
+            LOG(ERROR) << "fail to create proxy node";
+            return false;
+        }
+        LOG(ERROR) << "retry create rowlock number node in "
+                   << FLAGS_tera_zk_retry_period << " ms, retry=" << retry_count;
+        ThisThread::Sleep(FLAGS_tera_zk_retry_period);
+    }
+    return true;
+}
+
+void InsRowlockProxyZkAdapter::OnValueChange(const std::string& path, const std::string& value) {
+    uint32_t node_num;
+    galaxy::ins::sdk::SDKError err;
+
+    if (!StringToNumber(value, &node_num)) {
+        LOG(ERROR) << "read number node fail";
+        return;
+    }
+    rowlock_proxy_impl_->SetServerNumber(node_num);
+
+    if (!ins_sdk_->Watch(path, InsOnNumberChange, this, &err)) {
+        LOG(ERROR) << "try to watch number node ,path=" << path << " failed,"
+                << ins_sdk_->ErrorToString(err);
+        return;
+    }
+}
+
+void InsRowlockProxyZkAdapter::OnServerChange() {
+    galaxy::ins::sdk::SDKError err;
+    std::string root_path = FLAGS_rowlock_ins_root_path;
+    
+    int32_t retry_count = 0;
+    std::string id_lock_path = root_path + kRowlockNodeIdListPath;
+
+    while (!ins_sdk_->Watch(id_lock_path, InsOnServerChange, this, &err)) {
+        LOG(ERROR) << "try to watch server node ,path=" << id_lock_path << " failed,"
+                << ins_sdk_->ErrorToString(err);
+        if (retry_count++ > FLAGS_tera_zk_retry_max_times) {
+            _Exit(EXIT_FAILURE);
+        }
+    } 
+
+    galaxy::ins::sdk::ScanResult* result = ins_sdk_->Scan(id_lock_path+"/!",
+                                                           id_lock_path+"/~");
+    while (!result->Done()) {
+        CHECK_EQ(result->Error(), galaxy::ins::sdk::kOK);
+        std::string value = result->Value();
+        std::string key = result->Key();
+    
+        uint32_t node_num;
+        uint32_t pos = key.find_last_of("/") + 1;
+        key = key.substr(pos, key.length() - pos);
+        VLOG(12) << "key: " << key;
+        if (!StringToNumber(key, &node_num)) {
+            LOG(ERROR) << "read number node fail";
+            _Exit(EXIT_FAILURE);
+        }
+
+        rowlock_proxy_impl_->UpdateServers(node_num, value);
+        result->Next();
+    }
+    delete result;
+}
+
+} // namespace observer
+} // namespace tera
\ No newline at end of file
diff --git a/src/observer/rowlockproxy/rowlock_proxy_zk_adapter.h b/src/observer/rowlockproxy/rowlock_proxy_zk_adapter.h
new file mode 100644
index 000000000..02125135c
--- /dev/null
+++ b/src/observer/rowlockproxy/rowlock_proxy_zk_adapter.h
@@ -0,0 +1,83 @@
+// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef TERA_OBSERVER_ROWLOCKPROXY_ROWLOCK_PROXY_ZK_ADAPTER_H_
+#define TERA_OBSERVER_ROWLOCKPROXY_ROWLOCK_PROXY_ZK_ADAPTER_H_
+
+#include "zk/zk_adapter.h"
+
+namespace galaxy {
+namespace ins {
+namespace sdk {
+    class InsSDK;
+} // namespace sdk
+} // namespace ins
+} // namespace galaxy
+
+namespace tera {
+namespace observer {
+
+class RowlockProxyImpl;
+
+class RowlockProxyZkAdapterBase : public zk::ZooKeeperAdapter {
+public:
+    virtual ~RowlockProxyZkAdapterBase() {}
+    virtual bool Init() = 0;
+};
+
+class RowlockProxyZkAdapter : public RowlockProxyZkAdapterBase {
+public:
+    RowlockProxyZkAdapter(RowlockProxyImpl* rowlock_proxy_impl, const std::string& server_addr);
+    virtual ~RowlockProxyZkAdapter() {}
+    virtual bool Init();
+
+protected:
+	  virtual void OnNodeValueChanged(const std::string& path,
+                                    const std::string& value);
+    virtual void OnWatchFailed(const std::string& path, int watch_type,
+                               int err);
+    virtual void OnNodeDeleted(const std::string& path);
+    virtual void OnSessionTimeout();
+    virtual void OnNodeCreated(const std::string& path);
+    virtual void OnChildrenChanged(const std::string& path,
+                                   const std::vector<std::string>& name_list,
+                                   const std::vector<std::string>& data_list);
+
+private:
+    RowlockProxyImpl* rowlock_proxy_impl_;
+    std::string server_addr_;
+
+};
+
+class InsRowlockProxyZkAdapter : public RowlockProxyZkAdapterBase {
+public:
+    InsRowlockProxyZkAdapter(RowlockProxyImpl* rowlock_proxy_impl, const std::string& server_addr);
+    virtual ~InsRowlockProxyZkAdapter() {}
+    virtual bool Init();
+
+    void OnValueChange(const std::string& path, const std::string& value);
+    void OnServerChange();
+
+protected:
+    virtual void OnNodeValueChanged(const std::string& path,
+                                    const std::string& value) {}
+    virtual void OnWatchFailed(const std::string& path, int watch_type,
+                               int err) {}
+    virtual void OnNodeDeleted(const std::string& path) {}
+    virtual void OnSessionTimeout() {}
+    virtual void OnNodeCreated(const std::string& path) {}
+    virtual void OnChildrenChanged(const std::string& path,
+                                   const std::vector<std::string>& name_list,
+                                   const std::vector<std::string>& data_list) {}
+
+private:
+    RowlockProxyImpl* rowlock_proxy_impl_;
+    std::string server_addr_;
+    galaxy::ins::sdk::InsSDK* ins_sdk_;
+};
+
+} // namespace observer
+} // namespace tera
+
+#endif  // TERA_OBSERVER_ROWLOCKPROXY_ROWLOCK_PROXY_ZK_ADAPTER_H_
\ No newline at end of file
diff --git a/src/observer/test/observer_test.cc b/src/observer/test/observer_test.cc
new file mode 100644
index 000000000..299ec4581
--- /dev/null
+++ b/src/observer/test/observer_test.cc
@@ -0,0 +1,587 @@
+// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include <atomic>
+#include <memory>
+
+#include <gflags/gflags.h>
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+
+#include "common/thread_pool.h"
+#include "observer/executor/observer.h"
+#include "observer/executor/random_key_selector.h"
+#include "observer/executor/scanner.h"
+#include "observer/executor/scanner_impl.h"
+#include "observer/executor/notification_impl.h"
+#include "sdk/client_impl.h"
+#include "sdk/sdk_utils.h"
+#include "tera.h"
+#include "types.h"
+
+DECLARE_bool(tera_gtxn_test_opened);
+DECLARE_int64(start_ts);
+DECLARE_int64(begin_commit_ts);
+DECLARE_int64(begin_prewrite_ts);
+DECLARE_int64(end_prewrite_ts);
+DECLARE_int64(commit_ts);
+DECLARE_string(flagfile);
+DECLARE_string(tera_coord_type);
+DECLARE_bool(tera_sdk_client_for_gtxn);
+DECLARE_bool(mock_rowlock_enable);
+
+namespace tera {
+namespace observer {
+
+class TestWorker : public Observer {
+public:
+    TestWorker(): counter_(0), notified_(false) {}
+    virtual ~TestWorker() {}
+    virtual ErrorCode  OnNotify(tera::Transaction* t,
+	                            tera::Client* client,
+	                            const std::string& table_name,
+	                            const std::string& family,
+	                            const std::string& qualifier,
+	                          	const std::string& row,
+	                          	const std::string& value,
+	                          	int64_t timestamp,
+                                Notification* notification) {
+    	LOG(INFO) << "[Notify DemoObserver] table:family:qualifer=" <<
+	        table_name << ":" << family << ":" <<
+	        qualifier << " row=" << row <<
+	        " value=" << value << " timestamp=" << timestamp;
+
+	    table_name_ = table_name;
+	  	family_ = family;
+	  	qualifier_ = qualifier;
+	  	row_ = row;
+	  	value_ = value;
+
+		tera::ErrorCode err;
+        notified_ = true;    
+        ++counter_;
+
+        tera::Table* table = client->OpenTable(table_name, &err);
+        notification->Ack(table, row, family, qualifier);
+
+        return err;
+    }
+
+    virtual std::string GetObserverName() const {
+        return "DemoObserver";
+    }
+
+    virtual TransactionType GetTransactionType() const {
+        return kGlobalTransaction;
+    }
+private:
+    std::atomic<int> counter_;
+    std::atomic<bool> notified_;
+
+    std::string table_name_;
+  	std::string family_;
+  	std::string qualifier_;
+  	std::string row_;
+  	std::string value_;
+};
+
+class TestWorkerGTX : public Observer {
+public:
+    TestWorkerGTX(): counter_(0), notified_(false) {}
+    virtual ~TestWorkerGTX() {}
+    virtual ErrorCode  OnNotify(tera::Transaction* t,
+                                tera::Client* client,
+                                const std::string& table_name,
+                                const std::string& family,
+                                const std::string& qualifier,
+                                const std::string& row,
+                                const std::string& value,
+                                int64_t timestamp,
+                                Notification* notification) {
+        LOG(INFO) << "[Notify TestWorkerGTX] table:family:qualifer=" <<
+            table_name << ":" << family << ":" <<
+            qualifier << " row=" << row <<
+            " value=" << value << " timestamp=" << timestamp;
+
+        table_name_ = table_name;
+        family_ = family;
+        qualifier_ = qualifier;
+        row_ = row;
+        value_ = value;
+
+        tera::ErrorCode err;
+        notified_ = true;    
+        ++counter_;
+
+        tera::Table* table = client->OpenTable(table_name, &err);
+
+        // write ForwordIndex column
+        tera::RowMutation* mutation = table->NewRowMutation(row);
+        mutation->Put(family, qualifier + "_test", row + "_");
+        t->ApplyMutation(mutation);
+
+        tera::ErrorCode error;
+        t->Ack(table, row, family, qualifier);
+        table->CommitRowTransaction(t);
+        delete mutation;
+        return error;
+
+        return err;
+    }
+
+    virtual std::string GetObserverName() const {
+        return "DemoObserver";
+    }
+
+    virtual TransactionType GetTransactionType() const {
+        return kSingleRowTransaction;
+    }
+private:
+    std::atomic<int> counter_;
+    std::atomic<bool> notified_;
+
+    std::string table_name_;
+  	std::string family_;
+  	std::string qualifier_;
+  	std::string row_;
+  	std::string value_;
+};
+
+class DemoObserver : public tera::observer::Observer {
+public:
+    DemoObserver() {}
+    virtual ~DemoObserver() {}
+    virtual ErrorCode OnNotify(tera::Transaction* t,
+                              tera::Client* client,
+                              const std::string& table_name,
+                              const std::string& family,
+                              const std::string& qualifier,
+                              const std::string& row,
+                              const std::string& value,
+                              int64_t timestamp,
+                              Notification* notification) {
+	    LOG(INFO) << "[Notify ParseObserver] table:family:qualifer=" <<
+	        table_name << ":" << family << ":" <<
+	        qualifier << " row=" << row <<
+	        " value=" << value << " timestamp=" << timestamp;
+
+	    tera::ErrorCode err;
+	    // do nothing
+	    return err;
+    }
+    virtual std::string GetObserverName() const {
+    	return "DemoObserver";
+    }
+    virtual TransactionType GetTransactionType() const {
+        return kGlobalTransaction;
+    }
+};
+
+class TestWorkerNTX : public Observer {
+public:
+    TestWorkerNTX(): counter_(0), notified_(false) {}
+    virtual ~TestWorkerNTX() {}
+    virtual ErrorCode  OnNotify(tera::Transaction* t,
+                                tera::Client* client,
+                                const std::string& table_name,
+                                const std::string& family,
+                                const std::string& qualifier,
+                                const std::string& row,
+                                const std::string& value,
+                                int64_t timestamp,
+                                Notification* notification) {
+        LOG(INFO) << "[Notify TestWorkerNTX] table:family:qualifer=" <<
+            table_name << ":" << family << ":" <<
+            qualifier << " row=" << row <<
+            " value=" << value << " timestamp=" << timestamp;
+
+        table_name_ = table_name;
+        family_ = family;
+        qualifier_ = qualifier;
+        row_ = row;
+        value_ = value;
+
+        tera::ErrorCode err;
+        notified_ = true;    
+        ++counter_;
+
+        // do something without transaction
+
+        return err;
+    }
+
+    virtual std::string GetObserverName() const {
+        return "DemoObserver";
+    }
+
+    virtual TransactionType GetTransactionType() const {
+        return kNoneTransaction;
+    }
+private:
+    std::atomic<int> counter_;
+    std::atomic<bool> notified_;
+
+    std::string table_name_;
+    std::string family_;
+    std::string qualifier_;
+    std::string row_;
+    std::string value_;
+};
+
+class ObserverImplTest : public ::testing::Test {
+public:
+    void OnNotifyTest() {
+        tera::ErrorCode err;
+        tera::Client* client = tera::Client::NewClient(FLAGS_flagfile, &err);
+        // for ut test
+        EXPECT_EQ(tera::ErrorCode::kOK, err.GetType());
+        // for no core
+        if (tera::ErrorCode::kOK != err.GetType()) {
+            LOG(ERROR) << "new client failed";
+            return;
+        }
+
+        // create table
+        tera::TableDescriptor table_desc("observer_test_table");
+        table_desc.EnableTxn();
+
+        table_desc.AddLocalityGroup("lg1");
+        tera::ColumnFamilyDescriptor* cf1 = table_desc.AddColumnFamily("cf", "lg1");
+        cf1->EnableGlobalTransaction();
+        cf1->EnableNotify();
+        ExtendNotifyLgToDescriptor(&table_desc);
+
+        client->CreateTable(table_desc, &err);
+        if (err.GetType() != tera::ErrorCode::kOK) {
+        	LOG(ERROR) << "Create table fail";
+        }
+
+        tera::Table* table = client->OpenTable("observer_test_table", &err);
+        EXPECT_EQ(tera::ErrorCode::kOK, err.GetType());
+        if (tera::ErrorCode::kOK != err.GetType()) {
+            LOG(ERROR) << "open table failed"; 
+            return;
+        }
+
+		std::unique_ptr<tera::Transaction> t(table->StartRowTransaction("www.baidu.com"));
+
+        assert(t != NULL);
+        std::unique_ptr<tera::RowMutation> mu0(table->NewRowMutation("www.baidu.com"));
+        mu0->Put("_N_", "cf:Page", "I am not important");
+        t->ApplyMutation(mu0.get());
+        t->Commit();
+
+        std::unique_ptr<tera::Transaction> g_txn(client->NewGlobalTransaction());
+        assert(g_txn != NULL);
+        std::unique_ptr<tera::RowMutation> mu1(table->NewRowMutation("www.baidu.com"));
+
+        mu1->Put("cf", "Page", "hello world", -1);
+        g_txn->ApplyMutation(mu1.get());
+        g_txn->Commit();
+
+	    if (g_txn->GetError().GetType() != tera::ErrorCode::kOK) {
+	        std::cout << g_txn->GetError().ToString() << std::endl; 
+	    } else {
+	        std::cout << "commit success" << std::endl;
+	    }
+
+        // varibles for fake timeoracle
+        FLAGS_start_ts = 10;
+        FLAGS_begin_commit_ts = 1;
+        FLAGS_begin_prewrite_ts = 1;
+        FLAGS_end_prewrite_ts = 1;
+        FLAGS_commit_ts = 13;
+
+        Observer* observer = new TestWorker();
+        Observer* demo = new DemoObserver();
+        
+        Scanner* scanner = new ScannerImpl();
+        bool ret = scanner->Init();
+        EXPECT_EQ(true, ret);
+	    if(!ret) {
+			LOG(ERROR) << "fail to init scanner_impl";
+	        return;
+		}
+
+        err = scanner->Observe("observer_test_table", "cf", "Page", observer);
+        EXPECT_EQ(err.GetType(), tera::ErrorCode::kOK);
+
+        err = scanner->Observe("observer_test_table", "cf", "Page", demo);
+        EXPECT_EQ(err.GetType(), tera::ErrorCode::kOK);
+
+		if(!scanner->Start()) {
+			LOG(ERROR) << "fail to start scanner_impl";
+	        return;
+		}
+
+        while (!static_cast<TestWorker*>(observer)->notified_) {
+            sleep(1);
+        }
+
+
+        EXPECT_EQ("www.baidu.com", static_cast<TestWorker*>(observer)->row_);
+        EXPECT_EQ("observer_test_table", static_cast<TestWorker*>(observer)->table_name_);
+        EXPECT_EQ("cf", static_cast<TestWorker*>(observer)->family_);
+        EXPECT_EQ("Page", static_cast<TestWorker*>(observer)->qualifier_);
+        EXPECT_EQ("hello world", static_cast<TestWorker*>(observer)->value_);
+
+        scanner->Exit();
+        delete scanner;
+    }
+
+    void SingleRowTransactionTest() {
+        tera::ErrorCode err;
+        tera::Client* client = tera::Client::NewClient(FLAGS_flagfile, &err);
+        // for ut test
+        EXPECT_EQ(tera::ErrorCode::kOK, err.GetType());
+        // for no core
+        if (tera::ErrorCode::kOK != err.GetType()) {
+            LOG(ERROR) << "new client failed";
+            return;
+        }
+
+        // create table
+        tera::TableDescriptor table_desc("observer_table_gtx");
+        table_desc.EnableTxn();
+
+        table_desc.AddLocalityGroup("lg1");
+        tera::ColumnFamilyDescriptor* cf1 = table_desc.AddColumnFamily("cf", "lg1");
+        cf1->EnableNotify();
+        ExtendNotifyLgToDescriptor(&table_desc);
+
+        client->CreateTable(table_desc, &err);
+        if (err.GetType() != tera::ErrorCode::kOK) {
+            LOG(ERROR) << "Create table fail";
+        }
+
+        tera::Table* table = client->OpenTable("observer_table_gtx", &err);
+        EXPECT_EQ(tera::ErrorCode::kOK, err.GetType());
+        if (tera::ErrorCode::kOK != err.GetType()) {
+            LOG(ERROR) << "open table failed"; 
+            return;
+        }
+
+        std::unique_ptr<tera::Transaction> t(table->StartRowTransaction("www.baidu.com"));
+
+        assert(t != NULL);
+        std::unique_ptr<tera::RowMutation> mu0(table->NewRowMutation("www.baidu.com"));
+        mu0->Put("_N_", "cf:Page", "I am not important");
+        mu0->Put("cf", "Page", "hello world", -1);
+        t->ApplyMutation(mu0.get());
+        t->Commit();
+
+        if (t->GetError().GetType() != tera::ErrorCode::kOK) {
+            std::cout << t->GetError().ToString() << std::endl; 
+        } else {
+            std::cout << "commit success" << std::endl;
+        }
+
+        Observer* observer = new TestWorkerGTX();
+        
+        Scanner* scanner = new ScannerImpl();
+        bool ret = scanner->Init();
+
+        EXPECT_EQ(true, ret);
+        if(!ret) {
+            LOG(ERROR) << "fail to init scanner_impl";
+            return;
+        }
+
+        err = scanner->Observe("observer_table_gtx", "cf", "Page", observer);
+        EXPECT_EQ(err.GetType(), tera::ErrorCode::kOK);
+
+        if(!scanner->Start()) {
+            LOG(ERROR) << "fail to start scanner_impl";
+            return;
+        }
+
+        while (!static_cast<TestWorkerGTX*>(observer)->notified_) {
+            sleep(1);
+        }
+
+        EXPECT_EQ("www.baidu.com", static_cast<TestWorkerGTX*>(observer)->row_);
+        EXPECT_EQ("observer_table_gtx", static_cast<TestWorkerGTX*>(observer)->table_name_);
+        EXPECT_EQ("cf", static_cast<TestWorkerGTX*>(observer)->family_);
+        EXPECT_EQ("Page", static_cast<TestWorkerGTX*>(observer)->qualifier_);
+        EXPECT_EQ("hello world", static_cast<TestWorkerGTX*>(observer)->value_);
+        scanner->Exit();
+        delete scanner;
+    }
+
+    void NonTransactionTest() {
+        tera::ErrorCode err;
+        tera::Client* client = tera::Client::NewClient(FLAGS_flagfile, &err);
+        // for ut test
+        EXPECT_EQ(tera::ErrorCode::kOK, err.GetType());
+        // for no core
+        if (tera::ErrorCode::kOK != err.GetType()) {
+            LOG(ERROR) << "new client failed";
+            return;
+        }
+
+        // create table
+        tera::TableDescriptor table_desc("observer_table_ntx");
+
+        table_desc.AddLocalityGroup("lg1");
+        tera::ColumnFamilyDescriptor* cf1 = table_desc.AddColumnFamily("cf", "lg1");
+        cf1->EnableNotify();
+        ExtendNotifyLgToDescriptor(&table_desc);
+
+        client->CreateTable(table_desc, &err);
+        if (err.GetType() != tera::ErrorCode::kOK) {
+            LOG(ERROR) << "Create table fail";
+        }
+
+        tera::Table* table = client->OpenTable("observer_table_ntx", &err);
+        EXPECT_EQ(tera::ErrorCode::kOK, err.GetType());
+        if (tera::ErrorCode::kOK != err.GetType()) {
+            LOG(ERROR) << "open table failed"; 
+            return;
+        }
+
+        table->Put("www.baidu.com", "_N_", "cf:Page", "I am not important", &err);
+        table->Put("www.baidu.com", "cf", "Page", "hello world", -1, &err);
+
+        Observer* observer = new TestWorkerNTX();
+        
+        Scanner* scanner = new ScannerImpl();
+        bool ret = scanner->Init();
+
+        EXPECT_EQ(true, ret);
+        if(!ret) {
+            LOG(ERROR) << "fail to init scanner_impl";
+            return;
+        }
+
+        err = scanner->Observe("observer_table_ntx", "cf", "Page", observer);
+        EXPECT_EQ(err.GetType(), tera::ErrorCode::kOK);
+
+        if(!scanner->Start()) {
+            LOG(ERROR) << "fail to start scanner_impl";
+            return;
+        }
+
+        while (!static_cast<TestWorkerGTX*>(observer)->notified_) {
+            sleep(1);
+        }
+
+        EXPECT_EQ("www.baidu.com", static_cast<TestWorkerGTX*>(observer)->row_);
+        EXPECT_EQ("observer_table_ntx", static_cast<TestWorkerGTX*>(observer)->table_name_);
+        EXPECT_EQ("cf", static_cast<TestWorkerGTX*>(observer)->family_);
+        EXPECT_EQ("Page", static_cast<TestWorkerGTX*>(observer)->qualifier_);
+        EXPECT_EQ("hello world", static_cast<TestWorkerGTX*>(observer)->value_);
+        scanner->Exit();
+        delete scanner;
+    }
+
+    void ObserveTest() {
+        tera::ErrorCode err;
+        tera::Client* client = tera::Client::NewClient(FLAGS_flagfile, &err);
+        // for ut test
+        EXPECT_EQ(tera::ErrorCode::kOK, err.GetType());
+        // for no core
+        if (tera::ErrorCode::kOK != err.GetType()) {
+            LOG(ERROR) << "new client failed";
+            return;
+        }
+
+        // create table
+        tera::TableDescriptor table_desc("observer_table");
+        table_desc.EnableTxn();
+        table_desc.AddLocalityGroup("notify");
+        tera::ColumnFamilyDescriptor* cf_t = table_desc.AddColumnFamily(kNotifyColumnFamily, "notify");
+        cf_t->EnableGlobalTransaction();
+
+        table_desc.AddLocalityGroup("lg1");
+        tera::ColumnFamilyDescriptor* cf1 = table_desc.AddColumnFamily("cf", "lg1");
+        cf1->EnableGlobalTransaction();
+        cf1->EnableNotify();
+        tera::ColumnFamilyDescriptor* cf2 = table_desc.AddColumnFamily("cf_1", "lg1");
+        cf2->EnableGlobalTransaction();
+        cf2->EnableNotify();
+
+        ExtendNotifyLgToDescriptor(&table_desc);
+
+        client->CreateTable(table_desc, &err);
+        if (err.GetType() != tera::ErrorCode::kOK) {
+            LOG(ERROR) << "Create table fail";
+        }
+
+        FLAGS_tera_sdk_client_for_gtxn = true;
+        FLAGS_tera_coord_type = "ins";
+        common::ThreadPool thread_pool(5);
+        ScannerImpl* scanner = new ScannerImpl();
+        Observer* observer =  new DemoObserver();
+        scanner->key_selector_.reset(new RandomKeySelector());
+
+        // single thread
+
+        err = scanner->Observe("observer_table", "cf", "qualifier", observer);
+        EXPECT_TRUE(err.GetType() != tera::ErrorCode::kOK);
+
+        scanner->tera_client_ = tera::Client::NewClient(FLAGS_flagfile, &err);
+        EXPECT_EQ(scanner->table_observe_info_->size(), 0);
+
+        err = scanner->Observe("observer_table", "cf", "qualifier", observer);
+        EXPECT_TRUE(err.GetType() == tera::ErrorCode::kOK);
+
+        err = scanner->Observe("observer_table", "cf", "qualifier", observer);
+        EXPECT_FALSE(err.GetType() == tera::ErrorCode::kOK);
+
+        err = scanner->Observe("observer_table", "cf_1", "qualifier", observer);
+        EXPECT_TRUE(err.GetType() == tera::ErrorCode::kOK);  
+
+        // multi thread
+        std::string qualifier;
+
+        for (uint32_t i = 0; i < 10; ++i) {
+            qualifier += 'a';
+            thread_pool.AddTask(std::bind(&ScannerImpl::Observe, scanner, "observer_table", "cf", qualifier, observer));
+        }
+        thread_pool.Stop(true);
+        EXPECT_EQ(1, scanner->observers_.size());
+        EXPECT_EQ(10 + 2, (*(scanner->table_observe_info_))["observer_table"].observe_columns.size());   
+        scanner->Exit();
+        delete scanner;
+    }
+};
+
+TEST_F(ObserverImplTest, OnNotifyTest) {
+	FLAGS_tera_gtxn_test_opened = true;
+    FLAGS_tera_coord_type = "ins";
+    FLAGS_mock_rowlock_enable = true;
+    OnNotifyTest();
+}
+
+TEST_F(ObserverImplTest, SingleRowTransactionTest) {
+    FLAGS_tera_gtxn_test_opened = true;
+    FLAGS_tera_coord_type = "ins";
+    FLAGS_mock_rowlock_enable = true;
+    SingleRowTransactionTest();
+}
+
+TEST_F(ObserverImplTest, NoneTransactionTest) {
+    FLAGS_tera_gtxn_test_opened = true;
+    FLAGS_tera_coord_type = "ins";
+    FLAGS_mock_rowlock_enable = true;
+    NonTransactionTest();
+}
+
+TEST_F(ObserverImplTest, ObserveTest) {
+    FLAGS_tera_gtxn_test_opened = true;
+    FLAGS_tera_coord_type = "ins";
+    FLAGS_mock_rowlock_enable = true;
+    ObserveTest();
+}
+
+} // namespace observer
+} // namespace tera
+
+int main(int argc, char** argv) {
+    FLAGS_tera_sdk_client_for_gtxn = true;
+    ::google::ParseCommandLineFlags(&argc, &argv, true);
+    ::testing::InitGoogleTest(&argc, argv);
+    return RUN_ALL_TESTS();
+}
+
diff --git a/src/observer/test/rowlock_proxy_test.cc b/src/observer/test/rowlock_proxy_test.cc
new file mode 100644
index 000000000..3b690686b
--- /dev/null
+++ b/src/observer/test/rowlock_proxy_test.cc
@@ -0,0 +1,107 @@
+// Copyright (c) 2017, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include <atomic>
+#include <string>
+#include <vector>
+
+#include <gflags/gflags.h>
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+#include <sofa/pbrpc/pbrpc.h>
+
+#include "observer/rowlockproxy/remote_rowlock_proxy.h"
+#include "observer/rowlockproxy/rowlock_proxy_impl.h"
+#include "proto/rpc_client.h"
+#include "sdk/rowlock_client.h"
+#include "utils/utils_cmd.h"
+
+class TestClosure : public google::protobuf::Closure {
+public:
+	TestClosure() {}
+	virtual void Run() {}
+};
+
+namespace tera {
+namespace observer {
+
+class TestClient : public RowlockStub {
+public:
+	TestClient() : RowlockStub("127.0.0.1:22222") {};
+	~TestClient() {}
+
+	virtual bool TryLock(const RowlockRequest* request,
+            RowlockResponse* response,
+            std::function<void (RowlockRequest*, RowlockResponse*, bool, int)> done = NULL) {
+		response->set_lock_status(kLockSucc);
+		return true;
+	}
+
+    virtual bool UnLock(const RowlockRequest* request,
+            RowlockResponse* response,
+            std::function<void (RowlockRequest*, RowlockResponse*, bool, int)> done = NULL) {
+    	response->set_lock_status(kLockSucc);
+		return true;
+    }
+};
+
+TEST(RowlockProxyTest, ValueTest) {	
+	RowlockProxyImpl rowlock_proxy_impl;
+
+	rowlock_proxy_impl.SetServerNumber(100);
+	EXPECT_EQ(100, rowlock_proxy_impl.server_number_);
+	EXPECT_EQ(100, rowlock_proxy_impl.GetServerNumber());
+	
+	rowlock_proxy_impl.SetServerNumber(1000);
+	EXPECT_EQ(1000, rowlock_proxy_impl.server_number_);
+	EXPECT_EQ(1000, rowlock_proxy_impl.GetServerNumber());
+
+	rowlock_proxy_impl.SetServerNumber(2);
+	EXPECT_EQ(1000, rowlock_proxy_impl.server_addrs_->size());
+	EXPECT_EQ(0, rowlock_proxy_impl.clients_->size());
+	rowlock_proxy_impl.UpdateServers(0, "0.0.0.0:9999");
+
+	EXPECT_EQ(1, rowlock_proxy_impl.clients_->size());
+	rowlock_proxy_impl.UpdateServers(0, "0.0.1.1:9999");
+
+	EXPECT_EQ(2, rowlock_proxy_impl.clients_->size());
+
+	EXPECT_EQ(std::hash<std::string>()("tablerow"), 
+		rowlock_proxy_impl.GetRowKey("table", "row"));
+
+	EXPECT_EQ((*rowlock_proxy_impl.server_addrs_)[0], rowlock_proxy_impl.ScheduleRowKey(0));
+	EXPECT_EQ((*rowlock_proxy_impl.server_addrs_)[1], rowlock_proxy_impl.ScheduleRowKey(1));
+}
+
+TEST(RowlockProxyTest, LockTest) {
+	RowlockProxyImpl rowlock_proxy_impl;
+
+	rowlock_proxy_impl.SetServerNumber(1);
+	rowlock_proxy_impl.UpdateServers(0, "0.0.0.0:9999");
+	EXPECT_EQ(1, rowlock_proxy_impl.server_addrs_->size());
+	EXPECT_EQ(1, rowlock_proxy_impl.clients_->size());
+
+	EXPECT_TRUE(rowlock_proxy_impl.clients_->find("0.0.0.0:9999") != 
+		rowlock_proxy_impl.clients_->end());
+	delete (*rowlock_proxy_impl.clients_)["0.0.0.0:9999"];
+	(*rowlock_proxy_impl.clients_)["0.0.0.0:9999"] = new TestClient();
+
+	RowlockRequest request;
+	RowlockResponse response;
+	request.set_table_name("table");
+	request.set_row("row");
+
+	google::protobuf::Closure* closure = new TestClosure();
+
+	rowlock_proxy_impl.TryLock(&request, &response, closure);
+	EXPECT_EQ(response.lock_status(), kLockSucc);
+
+	google::protobuf::Closure* unlock_closure = new TestClosure();
+	rowlock_proxy_impl.UnLock(&request, &response, unlock_closure);
+	EXPECT_EQ(response.lock_status(), kLockSucc);
+}
+
+} // namespace observer
+} // namespace tera
+
diff --git a/src/observer/test/rowlock_test.cc b/src/observer/test/rowlock_test.cc
new file mode 100644
index 000000000..611cf195c
--- /dev/null
+++ b/src/observer/test/rowlock_test.cc
@@ -0,0 +1,184 @@
+// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. 
+
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include "gflags/gflags.h"
+#include "glog/logging.h"
+#include "gtest/gtest.h"
+
+#include "observer/rowlocknode/rowlock_db.h"
+#include "common/counter.h"
+
+DECLARE_int32(rowlock_timing_wheel_patch_num);
+
+namespace tera {
+namespace observer {
+
+class LockTest {
+public:
+    void Lock(tera::observer::ShardedRowlockDB* db, Counter* succeed) {
+        for (uint32_t i = 0; i < 10; ++i) {
+            uint64_t key = 1;
+
+            if (db->TryLock(key) == true) {
+                succeed->Inc();
+            }
+        }
+    }
+};
+
+TEST(ShardedRowlockDB, LockTest) {
+    ShardedRowlockDB db;
+
+    // test for lock
+    EXPECT_EQ(0, db.Size());
+
+    // different keys
+    EXPECT_TRUE(db.TryLock(0));
+    EXPECT_TRUE(db.TryLock(1));
+    EXPECT_TRUE(db.TryLock(2));
+
+    // same key that has been locked
+    EXPECT_FALSE(db.TryLock(0));
+    EXPECT_FALSE(db.TryLock(1));
+    EXPECT_FALSE(db.TryLock(2));
+
+    // test for unlock
+    db.UnLock(0);
+    EXPECT_TRUE(db.TryLock(0));
+
+    // unlock for other locked keys
+    EXPECT_FALSE(db.TryLock(1));
+    EXPECT_FALSE(db.TryLock(2));
+
+    // double unlock
+    db.UnLock(0);
+    db.UnLock(0);
+    EXPECT_TRUE(db.TryLock(0));
+
+    // unlock size
+    EXPECT_EQ(3, db.Size());
+    db.UnLock(0);
+    EXPECT_EQ(2, db.Size());
+    db.UnLock(0);
+    EXPECT_EQ(2, db.Size());
+    db.UnLock(1);
+    EXPECT_EQ(1, db.Size());
+    db.UnLock(2);
+    EXPECT_EQ(0, db.Size());
+
+    // test for ClearTimeout
+    for (int32_t i = 0; i < FLAGS_rowlock_timing_wheel_patch_num; ++i) {
+        // all keys will not be unlocked until timeing wheel works
+        EXPECT_TRUE(db.TryLock(i));
+        EXPECT_EQ(i + 1, db.Size());
+        db.ClearTimeout();
+    }
+
+    // timing wheel has run a circle, oldest key will be unlocked
+    EXPECT_EQ(FLAGS_rowlock_timing_wheel_patch_num - 1, db.Size());
+
+    // unlock the second oldest key
+    db.ClearTimeout();
+    EXPECT_EQ(FLAGS_rowlock_timing_wheel_patch_num - 2, db.Size());
+
+    // test for ClearTimeout multi keys
+    for (int32_t i = 0; i < FLAGS_rowlock_timing_wheel_patch_num; ++i) {
+        // all keys will not be unlocked until timeing wheel works
+        EXPECT_TRUE(db.TryLock(i * 10 + 1000000));
+        EXPECT_TRUE(db.TryLock(i * 10 + 1000001));
+        EXPECT_TRUE(db.TryLock(i * 10 + 1000002));
+        db.ClearTimeout();
+    }
+
+    // timing wheel has run a circle, oldest 3 keys will be unlocked
+    EXPECT_EQ(FLAGS_rowlock_timing_wheel_patch_num * 3 - 3, db.Size());
+
+    // unlock the oldest 3 keys
+    db.ClearTimeout();
+    EXPECT_EQ(FLAGS_rowlock_timing_wheel_patch_num * 3 - 6, db.Size());
+}
+
+TEST(RowlockDB, LockTest) {
+    RowlockDB db;
+
+    // test for lock
+    EXPECT_EQ(0, db.Size());
+
+    // different keys
+    EXPECT_TRUE(db.TryLock(0));
+    EXPECT_TRUE(db.TryLock(1));
+    EXPECT_TRUE(db.TryLock(2));
+
+    // same key that has been locked
+    EXPECT_FALSE(db.TryLock(0));
+    EXPECT_FALSE(db.TryLock(1));
+    EXPECT_FALSE(db.TryLock(2));
+
+    // test for unlock
+    db.UnLock(0);
+    EXPECT_TRUE(db.TryLock(0));
+
+    // unlock for other locked keys
+    EXPECT_FALSE(db.TryLock(1));
+    EXPECT_FALSE(db.TryLock(2));
+
+    // double unlock
+    db.UnLock(0);
+    db.UnLock(0);
+    EXPECT_TRUE(db.TryLock(0));
+
+    // unlock size
+    EXPECT_EQ(3, db.Size());
+    db.UnLock(0);
+    EXPECT_EQ(2, db.Size());
+    db.UnLock(0);
+    EXPECT_EQ(2, db.Size());
+    db.UnLock(1);
+    EXPECT_EQ(1, db.Size());
+    db.UnLock(2);
+    EXPECT_EQ(0, db.Size());
+
+    // test for ClearTimeout
+    for (int32_t i = 0; i < FLAGS_rowlock_timing_wheel_patch_num; ++i) {
+        // all keys will not be unlocked until timeing wheel works
+        EXPECT_TRUE(db.TryLock(i));
+        EXPECT_EQ(i + 1, db.Size());
+        db.ClearTimeout();
+    }
+
+    // timing wheel has run a circle, oldest key will be unlocked
+    EXPECT_EQ(FLAGS_rowlock_timing_wheel_patch_num - 1, db.Size());
+
+    // unlock the second oldest key
+    db.ClearTimeout();
+    EXPECT_EQ(FLAGS_rowlock_timing_wheel_patch_num - 2, db.Size());
+}
+
+TEST(ShardedRowlockDB, ParaTest) {
+    Counter counter;
+    ShardedRowlockDB db;
+    LockTest test;
+
+    // 10 threads to lock the same key
+    ThreadPool thread_pool(10);
+    for (uint32_t i = 0; i < 10; ++i) {
+        ThreadPool::Task task = std::bind(&LockTest::Lock, &test, &db, &counter);
+        thread_pool.AddTask(task);
+    }
+    sleep(1);
+    EXPECT_EQ(1, db.Size());
+    EXPECT_EQ(1, counter.Get());
+
+    for (int32_t i = 0; i < FLAGS_rowlock_timing_wheel_patch_num; ++i) {
+        db.ClearTimeout();
+    }
+    EXPECT_EQ(0, db.Size());
+}
+
+} // namespace observer
+} // namespace tera
diff --git a/src/observer/test/scanner_test.cc b/src/observer/test/scanner_test.cc
new file mode 100644
index 000000000..fc1b91c05
--- /dev/null
+++ b/src/observer/test/scanner_test.cc
@@ -0,0 +1,495 @@
+// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include <atomic>
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include "gflags/gflags.h"
+#include "glog/logging.h"
+#include "gtest/gtest.h"
+
+#include "observer/executor/random_key_selector.h"
+#include "observer/executor/scanner_impl.h"
+#include "observer/observer_demo/demo_observer.h"
+#include "sdk/client_impl.h"
+#include "sdk/global_txn.h"
+#include "sdk/mutate_impl.h"
+#include "sdk/read_impl.h"
+#include "sdk/table_impl.h"
+#include "sdk/sdk_utils.h"
+#include "tera.h"
+
+DECLARE_bool(tera_sdk_client_for_gtxn);
+DECLARE_bool(tera_sdk_tso_client_enabled);
+DECLARE_string(tera_coord_type);
+DECLARE_bool(rowlock_test);
+
+namespace tera {
+namespace observer {
+
+
+class TestRowReader : public RowReaderImpl {
+public:
+    TestRowReader(TableImpl* table, const std::string& row_key)
+        : RowReaderImpl(table, row_key), seq_(0) {
+            if (row_key == "empty") {
+                // empty case
+            } else if (row_key == "900") {
+                value_.push_back("900");
+                value_.push_back("900");
+                value_.push_back("901");
+                value_.push_back("920");
+            } else if (row_key == "1100") {
+                value_.push_back("1000");
+                value_.push_back("1000");
+                value_.push_back("1100");
+                value_.push_back("1100");
+            } else if (row_key == "1hour") {
+                value_.push_back("810");
+                value_.push_back("820");
+                value_.push_back("830");
+                value_.push_back("840");
+            } else if (row_key == "collision_mix") {
+                value_.push_back("100");
+                value_.push_back("1000");
+                value_.push_back("4700");
+                value_.push_back("1100");
+            } else if (row_key == "error_ts") {
+                value_.push_back("100:sffaeeew");
+            } else if (row_key == "some_error_ts") {
+                value_.push_back("wrong_string");
+                value_.push_back("900");
+                value_.push_back("900");
+                value_.push_back("900");
+            } else {
+                value_.push_back("1010");
+                value_.push_back("1012");
+                value_.push_back("1013");
+                value_.push_back("1014");
+                value_.push_back("1015");
+                value_.push_back("1016");
+                value_.push_back("1017");
+            }
+    }
+    virtual std::string Value() {
+        return value_[seq_];
+
+    }
+    virtual int64_t Timestamp() {
+        return 9999999;
+    }
+    virtual void AddColumn(const std::string& family, const std::string& qualifier) {}
+    virtual bool Done() {
+        return seq_ == value_.size();
+    }
+    virtual void Next() {
+        seq_++;
+    }
+private:
+    std::vector<std::string> value_;
+    uint32_t seq_;
+};
+
+class TestTransaction : public GlobalTxn {
+public:
+    TestTransaction(int64_t start_ts, common::ThreadPool* thread_pool, bool error = false)
+        : GlobalTxn(NULL, thread_pool, NULL),
+        start_timestamp_(1000), error_(error) {}
+
+    virtual ~TestTransaction() {}
+    virtual ErrorCode Get(RowReader* row_reader) {
+        ErrorCode err;
+        return err;
+    }
+    virtual int64_t GetStartTimestamp() {
+        return start_timestamp_;
+    }
+    virtual const ErrorCode& GetError() {
+        if (error_ == true) {
+            err_.SetFailed(ErrorCode::kSystem, "");
+        }
+        return err_;
+    }
+private:
+    int64_t start_timestamp_;
+    ErrorCode err_;
+    bool error_;
+};
+
+class TestRowMutationImpl : public RowMutationImpl {
+public:
+    TestRowMutationImpl(Table* table, const std::string& row_key)
+        : RowMutationImpl(table, row_key) {}
+    virtual void Put(const std::string& value, int32_t ttl = -1) {}
+    virtual void ApplyMutation(RowMutation* row_mu) {}
+};
+
+class TestTable : public TableImpl {
+public:
+    TestTable(const std::string& table_name,
+              ThreadPool* thread_pool,
+              sdk::ClusterFinder* cluster) 
+                : TableImpl(table_name, thread_pool, cluster), 
+                  global_txn_(true),
+                  thread_pool_(thread_pool) {} 
+    virtual RowReader* NewRowReader(const std::string& row_key) {
+        return new TestRowReader(this, row_key);
+    }
+    virtual Transaction* StartRowTransaction(const std::string& row_key) {
+        return new TestTransaction(1, thread_pool_);
+    }
+    virtual RowMutation* NewRowMutation(const std::string& row_key) {
+        return new TestRowMutationImpl(this, row_key);
+    }
+    virtual void CommitRowTransaction(Transaction* transaction) {}
+    virtual bool GetDescriptor(TableDescriptor* schema, ErrorCode* err) {
+        schema->AddLocalityGroup("lg0");
+        tera::ColumnFamilyDescriptor* cfd1 = schema->AddColumnFamily("cf1");
+        cfd1->EnableNotify();
+        ExtendNotifyLgToDescriptor(schema);
+        if (!global_txn_) {
+            cfd1->DisableGlobalTransaction();
+        }
+        return true;
+    }
+private:
+    bool global_txn_;
+    common::ThreadPool* thread_pool_;
+};
+
+class TestResultStream : public tera::ResultStream{
+public:
+    virtual bool Done(ErrorCode* err) {
+        if (next_number_ < row_name_.size()) {
+            return false;
+        } else {
+            return true;
+        }
+    }
+    virtual void Next() {
+        next_number_++;
+    }
+
+    virtual std::string RowName() const {
+        return row_name_[next_number_];
+    }
+    virtual std::string Qualifier() const {
+        return qualifier_[next_number_];
+    }
+
+
+    virtual std::string Family() const {
+        return "";
+    }
+
+    virtual int64_t Timestamp() const {
+        return 0;
+    }
+    virtual std::string Value() const {
+        return "";
+    }
+
+    virtual int64_t ValueInt64() const {
+        return 0;
+    }
+
+    virtual bool LookUp(const std::string& row_key) {
+        return true;
+    }
+
+    virtual std::string ColumnName() const {
+        return "";
+    }
+private:
+    uint32_t next_number_;
+    std::vector<string> row_name_;
+    std::vector<string> qualifier_;
+    bool done_;
+};
+
+class TestObserver : public tera::observer::Observer {
+public:
+    TestObserver() : count_(0) {}
+    virtual ~TestObserver() {}
+    virtual ErrorCode OnNotify(tera::Transaction* t,
+                              tera::Client* client,
+                              const std::string& table_name,
+                              const std::string& family,
+                              const std::string& qualifier,
+                              const std::string& row,
+                              const std::string& value,
+                              int64_t timestamp,
+                              Notification* notification) {
+        LOG(INFO) << "[Notify TestObserver] table:family:qualifer=" <<
+        table_name << ":" << family << ":" <<
+        qualifier << " row=" << row <<
+        " value=" << value << " timestamp=" << timestamp;
+
+        count_++;
+
+        tera::ErrorCode err;
+        // do nothing
+        return err;
+    }
+    virtual std::string GetObserverName() const {
+        return "TestObserver";
+    }
+
+    virtual TransactionType GetTransactionType() const {
+        return kGlobalTransaction;
+    }
+private:
+    std::atomic<uint32_t> count_;
+};
+
+class TestClient : public ClientImpl {
+public:
+    TestClient() : ClientImpl("", "") {}
+    ~TestClient() {}
+    virtual Table* OpenTable(const std::string& table_name, ErrorCode* err) {
+        return  static_cast<tera::Table*>(new TestTable(table_name, &thread_pool_, NULL)); 
+    }
+};
+
+class TestKeySelector : public RandomKeySelector {
+public:
+    TestKeySelector() {}
+    virtual ErrorCode Observe(const std::string& table_name) {
+        tera::ErrorCode err;
+        return err;
+    }
+};
+
+TEST(ScannerImpl, ParseNotifyQualifier) {
+    FLAGS_tera_sdk_client_for_gtxn = true;
+    FLAGS_tera_coord_type = "mock_zk";
+    ScannerImpl scanner;
+
+    std::string data_family;
+    std::string data_qualfier;
+
+    EXPECT_TRUE(scanner.ParseNotifyQualifier("C:url", &data_family, &data_qualfier));
+    EXPECT_EQ(data_family, "C");
+    EXPECT_EQ(data_qualfier, "url");
+
+    EXPECT_TRUE(scanner.ParseNotifyQualifier("cf:page", &data_family, &data_qualfier));
+    EXPECT_EQ(data_family, "cf");
+    EXPECT_EQ(data_qualfier, "page");
+
+    EXPECT_TRUE(scanner.ParseNotifyQualifier("cf::::::", &data_family, &data_qualfier));
+    EXPECT_EQ(data_family, "cf");
+    EXPECT_EQ(data_qualfier, ":::::");
+
+    EXPECT_TRUE(scanner.ParseNotifyQualifier("cf:___", &data_family, &data_qualfier));
+    EXPECT_EQ(data_family, "cf");
+    EXPECT_EQ(data_qualfier, "___");
+
+    EXPECT_FALSE(scanner.ParseNotifyQualifier("Curl", &data_family, &data_qualfier));
+    EXPECT_FALSE(scanner.ParseNotifyQualifier("C_url", &data_family, &data_qualfier));
+    EXPECT_FALSE(scanner.ParseNotifyQualifier("C.urlN_", &data_family, &data_qualfier));
+    EXPECT_FALSE(scanner.ParseNotifyQualifier("++page", &data_family, &data_qualfier));
+
+}
+
+TEST(ScannerImpl, DoReadValue) {
+    FLAGS_tera_sdk_client_for_gtxn = true;
+    FLAGS_mock_rowlock_enable = true;
+    FLAGS_tera_coord_type = "mock_zk";
+    common::ThreadPool thread_pool(2);
+    ScannerImpl scanner;
+    TestTable table("test_table", &thread_pool, NULL);
+
+    std::shared_ptr<NotifyCell> notify_cell(new NotifyCell(new TestTransaction(1, &thread_pool)));
+    Column column = {"test_table", "family", "qualifier"};
+
+    notify_cell->row = "row";
+    notify_cell->value = "value";
+    notify_cell->timestamp = 999999999;
+    notify_cell->observed_column = column;
+    notify_cell->table = &table;
+
+    // no table name
+    EXPECT_FALSE(scanner.DoReadValue(notify_cell));
+    // no column
+    ScannerImpl::TableObserveInfo cell;
+    (*scanner.table_observe_info_)["test_table"] = cell;
+    EXPECT_FALSE(scanner.DoReadValue(notify_cell));
+    // size 0
+    (*scanner.table_observe_info_)["test_table"].observe_columns[column].clear();
+    EXPECT_FALSE(scanner.DoReadValue(notify_cell));
+
+    Observer* observer = new TestObserver();
+    // normal
+    (*scanner.table_observe_info_)["test_table"].observe_columns[column].insert(observer);
+    EXPECT_TRUE(scanner.DoReadValue(notify_cell));
+
+    // multi observer
+    Observer* parse = new TestObserver();
+    (*scanner.table_observe_info_)["test_table"].observe_columns[column].insert(parse);
+    EXPECT_TRUE(scanner.DoReadValue(notify_cell));
+}
+
+TEST(ScannerImpl, MultiThreadDoReadValue) {
+    FLAGS_tera_sdk_client_for_gtxn = true;
+    FLAGS_mock_rowlock_enable = true;
+    FLAGS_tera_coord_type = "mock_zk";
+    common::ThreadPool thread_pool(2);
+    ScannerImpl scanner;
+    TestTable table("test_table", &thread_pool, NULL);
+
+    std::shared_ptr<NotifyCell> notify_cell(new NotifyCell(new TestTransaction(1, &thread_pool)));
+    Column column = {"test_table", "family", "qualifier"};
+
+    notify_cell->row = "row";
+    notify_cell->value = "value";
+    notify_cell->timestamp = 100;
+    notify_cell->observed_column = column;
+    notify_cell->table = &table;
+
+    Observer* observer = new TestObserver();
+    (*scanner.table_observe_info_)["test_table"].observe_columns[column].insert(observer);
+
+    common::ThreadPool worker_thread(10);
+    for (uint32_t i = 0; i < 10; ++i) {
+        worker_thread.AddTask(std::bind(&ScannerImpl::DoReadValue, &scanner, notify_cell));
+    }
+    worker_thread.Stop(true);
+    scanner.transaction_threads_->Stop(true);
+    EXPECT_EQ(((TestObserver*)observer)->count_, 10);
+}
+
+TEST(ScannerImpl, NextRow) {
+    FLAGS_tera_sdk_client_for_gtxn = true;
+    FLAGS_tera_coord_type = "mock_zk";
+    std::unique_ptr<tera::ResultStream> result_stream(new TestResultStream());
+    ScannerImpl scanner;
+    std::set<Column> columns;
+    bool finished = false;
+    std::string vec_rowkey;
+    std::vector<Column> vec_col;
+
+    // stream done
+    EXPECT_FALSE(scanner.NextRow(columns, result_stream.get(), "table_name", &finished, &vec_rowkey, &vec_col));
+    EXPECT_EQ(true, finished);
+
+    finished = false;
+    static_cast<TestResultStream*>(result_stream.get())->row_name_.push_back("row1");
+    static_cast<TestResultStream*>(result_stream.get())->qualifier_.push_back("cf:page1");
+    static_cast<TestResultStream*>(result_stream.get())->row_name_.push_back("row1");
+    static_cast<TestResultStream*>(result_stream.get())->qualifier_.push_back("cf:page2");
+    static_cast<TestResultStream*>(result_stream.get())->row_name_.push_back("row2");
+    static_cast<TestResultStream*>(result_stream.get())->qualifier_.push_back("cf:page3");
+    static_cast<TestResultStream*>(result_stream.get())->row_name_.push_back("row2");
+    static_cast<TestResultStream*>(result_stream.get())->qualifier_.push_back("cf:page4");
+
+    Column colum_1 = {"table_name", "cf", "page1"};
+    Column colum_2 = {"table_name", "cf", "page2"};
+    Column colum_3 = {"table_name", "cf", "page3"};
+    Column colum_4 = {"table_name", "cf", "page4"};
+    columns.insert(colum_1);
+    columns.insert(colum_2);
+    columns.insert(colum_3);
+    columns.insert(colum_4);
+
+    // row 1
+    EXPECT_TRUE(scanner.NextRow(columns, result_stream.get(), "table_name", &finished, &vec_rowkey, &vec_col));
+    EXPECT_FALSE(finished);
+
+    // row 1 data
+    EXPECT_EQ(vec_col.size(), 2);
+    EXPECT_EQ(vec_rowkey, "row1");
+    EXPECT_EQ(vec_col[0].qualifier, "page1");
+    EXPECT_EQ(vec_col[1].qualifier, "page2");
+
+    // row 2
+    EXPECT_TRUE(scanner.NextRow(columns, result_stream.get(), "table_name", &finished, &vec_rowkey, &vec_col));
+    EXPECT_FALSE(finished);
+
+    // row 2 data
+    EXPECT_EQ(vec_col.size(), 2);
+    EXPECT_EQ(vec_rowkey, "row2");
+    EXPECT_EQ(vec_col[0].qualifier, "page3");
+    EXPECT_EQ(vec_col[1].qualifier, "page4");
+
+    // scan finish
+    EXPECT_FALSE(scanner.NextRow(columns, result_stream.get(), "table_name", &finished, &vec_rowkey, &vec_col));
+    EXPECT_TRUE(finished);
+}
+
+
+
+TEST(ScannerImpl, CheckConflictOnAckColumn) {
+    FLAGS_tera_sdk_client_for_gtxn = true;
+    FLAGS_tera_coord_type = "mock_zk";
+    common::ThreadPool thread_pool(2);
+    ScannerImpl scanner;
+    TestTable table("test_table", &thread_pool, NULL);
+
+    std::shared_ptr<NotifyCell> notify_cell(new NotifyCell(new TestTransaction(1, &thread_pool)));
+    Column column = {"test_table", "family", "qualifier"};
+
+    notify_cell->row = "row";
+    notify_cell->value = "value";
+    notify_cell->timestamp = 1000;
+    notify_cell->observed_column = column;
+    notify_cell->table = &table;
+
+    std::set<Observer*> observers;
+
+    TestObserver observer;
+    observers.insert(&observer);
+
+    // empty case
+    notify_cell->row = "empty";
+    EXPECT_TRUE(scanner.CheckConflictOnAckColumn(notify_cell, observers));
+
+    // row reader ts < transaction(notify) ts
+    notify_cell->row = "900";
+    EXPECT_TRUE(scanner.CheckConflictOnAckColumn(notify_cell, observers));
+
+    // row reader ts > transaction(notify) ts
+    notify_cell->row = "1100";
+    EXPECT_FALSE(scanner.CheckConflictOnAckColumn(notify_cell, observers));
+
+    // transaction ts - row reader ts < 600
+    notify_cell->timestamp = 700;
+    notify_cell->row = "1hour";
+    EXPECT_FALSE(scanner.CheckConflictOnAckColumn(notify_cell, observers));
+
+    // collision_mix: some legal, some illegal
+    notify_cell->row = "collision_mix";
+    EXPECT_FALSE(scanner.CheckConflictOnAckColumn(notify_cell, observers));
+
+    // ack parse fail
+    notify_cell->timestamp = 1000;
+    notify_cell->row = "error_ts";
+    EXPECT_FALSE(scanner.CheckConflictOnAckColumn(notify_cell, observers));
+
+    // some ack parse fail
+    notify_cell->row = "some_error_ts";
+    EXPECT_FALSE(scanner.CheckConflictOnAckColumn(notify_cell, observers));
+
+    // mutation fail
+    std::shared_ptr<NotifyCell> notify_cell_fail(new NotifyCell(new TestTransaction(1, &thread_pool, true)));
+
+    notify_cell_fail->row = "row";
+    notify_cell_fail->value = "value";
+    notify_cell_fail->timestamp = 1000;
+    notify_cell_fail->observed_column = column;
+    notify_cell_fail->table = &table;
+
+    // empty case
+    notify_cell->row = "empty";
+    EXPECT_FALSE(scanner.CheckConflictOnAckColumn(notify_cell_fail, observers));
+
+    // row reader ts < transaction(notify) ts
+    notify_cell->row = "900";
+    EXPECT_FALSE(scanner.CheckConflictOnAckColumn(notify_cell_fail, observers));
+}
+
+} // namespace observer
+} // namespace tera
+
diff --git a/src/proto/lb_client.cc b/src/proto/lb_client.cc
new file mode 100644
index 000000000..0b70af707
--- /dev/null
+++ b/src/proto/lb_client.cc
@@ -0,0 +1,37 @@
+// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include <string>
+
+#include "gflags/gflags.h"
+
+#include "proto/lb_client.h"
+
+DECLARE_int32(tera_master_connect_retry_times);
+DECLARE_int32(tera_master_connect_retry_period);
+DECLARE_int32(tera_master_connect_timeout_period);
+
+namespace tera {
+namespace load_balancer {
+
+LBClient::LBClient(const std::string& server_addr,
+                   int32_t rpc_timeout)
+    : RpcClient<LoadBalancerService::Stub>(server_addr),
+    rpc_timeout_(rpc_timeout) {
+}
+
+LBClient::~LBClient() {
+}
+
+bool LBClient::CmdCtrl(const CmdCtrlRequest* request,
+                       CmdCtrlResponse* response) {
+    return SendMessageWithRetry(&LoadBalancerService::Stub::CmdCtrl,
+                                request, response,
+                                (std::function<void (CmdCtrlRequest*, CmdCtrlResponse*, bool, int)>)NULL,
+                                "CmdCtrl", rpc_timeout_);
+}
+
+} // namespace load_balancer
+} // namespace tera
+
diff --git a/src/proto/lb_client.h b/src/proto/lb_client.h
new file mode 100644
index 000000000..faf47b59a
--- /dev/null
+++ b/src/proto/lb_client.h
@@ -0,0 +1,35 @@
+// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef TERA_LOAD_BALANCER_LB_CLIENT_H_
+#define TERA_LOAD_BALANCER_LB_CLIENT_H_
+
+#include <string>
+
+#include "proto/load_balancer_rpc.pb.h"
+#include "proto/rpc_client.h"
+
+DECLARE_int32(tera_rpc_timeout_period);
+
+namespace tera {
+namespace load_balancer {
+
+class LBClient : public RpcClient<LoadBalancerService::Stub> {
+public:
+    LBClient(const std::string& server_addr = "",
+             int32_t rpc_timeout = FLAGS_tera_rpc_timeout_period);
+    virtual ~LBClient();
+
+    virtual bool CmdCtrl(const CmdCtrlRequest* request,
+                         CmdCtrlResponse* response);
+
+private:
+    int32_t rpc_timeout_;
+};
+
+} // namespace load_balancer
+} // namespace tera
+
+#endif // TERA_LOAD_BALANCER_LB_CLIENT_H_
+
diff --git a/src/proto/load_balancer_rpc.proto b/src/proto/load_balancer_rpc.proto
new file mode 100644
index 000000000..d7b077fa1
--- /dev/null
+++ b/src/proto/load_balancer_rpc.proto
@@ -0,0 +1,11 @@
+import "sofa/pbrpc/rpc_option.proto";
+import "master_rpc.proto";
+
+package tera;
+
+service LoadBalancerService {
+    rpc CmdCtrl(CmdCtrlRequest) returns(CmdCtrlResponse);
+}
+
+option cc_generic_services = true;
+
diff --git a/src/proto/rowlocknode_rpc.proto b/src/proto/rowlocknode_rpc.proto
new file mode 100644
index 000000000..6e8107710
--- /dev/null
+++ b/src/proto/rowlocknode_rpc.proto
@@ -0,0 +1,19 @@
+import "sofa/pbrpc/rpc_option.proto";
+import "status_code.proto";
+
+package tera;
+
+message RowlockRequest {
+    required string table_name = 1;
+    required string row = 2;
+}
+
+message RowlockResponse {
+    required StatusCode lock_status = 1;
+}
+
+service RowlockService {
+    rpc Lock(RowlockRequest) returns(RowlockResponse);
+    rpc UnLock(RowlockRequest) returns(RowlockResponse);
+}
+option cc_generic_services = true;
diff --git a/src/proto/rpc_client.h b/src/proto/rpc_client.h
index 74ded0212..9067cc96d 100644
--- a/src/proto/rpc_client.h
+++ b/src/proto/rpc_client.h
@@ -144,7 +144,7 @@ class RpcClient : public RpcClientBase {
                               int32_t rpc_timeout, ThreadPool* thread_pool = 0) {
         if (NULL == server_client_.get()) {
             // sync call
-            if (closure == NULL) {
+            if (!closure) {
                 return false;
             }
 
@@ -168,7 +168,7 @@ class RpcClient : public RpcClientBase {
         (server_client_.get()->*func)(rpc_controller, request, response, done);
 
         // sync call
-        if (closure == NULL) {
+        if (!closure) {
             sync_call_event.Wait();
             return (!sync_call_failed);
         }
@@ -196,7 +196,7 @@ class RpcClient : public RpcClientBase {
         delete param;
 
         // sync call
-        if (closure == NULL) {
+        if (!closure) {
             client->sync_call_failed = failed;
             client->sync_call_event.Set();
             return;
diff --git a/src/proto/status_code.proto b/src/proto/status_code.proto
index 24b0ff595..cb99c7235 100644
--- a/src/proto/status_code.proto
+++ b/src/proto/status_code.proto
@@ -96,6 +96,18 @@ enum StatusCode {
 
     kTableStatusEnable = 1000;
     kTableStatusDisable = 1001;
+
+    // Timeoracle
+    kTimeoracleOk   = 2000;
+    kTimeoracleBusy = 2001;
+
+    // rowlock service
+    kLockSucc = 2100;
+    kLockFail = 2101;
+
+    // LoadBalancer
+    kLoadBalancerOk = 2200;
+    kLoadBalancerError = 2201;
 }
 
 enum TabletStatus {
@@ -118,6 +130,9 @@ enum TabletStatus {
     kTabletPending = 65;
     kTabletOnSnapshot = 66;
     kTabletDelSnapshot = 67;
+
+    // runtime status
+    kTabletCorruption = 90;
 }
 
 enum TableStatus {
diff --git a/src/proto/table_meta.proto b/src/proto/table_meta.proto
index c0df47e63..cdf18b689 100644
--- a/src/proto/table_meta.proto
+++ b/src/proto/table_meta.proto
@@ -64,6 +64,7 @@ message TabletCounter {
     optional double write_workload = 11 [default = 0.0];
 
     optional bool is_on_busy = 15 [default = false];
+    optional TabletStatus db_status = 16;
 }
 
 message TableCounter {
@@ -107,6 +108,7 @@ message TabletMeta {
     repeated uint64 parent_tablets = 12;
     repeated int64 lg_size = 13;
     repeated Rollback rollbacks = 14;
+    optional int64 last_move_time_us = 15;
 }
 
 message TableMetaList {
@@ -130,3 +132,12 @@ message SdkCookie {
     required string table_name = 1;
     repeated SdkTabletCookie tablets = 2;
 }
+
+message PrimaryInfo {
+    optional string table_name = 1;
+    optional bytes row_key = 2;
+    optional bytes column_family = 3;
+    optional bytes qualifier = 4;
+    optional int64 gtxn_start_ts = 5;
+    optional string client_session = 6;
+}
diff --git a/src/proto/table_schema.proto b/src/proto/table_schema.proto
index 9f6c8727d..62c716c53 100644
--- a/src/proto/table_schema.proto
+++ b/src/proto/table_schema.proto
@@ -39,6 +39,8 @@ message ColumnFamilySchema {
     optional int32 time_to_live = 8 [default = 0]; // 单位:秒(0:不过期, <0:提前过期, >0:延后过期)
     optional int64 disk_quota = 9;
     optional string type = 10;
+    optional bool gtxn = 11 [default = false]; // 'gtxn=on' for global transaction feature availability 
+    optional bool notify = 12 [default = false]; // 'notify=on' for notify feature availability
 }
 
 message TableSchema {
diff --git a/src/proto/tabletnode.proto b/src/proto/tabletnode.proto
index fff28caa5..d36f5e0f2 100644
--- a/src/proto/tabletnode.proto
+++ b/src/proto/tabletnode.proto
@@ -14,6 +14,7 @@ message TabletNodeInfo {
     optional uint64 timestamp = 4;
     optional uint32 tablet_total = 5;
     optional uint32 tablet_onbusy = 6;
+    optional uint32 tablet_corruption = 7;
 
     optional uint32 low_read_cell = 11;
     optional uint32 scan_rows = 12;
diff --git a/src/proto/tabletnode_client.cc b/src/proto/tabletnode_client.cc
index b6b347d2d..e57a5e8a8 100644
--- a/src/proto/tabletnode_client.cc
+++ b/src/proto/tabletnode_client.cc
@@ -105,6 +105,14 @@ bool TabletNodeClient::SplitTablet(const SplitTabletRequest* request,
                                 request, response, done, "SplitTablet",
                                 rpc_timeout_, thread_pool_);
 }
+bool TabletNodeClient::ComputeSplitKey(const SplitTabletRequest* request,
+                                   SplitTabletResponse* response,
+                                   std::function<void (SplitTabletRequest*, SplitTabletResponse*, bool, int)> done) {
+    return SendMessageWithRetry(&TabletNodeServer::Stub::ComputeSplitKey,
+                                request, response, done, "ComputeSplitKey",
+                                rpc_timeout_, thread_pool_);
+}
+
 
 bool TabletNodeClient::CompactTablet(const CompactTabletRequest* request,
                                      CompactTabletResponse* response,
diff --git a/src/proto/tabletnode_client.h b/src/proto/tabletnode_client.h
index c56e0d7c0..1033841d0 100644
--- a/src/proto/tabletnode_client.h
+++ b/src/proto/tabletnode_client.h
@@ -69,6 +69,9 @@ class TabletNodeClient : public RpcClient<TabletNodeServer::Stub> {
     bool SplitTablet(const SplitTabletRequest* request,
                      SplitTabletResponse* response,
                      std::function<void (SplitTabletRequest*, SplitTabletResponse*, bool, int)> done = NULL);
+    bool ComputeSplitKey(const SplitTabletRequest* request, SplitTabletResponse* response, 
+                     std::function<void (SplitTabletRequest*, SplitTabletResponse*, bool, int)> done = NULL);
+
 
     bool CompactTablet(const CompactTabletRequest* request,
                        CompactTabletResponse* response,
diff --git a/src/proto/tabletnode_rpc.proto b/src/proto/tabletnode_rpc.proto
index 0d79ce0c7..45651203e 100644
--- a/src/proto/tabletnode_rpc.proto
+++ b/src/proto/tabletnode_rpc.proto
@@ -90,6 +90,7 @@ message LoadTabletRequest {
     repeated uint64 snapshots_sequence = 10;
     repeated uint64 parent_tablets = 11;
     repeated Rollback rollbacks = 12;
+    repeated string ignore_err_lgs = 13; 
 }
 
 message LoadTabletResponse {
@@ -263,6 +264,7 @@ message ScanTabletRequest {
     optional int64 timestamp = 18 [default = 0];
     optional int64 timeout = 19;
     optional int64 number_limit = 21;
+    optional uint64 max_qualifiers = 22;
 }
 
 message ScanTabletResponse {
@@ -282,6 +284,7 @@ message RowReaderInfo {
     optional TimeRange time_range = 3;
     optional FilterList filter_list = 4;
     optional uint32 max_version = 5;
+    optional uint64 max_qualifiers = 6;
 }
 
 message ReadTabletRequest {
@@ -309,11 +312,13 @@ message SplitTabletRequest {
     optional TabletMeta tablet_meta = 4;
     repeated uint64 child_tablets = 5;
     optional bytes split_key = 6;
+    optional bool master_update_meta = 7;
 }
 
 message SplitTabletResponse {
     required StatusCode status = 1 [default = kTableMergeError];
     required uint64 sequence_id = 2;
+    repeated string split_keys = 3;
 }
 
 message MergeTabletRequest {
@@ -367,6 +372,7 @@ service TabletNodeServer {
     rpc Rollback(SnapshotRollbackRequest) returns(SnapshotRollbackResponse);
 
     rpc SplitTablet(SplitTabletRequest) returns(SplitTabletResponse);
+    rpc ComputeSplitKey(SplitTabletRequest) returns (SplitTabletResponse);
 
     rpc CmdCtrl(TsCmdCtrlRequest) returns(TsCmdCtrlResponse);
     rpc Update(UpdateRequest) returns(UpdateResponse);
diff --git a/src/proto/timeoracle_rpc.proto b/src/proto/timeoracle_rpc.proto
new file mode 100644
index 000000000..f96661b9f
--- /dev/null
+++ b/src/proto/timeoracle_rpc.proto
@@ -0,0 +1,20 @@
+import "sofa/pbrpc/rpc_option.proto";
+import "status_code.proto";
+
+package tera;
+
+message GetTimestampRequest {
+    optional uint64 count = 1;
+}
+
+message GetTimestampResponse {
+    optional StatusCode status = 1;
+    optional int64 start_timestamp = 2;
+    optional uint64 count = 3;
+}
+
+service TimeoracleServer {
+    rpc GetTimestamp(GetTimestampRequest) returns(GetTimestampResponse);
+}
+
+option cc_generic_services = true;
diff --git a/src/sample/Makefile b/src/sample/Makefile
index 81698c729..02268f2ff 100644
--- a/src/sample/Makefile
+++ b/src/sample/Makefile
@@ -10,15 +10,15 @@ SHARED_LDFLAGS = -shared -Wl,-soname -Wl,
 
 INCPATH += -I../../include $(DEPS_INCPATH)
 CFLAGS += $(OPT) $(SHARED_CFLAGS) $(INCPATH)
-CXXFLAGS += $(OPT) $(SHARED_CFLAGS) $(INCPATH)
+CXXFLAGS += -std=gnu++11 $(OPT) $(SHARED_CFLAGS) $(INCPATH)
 LDFLAGS += ../../build/lib/libtera.a $(DEPS_LDPATH) $(DEPS_LDFLAGS) -lpthread -lz
 
-SAMPLE_SRC := ./tera_sample.cc tera_row_txn_sample.cc atomic_sample.cc
+SAMPLE_SRC := ./tera_sample.cc tera_row_txn_sample.cc atomic_sample.cc global_txn_async_sample.cc
 SAMPLE_OBJ := $(SAMPLE_SRC:.cc=.o)
 
 .PHONY: clean
 
-all: sample_demo tera_row_txn_sample atomic_sample
+all: sample_demo tera_row_txn_sample atomic_sample global_txn_async_sample global_txn_sync_sample
 
 sample_demo: ./tera_sample.o
 	$(CXX) -o $@ $^ $(LDFLAGS)
@@ -26,6 +26,12 @@ sample_demo: ./tera_sample.o
 tera_row_txn_sample: tera_row_txn_sample.o
 	$(CXX) -o $@ $^ $(LDFLAGS)
 
+global_txn_async_sample: global_txn_async_sample.o
+	$(CXX) -o $@ $^ $(LDFLAGS)
+
+global_txn_sync_sample: global_txn_sync_sample.o
+	$(CXX) -o $@ $^ $(LDFLAGS)
+
 atomic_sample: atomic_sample.o
 	$(CXX) -o $@ $^ $(LDFLAGS)
 
@@ -36,5 +42,7 @@ clean:
 	rm -f *.o
 	rm -f ./sample_demo
 	rm -f ./tera_row_txn_sample
+	rm -f ./global_txn_async_sample
+	rm -f ./global_txn_sync_sample
 	rm -f ./atomic_sample
 
diff --git a/src/sample/atomic_sample.cc b/src/sample/atomic_sample.cc
index ce35fbe6b..3053ec8b4 100644
--- a/src/sample/atomic_sample.cc
+++ b/src/sample/atomic_sample.cc
@@ -1,4 +1,5 @@
 #include <iostream>
+#include <assert.h>
 #include "tera.h"
 
 int main() {
diff --git a/src/sample/global_txn_async_sample.cc b/src/sample/global_txn_async_sample.cc
new file mode 100644
index 000000000..a2f77896e
--- /dev/null
+++ b/src/sample/global_txn_async_sample.cc
@@ -0,0 +1,143 @@
+#include <atomic>
+#include <iostream>
+#include <memory>
+#include <thread>
+
+#include <assert.h>
+#include <unistd.h>
+
+#include "tera.h"
+
+std::string read_result = "";
+std::atomic<bool> all_gtxn_thread_done(false);
+std::atomic<int> finish_cnt(0);
+    
+struct RowReaderContext {
+    tera::Transaction* gtxn;
+    tera::Table* t1;
+    tera::Table* t2;
+};
+
+tera::Table* InitTable(tera::Client* client, const std::string& tablename) {
+    tera::ErrorCode error_code;
+    if (!client->IsTableExist(tablename, &error_code)) {
+        tera::TableDescriptor schema(tablename);
+        schema.EnableTxn(); // 参与全局事务的表schema 都需要设置 txn=true
+        schema.AddLocalityGroup("lg0");
+        tera::ColumnFamilyDescriptor* cfd1 = schema.AddColumnFamily("cf1");
+        cfd1->EnableGlobalTransaction();
+        tera::ColumnFamilyDescriptor* cfd2 = schema.AddColumnFamily("cf2");
+        cfd2->DisableGlobalTransaction();
+        client->CreateTable(schema, &error_code);
+        assert(error_code.GetType() == tera::ErrorCode::kOK);
+    }
+    
+    tera::Table* table = client->OpenTable(tablename, &error_code);
+    assert(table && error_code.GetType() == tera::ErrorCode::kOK);
+    return table;
+}
+
+void TxnCallBack(tera::Transaction* txn) {
+    if (txn->GetError().GetType() != tera::ErrorCode::kOK) {
+        std::cout << "txn failed, start_ts= " << txn->GetStartTimestamp() 
+                  << ", reason= " << txn->GetError().ToString()
+                  << std::endl;
+    } else {
+        std::cout << "gtxn success" << std::endl;
+    }
+    delete txn;
+    all_gtxn_thread_done.store(true);
+}
+
+void ReadRowCallBack(tera::RowReader* row_reader) {
+    RowReaderContext* ctx = (RowReaderContext*)row_reader->GetContext();
+    while (!row_reader->Done()) {
+        printf("Row: %s\%s\%ld\%s\n",
+                row_reader->RowName().c_str(), row_reader->ColumnName().c_str(),
+                row_reader->Timestamp(), row_reader->Value().c_str());
+        row_reader->Next();
+        read_result += row_reader->Value();
+    }
+    delete row_reader;
+    ++finish_cnt;
+    // mutations begin at all reader callback done
+    if (finish_cnt.load() == 2) {
+        // write to other columns
+        tera::Transaction* g_txn = ctx->gtxn;
+        tera::RowMutation* m1 = ctx->t1->NewRowMutation("r1");
+        tera::RowMutation* m2 = ctx->t2->NewRowMutation("r1");
+        m1->Put( "cf1", "q1", read_result);
+        m2->Put( "cf1", "q1", read_result);
+
+        // ApplyMutation only modifying local memory and do not need asynchronous
+        // we also support asynchronous interface for RowMutation，as you like
+        g_txn->ApplyMutation(m1);
+        g_txn->ApplyMutation(m2);
+        g_txn->SetCommitCallback(TxnCallBack);
+        delete m1;
+        delete m2;
+        // need not check ApplyMutation, Transaction will be check before commit.
+        g_txn->Commit();
+    }
+}
+
+void DoTxn(tera::Client* client, tera::Table* t1, tera::Table* t2) {
+    
+    // begin global transaction
+    tera::Transaction* g_txn = client->NewGlobalTransaction();
+    if (g_txn == NULL) {
+        return;
+    }
+
+    // read from different tables 
+    tera::RowReader* r1 = t1->NewRowReader("r1");
+    tera::RowReader* r2 = t2->NewRowReader("r1");
+    r1->AddColumn("cf1", "q2");
+    r2->AddColumn("cf1", "q2");
+    r1->SetCallBack(ReadRowCallBack);
+    r2->SetCallBack(ReadRowCallBack);
+    RowReaderContext ctx;
+    ctx.gtxn = g_txn;
+    ctx.t1 = t1;
+    ctx.t2 = t2;
+    r1->SetContext(&ctx);
+    r2->SetContext(&ctx);
+    // read from t1:r1:cf1:q2 and check 
+    g_txn->Get(r1);
+    // read from t2:r1:cf1:q2 and check 
+    g_txn->Get(r2);
+}
+
+int main(int argc, char *argv[]) {
+
+    tera::ErrorCode error_code;
+    
+    tera::Client* client = tera::Client::NewClient("../conf/tera.flag", "global_txn_sample_async", &error_code);
+    if (client == NULL) {
+        return -1;
+    }
+
+    // create or open tables
+    // before global transaction should be 
+    // (1) OpenTable which you will r/w
+    // (2) check OpenTable success
+    tera::Table* t1 = InitTable(client, "t1");
+    tera::Table* t2 = InitTable(client, "t2");
+
+    // the global transaction may add to threadpool, which implements by yourself.
+    //
+    // In this example, 
+    //
+    // first, read two cell values from different tables,
+    // next, get all values concat at reader callback, 
+    // last, put concat result into different tables.
+    DoTxn(client, t1, t2);
+
+    // global transaction thead always finished before callback
+    // wait for callback thread done at main thread
+    // if your know the program can't exit before callback done, it's not necessary.
+    while (!all_gtxn_thread_done.load()) {
+        usleep(100);
+    }
+    return 0;
+}
diff --git a/src/sample/global_txn_sync_sample.cc b/src/sample/global_txn_sync_sample.cc
new file mode 100644
index 000000000..66bb94b7d
--- /dev/null
+++ b/src/sample/global_txn_sync_sample.cc
@@ -0,0 +1,107 @@
+#include <memory>
+#include <iostream>
+
+#include <assert.h>
+#include "tera.h"
+
+int main(int argc, char *argv[]) {
+
+    tera::ErrorCode error_code;
+    
+    tera::Client* client = tera::Client::NewClient("../conf/tera.flag", "global_txn_sample", &error_code);
+    assert(client);
+    // create or open tables
+    tera::Table* t1 = nullptr;
+    tera::Table* t2 = nullptr;
+    if (!client->IsTableExist("t1", &error_code)) {
+        tera::TableDescriptor schema("t1");
+        schema.EnableTxn(); // 参与全局事务的表schema 都需要设置 txn=true
+        schema.AddLocalityGroup("lg0");
+        tera::ColumnFamilyDescriptor* cfd1 = schema.AddColumnFamily("cf1");
+        cfd1->EnableGlobalTransaction();
+        tera::ColumnFamilyDescriptor* cfd2 = schema.AddColumnFamily("cf2");
+        cfd2->EnableGlobalTransaction();
+        client->CreateTable(schema, &error_code);
+        assert(error_code.GetType() == tera::ErrorCode::kOK);
+    }
+
+    if (!client->IsTableExist("t2", &error_code)) {
+        tera::TableDescriptor schema("t2");
+        schema.EnableTxn(); // 参与全局事务的表schema 都需要设置 txn=true
+        schema.AddLocalityGroup("lg0");
+        tera::ColumnFamilyDescriptor* cfd1 = schema.AddColumnFamily("cf1");
+        cfd1->EnableGlobalTransaction();
+        tera::ColumnFamilyDescriptor* cfd2 = schema.AddColumnFamily("cf2");
+        cfd2->EnableGlobalTransaction();
+        client->CreateTable(schema, &error_code);
+        assert(error_code.GetType() == tera::ErrorCode::kOK);
+    }
+    // before global transaction should be 
+    // (1) OpenTable which you will r/w
+    // (2) check OpenTable success
+    t1 = client->OpenTable("t1", &error_code);
+    assert(t1 && error_code.GetType() == tera::ErrorCode::kOK);
+    
+    t2 = client->OpenTable("t2", &error_code);
+    assert(t2 && error_code.GetType() == tera::ErrorCode::kOK);
+
+    // begin global transaction
+    tera::Transaction* g_txn = client->NewGlobalTransaction();
+    if (g_txn == NULL) {
+        return -1;
+    }
+    if (error_code.GetType()!=tera::ErrorCode::kOK) { 
+        std::cout << error_code.ToString() << std::endl; 
+        return -1;
+    }
+    // read from different tables 
+    std::unique_ptr<tera::RowReader> r1(t1->NewRowReader("r1"));
+    std::unique_ptr<tera::RowReader> r2(t2->NewRowReader("r1"));
+    r1->AddColumn("cf1", "q2");
+    r2->AddColumn("cf1", "q2");
+    // read from t1:r1:cf1:q2 and check 
+    g_txn->Get(r1.get());
+    if (g_txn->GetError().GetType() != tera::ErrorCode::kOK) {
+        std::cout << g_txn->GetError().ToString() << std::endl; 
+        return -1;
+    }
+    std::string r1_v = "";
+    while(!r1->Done()) {
+        std::cout << r1->Value() << std::endl;
+        r1_v = r1->Value();
+        r1->Next();
+    }
+
+    // read from t2:r1:cf1:q2 and check 
+    g_txn->Get(r2.get());
+    if (g_txn->GetError().GetType() != tera::ErrorCode::kOK) {
+        std::cout << g_txn->GetError().ToString() << std::endl; 
+        return -1;
+    }
+    std::string r2_v = "";
+    while(!r2->Done()) {
+        std::cout << r2->Value() << std::endl;
+        r2_v = r2->Value();
+        r2->Next();
+    }
+
+    // write to other columns
+    std::unique_ptr<tera::RowMutation> m1(t1->NewRowMutation("r1"));
+    std::unique_ptr<tera::RowMutation> m2(t2->NewRowMutation("r1"));
+    m1->Put( "cf1", "q1", r2_v);
+    m2->Put( "cf1", "q1", r1_v);
+
+    g_txn->ApplyMutation(m1.get());
+    g_txn->ApplyMutation(m2.get()); 
+    // need not check ApplyMutation, Transaction will be check before commit.
+    g_txn->Commit();
+    if (g_txn->GetError().GetType() != tera::ErrorCode::kOK) {
+        std::cout << g_txn->GetError().ToString() << std::endl; 
+    } else {
+        std::cout << "commit success" << std::endl;
+    }
+
+    delete g_txn;
+    // end global transaction
+    return 0;
+}
diff --git a/src/sample/tera_row_txn_sample.cc b/src/sample/tera_row_txn_sample.cc
index 4c9897708..879652dfc 100644
--- a/src/sample/tera_row_txn_sample.cc
+++ b/src/sample/tera_row_txn_sample.cc
@@ -1,3 +1,6 @@
+#include <assert.h>
+#include <stdio.h>
+
 #include "tera.h"
 
 int main() {
diff --git a/src/sdk/client_impl.cc b/src/sdk/client_impl.cc
index bc9eb1998..3599b2e9e 100644
--- a/src/sdk/client_impl.cc
+++ b/src/sdk/client_impl.cc
@@ -10,6 +10,7 @@
 #include "gflags/gflags.h"
 
 #include "common/file/file_path.h"
+#include "common/log/log_cleaner.h"
 #include "common/mutex.h"
 #include "proto/kv_helper.h"
 #include "proto/master_client.h"
@@ -17,6 +18,8 @@
 #include "proto/table_meta.pb.h"
 #include "proto/tabletnode_client.h"
 #include "sdk/table_impl.h"
+#include "sdk/global_txn.h"
+#include "sdk/sdk_perf.h"
 #include "sdk/sdk_utils.h"
 #include "sdk/sdk_zk.h"
 #include "utils/config_utils.h"
@@ -43,6 +46,12 @@ DECLARE_int32(tera_sdk_rpc_max_pending_buffer_size);
 DECLARE_int32(tera_sdk_rpc_work_thread_num);
 DECLARE_int32(tera_sdk_show_max_num);
 DECLARE_bool(tera_online_schema_update_enabled);
+DECLARE_string(tera_log_prefix);
+DECLARE_bool(tera_info_log_clean_enable);
+DECLARE_bool(tera_sdk_perf_collect_enabled);
+DECLARE_int32(tera_gtxn_thread_max_num);
+DECLARE_bool(tera_sdk_client_for_gtxn);
+DECLARE_bool(tera_sdk_tso_client_enabled);
 
 namespace tera {
 
@@ -55,14 +64,40 @@ void LogSdkVersionInfo() {
 ClientImpl::ClientImpl(const std::string& user_identity,
                        const std::string& user_passcode)
     : thread_pool_(FLAGS_tera_sdk_thread_max_num),
+      gtxn_thread_pool_(NULL),
       user_identity_(user_identity),
-      user_passcode_(user_passcode) {
+      user_passcode_(user_passcode),
+      client_zk_adapter_(NULL),
+      tso_cluster_(NULL),
+      collecter_(NULL),
+      session_str_("") {
     tabletnode::TabletNodeClient::SetThreadPool(&thread_pool_);
     tabletnode::TabletNodeClient::SetRpcOption(
         FLAGS_tera_sdk_rpc_limit_enabled ? FLAGS_tera_sdk_rpc_limit_max_inflow : -1,
         FLAGS_tera_sdk_rpc_limit_enabled ? FLAGS_tera_sdk_rpc_limit_max_outflow : -1,
         FLAGS_tera_sdk_rpc_max_pending_buffer_size, FLAGS_tera_sdk_rpc_work_thread_num);
-    cluster_ = sdk::NewClusterFinder();
+
+    if (FLAGS_tera_sdk_client_for_gtxn) {
+        client_zk_adapter_ = sdk::NewClientZkAdapter();
+        client_zk_adapter_->Init();
+        cluster_ = sdk::NewClusterFinder(client_zk_adapter_);
+        if (FLAGS_tera_sdk_tso_client_enabled) {
+            tso_cluster_ = sdk::NewTimeoracleClusterFinder();
+        }
+        gtxn_thread_pool_ = new ThreadPool(FLAGS_tera_gtxn_thread_max_num);
+        RegisterSelf();
+    } else {
+        cluster_ = sdk::NewClusterFinder();
+    }
+
+    if (FLAGS_tera_sdk_perf_collect_enabled) {
+        collecter_ = new sdk::PerfCollecter();
+        collecter_->Run();
+        LOG(INFO) << "start perf collect";
+    } else {
+        LOG(INFO) << "perf collect disable";
+    }
+
     pthread_once(&sdk_client_once_control, LogSdkVersionInfo);
 }
 
@@ -77,6 +112,17 @@ ClientImpl::~ClientImpl() {
         }
     }
     delete cluster_;
+    if (FLAGS_tera_sdk_perf_collect_enabled) {
+        collecter_->Stop();
+        delete collecter_;
+    }
+    if (FLAGS_tera_sdk_client_for_gtxn) {
+        delete gtxn_thread_pool_;
+        if (FLAGS_tera_sdk_tso_client_enabled) {
+            delete tso_cluster_;
+        }
+        delete client_zk_adapter_;
+    }
 }
 
 bool ClientImpl::CreateTable(const TableDescriptor& desc, ErrorCode* err) {
@@ -1173,6 +1219,29 @@ bool ClientImpl::ParseTabletEntry(const TabletMeta& meta, std::vector<TabletInfo
     return true;
 }
 
+Transaction* ClientImpl::NewGlobalTransaction() {
+    return GlobalTxn::NewGlobalTxn(this, gtxn_thread_pool_, tso_cluster_);
+}
+
+bool ClientImpl::IsClientAlive(const std::string& path) {
+    if (client_zk_adapter_ != NULL) {
+        return client_zk_adapter_->IsClientAlive(path);
+    }
+    return true;
+}
+
+std::string ClientImpl::ClientSession() {
+    return session_str_;
+}
+
+bool ClientImpl::RegisterSelf() {
+    if (client_zk_adapter_ != NULL) {
+        return client_zk_adapter_->RegisterClient(&session_str_);
+    } else {
+        return false;   
+    }
+}
+
 static Mutex g_mutex;
 static bool g_is_glog_init = false;
 
@@ -1223,6 +1292,14 @@ static int InitFlags(const std::string& confpath, const std::string& log_prefix)
     if (!g_is_glog_init) {
         ::google::InitGoogleLogging(log_prefix.c_str());
         utils::SetupLog(log_prefix);
+        FLAGS_tera_log_prefix = log_prefix;
+    	// start log cleaner
+    	if (FLAGS_tera_info_log_clean_enable) {
+    	    common::LogCleaner::StartCleaner();
+    		LOG(INFO) << "start log cleaner";
+    	} else {
+    		LOG(INFO) << "log cleaner is disable";
+    	}
         g_is_glog_init = true;
     }
 
diff --git a/src/sdk/client_impl.h b/src/sdk/client_impl.h
index f401111f3..246e7608d 100644
--- a/src/sdk/client_impl.h
+++ b/src/sdk/client_impl.h
@@ -8,9 +8,11 @@
 #include "common/thread_pool.h"
 #include "proto/master_rpc.pb.h"
 #include "proto/tabletnode_client.h"
+#include "sdk/sdk_perf.h"
 #include "sdk/sdk_zk.h"
+#include "sdk/timeoracle_client_impl.h"
 #include "tera.h"
-#include "utils/timer.h"
+#include "common/timer.h"
 
 using std::string;
 
@@ -97,6 +99,8 @@ class ClientImpl : public Client {
                          string* str_result,
                          ErrorCode* err);
 
+    virtual Transaction* NewGlobalTransaction();
+
     bool ShowTableSchema(const string& name, TableSchema* meta, ErrorCode* err);
 
     bool ShowTablesInfo(const string& name, TableMeta* meta,
@@ -117,6 +121,10 @@ class ClientImpl : public Client {
     void CloseTable(const string& table_name);
     TableImpl* OpenTableInternal(const string& table_name, ErrorCode* err);
 
+    bool IsClientAlive(const string& path);
+
+    string ClientSession();
+
 private:
     bool ListInternal(std::vector<TableInfo>* table_list,
                       std::vector<TabletInfo>* tablet_list,
@@ -147,10 +155,13 @@ class ClientImpl : public Client {
                           bool is_brief,
                           ErrorCode* err);
 
+    bool RegisterSelf();
+
 private:
     ClientImpl(const ClientImpl&);
     void operator=(const ClientImpl&);
     ThreadPool thread_pool_;
+    ThreadPool* gtxn_thread_pool_;
 
     std::string user_identity_;
     std::string user_passcode_;
@@ -160,7 +171,11 @@ class ClientImpl : public Client {
     ///    we have to access zookeeper whenever we need master_addr or root_table_addr.
     /// if there is cluster_,
     ///    we save master_addr & root_table_addr in cluster_, access zookeeper only once.
+    sdk::ClientZkAdapterBase* client_zk_adapter_;
     sdk::ClusterFinder* cluster_;
+    sdk::ClusterFinder* tso_cluster_;
+    sdk::PerfCollecter* collecter_;
+    std::string session_str_;
 
     Mutex open_table_mutex_;
     struct TableHandle {
diff --git a/src/sdk/global_txn.cc b/src/sdk/global_txn.cc
new file mode 100644
index 000000000..a003cbd64
--- /dev/null
+++ b/src/sdk/global_txn.cc
@@ -0,0 +1,1142 @@
+// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+//
+// Author: baorenyi@baidu.com
+
+#include <functional>
+#include <thread> 
+
+#include "common/metric/metric_counter.h"
+#include "common/this_thread.h"
+#include "common/thread.h"
+#include "proto/table_meta.pb.h"
+#include "proto/tabletnode_rpc.pb.h"
+#include "sdk/global_txn.h"
+#include "sdk/read_impl.h"
+#include "sdk/timeoracle_client_impl.h"
+
+DECLARE_bool(tera_gtxn_test_opened);
+DECLARE_string(tera_gtxn_test_flagfile);
+DECLARE_int32(tera_gtxn_get_waited_times_limit);
+DECLARE_int32(tera_gtxn_timeout_ms);
+DECLARE_bool(tera_sdk_tso_client_enabled);
+
+namespace tera {
+
+extern tera::MetricCounter gtxn_read_cnt;
+extern tera::MetricCounter gtxn_read_fail_cnt;
+extern tera::MetricCounter gtxn_read_retry_cnt;
+extern tera::MetricCounter gtxn_read_rollback_cnt;
+extern tera::MetricCounter gtxn_read_rollforward_cnt;
+extern tera::MetricCounter gtxn_commit_cnt;
+extern tera::MetricCounter gtxn_commit_fail_cnt;
+extern tera::MetricCounter gtxn_prewrite_cnt;
+extern tera::MetricCounter gtxn_prewrite_fail_cnt;
+extern tera::MetricCounter gtxn_primary_cnt;
+extern tera::MetricCounter gtxn_primary_fail_cnt;
+extern tera::MetricCounter gtxn_secondaries_cnt;
+extern tera::MetricCounter gtxn_secondaries_fail_cnt;
+extern tera::MetricCounter gtxn_acks_cnt;
+extern tera::MetricCounter gtxn_acks_fail_cnt;
+extern tera::MetricCounter gtxn_notifies_cnt;
+extern tera::MetricCounter gtxn_notifies_fail_cnt;
+
+Transaction* GlobalTxn::NewGlobalTxn(tera::Client* client, 
+                                     common::ThreadPool* thread_pool,
+                                     sdk::ClusterFinder* tso_cluster) {
+    if (client != NULL) {
+            return new GlobalTxn(client, thread_pool, tso_cluster);
+    }
+    LOG(ERROR) << "client or tso_cluster is NULL";
+    return NULL;
+}
+
+GlobalTxn::GlobalTxn(tera::Client* client,
+        common::ThreadPool* thread_pool,
+        sdk::ClusterFinder* tso_cluster) :
+    gtxn_internal_(new GlobalTxnInternal(client)),
+    status_returned_(false),
+    primary_write_(NULL),
+    writes_size_(0), 
+    commit_ts_(0),
+    isolation_level_(IsolationLevel::kSnapshot),
+    serialized_primary_(""),
+    finish_(false),
+    finish_cond_(&finish_mutex_),
+    has_commited_(false),
+    user_commit_callback_(NULL),
+    user_commit_context_(NULL),
+    thread_pool_(thread_pool),
+    tso_cluster_(tso_cluster),
+    timeout_ms_(FLAGS_tera_gtxn_timeout_ms),
+    all_task_pushed_(false) {
+    if (FLAGS_tera_gtxn_test_opened) {
+        VLOG(12) << "conf_file = " << FLAGS_tera_gtxn_test_flagfile;
+        start_ts_ = gtxn_internal_->TEST_Init(FLAGS_tera_gtxn_test_flagfile);
+    } else if (!FLAGS_tera_sdk_tso_client_enabled) {
+        start_ts_ = get_micros();
+    } else {
+        timeoracle::TimeoracleClientImpl tsoc(thread_pool_, tso_cluster_); 
+        start_ts_ = tsoc.GetTimestamp(1);
+        if (start_ts_ == 0) {
+            status_.SetFailed(ErrorCode::kGTxnTimestampLost);
+            status_returned_ = true;
+        }
+    }
+    prewrite_start_ts_ = start_ts_;
+    gtxn_internal_->SetStartTimestamp(start_ts_);
+}
+
+GlobalTxn::~GlobalTxn() {
+}
+
+void GlobalTxn::SetIsolation(const IsolationLevel& isolation_level) {
+    assert(has_commited_ == false);
+    isolation_level_ = isolation_level;
+}
+
+void GlobalTxn::SetTimeout(int64_t timeout_ms) {
+    timeout_ms_ = timeout_ms;
+}
+
+int64_t GlobalTxn::Timeout() {
+    return timeout_ms_;
+}
+
+void GlobalTxn::SetReaderStatusAndRunCallback(RowReaderImpl* reader_impl, 
+                                              ErrorCode* status) {
+    gtxn_read_cnt.Inc();
+    gtxn_internal_->PerfReadDelay(0, get_micros()); // finish_time
+    VLOG(12) << "[gtxn][get][" << start_ts_ << "][status] :" << status->ToString();
+    reader_impl->SetError(status->GetType(), status->GetReason());
+    thread_pool_->AddTask(std::bind(&RowReaderImpl::RunCallback, reader_impl));
+}
+
+ErrorCode GlobalTxn::Get(RowReader* row_reader) {
+    assert(row_reader != NULL);
+    gtxn_internal_->PerfReadDelay(get_micros(), 0); // begin_time
+    gtxn_internal_->TEST_GetSleep(); 
+    
+    RowReaderImpl* reader_impl = static_cast<RowReaderImpl*>(row_reader);
+    reader_impl->SetTransaction(this);
+    
+    // Pre Check can read
+    ErrorCode status;
+    status.SetFailed(ErrorCode::kOK);
+    if (has_commited_.load()) {
+        std::string reason = "get failed, txn has commited @ [" + 
+            std::to_string(start_ts_) + "," + std::to_string(commit_ts_);
+        LOG(ERROR) << "[gtxn][get][" << start_ts_ <<"] " << reason;
+        status.SetFailed(ErrorCode::kGTxnOpAfterCommit, reason);
+        SetReaderStatusAndRunCallback(reader_impl, &status);
+        return status;
+    }
+
+    Table* table = row_reader->GetTable();
+    const std::string& row_key = row_reader->RowKey();
+    // Check UserReader and Build cells
+    if (!gtxn_internal_->VerifyUserRowReader(row_reader)) {
+        status = reader_impl->GetError();
+        SetReaderStatusAndRunCallback(reader_impl, &status);
+        return status;
+    }
+
+    std::vector<Cell*> cells;
+    for (auto it : row_reader->GetReadColumnList()) {
+        const std::string& column_family = it.first;
+        const std::set<std::string>& qualifier_set = it.second;
+
+        for (auto q_it = qualifier_set.begin(); q_it != qualifier_set.end(); ++q_it) {
+            const std::string& qualifier = *q_it;
+            cells.push_back(new Cell(table, row_key, column_family, qualifier));
+        }
+    }
+    int expected_cells_cnt = cells.size();
+
+    InternalReaderContext* ctx = new InternalReaderContext(expected_cells_cnt, reader_impl, this);
+    for(auto& cell : cells) {
+        ctx->cell_map[cell] = 0; // cell* -> try_time, default = 0
+        AsyncGetCell(cell, reader_impl, ctx);
+    }
+    
+    // sync wait and set status
+    if(!reader_impl->IsAsync()) {
+        reader_impl->Wait();
+        status = reader_impl->GetError();
+        return status;
+    }
+    return status;
+}
+
+void GlobalTxn::AsyncGetCell(Cell* cell,
+                             RowReaderImpl* user_reader_impl, 
+                             InternalReaderContext* ctx) {
+    VLOG(12) << "[gtxn][get][" << start_ts_ << "] " 
+             << gtxn_internal_->DebugString(*cell, "TryGet times(" + std::to_string(ctx->cell_map[cell]) + ")");
+    
+    Table* table = cell->Table();
+    RowReader* reader = table->NewRowReader(cell->RowKey());
+    reader->AddColumn(cell->ColFamily(), cell->LockName());
+    reader->AddColumn(cell->ColFamily(), cell->WriteName());
+    reader->AddColumn(cell->ColFamily(), cell->Qualifier());
+    reader->SetTimeRange(0, kMaxTimeStamp);
+    reader->SetMaxVersions(UINT32_MAX);
+    reader->SetCallBack([] (RowReader* r) {
+        CellReaderContext* ctx = (CellReaderContext*)r->GetContext();
+        GlobalTxn* gtxn = static_cast<GlobalTxn*>(ctx->internal_reader_ctx->gtxn);
+        gtxn->thread_pool_->AddTask(std::bind(&GlobalTxn::DoGetCellReaderCallback, 
+            gtxn, static_cast<RowReaderImpl*>(r)));
+    });
+    reader->SetContext(new CellReaderContext(cell, ctx));
+    table->Get(reader);
+}
+
+void GlobalTxn::DoGetCellReaderCallback(RowReader* reader) {
+    ErrorCode status = reader->GetError();
+    if (status.GetType() != ErrorCode::kOK) {
+        MergeCellToRow(reader, status);
+        return;
+    }
+
+    RowReader::TRow row;
+    reader->ToMap(&row);
+    CellReaderContext* ctx = (CellReaderContext*)reader->GetContext();
+    Cell* cell = ctx->cell;
+    if (row.find(cell->ColFamily()) == row.end()) {
+        status.SetFailed(ErrorCode::kNotFound, "columnfamily not found");
+        MergeCellToRow(reader, status);
+        return;
+    }
+    // local check lock
+    if (gtxn_internal_->IsLockedByOthers(row, *cell)) {
+        // sync operate
+        status.SetFailed(ErrorCode::kOK);
+        InternalReaderContext* internal_reader_ctx = ctx->internal_reader_ctx;
+        bool do_clean = false;
+        // check clean lock before read cell next time, 
+        // when read times >= limit - 1 do clean lock opreations 
+        if (internal_reader_ctx->cell_map[cell] >= FLAGS_tera_gtxn_get_waited_times_limit - 1) {
+            do_clean = true;   
+        }
+        BackoffAndMaybeCleanupLock(row, *cell, do_clean, &status);
+        if (status.GetType() == ErrorCode::kOK) {
+            // call Next time to async GetCell
+            // don't merge until next time ok or failed
+            ++ internal_reader_ctx->cell_map[cell];
+            gtxn_read_retry_cnt.Inc();
+            AsyncGetCell(cell, 
+                         static_cast<RowReaderImpl*>(internal_reader_ctx->user_reader), 
+                         internal_reader_ctx);
+            return;
+        }
+    } else if (!FindValueFromResultRow(row, cell)) {
+        status.SetFailed(ErrorCode::kNotFound, "build data col from write col failed");
+    }
+    MergeCellToRow(reader, status);
+}
+
+void GlobalTxn::MergeCellToRow(RowReader* internal_reader, 
+                               const ErrorCode& status) {
+    CellReaderContext* ctx = (CellReaderContext*)internal_reader->GetContext();
+    ctx->status = status;
+    VLOG(12) << "[gtxn][get][" << start_ts_ << "] " 
+             << gtxn_internal_->DebugString(*(ctx->cell), status.ToString());
+    GetCellCallback(ctx);
+    // next time internal read will new next RowReader
+    delete internal_reader;
+}
+
+void GlobalTxn::GetCellCallback(CellReaderContext* ctx) {
+    InternalReaderContext* internal_reader_ctx = ctx->internal_reader_ctx;
+    Cell* cell = ctx->cell;
+    bool last_cell = false;
+    {
+        MutexLock lock(&mu_);
+        ++internal_reader_ctx->active_cell_cnt;
+        if (internal_reader_ctx->fail_cell_cnt == 0 && ctx->status.GetType() == ErrorCode::kOK) {
+            KeyValuePair* kv = internal_reader_ctx->results.add_key_values();
+            kv->set_key(cell->RowKey());
+            kv->set_column_family(cell->ColFamily());
+            kv->set_qualifier(cell->Qualifier());
+            kv->set_timestamp(cell->Timestamp());
+            kv->set_value(cell->Value());
+        } else if (ctx->status.GetType() != ErrorCode::kNotFound) {
+            ++internal_reader_ctx->fail_cell_cnt;
+            internal_reader_ctx->results.clear_key_values();
+            internal_reader_ctx->last_err = ctx->status;
+        } else {
+            ++internal_reader_ctx->not_found_cnt;
+        }
+        last_cell = (internal_reader_ctx->active_cell_cnt == internal_reader_ctx->expected_cell_cnt);
+    }
+    if (last_cell) {
+        ErrorCode last_err = internal_reader_ctx->last_err;
+        RowReaderImpl* reader_impl = static_cast<RowReaderImpl*>(internal_reader_ctx->user_reader);
+        if (internal_reader_ctx->fail_cell_cnt > 0) {
+            gtxn_read_fail_cnt.Inc();
+        } else if (internal_reader_ctx->not_found_cnt == internal_reader_ctx->expected_cell_cnt) {
+            // all cell not found
+            last_err.SetFailed(ErrorCode::kNotFound); 
+        } else {
+            reader_impl->SetResult(internal_reader_ctx->results);
+            last_err.SetFailed(ErrorCode::kOK); 
+        }
+        delete internal_reader_ctx;
+        SetReaderStatusAndRunCallback(reader_impl, &last_err);
+    }
+}
+
+bool GlobalTxn::FindValueFromResultRow(RowReader::TRow& result_row, Cell* target_cell) {
+    
+    auto write_col_it = result_row[target_cell->ColFamily()].find(target_cell->WriteName()); 
+    auto data_col_it = result_row[target_cell->ColFamily()].find(target_cell->Qualifier());
+
+    // check write col and data col exsit
+    if (write_col_it == result_row[target_cell->ColFamily()].end()
+        || data_col_it == result_row[target_cell->ColFamily()].end()) {
+        return false;
+    }
+    auto write_col = result_row[target_cell->ColFamily()][target_cell->WriteName()];
+    auto data_col = result_row[target_cell->ColFamily()][target_cell->Qualifier()];
+
+    for (auto k1 = write_col.rbegin(); k1 != write_col.rend(); ++k1) {
+        int64_t write_ts = k1->first;
+        std::string write_value = k1->second;
+        VLOG(12) << "[gtxn][get][" << start_ts_ << "] found write col, ts=" 
+                 << write_ts << ", internal val = " << write_value;
+        int write_type;
+        int64_t data_ts;
+        // skip new version value or skip error write format version
+        if (write_ts > start_ts_ || !DecodeWriteValue(write_value, &write_type, &data_ts)) {
+            continue;
+        }
+        VLOG(12) << "[gtxn][get][" << start_ts_ << "] decode write col, ts=" 
+                 << write_ts << ", type=" << write_type << ", value=" << data_ts;
+        // get data col , ts == data_ts
+        for (auto k2 = data_col.rbegin(); k2 != data_col.rend(); ++k2) {
+            VLOG(12) << "[gtxn][get][" << start_ts_ << "] found data col, ts=" 
+                     << k2->first << ", internal val = " << k2->second;
+            if (k2->first == data_ts && write_type == RowMutation::kPut) {
+                target_cell->SetTimestamp(data_ts);
+                target_cell->SetValue(k2->second);
+                return true;
+            } else if (k2->first < data_ts) {
+                VLOG(12) << "[gtxn][get][" << start_ts_ 
+                         << "] data cell version not found, v=" << k2->first;
+                break;
+            }
+        }
+        VLOG(12) << "[gtxn][get][" << start_ts_ << "] check data col failed, no data";
+        break;
+    }
+    VLOG(12) << "[gtxn][get][" << start_ts_ 
+             << "] write col versions count" << write_col.size();
+    return false;
+}
+
+void GlobalTxn::BackoffAndMaybeCleanupLock(RowReader::TRow& row, const Cell& cell,
+                                           const bool try_clean, ErrorCode* status) {
+    VLOG(12) << gtxn_internal_->DebugString(cell, "[gtxn][get][" + 
+            std::to_string(start_ts_) + " backoff or cleanup lock");
+    // get lock ts
+    int64_t lock_ts = -1;
+    int lock_type = -1;
+    tera::PrimaryInfo primary_info;
+    for (auto k = row[cell.ColFamily()][cell.LockName()].rbegin();
+             k != row[cell.ColFamily()][cell.LockName()].rend(); ++k) {
+        if (k->first < start_ts_) {
+            lock_ts = k->first;
+            VLOG(12) << "lock_ts=" << lock_ts << ", primary_str=" << k->second;
+            if (!DecodeLockValue(k->second, &lock_type, &primary_info)) {
+                status->SetFailed(ErrorCode::kGTxnPrimaryLost, "can't found primary");
+                return;
+            }
+            break;
+        }
+    }
+    // get primary lock
+    const std::string& process = "[gtxn][get][" + std::to_string(start_ts_) 
+        + "][check locked and writed]";
+    bool ret = gtxn_internal_->PrimaryIsLocked(primary_info, lock_ts, status);
+    if (status->GetType() != ErrorCode::kOK && status->GetType() != ErrorCode::kNotFound) {
+        LOG(ERROR) << gtxn_internal_->DebugString(cell, process + " failed," + status->ToString());
+        return;
+    } else if (ret) {
+        // NotFound means : other txn on prewrite process
+        // and this cell locked but primary unlocked(failed)
+        VLOG(12) << gtxn_internal_->DebugString(cell, process + " succeed");
+        // primary at prewrite do (1) clean or (2) wait
+        if (try_clean) {
+            CleanLock(cell, primary_info, status);
+        } else if (gtxn_internal_->SuspectLive(primary_info)) { 
+            // TODO add a better sleep strategy
+            ThisThread::Sleep(100); 
+        } else {
+            CleanLock(cell, primary_info, status);
+        }
+    } else {
+        if (!gtxn_internal_->IsPrimary(cell, primary_info)) {
+            VLOG(12) << gtxn_internal_->DebugString(cell, process + ", will do rollforward");
+            // primary maybe at commited do roll_forward
+            RollForward(cell, primary_info, lock_type, status);
+            if (status->GetType() == ErrorCode::kGTxnPrimaryLost) {
+                VLOG(12) << gtxn_internal_->DebugString(cell, process + ", rollforward failed, try clean lock");
+                // primary prewrite failed
+                status->SetFailed(ErrorCode::kOK);
+                if (try_clean) {
+                    CleanLock(cell, primary_info, status);
+                } else if (gtxn_internal_->SuspectLive(primary_info)) {
+                    ThisThread::Sleep(100);
+                } else {
+                    CleanLock(cell, primary_info, status);
+                }
+            }
+        } else { 
+            VLOG(12) << gtxn_internal_->DebugString(cell, process + ", ignore(primary)");
+        }
+    } 
+}
+
+void GlobalTxn::CleanLock(const Cell& cell, const tera::PrimaryInfo& primary, ErrorCode* status) {
+    gtxn_read_rollback_cnt.Inc();
+    Table* primary_table = gtxn_internal_->FindTable(primary.table_name());
+    assert(primary_table != NULL);
+    const Cell& primary_cell = Cell(primary_table, primary.row_key(), 
+                                    primary.column_family(), primary.qualifier());
+    // if now cell is primary
+    bool is_same = cell.Table()->GetName() == primary_table->GetName() 
+                   && cell.RowKey() == primary_cell.RowKey() 
+                   && cell.ColFamily() == primary_cell.ColFamily() 
+                   && cell.LockName() == primary_cell.LockName();
+    if (!is_same) {
+        VLOG(12) << "[gtxn][get][" << start_ts_ << "] " 
+                 << gtxn_internal_->DebugString(primary_cell, "clean lock primary");
+        RowMutation* pri_mu = primary_table->NewRowMutation(primary_cell.RowKey());
+        // delete all info between [0, start_ts_] at lock col
+        pri_mu->DeleteColumns(primary_cell.ColFamily(), primary_cell.LockName(), start_ts_);
+        primary_table->ApplyMutation(pri_mu);
+        if (pri_mu->GetError().GetType() != tera::ErrorCode::kOK) {
+            LOG(WARNING) << pri_mu->GetError().ToString();
+            *status = pri_mu->GetError();
+        }
+        delete pri_mu;
+    }  
+    VLOG(12) << "[gtxn][get][" << start_ts_ << "] " 
+             << gtxn_internal_->DebugString(cell, "clean lock this cell");
+    RowMutation* this_mu = (cell.Table())->NewRowMutation(cell.RowKey());
+    // delete all info between [0, start_ts_] at lock col
+    this_mu->DeleteColumns(cell.ColFamily(), cell.LockName(), start_ts_);
+    (cell.Table())->ApplyMutation(this_mu);
+    if (this_mu->GetError().GetType() != tera::ErrorCode::kOK) {
+        LOG(WARNING) << "[gtxn][get][" << start_ts_ << "] clean lock failed :" 
+                     << this_mu->GetError().ToString();
+        *status = this_mu->GetError();
+    }
+    delete this_mu;
+}
+
+void GlobalTxn::RollForward(const Cell& cell, const tera::PrimaryInfo& primary, 
+                            int lock_type, ErrorCode* status) {
+    gtxn_read_rollforward_cnt.Inc();
+    // find primary write col start_ts
+    Table* pri_table = gtxn_internal_->FindTable(primary.table_name());
+    assert(pri_table != NULL);
+    std::unique_ptr<Cell> primary_cell(new Cell(pri_table, primary.row_key(), 
+                                                primary.column_family(), 
+                                                primary.qualifier()));
+    RowReader* reader = pri_table->NewRowReader(primary_cell->RowKey());
+    reader->AddColumn(primary_cell->ColFamily(), primary_cell->WriteName());
+    reader->SetTimeRange(0, kMaxTimeStamp);
+    reader->SetMaxVersions(UINT32_MAX);
+    pri_table->Get(reader);
+    if (reader->GetError().GetType() != ErrorCode::kOK) {
+        if (reader->GetError().GetType() == ErrorCode::kNotFound) {
+            status->SetFailed(ErrorCode::kGTxnPrimaryLost, "primary lost, not 'lock' and 'write'");
+        } else {
+            LOG(WARNING) << status->GetReason();
+            *status = reader->GetError();
+        }
+        delete reader;
+        return;
+    }
+    int64_t commit_ts = -1;
+    int write_type;
+    int64_t data_ts = -1;
+    while (!reader->Done()) {
+        // decode primary cell write col value
+        std::string reader_value = reader->Value();
+        DecodeWriteValue(reader_value, &write_type, &data_ts);
+        VLOG(12) << "[gtxn][get][ " << start_ts_ << "] decode primary 'write', ts=" << reader->Timestamp()
+                 << ", type=" << write_type << ", value=" << data_ts;
+        VLOG(12) << "[gtxn][get][ " << start_ts_ << "] primary start_ts=" << primary.gtxn_start_ts();
+        if (data_ts > 0 && data_ts < primary.gtxn_start_ts()) {
+            status->SetFailed(ErrorCode::kGTxnPrimaryLost, "primary lost, not 'lock' and 'write'");
+            delete reader;
+            return;      
+        } else if (data_ts == primary.gtxn_start_ts()) {
+            commit_ts = reader->Timestamp();
+            break;
+        }
+        reader->Next();
+    }
+    delete reader;
+
+    if (commit_ts > 0) {
+        RowMutation* this_mu = cell.Table()->NewRowMutation(cell.RowKey());
+        this_mu->Put(cell.ColFamily(), 
+                     cell.WriteName(), 
+                     EncodeWriteValue(lock_type, data_ts), 
+                     commit_ts);
+        this_mu->DeleteColumns(cell.ColFamily(), cell.LockName(), commit_ts);
+        cell.Table()->ApplyMutation(this_mu);
+        if (this_mu->GetError().GetType() != tera::ErrorCode::kOK) {
+            LOG(WARNING) << this_mu->GetError().GetReason();
+            *status = this_mu->GetError();
+        }
+        delete this_mu;
+    } else {
+        status->SetFailed(ErrorCode::kGTxnPrimaryLost, "not found primary cell");
+    }
+}
+
+void GlobalTxn::SaveWrite(const std::string& tablename, const std::string& row_key, 
+                         tera::Write& w) {
+    MutexLock lock(&mu_);
+    TableWithRowkey twr(tablename, row_key);
+    auto it = writes_.find(twr);
+    if (it != writes_.end()) {
+        std::vector<Write>* ws_ptr = &(writes_[twr]);
+        ws_ptr->push_back(w);
+    } else {
+        std::vector<Write> ws;
+        ws.push_back(w);   
+        writes_[twr] = ws; 
+        writes_cnt_.Inc();
+    }
+}
+
+void GlobalTxn::SetLastStatus(ErrorCode* status) {
+    MutexLock lock(&mu_);
+    if (!status_returned_) {
+        VLOG(12) << "[gtxn][commit][status][" << start_ts_ << "]" << status->ToString();
+        status_.SetFailed(status->GetType(), status->GetReason());
+        status_returned_ = true;
+    }
+}
+
+void GlobalTxn::RunUserCallback() {
+    if (status_.GetType() == ErrorCode::kOK) {
+        gtxn_commit_cnt.Inc();
+    } else {
+        gtxn_commit_fail_cnt.Inc();
+    }
+    gtxn_internal_->PerfCommitDelay(0, get_micros()); // finish_time
+    if (user_commit_callback_ != NULL) {
+        VLOG(12) << "[gtxn][commit][callback][" << start_ts_ << "]" << status_.ToString();
+        user_commit_callback_(this);
+    } else {
+        MutexLock lock(&finish_mutex_);
+        VLOG(12) << "[gtxn][commit][finish][" << start_ts_ << "]" << status_.ToString();
+        finish_ = true;
+        finish_cond_.Signal(); 
+    }
+}
+
+ErrorCode GlobalTxn::Commit() { 
+    /// begin commit
+    gtxn_internal_->TEST_Sleep(); 
+    gtxn_internal_->PerfCommitDelay(get_micros(), 0); // begin_time
+    ErrorCode status;
+    if (put_fail_cnt_.Get() > 0 || has_commited_) {
+        std::string reason("commit failed, has_commited[" + 
+                std::to_string(has_commited_.load()) +
+                "], put_fail_cnt[" + std::to_string(put_fail_cnt_.Get()) + "]");
+        VLOG(12) << reason;
+        status.SetFailed(ErrorCode::kGTxnOpAfterCommit, reason);
+        SetLastStatus(&status);
+        // Callback Point : put applyMutation failed or has commited
+        RunUserCallback();
+        return status;
+    }
+    has_commited_ = true;
+    // don't have any writes
+    if (writes_cnt_.Get() == 0) {
+        status.SetFailed(ErrorCode::kOK, "No modification exists");
+        SetLastStatus(&status);
+        // Callback Point
+        RunUserCallback();
+        return status;
+    }
+    thread_pool_->AddTask(std::bind(&GlobalTxn::InternalCommit, this));
+
+    if (user_commit_callback_ == NULL) {
+        WaitForComplete();
+    }
+    return status_;
+}
+
+void GlobalTxn::InternalCommit() {
+    gtxn_internal_->SetCommitDuration(timeout_ms_);
+
+    /// begin prewrite
+    gtxn_internal_->TEST_Sleep();
+
+    // on ReadCommitedSnapshot level will get new timestamp before prewrite
+    if (isolation_level_ == IsolationLevel::kReadCommitedSnapshot) {
+        if (FLAGS_tera_gtxn_test_opened) {
+            prewrite_start_ts_ = gtxn_internal_->TEST_GetPrewriteStartTimestamp();
+        } else if (!FLAGS_tera_sdk_tso_client_enabled) {
+            start_ts_ = get_micros();
+        } else {
+            timeoracle::TimeoracleClientImpl tsoc(thread_pool_, tso_cluster_); 
+            prewrite_start_ts_ = tsoc.GetTimestamp(1);
+        }
+        if (prewrite_start_ts_ < start_ts_) {
+            ErrorCode status;
+            LOG(ERROR) << "[gtxn][prewrite][" << start_ts_ <<"] get prewrite new ts failed";
+            status.SetFailed(ErrorCode::kGTxnTimestampLost, "get prewrite new ts failed");
+            SetLastStatus(&status);
+            RunUserCallback();
+            return;
+        }
+        gtxn_internal_->SetPrewriteStartTimestamp(prewrite_start_ts_);
+    }
+    VLOG(12) << "[gtxn][prewrite][" << start_ts_ << "]";
+    gtxn_internal_->PerfPrewriteDelay(get_micros(), 0); // begin_time
+    gtxn_prewrite_cnt.Inc();
+
+    prewrite_iterator_ = writes_.begin();
+    primary_write_ = &(prewrite_iterator_->second[0]);
+    primary_write_->Serialize(prewrite_start_ts_, 
+                              gtxn_internal_->GetClientSession(), 
+                              &serialized_primary_);
+    AsyncPrewrite(&prewrite_iterator_->second);
+}
+
+// [prewrite] Step(1): 
+//      read "lock", "write" column from tera
+//
+// aysnc prewrite one row use single_row_txn
+//
+void GlobalTxn::AsyncPrewrite(std::vector<Write>* ws) {
+    assert(ws->size() > 0);
+    // find table and rowkey to new reader and single row txn
+    Write w = *(ws->begin());
+    Table* table = w.Table();
+    Transaction* single_row_txn = table->StartRowTransaction(w.RowKey());
+    RowReader* reader = table->NewRowReader(w.RowKey());
+    // set internal reader timeout 
+    gtxn_internal_->SetInternalSdkTaskTimeout(reader);
+    // set cf qu and timerange for reader
+    gtxn_internal_->BuildRowReaderForPrewrite(*ws, reader);
+    // set callback, context, single row txn for reader
+    reader->SetCallBack([](RowReader* r){
+        GlobalTxn* gtxn = static_cast<GlobalTxn*>(((PrewriteContext*)r->GetContext())->gtxn);
+        gtxn->thread_pool_->AddTask(std::bind(&GlobalTxn::DoPrewriteReaderCallback, gtxn, r));
+    });
+    PrewriteContext* ctx = new PrewriteContext(ws, this, w.TableName(), w.RowKey());
+    if (gtxn_internal_->IsTimeOut()) {
+        ctx->status.SetFailed(ErrorCode::kGTxnPrewriteTimeout, "global transaction prewrite timeout");
+        VLOG(12) << "[gtxn][prewrite][stxn_read] ignored : " << ctx->DebugString();
+        RunAfterPrewriteFailed(ctx);
+    } else {
+        reader->SetContext(ctx);
+        // get async
+        VLOG(12) << "[gtxn][prewrite][stxn_read] invoked : " << ctx->DebugString();
+        single_row_txn->Get(reader);
+    }
+}
+
+// [prewrite] Step(2): 
+//      a) verify [prewrite] step(1) read result status and no conflict 
+//      b) write "lock" and "data" column to tera, through same single_row_txn in step(1)
+//
+// call by [prewrite] step(1),through reader callback
+// 
+void GlobalTxn::DoPrewriteReaderCallback(RowReader* reader) {
+    PrewriteContext* ctx = (PrewriteContext*)reader->GetContext();
+    if (reader->GetError().GetType() != ErrorCode::kNotFound
+        && reader->GetError().GetType() != ErrorCode::kOK) {
+        ctx->status = reader->GetError();
+        VLOG(12) << "[gtxn][prewrite][stxn_read] failed : " << ctx->status.ToString();
+        if (gtxn_internal_->IsTimeOut() || reader->GetError().GetType() == ErrorCode::kTimeout) {
+            ctx->status.SetFailed(ErrorCode::kGTxnPrewriteTimeout, ctx->status.ToString());
+        }
+        delete reader;
+        RunAfterPrewriteFailed(ctx);
+    } else if (gtxn_internal_->ConflictWithOtherWrite(ctx->ws, reader, &(ctx->status))) {
+        VLOG(12) << "[gtxn][prewrite][stxn_read] failed : " << ctx->status.ToString();
+        delete reader;
+        RunAfterPrewriteFailed(ctx);
+    } else {
+        VLOG(12) << "[gtxn][prewrite][stxn_read] succeed, table=" << ctx->DebugString();
+        Table* t = reader->GetTable();
+        RowMutation* prewrite_mu = t->NewRowMutation(reader->RowKey());
+        // set internal task timeout
+        gtxn_internal_->SetInternalSdkTaskTimeout(prewrite_mu);
+        gtxn_internal_->BuildRowMutationForPrewrite(ctx->ws, prewrite_mu, 
+                                                    serialized_primary_);
+        
+        // commit single_row_txn
+        SingleRowTxn* single_row_txn = static_cast<SingleRowTxn*>(reader->GetTransaction());
+        delete reader;
+        single_row_txn->SetContext(ctx);
+        single_row_txn->SetCommitCallback([](Transaction* single_txn) {
+            GlobalTxn* gtxn = static_cast<GlobalTxn*>(((PrewriteContext*)single_txn->GetContext())->gtxn);
+            SingleRowTxn* stxn = static_cast<SingleRowTxn*>(single_txn); 
+            gtxn->thread_pool_->AddTask(std::bind(&GlobalTxn::DoPrewriteCallback, gtxn, stxn));
+        });
+        if (gtxn_internal_->IsTimeOut()) {
+            ctx->status.SetFailed(ErrorCode::kGTxnPrewriteTimeout, "global transaction prewrite timeout");
+            VLOG(12) << "[gtxn][prewrite][stxn_commit] ignored : " << ctx->DebugString();
+            delete single_row_txn;
+            delete prewrite_mu;
+            RunAfterPrewriteFailed(ctx);
+        } else {
+            single_row_txn->ApplyMutation(prewrite_mu);
+            VLOG(12) << "[gtxn][prewrite][stxn_commit] invoked : " << ctx->DebugString();
+            t->CommitRowTransaction(single_row_txn);
+            delete prewrite_mu;
+        }
+    }
+}
+
+// prewrite Step(3):
+//      verify [prewrite] step(2) single_row_txn commit status,
+//      if the last prewrite callback and status ok, will call [commit]
+//
+// call by [prewrite] step(2), through single_row_txn commit callback
+//      
+void GlobalTxn::DoPrewriteCallback(SingleRowTxn* single_row_txn) {
+    ErrorCode status = single_row_txn->GetError();
+    PrewriteContext* ctx = (PrewriteContext*)single_row_txn->GetContext();
+    delete single_row_txn;
+    if (gtxn_internal_->IsTimeOut() || status.GetType() != ErrorCode::kOK) {
+        // wapper timeout status for global transaction 
+        if (gtxn_internal_->IsTimeOut() || status.GetType() == ErrorCode::kTimeout) {
+            ctx->status.SetFailed(ErrorCode::kGTxnPrewriteTimeout, status.ToString()); 
+        } else {
+            ctx->status.SetFailed(status.GetType(), status.ToString());
+        }
+        VLOG(12) << "[gtxn][prewrite][stxn_commit] failed : " << ctx->DebugString();
+        RunAfterPrewriteFailed(ctx);
+    } else if (++prewrite_iterator_ != writes_.end()) {
+        thread_pool_->AddTask(std::bind(&GlobalTxn::AsyncPrewrite, this, &(prewrite_iterator_->second)));
+    } else {
+        gtxn_internal_->PerfPrewriteDelay(0, get_micros()); // finish_time
+        VLOG(12) << "prewrite done, next step";
+        InternalCommitPhase2();
+    }
+}
+
+void GlobalTxn::RunAfterPrewriteFailed(PrewriteContext* ctx) {
+    gtxn_internal_->PerfPrewriteDelay(0, get_micros()); // finish_time
+    gtxn_prewrite_fail_cnt.Inc();
+    if (gtxn_internal_->IsTimeOut() || ctx->status.GetType() == ErrorCode::kTimeout) {
+        ctx->status.SetFailed(ErrorCode::kGTxnPrewriteTimeout, ctx->status.ToString()); 
+    }
+    SetLastStatus(&ctx->status);
+    delete ctx;
+    RunUserCallback();
+}
+
+// commit phase2 Step(1):
+//      a) get timestamp from timeoracle for commit_ts
+//      b) sync commit primary write through single_row_txn
+//         (for this gtxn, on this step only one thread can work)
+//      c) loop call [commit phase2] step(2)
+//
+// call by [prewrite] step(3)
+void GlobalTxn::InternalCommitPhase2() {
+    gtxn_internal_->PerfPrimaryCommitDelay(get_micros(), 0); // begin_time
+    gtxn_primary_cnt.Inc();
+    gtxn_internal_->TEST_Sleep(); // end prewrite
+    ErrorCode status;
+    status.SetFailed(ErrorCode::kOK);
+    gtxn_internal_->TEST_Sleep(); // wait to begin commit
+
+    if (FLAGS_tera_gtxn_test_opened) {
+        commit_ts_ = gtxn_internal_->TEST_GetCommitTimestamp();
+    } else if (!FLAGS_tera_sdk_tso_client_enabled) {
+        start_ts_ = get_micros();
+    } else {
+        timeoracle::TimeoracleClientImpl tsoc(thread_pool_, tso_cluster_); 
+        commit_ts_ = tsoc.GetTimestamp(1);
+    }
+    if (commit_ts_ < prewrite_start_ts_) {
+        LOG(ERROR) << "[gtxn][commit] get commit ts failed";
+        status.SetFailed(ErrorCode::kGTxnTimestampLost, "get commit ts failed");
+        SetLastStatus(&status);
+        gtxn_internal_->PerfPrimaryCommitDelay(0, get_micros());
+        gtxn_primary_fail_cnt.Inc();
+        RunUserCallback();
+        return;
+    }
+
+    VLOG(12) << "[gtxn][commit] commit_ts:" << commit_ts_;
+    gtxn_internal_->TEST_Sleep(); // wait to begin primary commit
+    
+    /// begin to commit primary
+    VerifyPrimaryLocked();
+}
+
+void GlobalTxn::VerifyPrimaryLocked() {
+    Table* pri_t = primary_write_->Table();
+    tera::Transaction* pri_txn = pri_t->StartRowTransaction(primary_write_->RowKey());
+    RowReader* reader = pri_t->NewRowReader(primary_write_->RowKey());
+    // set internal task timeout
+    gtxn_internal_->SetInternalSdkTaskTimeout(reader); 
+    reader->AddColumn(primary_write_->ColFamily(), primary_write_->LockName());
+    reader->SetTimeRange(prewrite_start_ts_, prewrite_start_ts_);
+    reader->SetCallBack([](RowReader* r) {
+            ((GlobalTxn*)r->GetContext())->DoVerifyPrimaryLockedCallback(r);});
+    reader->SetContext(this);
+    pri_txn->Get(reader);
+}
+
+void GlobalTxn::DoVerifyPrimaryLockedCallback(RowReader* reader) {
+    ErrorCode status = reader->GetError();
+    SingleRowTxn* pri_txn = static_cast<SingleRowTxn*>(reader->GetTransaction());
+    delete reader;
+
+    if (status.GetType() == ErrorCode::kOK) {
+        CommitPrimary(pri_txn);
+    } else {
+        delete pri_txn;
+        if (status.GetType() == ErrorCode::kNotFound) {
+            status.SetFailed(ErrorCode::kGTxnPrimaryLost, "primary 'lock' lost before commit");
+        } else if (status.GetType() == ErrorCode::kTimeout) {
+            status.SetFailed(ErrorCode::kGTxnPrimaryCommitTimeout, status.ToString()); 
+        }
+        SetLastStatus(&status);
+        gtxn_primary_fail_cnt.Inc();
+        gtxn_internal_->PerfPrimaryCommitDelay(0, get_micros()); // finish_time
+        RunUserCallback();
+    }
+}
+
+void GlobalTxn::CommitPrimary(SingleRowTxn* pri_txn) {
+    Table* pri_t = primary_write_->Table();
+    RowMutation* primary_mu = pri_t->NewRowMutation(primary_write_->RowKey());
+    // set internal task timeout
+    gtxn_internal_->SetInternalSdkTaskTimeout(primary_mu); 
+    primary_mu->Put(primary_write_->ColFamily(), primary_write_->WriteName(), 
+                    EncodeWriteValue(primary_write_->WriteType(), prewrite_start_ts_), commit_ts_);
+    primary_mu->DeleteColumns(primary_write_->ColFamily(), primary_write_->LockName(), commit_ts_);
+    pri_txn->ApplyMutation(primary_mu);
+    pri_txn->SetCommitCallback([] (Transaction* txn) {
+        ((GlobalTxn*)txn->GetContext())->CheckPrimaryStatusAndCommmitSecondaries(txn);
+    });
+    pri_txn->SetContext(this);
+    pri_txn->Commit();
+    delete primary_mu;
+}
+
+void GlobalTxn::CheckPrimaryStatusAndCommmitSecondaries(Transaction* pri_txn) {
+	ErrorCode status = pri_txn->GetError();
+    delete pri_txn;
+    gtxn_internal_->TEST_Sleep();
+    // primary commit failed callback and return
+    if (status.GetType() != tera::ErrorCode::kOK) {
+        VLOG(12) << "[gtxn][commit] primary failed :[" << status.ToString() << "]";
+        // Callback Point : primary commit failed
+        if (status.GetType() == ErrorCode::kTimeout) {
+            status.SetFailed(ErrorCode::kGTxnPrimaryCommitTimeout, status.ToString()); 
+        }
+        SetLastStatus(&status);
+        gtxn_primary_fail_cnt.Inc();
+        gtxn_internal_->PerfPrimaryCommitDelay(0, get_micros()); // finish_time
+        RunUserCallback();
+        return;
+    }
+    gtxn_internal_->PerfPrimaryCommitDelay(0, get_micros()); // finish_time
+    if (acks_cnt_.Get() == 0 && notifies_cnt_.Get() == 0) {
+        SetLastStatus(&status);
+    }
+    // wait primary commit done
+	VLOG(12) << "[gtxn][commit] succeed :[" << start_ts_ 
+              << "," << prewrite_start_ts_ << "," << commit_ts_ << "]";
+
+    std::vector<Write>* ws = &(writes_.begin()->second);
+    if (ws->size() == 1) {
+        writes_.erase(writes_.begin());
+        writes_cnt_.Dec();
+    } else {
+        ws->erase(ws->begin());
+    }
+
+    all_task_pushed_ = false;
+    /// begin commit secondaries
+    for (auto &same_row_writes : writes_) {
+        thread_pool_->AddTask(std::bind(&GlobalTxn::AsyncCommitSecondaries, 
+                                       this, &(same_row_writes.second)));
+    }
+
+    /// begin ack
+    for (auto &same_row_acks : acks_) {
+        thread_pool_->AddTask(std::bind(&GlobalTxn::AsyncAck, 
+                                       this, &(same_row_acks.second)));
+    }
+    /// begin notify
+    for (auto &same_row_notifies : notifies_) {
+        thread_pool_->AddTask(std::bind(&GlobalTxn::AsyncNotify, 
+                                       this, &(same_row_notifies.second)));
+    }
+    bool should_callback = false;
+    {
+        MutexLock lock(&mu_);
+        all_task_pushed_ = true;
+        should_callback = commit_secondaries_done_cnt_.Get() == writes_cnt_.Get() 
+                            && acks_cnt_.Get() == ack_done_cnt_.Get() 
+                            && notifies_cnt_.Get() == notify_done_cnt_.Get()
+                            && all_task_pushed_ == true;
+    }
+    if (should_callback) {
+        RunUserCallback();
+    }
+
+}
+
+void GlobalTxn::AsyncAck(std::vector<Write>* ws) {
+    gtxn_internal_->PerfAckDelay(get_micros(), 0);
+    gtxn_acks_cnt.Inc();
+    assert(ws->size() > 0);
+    Write w = *(ws->begin());
+    Table* table = w.Table();
+    RowMutation* mu = table->NewRowMutation(w.RowKey());
+    gtxn_internal_->SetInternalSdkTaskTimeout(mu); 
+    gtxn_internal_->BuildRowMutationForAck(ws, mu);
+    mu->SetCallBack([](RowMutation* row_mu) {
+            ((GlobalTxn*)row_mu->GetContext())->DoAckCallback(row_mu);});
+    mu->SetContext(this);
+    table->ApplyMutation(mu);
+}
+
+void GlobalTxn::DoAckCallback(RowMutation* mutation) {
+    if (mutation->GetError().GetType() != tera::ErrorCode::kOK) {
+        LOG(WARNING) << "[gtxn][commit][ack], failed"
+                     << mutation->GetError().GetReason();
+        ErrorCode status;
+        status.SetFailed(ErrorCode::kGTxnOKButAckFailed, mutation->GetError().ToString());
+        SetLastStatus(&status);
+        gtxn_acks_fail_cnt.Inc();
+    }
+    delete mutation;
+    bool should_callback = false;
+    {
+        MutexLock lock(&mu_);
+        ack_done_cnt_.Inc();
+        gtxn_internal_->PerfAckDelay(0, get_micros());
+        should_callback = commit_secondaries_done_cnt_.Get() == writes_cnt_.Get() 
+                            && acks_cnt_.Get() == ack_done_cnt_.Get() 
+                            && notifies_cnt_.Get() == notify_done_cnt_.Get();
+    }
+
+    if (should_callback) {
+        RunUserCallback();
+    }
+}
+
+void GlobalTxn::AsyncNotify(std::vector<Write>* ws) {
+    gtxn_internal_->PerfNotifyDelay(get_micros(), 0);
+    gtxn_notifies_cnt.Inc();
+    assert(ws->size() > 0);
+    Write w = *(ws->begin());
+    Table* table = w.Table();
+    RowMutation* mu = table->NewRowMutation(w.RowKey());
+    gtxn_internal_->SetInternalSdkTaskTimeout(mu); 
+    gtxn_internal_->BuildRowMutationForNotify(ws, mu, commit_ts_);
+    mu->SetCallBack([](RowMutation* row_mu) {
+            ((GlobalTxn*)row_mu->GetContext())->DoNotifyCallback(row_mu);});
+    mu->SetContext(this);
+    table->ApplyMutation(mu);
+}
+
+void GlobalTxn::DoNotifyCallback(RowMutation* mutation) {
+    if (mutation->GetError().GetType() != tera::ErrorCode::kOK) {
+        LOG(WARNING) << "[gtxn][commit][notify], failed"
+                     << mutation->GetError().GetReason();
+        ErrorCode status;
+        status.SetFailed(ErrorCode::kGTxnOKButNotifyFailed, mutation->GetError().ToString());
+        gtxn_notifies_fail_cnt.Inc();
+        SetLastStatus(&status);
+    }
+    delete mutation;
+
+    bool should_callback = false;
+    {
+        MutexLock lock(&mu_);
+        notify_done_cnt_.Inc();
+        gtxn_internal_->PerfNotifyDelay(0, get_micros());
+        should_callback = commit_secondaries_done_cnt_.Get() == writes_cnt_.Get() 
+                            && acks_cnt_.Get() == ack_done_cnt_.Get() 
+                            && notifies_cnt_.Get() == notify_done_cnt_.Get()
+                            && all_task_pushed_ == true;
+    }
+
+    if (should_callback) {
+        RunUserCallback();
+    }
+}
+
+void GlobalTxn::AsyncCommitSecondaries(std::vector<Write>* ws) {
+    gtxn_internal_->PerfSecondariesCommitDelay(get_micros(), 0); // begin time
+    gtxn_secondaries_cnt.Inc();
+    assert(ws->size() > 0);
+    Write w = *(ws->begin());
+    Table* table = w.Table();
+    RowMutation* mu = table->NewRowMutation(w.RowKey());
+    gtxn_internal_->SetInternalSdkTaskTimeout(mu); 
+    gtxn_internal_->BuildRowMutationForCommit(ws, mu, commit_ts_);
+    mu->SetCallBack([](RowMutation* row_mu) {
+            ((GlobalTxn*)row_mu->GetContext())->DoCommitSecondariesCallback(row_mu);});
+    mu->SetContext(this);
+    table->ApplyMutation(mu);
+}
+
+void GlobalTxn::DoCommitSecondariesCallback(RowMutation* mutation) {
+    if (mutation->GetError().GetType() != tera::ErrorCode::kOK) {
+        LOG(WARNING) << "[gtxn][commit][secondaries], failed"
+                     << mutation->GetError().GetReason();
+        gtxn_secondaries_fail_cnt.Inc();
+    }
+    delete mutation;
+
+    bool should_callback = false;
+    {
+        MutexLock lock(&mu_);
+        commit_secondaries_done_cnt_.Inc();
+        gtxn_internal_->PerfSecondariesCommitDelay(0, get_micros()); // finish time
+        should_callback = commit_secondaries_done_cnt_.Get() == writes_cnt_.Get() 
+                            && acks_cnt_.Get() == ack_done_cnt_.Get() 
+                            && notifies_cnt_.Get() == notify_done_cnt_.Get()
+                            && all_task_pushed_ == true;
+    }
+        
+    if (should_callback) {
+        RunUserCallback();
+    }
+}
+
+void GlobalTxn::ApplyMutation(RowMutation* row_mu) {
+    assert(row_mu != NULL);
+
+    RowMutationImpl* row_mu_impl = static_cast<RowMutationImpl*>(row_mu);
+    row_mu_impl->SetTransaction(this);
+    row_mu_impl->SetError(ErrorCode::kOK);
+
+    bool can_apply = false;
+	if (!has_commited_.load()) {
+		assert(put_fail_cnt_.Get() > -1);
+		put_fail_cnt_.Inc();
+		// check writes_size_ over limit
+        MutexLock lock(&mu_);
+		can_apply = gtxn_internal_->VerifyWritesSize(row_mu, &writes_size_);
+	} else {
+        std::string reason = "ApplyMutation failed, txn has committed at [" 
+            + std::to_string(commit_ts_) + "]";
+		LOG(ERROR) << "[gtxn][apply_mutation][" << start_ts_ << "]" << reason;
+		row_mu_impl->SetError(ErrorCode::kGTxnOpAfterCommit, reason);
+	}
+
+    size_t writes_cnt = 0;
+    
+    if (can_apply && gtxn_internal_->VerifyUserRowMutation(row_mu)) {
+        Table* table = row_mu->GetTable();
+        const std::string& tablename = table->GetName();
+        const std::string& row_key = row_mu->RowKey();
+        for (size_t i = 0; i < row_mu->MutationNum(); ++i) {
+            const RowMutation::Mutation& mu = row_mu->GetMutation(i);
+            Cell cell(table, row_key, mu.family, mu.qualifier, start_ts_, mu.value);
+            Write w(cell, mu.type);
+            ++writes_cnt;
+            SaveWrite(tablename, row_key, w);
+        }
+    }
+
+    bool is_async = row_mu_impl->IsAsync();
+    ErrorCode mu_err = row_mu_impl->GetError();
+    
+    if (mu_err.GetType() != ErrorCode::kOK || writes_cnt == 0) {
+        if (!status_returned_) {
+            status_.SetFailed(mu_err.GetType(), mu_err.GetReason());
+            status_returned_ = true;
+        }
+        if (is_async) {
+            thread_pool_->AddTask(std::bind(&RowMutationImpl::RunCallback, row_mu_impl));
+        } else {
+            // nothing to do
+            // sync mu_err != ok will return before put_fail_cnt -1
+        }
+        return;
+    }
+    if (is_async) {
+        thread_pool_->AddTask(std::bind(&RowMutationImpl::RunCallback, row_mu_impl));
+    }
+    // only succes put will -1
+    assert(put_fail_cnt_.Get() > 0);
+    put_fail_cnt_.Dec();
+}
+
+// for wait commit 
+void GlobalTxn::WaitForComplete() {
+    MutexLock lock(&finish_mutex_);
+    while(!finish_) {
+        finish_cond_.Wait();
+    }
+}
+
+void GlobalTxn::Ack(Table* t, 
+                    const std::string& row_key, 
+                    const std::string& column_family, 
+                    const std::string& qualifier) {
+    if (t == NULL) {
+        LOG(ERROR) << "set ack cell failed";
+        return;
+    }
+    const std::string& tablename = t->GetName();
+    Cell cell(t, row_key, column_family, qualifier);
+    Write w(cell);
+    TableWithRowkey twr(tablename, row_key);
+    MutexLock lock(&mu_);
+    auto it = acks_.find(twr);
+    if (it != acks_.end()) {
+        std::vector<Write>* acks_ptr = &(acks_[twr]);
+        acks_ptr->push_back(w);
+    } else {
+        std::vector<Write> acks;
+        acks.push_back(w);   
+        acks_[twr] = acks; 
+        acks_cnt_.Inc();
+    }
+}
+
+void GlobalTxn::Notify(Table* t,
+                       const std::string& row_key, 
+                       const std::string& column_family, 
+                       const std::string& qualifier) {
+    if (t == NULL) {
+        LOG(ERROR) << "set ack cell failed";
+        return;
+    }
+    const std::string& tablename = t->GetName();
+    Cell cell(t, row_key, column_family, qualifier);
+    Write w(cell);
+    TableWithRowkey twr(tablename, row_key);
+    MutexLock lock(&mu_);
+    auto it = notifies_.find(twr);
+    if (it != notifies_.end()) {
+        std::vector<Write>* notifies_ptr = &(notifies_[twr]);
+        notifies_ptr->push_back(w);
+    } else {
+        std::vector<Write> notifies;
+        notifies.push_back(w);   
+        notifies_[twr] = notifies; 
+        notifies_cnt_.Inc();
+    }
+}
+
+} // namespace tera
+
+/* vim: set expandtab ts=4 sw=4 sts=4 tw=100: */
diff --git a/src/sdk/global_txn.h b/src/sdk/global_txn.h
new file mode 100644
index 000000000..de5832166
--- /dev/null
+++ b/src/sdk/global_txn.h
@@ -0,0 +1,273 @@
+// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+//
+// Author: baorenyi@baidu.com
+
+#ifndef  TERA_SDK_GLOBAL_TXN_H_
+#define  TERA_SDK_GLOBAL_TXN_H_
+
+#include <map>
+#include <string>
+#include <set>
+#include <utility> 
+
+#include "common/mutex.h"
+#include "io/coding.h"
+#include "proto/table_meta.pb.h"
+#include "sdk/global_txn_internal.h"
+#include "sdk/single_row_txn.h"
+#include "sdk/sdk_utils.h"
+#include "sdk/table_impl.h"
+#include "sdk/sdk_zk.h"
+#include "tera.h"
+#include "common/counter.h"
+#include "common/timer.h"
+
+namespace tera {
+
+class Cell;
+class Write;
+class GlobalTxnInternal;
+class CellReaderContext;
+class InternalReaderContext;
+class PrewriteContext;
+
+class GlobalTxn : public Transaction {
+public:
+    static Transaction* NewGlobalTxn(tera::Client* client, 
+            common::ThreadPool* thread_pool, 
+            sdk::ClusterFinder* tso_cluster);
+
+    virtual ~GlobalTxn();
+
+    virtual void ApplyMutation(RowMutation* row_mu);
+    virtual ErrorCode Get(RowReader* row_reader);
+    virtual ErrorCode Commit();
+    
+    virtual int64_t GetStartTimestamp() { return start_ts_; }
+    virtual int64_t GetCommitTimestamp() { return commit_ts_; }
+
+    virtual const ErrorCode& GetError() { return status_; }
+
+    typedef void (*Callback)(Transaction* transaction);
+    
+    virtual void SetCommitCallback(Callback callback) {
+        user_commit_callback_ = callback;
+    }
+
+    virtual Callback GetCommitCallback() { 
+        return user_commit_callback_; 
+    }
+
+    virtual void SetContext(void* context) {
+        user_commit_context_ = context;
+    }
+    
+    virtual void* GetContext() {
+        return user_commit_context_;
+    }
+
+    virtual void Ack(Table* t, 
+                     const std::string& row_key, 
+                     const std::string& column_family, 
+                     const std::string& qualifier);
+
+    virtual void Notify(Table* t,
+                        const std::string& row_key, 
+                        const std::string& column_family, 
+                        const std::string& qualifier);
+
+    virtual void SetIsolation(const IsolationLevel& isolation_level);
+
+    virtual IsolationLevel Isolation() { return isolation_level_; }
+
+    virtual void SetTimeout(int64_t timeout_ms);
+
+    virtual int64_t Timeout();
+
+private:    
+    // ----------------------- begin get process --------------------------- //
+    // read one cell from db
+    // 
+    // read "lock", "write", "data" columns result from db,
+    // use async interface of tera [RowReader]
+    void AsyncGetCell(Cell* cell, RowReaderImpl* user_reader_impl, InternalReaderContext* ctx);
+
+    // check lock write and build cell result
+    // (1) check read result, if failed will call [MergeCellToRow]
+    // (2) maybe call [BackoffAndMaybeCleanupLock] and call [AsyncGetCell] retry
+    // (3) maybe call [FindValueFromResultRow] and call [MergeCellToRow]
+    void DoGetCellReaderCallback(RowReader* reader);
+
+    // check "lock" and "write" columns, do like percolator
+    // maybe call CleanLock, RollForward or wait some times
+    //
+    // if try_clean == true will be CleanLock not wait
+    void BackoffAndMaybeCleanupLock(RowReader::TRow& row, 
+                                    const Cell& cell, 
+                                    const bool try_clean, 
+                                    ErrorCode* status);
+    void CleanLock(const Cell& cell, const tera::PrimaryInfo& primary,  
+                   ErrorCode* status);
+    
+    void RollForward(const Cell& cell, 
+                     const tera::PrimaryInfo& primary,
+                     int lock_type, 
+                     ErrorCode* status);
+
+    // get result form "result_row" and set into "target_cell"
+    bool FindValueFromResultRow(RowReader::TRow& result_row, Cell* target_cell);
+
+    // call GetCellCallback function @ other thread
+    void MergeCellToRow(RowReader* internal_reader, const ErrorCode& status);
+
+    // set cell result, merge to value_list and call user_reader_callback
+    void GetCellCallback(CellReaderContext* ctx);
+    
+    void SetReaderStatusAndRunCallback(RowReaderImpl* reader_impl, ErrorCode* status);
+
+    // ------------- begin commit prewrite (commit phase1) ----------------- //
+    void SaveWrite(const std::string& tablename, 
+                  const std::string& row_key, 
+                  tera::Write& w);
+
+    // commit entry
+    //
+    // do [commit phase1], [commit phase2] will begin at callback
+    void InternalCommit();
+
+    // [prewrite] Step(1): 
+    //      read "data", "lock", "write" column from tera
+    //
+    // aysnc prewrite one row use single_row_txn
+    void AsyncPrewrite(std::vector<Write>* same_row_writes);
+    
+    // [prewrite] Step(2): 
+    //      a) verify [prewrite] step(1) read result status and no conflict 
+    //      b) write "lock" and "data" column to tera, 
+    //         through same single_row_txn in step(1)
+    //
+    // call by [prewrite] step(1),through reader callback
+    void DoPrewriteReaderCallback(RowReader* reader);
+
+    // prewrite Step(3):
+    //      verify [prewrite] step(2) single_row_txn commit status,
+    //      if the last prewrite callback and status ok, will call [commit]
+    //
+    // call by [prewrite] step(2), through single_row_txn commit callback
+    void DoPrewriteCallback(SingleRowTxn* single_row_txn);
+    void RunAfterPrewriteFailed(PrewriteContext* ctx);
+
+    // --------------------- begin commit phase2 ---------------------- //
+    
+    // commit phase2 Step(1):
+    //      a) get timestamp from timeoracle for commit_ts
+    //      b) sync commit primary write through single_row_txn
+    //         (for this gtxn, on this step only one thread can work)
+    //      c) call [commit phase2] step(2) in a loop
+    //
+    // call by [prewrite] step(3)
+    void InternalCommitPhase2(); 
+
+    void VerifyPrimaryLocked();
+
+    void DoVerifyPrimaryLockedCallback(RowReader* reader);
+    
+    void CommitPrimary(SingleRowTxn* primary_single_txn);
+
+    void CheckPrimaryStatusAndCommmitSecondaries(Transaction* primary_single_txn);
+
+    // commit phase2 Step(2):
+    //      async commit secondaries writes through RowMutaion
+    //
+    // call by [commit phase2] step(1)
+    void AsyncCommitSecondaries(std::vector<Write>* same_row_writes);
+    
+    void DoCommitSecondariesCallback(RowMutation* mutation);
+
+    // commit phase2 Step(3):
+    //      async do ack through RowMutaion
+    //
+    // call by [commit phase2] step(1)
+    void AsyncAck(std::vector<Write>* same_row_acks);
+    
+    void DoAckCallback(RowMutation* mutation);
+
+    // commit phase2 Step(4):
+    //      async do notify through RowMutaion
+    //
+    // call by [commit phase2] step(1)
+    void AsyncNotify(std::vector<Write>* same_row_notifies);
+    
+    void DoNotifyCallback(RowMutation* mutation);
+
+    /// if user want to delete this transaction, 
+    /// before any async tasks of this transaction finished for failed
+    void WaitForComplete();
+    
+    void SetLastStatus(ErrorCode* status);
+
+    void RunUserCallback();
+
+    // -------------------- end commit phase1 and phase2 ------------------- //
+private:
+    GlobalTxn(tera::Client* client, 
+              common::ThreadPool* thread_pool, 
+              sdk::ClusterFinder* tso_cluster);
+
+    GlobalTxn(const GlobalTxn&) = delete;
+    void operator=(const GlobalTxn&) = delete;    
+
+    // <tablename, row_key> 
+    typedef std::pair<std::string, std::string> TableWithRowkey;
+    // tableWithRowkey -> set(write)
+    typedef std::map<TableWithRowkey, std::vector<Write>> WriterMap;
+
+    std::unique_ptr<GlobalTxnInternal> gtxn_internal_;
+    ErrorCode status_;
+    bool status_returned_; // if true gtxn will not change "status_"
+    
+    Write* primary_write_;
+    WriterMap writes_;
+    WriterMap::iterator prewrite_iterator_;
+    int64_t writes_size_;
+    
+    int64_t start_ts_;
+    int64_t prewrite_start_ts_;
+    int64_t commit_ts_;
+    IsolationLevel isolation_level_;
+	std::string serialized_primary_;
+
+    WriterMap acks_;
+    WriterMap notifies_;
+    
+    mutable Mutex mu_;
+    std::atomic<bool> finish_;
+    mutable Mutex finish_mutex_;
+    common::CondVar finish_cond_;
+    
+    std::atomic<bool> has_commited_;
+    
+    Callback user_commit_callback_;
+    void* user_commit_context_;
+
+    common::ThreadPool* thread_pool_;
+    sdk::ClusterFinder* tso_cluster_;
+
+    int64_t timeout_ms_;
+   
+    Counter put_fail_cnt_; // put begin +1, done -1
+    Counter commit_secondaries_done_cnt_;
+    Counter ack_done_cnt_;
+    Counter notify_done_cnt_;
+
+    Counter writes_cnt_;
+    Counter acks_cnt_;
+    Counter notifies_cnt_;
+    std::atomic<bool> all_task_pushed_;
+};
+
+} // namespace tera
+
+#endif  // TERA_SDK_GLOBAL_TXN_H_
diff --git a/src/sdk/global_txn_internal.cc b/src/sdk/global_txn_internal.cc
new file mode 100644
index 000000000..8c69651ed
--- /dev/null
+++ b/src/sdk/global_txn_internal.cc
@@ -0,0 +1,559 @@
+// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+//
+// Author: baorenyi@baidu.com
+
+#include "sdk/global_txn_internal.h"
+
+#include "common/metric/metric_counter.h"
+#include "common/this_thread.h"
+#include "proto/table_meta.pb.h"
+#include "proto/tabletnode_rpc.pb.h"
+#include "sdk/global_txn.h"
+#include "sdk/read_impl.h"
+#include "sdk/sdk_metric_name.h"
+
+DECLARE_bool(tera_gtxn_test_opened);
+DECLARE_string(tera_gtxn_test_flagfile);
+DECLARE_int32(tera_gtxn_all_puts_size_limit);
+DECLARE_int32(tera_sdk_timeout);
+
+namespace tera {
+
+// for record sdk all transactions perf
+tera::MetricCounter gtxn_read_delay_us(kGTxnReadDelayMetric, kGTxnLabelRead);
+tera::MetricCounter gtxn_read_cnt(kGTxnReadCountMetric, kGTxnLabelRead);
+tera::MetricCounter gtxn_read_fail_cnt(kGTxnReadFailCountMetric, kGTxnLabelRead);
+tera::MetricCounter gtxn_read_retry_cnt(kGTxnReadRetryCountMetric, kGTxnLabelRead);
+tera::MetricCounter gtxn_read_rollback_cnt(kGTxnReadRollBackCountMetric, kGTxnLabelRead);
+tera::MetricCounter gtxn_read_rollforward_cnt(kGTxnReadRollForwardCountMetric, kGTxnLabelRead);
+
+tera::MetricCounter gtxn_commit_delay_us(kGTxnCommitDelayMetric, kGTxnLabelCommit);
+tera::MetricCounter gtxn_commit_cnt(kGTxnCommitCountMetric, kGTxnLabelCommit);
+tera::MetricCounter gtxn_commit_fail_cnt(kGTxnCommitFailCountMetric, kGTxnLabelCommit);
+
+tera::MetricCounter gtxn_prewrite_delay_us(kGTxnPrewriteDelayMetric, kGTxnLabelCommit);
+tera::MetricCounter gtxn_prewrite_cnt(kGTxnPrewriteCountMetric, kGTxnLabelCommit);
+tera::MetricCounter gtxn_prewrite_fail_cnt(kGTxnPrewriteFailCountMetric, kGTxnLabelCommit);
+
+tera::MetricCounter gtxn_primary_delay_us(kGTxnPrimaryDelayMetric, kGTxnLabelCommit);
+tera::MetricCounter gtxn_primary_cnt(kGTxnPrimaryCountMetric, kGTxnLabelCommit);
+tera::MetricCounter gtxn_primary_fail_cnt(kGTxnPrimaryFailCountMetric, kGTxnLabelCommit);
+
+tera::MetricCounter gtxn_secondaries_delay_us(kGTxnSecondariesDelayMetric, kGTxnLabelCommit);
+tera::MetricCounter gtxn_secondaries_cnt(kGTxnSecondariesCountMetric, kGTxnLabelCommit);
+tera::MetricCounter gtxn_secondaries_fail_cnt(kGTxnSecondariesFailCountMetric, kGTxnLabelCommit);
+
+tera::MetricCounter gtxn_acks_delay_us(kGTxnAcksDelayMetric, kGTxnLabelCommit);
+tera::MetricCounter gtxn_acks_cnt(kGTxnAcksCountMetric, kGTxnLabelCommit);
+tera::MetricCounter gtxn_acks_fail_cnt(kGTxnAcksFailCountMetric, kGTxnLabelCommit);
+
+tera::MetricCounter gtxn_notifies_delay_us(kGTxnNotifiesDelayMetric, kGTxnLabelCommit);
+tera::MetricCounter gtxn_notifies_cnt(kGTxnNotifiesCountMetric, kGTxnLabelCommit);
+tera::MetricCounter gtxn_notifies_fail_cnt(kGTxnNotifiesFailCountMetric, kGTxnLabelCommit);
+
+tera::MetricCounter gtxn_tso_delay_us(kGTxnTsoDelayMetric, kGTxnLabelTso);
+tera::MetricCounter gtxn_tso_req_cnt(kGTxnTsoRequestCountMetric, kGTxnLabelTso);
+
+GlobalTxnInternal::GlobalTxnInternal(tera::Client* client) 
+    : TEST_GtxnTestHelper_(NULL),
+      start_ts_(0), 
+      prewrite_start_ts_(0), 
+      terminal_time_(0), 
+      is_timeout_(false),
+      client_(client) {}
+
+GlobalTxnInternal::~GlobalTxnInternal() {
+    PerfReport();
+}
+
+void GlobalTxnInternal::SetStartTimestamp(int64_t ts) {
+    start_ts_ = ts;
+    prewrite_start_ts_ = ts;
+}
+
+bool GlobalTxnInternal::CheckTable(Table* table, ErrorCode* status) {
+    assert(table != NULL);
+    MutexLock lock(&tables_mu_);
+    TableInfoMap::const_iterator tables_it = tables_.find(table->GetName());
+    if (tables_it == tables_.end()) {
+        TableImpl* table_impl = static_cast<TableImpl*>(table);
+        TableSchema schema = table_impl->GetTableSchema();
+        if (IsTransactionTable(schema)) {
+            std::set<std::string> gtxn_cfs;
+            FindGlobalTransactionCfs(schema, &gtxn_cfs);
+            if (gtxn_cfs.size() > 0) { 
+                tables_[table->GetName()] = std::pair<Table*, std::set<std::string> >(table, gtxn_cfs);
+                return true;
+            } else {
+                status->SetFailed(ErrorCode::kBadParam, 
+                                 "schema check fail: " + table->GetName() + " haven't gtxn cf");
+                return false;
+            }
+        } else {
+            status->SetFailed(ErrorCode::kBadParam, 
+                             "schema check fail: " + table->GetName() + " not txn table");
+            return false;
+        }
+    }
+    return true;
+}
+
+
+
+bool GlobalTxnInternal::IsLockedByOthers(RowReader::TRow& row, const Cell& cell) {
+    if (row[cell.ColFamily()].find(cell.LockName()) != row[cell.ColFamily()].end()) {
+        for (auto k = row[cell.ColFamily()][cell.LockName()].rbegin();
+                 k != row[cell.ColFamily()][cell.LockName()].rend(); ++k) {
+            if (k->first < start_ts_) {
+                return true;
+            }
+        }
+    } 
+    return false;
+}
+
+bool GlobalTxnInternal::SuspectLive(const tera::PrimaryInfo& primary_info) {
+    std::string session_str = primary_info.client_session();
+    VLOG(12) << "suppect_live : " << session_str;
+    ClientImpl* client_impl = static_cast<ClientImpl*>(client_);
+    return client_impl->IsClientAlive(session_str);
+}
+
+bool GlobalTxnInternal::VerifyUserRowReader(RowReader* user_reader) {
+    RowReaderImpl* reader_impl = static_cast<RowReaderImpl*>(user_reader);
+    const RowReader::ReadColumnList& read_col_list = user_reader->GetReadColumnList();
+    ErrorCode status; 
+    bool schema_valid = true; 
+    std::string reason("");
+    
+    Table* table = reader_impl->GetTable();
+    if (!CheckTable(table, &status)) {
+        // table schema error for gtxn
+        reader_impl->SetError(status.GetType(), status.GetReason());
+        return false;
+    } else if (read_col_list.size() == 0) {
+        // TODO support read full
+        reason = "not support read full line in global transaction";
+        LOG(ERROR) << "[gtxn][get] " << reason;
+        reader_impl->SetError(ErrorCode::kBadParam, reason);
+        return false;
+    } else if (reader_impl->GetSnapshot() != 0) {
+        reason = "not support read a snapshot in global transaction";
+        LOG(ERROR) << "[gtxn][get] " << reason;
+        reader_impl->SetError(ErrorCode::kBadParam, reason);
+        return false;
+    }
+    
+    // check schema valid
+    const std::string& tablename = table->GetName();
+
+    for (auto it = read_col_list.begin(); it != read_col_list.end(); ++it) {
+        const std::string& column_family = it->first;
+        const std::set<std::string>& qualifier_set = it->second;
+
+        if (qualifier_set.size() == 0) {
+            reason = "not set any qualifier";
+            LOG(ERROR) << "[gtxn][get] " << reason;
+            reader_impl->SetError(ErrorCode::kBadParam, reason);
+            schema_valid = false;
+            break;
+        }
+        if (!IsGTxnColumnFamily(tablename, column_family)) {
+            reason = "table:" + tablename + ",cf:" + column_family + " not set gtxn=\"on\"";
+            LOG(ERROR) << "[gtxn][get] " << reason;
+            reader_impl->SetError(ErrorCode::kBadParam, reason);
+            schema_valid = false;
+            break;
+        }
+        for (auto q_it = qualifier_set.begin(); q_it != qualifier_set.end(); ++q_it) {
+            const std::string& qualifier = *q_it;
+
+            if (BadQualifier(qualifier)) {
+                reason = "table:" + tablename + ",qu:" + qualifier + " can't end with \"_*_\"";
+                LOG(ERROR) << "[gtxn][get] " << reason;
+                reader_impl->SetError(ErrorCode::kBadParam, reason);
+                schema_valid = false;
+                break;
+            }
+        }
+    }
+    return schema_valid;
+}
+
+bool GlobalTxnInternal::VerifyUserRowMutation(RowMutation* user_mu) {
+    RowMutationImpl* row_mu_impl = static_cast<RowMutationImpl*>(user_mu);
+    Table* table = user_mu->GetTable();
+    
+    ErrorCode status;
+    if (!CheckTable(table, &status)) {
+        // table schema error for gtxn;
+        row_mu_impl->SetError(status.GetType(), status.GetReason());
+        return false;
+    } else if (row_mu_impl->MutationNum() <= 0) {
+        // nothing to mutation
+        row_mu_impl->SetError(ErrorCode::kBadParam, "nothing to mutation");
+        return false;
+    }
+    
+    std::string reason("");
+    const std::string& tablename = table->GetName();
+    
+    for (size_t i = 0; i < user_mu->MutationNum(); ++i) {
+        const RowMutation::Mutation& mu = user_mu->GetMutation(i);
+        // check this qualifier is right
+        if (BadQualifier(mu.qualifier)) {
+            reason = "@table" + tablename + ",qu:" + mu.qualifier + 
+                     " can't end with \"_*_\"";
+            LOG(ERROR) << "[gtxn][apply_mutation] " << reason;
+            row_mu_impl->SetError(ErrorCode::kBadParam, reason);
+            return false;
+        } else if (!IsGTxnColumnFamily(tablename, mu.family)) {
+            // check column has set gtxn="on"
+            reason = "@table" + tablename + ",cf:" + mu.family + 
+                     " not set gtxn=\"on\"";
+            LOG(ERROR) << "[gtxn][apply_mutation] " << reason;
+            row_mu_impl->SetError(ErrorCode::kBadParam, reason);
+            return false;
+        } else if (mu.type != RowMutation::kPut && mu.type != RowMutation::kDeleteColumn 
+                && mu.type != RowMutation::kDeleteColumns) {
+
+            reason = "@table " + tablename + ",row mutation type is " + 
+                     std::to_string(mu.type);
+            LOG(ERROR) << "[gtxn][apply_mutation] " << reason;
+            row_mu_impl->SetError(ErrorCode::kGTxnNotSupport, reason);
+            return false;
+        }
+    }
+    return true;
+}
+
+bool GlobalTxnInternal::VerifyWritesSize(RowMutation* user_mu, int64_t* size) {
+    RowMutationImpl* row_mu_impl = static_cast<RowMutationImpl*>(user_mu);
+    *size += row_mu_impl->Size();
+    if (*size > FLAGS_tera_gtxn_all_puts_size_limit) {
+        LOG(ERROR) << "[gtxn][apply_mutation][" << start_ts_ << "] failed, "
+                   << "mutations size " << *size << " > limit (" 
+                   << FLAGS_tera_gtxn_all_puts_size_limit << ")";
+        row_mu_impl->SetError(ErrorCode::kGTxnDataTooLarge);
+        return false;
+    } else if ( *size <= 0) {
+        LOG(ERROR) << "[gtxn][apply_mutation][" << start_ts_ << "] failed, "
+                   << "mutaions size " << *size;
+        row_mu_impl->SetError(ErrorCode::kBadParam);
+        return false;
+    }
+    return true;
+}
+
+bool GlobalTxnInternal::PrimaryIsLocked(const tera::PrimaryInfo& primary,
+                                        const int64_t lock_ts, 
+                                        ErrorCode* status) {
+    Table* table = FindTable(primary.table_name());
+    if (table == NULL) {
+        status->SetFailed(ErrorCode::kGTxnPrimaryLost, 
+                "not found primary table and open failed");
+        return false;
+    }
+    if (!CheckTable(table, status)) {
+        status->SetFailed(ErrorCode::kGTxnPrimaryLost, 
+                "primary table check failed" + status->ToString());
+        return false;
+    }
+    const Cell& cell = Cell(table, primary.row_key(), 
+                            primary.column_family(), primary.qualifier());
+    
+    std::unique_ptr<RowReader> reader(table->NewRowReader(cell.RowKey()));
+    reader->AddColumn(cell.ColFamily(), cell.LockName());
+    reader->SetTimeRange(lock_ts, lock_ts);
+    table->Get(reader.get());
+    
+    if (reader->GetError().GetType() != tera::ErrorCode::kOK &&
+        reader->GetError().GetType() != tera::ErrorCode::kNotFound) {
+        *status = reader->GetError();
+        return false;
+    }
+    while (!reader->Done()) {
+        if (reader->Timestamp() == lock_ts) {
+            VLOG(12) << DebugString(cell, "other transaction on prewrite @" + std::to_string(lock_ts));
+            return true;
+        }
+        reader->Next();
+    }
+    return false;
+}
+
+void GlobalTxnInternal::BuildRowReaderForPrewrite(const std::vector<Write>& ws, RowReader* reader) {
+    for (auto& w : ws){
+        reader->AddColumn(w.ColFamily(), w.Qualifier());
+        reader->AddColumn(w.ColFamily(), w.LockName());
+        reader->AddColumn(w.ColFamily(), w.WriteName());
+        reader->SetTimeRange(0, kMaxTimeStamp);
+        reader->SetMaxVersions(UINT32_MAX);
+    }    
+}
+
+void GlobalTxnInternal::BuildRowMutationForPrewrite(std::vector<Write>* ws, 
+                                                    RowMutation* prewrite_mu,
+                                                    const std::string& primary_info) {
+    for (auto it = ws->begin(); it != ws->end(); ++it) { 
+        const Write& w = *it; // one cell
+		prewrite_mu->Put(w.ColFamily(), 
+                         w.LockName(), 
+                         EncodeLockValue(w.WriteType(), primary_info), 
+                         (int64_t)prewrite_start_ts_);
+		prewrite_mu->Put(w.ColFamily(), 
+                         w.Qualifier(), 
+                         w.Value(), 
+                         (int64_t)prewrite_start_ts_);
+    }
+}
+
+void GlobalTxnInternal::BuildRowMutationForCommit(std::vector<Write>* ws, 
+                                                  RowMutation* commit_mu, 
+                                                  const int64_t commit_ts) {
+    for (auto it = ws->begin(); it != ws->end(); ++it) { 
+        const Write& w = *it; // one cell
+        // value = type + start_ts
+        commit_mu->Put(w.ColFamily(), w.WriteName(), 
+                       EncodeWriteValue(w.WriteType(), prewrite_start_ts_), 
+                       commit_ts);
+        commit_mu->DeleteColumns(w.ColFamily(), w.LockName(), commit_ts);
+    }
+}
+
+void GlobalTxnInternal::BuildRowMutationForAck(std::vector<Write>* ws, 
+                                               RowMutation* commit_mu) {
+    for (auto it = ws->begin(); it != ws->end(); ++it) { 
+        const Write& w = *it; // one cell
+        commit_mu->DeleteColumns(kNotifyColumnFamily, w.NotifyName(), start_ts_);
+    }
+}
+
+void GlobalTxnInternal::BuildRowMutationForNotify(std::vector<Write>* ws, 
+                                                  RowMutation* commit_mu, 
+                                                  const int64_t commit_ts) {
+    for (auto it = ws->begin(); it != ws->end(); ++it) { 
+        const Write& w = *it; // one cell
+        commit_mu->Put(kNotifyColumnFamily, w.NotifyName(), 
+                       Int64ToEncodedString(commit_ts), commit_ts);
+    }
+}
+
+void GlobalTxnInternal::SetCommitDuration(int64_t timeout_ms) {
+     terminal_time_ = timeout_ms + get_millis();
+}
+
+void GlobalTxnInternal::SetInternalSdkTaskTimeout(RowReader* reader) {
+    int64_t duration = terminal_time_ - get_millis();
+    if (duration < 0) {
+        is_timeout_ = true;
+        duration = 1;
+    }
+    // duration should not larger than FLAGS_tera_sdk_timeout 
+    duration = duration > FLAGS_tera_sdk_timeout ? FLAGS_tera_sdk_timeout : duration;
+    reader->SetTimeOut(duration);
+}
+
+void GlobalTxnInternal::SetInternalSdkTaskTimeout(RowMutation* mutation) {
+    int64_t duration = terminal_time_ - get_millis();
+    if (duration < 0) {
+        is_timeout_ = true;
+        duration = 1;
+    }
+    // duration should not larger than FLAGS_tera_sdk_timeout 
+    duration = duration > FLAGS_tera_sdk_timeout ? FLAGS_tera_sdk_timeout : duration;
+    mutation->SetTimeOut(duration);
+}
+
+bool GlobalTxnInternal::IsTimeOut() {
+    return is_timeout_;
+}
+
+bool GlobalTxnInternal::IsPrimary(const tera::Cell& cell, 
+                                  const tera::PrimaryInfo& primary_info) {
+    return primary_info.table_name() == cell.TableName()
+           && primary_info.row_key() == cell.RowKey()
+           && primary_info.column_family() == cell.ColFamily()
+           && primary_info.qualifier() == cell.Qualifier();
+}
+
+Table* GlobalTxnInternal::FindTable(const std::string& tablename) {
+    assert(!tablename.empty());
+    MutexLock lock(&tables_mu_);
+    TableInfoMap::const_iterator it = tables_.find(tablename);
+    if (it == tables_.end()) {
+        ErrorCode status;
+        Table* t = client_->OpenTable(tablename, &status);
+        if (t == NULL || status.GetType() != ErrorCode::kOK) {
+            LOG(ERROR) << "[gtxn] can't create table :" << tablename << "," << status.ToString();
+            return NULL;
+        }
+        return t;
+    }
+    return (it->second).first;
+}
+
+bool GlobalTxnInternal::ConflictWithOtherWrite(const std::vector<Write>* ws, 
+                                               RowReader* reader, 
+                                               ErrorCode* status) {
+    RowReader::TRow row;
+    reader->ToMap(&row);
+
+    // check every cell
+    for (auto it = ws->begin(); it != ws->end(); ++it) {
+        const Write& w = *it;
+		const std::string& w_cf = w.ColFamily();
+        if (row.find(w_cf) == row.end()) {
+            VLOG(12) << "[gtxn][prewrite][stxn_read]" << w.DebugString() 
+                     << "not found [" << w_cf << "]";
+            continue;
+        } else {
+            // check Write column
+            const std::string& w_write = w.WriteName();
+            if (row[w_cf].find(w_write) != row[w_cf].end()) {
+                for (auto k = row[w_cf][w_write].rbegin(); k != row[w_cf][w_write].rend(); ++k) {
+                    std::string write_value = k->second;
+                    int write_type;
+                    int64_t data_start_ts;
+                    DecodeWriteValue(write_value, &write_type, &data_start_ts);
+                    VLOG(12) << "[gtxn][prewrite][stxn_read]" << w.DebugString() 
+                             << " prewrite_start_ts:" << prewrite_start_ts_
+                             << " found _W_ :" <<  k->first 
+                             << " type: " << write_type 
+                             << " data_ts: " << data_start_ts;
+                    if (k->first >= prewrite_start_ts_) {
+                        status->SetFailed(ErrorCode::kGTxnWriteConflict, 
+                                          "writing by others ts:" + std::to_string(k->first));
+                        return true;
+                    }
+                }
+            } else {
+                VLOG(12) << "[gtxn][prewrite][stxn_read]" << w.DebugString() 
+                         << "not found _W_ col";
+            }
+            // check Lock column
+            const std::string& w_lock = w.LockName();
+            if (row[w_cf].find(w_lock) != row[w_cf].end()) {
+                auto k = row[w_cf][w_lock].rbegin();
+                if (k != row[w_cf][w_lock].rend()) {
+                    VLOG(12) << "[gtxn][prewrite][stxn_read]" << w.DebugString() 
+                             << "locked@: " << k->first;
+                    status->SetFailed(ErrorCode::kGTxnLockConflict, 
+                                      w.DebugString() + "locked@:" + std::to_string(k->first));
+                    return true;
+                }
+            }
+        }
+    }
+    return false;
+}
+
+void GlobalTxnInternal::SetPrewriteStartTimestamp(const int64_t prewrite_start_ts) {
+    prewrite_start_ts_ = prewrite_start_ts;
+}
+
+bool GlobalTxnInternal::IsGTxnColumnFamily(const std::string& tablename, 
+                                           const std::string& column_family) {
+    MutexLock lock(&tables_mu_);
+    auto it = tables_.find(tablename);
+    if (it != tables_.end()) {
+        std::set<std::string>& gtxn_cfs = (it->second).second;
+        auto cfs_it = gtxn_cfs.find(column_family);
+        if (cfs_it != gtxn_cfs.end()) {
+            return true;
+        }
+    }
+    return false;
+}
+
+std::string GlobalTxnInternal::GetClientSession() {
+    ClientImpl* client_impl = static_cast<ClientImpl*>(client_);
+    return client_impl->ClientSession();
+}
+
+std::string GlobalTxnInternal::DebugString(const Cell& cell, const std::string& msg) const {
+    std::stringstream ss;
+    ss << msg << " @ [" << cell.Table()->GetName() << ":" 
+       << cell.RowKey() << ":" << cell.ColFamily()
+       << ":" << cell.Qualifier() << ":" << cell.Timestamp() << "]";
+    return ss.str();
+}
+
+int64_t GlobalTxnInternal::TEST_Init(const std::string& conf_file) {
+    if (FLAGS_tera_gtxn_test_opened) {
+        TEST_GtxnTestHelper_ = new GlobalTxnTestHelper(conf_file);
+        TEST_GtxnTestHelper_->LoadTxnConf();
+        start_ts_ = TEST_GtxnTestHelper_->GetStartTs();
+        prewrite_start_ts_ = TEST_GtxnTestHelper_->GetPrewriteStartTs();
+    }
+    return start_ts_;
+}
+
+void GlobalTxnInternal::TEST_GetSleep() {
+    if (FLAGS_tera_gtxn_test_opened) {
+        TEST_GtxnTestHelper_->GetWait(start_ts_);
+    }
+}
+
+void GlobalTxnInternal::TEST_Sleep() {
+    if (FLAGS_tera_gtxn_test_opened) {
+        TEST_GtxnTestHelper_->Wait(start_ts_);
+    }
+}
+
+void GlobalTxnInternal::TEST_Destory() {
+    if (FLAGS_tera_gtxn_test_opened) {
+        delete TEST_GtxnTestHelper_;
+    }
+}
+
+int64_t GlobalTxnInternal::TEST_GetCommitTimestamp() {
+    return TEST_GtxnTestHelper_->GetCommitTs(); 
+}
+
+int64_t GlobalTxnInternal::TEST_GetPrewriteStartTimestamp() {
+    return TEST_GtxnTestHelper_->GetPrewriteStartTs();
+}
+
+void GlobalTxnInternal::PerfReadDelay(int64_t begin_time, int64_t finish_time) {
+    read_cost_time_.Add(finish_time - begin_time);
+}
+void GlobalTxnInternal::PerfCommitDelay(int64_t begin_time, int64_t finish_time) {
+    commit_cost_time_.Add(finish_time - begin_time);
+}
+
+void GlobalTxnInternal::PerfPrewriteDelay(int64_t begin_time, int64_t finish_time) {
+    prewrite_cost_time_.Add(finish_time - begin_time);
+}
+
+void GlobalTxnInternal::PerfPrimaryCommitDelay(int64_t begin_time, int64_t finish_time) { 
+    primary_cost_time_.Add(finish_time - begin_time);
+}
+
+void GlobalTxnInternal::PerfSecondariesCommitDelay(int64_t begin_time, int64_t finish_time) {
+    secondaries_cost_time_.Add(finish_time - begin_time);
+}
+
+void GlobalTxnInternal::PerfAckDelay(int64_t begin_time, int64_t finish_time) {
+    acks_cost_time_.Add(finish_time - begin_time);
+}
+
+void GlobalTxnInternal::PerfNotifyDelay(int64_t begin_time, int64_t finish_time) {
+    notifies_cost_time_.Add(finish_time - begin_time);
+}
+
+void GlobalTxnInternal::PerfReport() {
+    gtxn_read_delay_us.Add(read_cost_time_.Clear());
+    gtxn_commit_delay_us.Add(commit_cost_time_.Clear());
+    gtxn_prewrite_delay_us.Add(prewrite_cost_time_.Clear());
+    gtxn_primary_delay_us.Add(primary_cost_time_.Clear());
+    gtxn_secondaries_delay_us.Add(secondaries_cost_time_.Clear());
+    gtxn_acks_delay_us.Add(acks_cost_time_.Clear());
+    gtxn_notifies_delay_us.Add(notifies_cost_time_.Clear());
+}
+
+} // namespace tera
+
+/* vim: set expandtab ts=4 sw=4 sts=4 tw=100: */
diff --git a/src/sdk/global_txn_internal.h b/src/sdk/global_txn_internal.h
new file mode 100644
index 000000000..95eaae825
--- /dev/null
+++ b/src/sdk/global_txn_internal.h
@@ -0,0 +1,366 @@
+// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+//
+// Author: baorenyi@baidu.com
+
+#ifndef  TERA_SDK_GLOBAL_TXN_INTERNAL_H_
+#define  TERA_SDK_GLOBAL_TXN_INTERNAL_H_
+
+#include <atomic>
+#include <map>
+#include <string>
+#include <set>
+#include <utility> 
+
+#include "common/mutex.h"
+#include "io/coding.h"
+#include "sdk/global_txn.h"
+#include "sdk/test/global_txn_testutils.h"
+#include "sdk/sdk_utils.h"
+#include "sdk/single_row_txn.h"
+#include "sdk/table_impl.h"
+#include "sdk/timeoracle_client_impl.h"
+#include "tera.h"
+#include "common/timer.h"
+
+namespace tera {
+
+class Cell;
+class GlobalTxnTestHelper;
+class Write;
+
+inline void PrintCostTime(const std::string& msg, int64_t begin_time) {
+    VLOG(12) << msg <<" cost: " << get_micros() - begin_time;
+}
+
+inline std::string Int64ToEncodedString(int64_t n) {
+    char buf[sizeof(int64_t)];
+    io::EncodeBigEndian(buf, n);
+    std::string s (buf, sizeof(int64_t));
+    return s;
+}
+
+inline int64_t EncodedStringToInt64(const std::string& s) {
+    return io::DecodeBigEndain(s.c_str());
+}
+
+inline std::string PackLockName(const std::string& qualifier) {
+    return qualifier + "_L_";
+}
+
+inline std::string PackWriteName(const std::string& qualifier) {
+    return qualifier + "_W_";
+}
+
+inline std::string EncodeLockValue(int type, const std::string& primary_str) {
+    return (char)type + primary_str;
+}
+
+inline bool DecodeLockValue(const std::string& value, 
+                            int* type, tera::PrimaryInfo* info) {
+    if (value.length() > 1) {
+        *type = (int)value[0];
+        return info->ParseFromString(value.substr(1));
+    } else {
+        *type = -1;
+        return false;
+    }
+}
+
+inline std::string EncodeWriteValue(int type, int64_t timestamp) {
+    return (char)type + Int64ToEncodedString(timestamp);
+}
+
+inline bool DecodeWriteValue(const std::string& value, int* type, int64_t* timestamp) {
+    if (value.length() > 1) {
+        *type = (int)value[0];
+        *timestamp = EncodedStringToInt64(value.substr(1));
+        return true;
+    } else {
+        *type = -1;
+        *timestamp = -1;
+        return false;
+    }
+}
+
+inline std::string PackNotifyName(const std::string& column_family, 
+                                  const std::string& qualifier) {
+    return column_family + ":" + qualifier;
+}
+
+inline bool BadQualifier(const std::string& qualifier) {
+    size_t q_len = qualifier.length();
+    return q_len >= 3 && qualifier[q_len - 1] == '_' && qualifier[q_len - 3] == '_';
+}
+
+struct PrewriteContext {
+    std::vector<Write>* ws;
+    Transaction* gtxn;
+    std::string table_name;
+    std::string row_key;
+    ErrorCode status; 
+    PrewriteContext(std::vector<Write>* same_row_ws, 
+                    Transaction* txn, 
+                    const std::string& tablename, 
+                    const std::string& rowkey) :
+        ws(same_row_ws),
+        gtxn(txn),
+        table_name(tablename),
+        row_key(rowkey) {
+        status.SetFailed(ErrorCode::kOK);
+    }
+    const std::string DebugString() const {
+        return "[tablename=" + table_name + ",rowkey=" + row_key + "]" + status.ToString();
+    }
+};
+// one user reader will have one InternalReaderContext
+struct InternalReaderContext {
+    int expected_cell_cnt;
+    int active_cell_cnt;
+    int fail_cell_cnt;
+    int not_found_cnt;
+    RowReader* user_reader;
+    Transaction* gtxn;
+    std::map<Cell*, int> cell_map;
+    RowResult results;
+    ErrorCode last_err;
+    
+    InternalReaderContext(int expected_cnt, RowReader* reader, Transaction* txn)
+        : expected_cell_cnt(expected_cnt),
+          active_cell_cnt(0),
+          fail_cell_cnt(0),
+          not_found_cnt(0),
+          user_reader(reader),
+          gtxn(txn) {}
+};
+// one cell reader will have one CellReaderContext
+struct CellReaderContext {
+    Cell* cell;
+    InternalReaderContext* internal_reader_ctx;
+    ErrorCode status;
+    CellReaderContext(Cell* c, InternalReaderContext* ctx)
+        : cell(c),
+          internal_reader_ctx(ctx) {}
+};
+
+class Cell {
+public:
+    Cell(tera::Table* table, 
+         const std::string& row_key, 
+         const std::string& column_family, 
+         const std::string& qualifier, 
+         const int64_t timestamp = 0, 
+         const std::string& value = "") : 
+        table_(table), 
+        row_key_(row_key),
+        column_family_(column_family),
+        qualifier_(qualifier),
+        timestamp_(timestamp),
+        value_(value),
+        tablename_("") {
+        
+        assert(table_ != NULL);
+        tablename_ = table_->GetName();
+    }
+
+    tera::Table* Table() const { return table_; }
+
+    const std::string TableName() const { return tablename_; }
+    const std::string& RowKey() const { return row_key_; }
+    const std::string& ColFamily() const { return column_family_; }
+    const std::string& Qualifier() const { return qualifier_; }
+    const std::string LockName() const { return PackLockName(qualifier_); }
+    const std::string WriteName() const { return PackWriteName(qualifier_); }
+    const std::string NotifyName() const { return PackNotifyName(column_family_, qualifier_); }
+    const int64_t Timestamp() const { return timestamp_; }
+    void SetTimestamp(const int64_t timestamp) {
+        timestamp_ = timestamp;
+    }
+    const std::string& Value() const { return value_; }
+    void SetValue(const std::string& value) {
+        value_ = value;
+    }
+private:
+    tera::Table* table_;
+    std::string row_key_;  
+    std::string column_family_;
+    std::string qualifier_;
+    int64_t timestamp_;
+    std::string value_;
+    std::string tablename_;
+};
+
+class Write {
+public:
+    Write(const Cell& cell, const int& type = 0) 
+        : cell_(cell),
+          type_(type),
+          is_primary_(false) {}
+
+    int WriteType() const { return type_; }
+    bool IsPrimary() const { return is_primary_; }
+    tera::Table* Table() const { return cell_.Table(); }
+    const std::string TableName() const { return cell_.TableName(); }
+    const std::string& RowKey() const { return cell_.RowKey(); }
+    const std::string& ColFamily() const { return cell_.ColFamily(); }
+    const std::string& Qualifier() const { return cell_.Qualifier(); }
+    const std::string LockName() const { return cell_.LockName(); }
+    const std::string WriteName() const { return cell_.WriteName(); }
+    const std::string NotifyName() const { return cell_.NotifyName(); }
+    const int64_t Timestamp() const { return cell_.Timestamp(); }
+    const std::string& Value() const { return cell_.Value(); }
+    const int64_t GetSize() {
+        return cell_.RowKey().length() + cell_.ColFamily().length() +
+               cell_.Qualifier().length() + cell_.Value().length();
+    }
+    bool IsSameRow(Write* w) {
+        return RowKey() == w->RowKey() 
+               && Table() == w->Table();
+    }
+
+    void Serialize(const int64_t start_ts, 
+                   const std::string& session, 
+                   std::string* primary_info) {
+        tera::PrimaryInfo primary;
+        primary.set_table_name(TableName());
+        primary.set_row_key(RowKey());
+        primary.set_column_family(ColFamily());
+        primary.set_qualifier(Qualifier());
+        primary.set_gtxn_start_ts(start_ts);
+        primary.set_client_session(session),
+        primary.SerializeToString(primary_info);
+    }
+
+    const std::string DebugString() const {
+        std::stringstream ss;
+        ss <<"[" << TableName() << ":" << RowKey() << ":" << ColFamily()
+           << ":" << Qualifier() << "]";
+        return ss.str();
+    }
+
+private:
+    tera::Cell cell_;
+    int type_;
+    bool is_primary_;
+};    
+
+class GlobalTxnInternal {
+public:
+    friend class GlobalTxn;
+    GlobalTxnInternal(tera::Client* client);
+
+    ~GlobalTxnInternal();
+    // for common
+    void SetStartTimestamp(int64_t ts);
+
+    bool CheckTable(Table* table, ErrorCode* status);
+    
+    Table* FindTable(const std::string& tablename);
+    
+    bool IsPrimary(const tera::Cell& cell, 
+                   const tera::PrimaryInfo& primary_info);
+    
+    bool IsGTxnColumnFamily(const std::string& tablename, 
+                            const std::string& column_family);
+    
+    // for get
+    bool VerifyUserRowReader(RowReader* user_reader);
+    
+    bool PrimaryIsLocked(const tera::PrimaryInfo& primary_info, 
+                         const int64_t lock_ts,
+                         ErrorCode* status);
+
+    bool IsLockedByOthers(RowReader::TRow& row, const tera::Cell& cell);
+    
+    bool SuspectLive(const tera::PrimaryInfo& primary_info);
+
+    // for prewrite
+    void BuildRowReaderForPrewrite(const std::vector<Write>& ws, RowReader* reader);
+
+    void BuildRowMutationForPrewrite(std::vector<Write>* ws, 
+                                     RowMutation* txn_mu, 
+                                     const std::string& primary_info);
+
+	bool ConflictWithOtherWrite(const std::vector<Write>* ws, 
+                                RowReader* reader, 
+                                ErrorCode* status);
+    
+    // for applyMutation 
+    bool VerifyUserRowMutation(RowMutation* user_mu);
+    bool VerifyWritesSize(RowMutation* user_mu, int64_t* size);
+    
+    // for commit
+    void BuildRowMutationForCommit(std::vector<Write>* ws, 
+                                   RowMutation* txn_mu, 
+                                   const int64_t commit_ts);
+
+    void BuildRowMutationForAck(std::vector<Write>* ws, RowMutation* txn_mu);
+    
+    void BuildRowMutationForNotify(std::vector<Write>* ws, 
+                                   RowMutation* txn_mu, 
+                                   const int64_t commit_ts);
+
+    void SetPrewriteStartTimestamp(const int64_t prewrite_start_ts);
+
+    // for timeout 
+    void SetCommitDuration(int64_t timeout_ms);
+    void SetInternalSdkTaskTimeout(RowMutation* mutation);
+    void SetInternalSdkTaskTimeout(RowReader* reader);
+    bool IsTimeOut();
+
+    // for other transaction alive
+    std::string GetClientSession();
+private:    
+    // for pref
+    void UpdateTimerCounter(Counter* c) {
+        c->Set(get_micros() - c->Get());
+    }
+
+    // for debug and test
+    std::string DebugString(const tera::Cell& cell, const std::string& msg) const ;
+    int64_t TEST_Init(const std::string& conf_file);
+    void TEST_Sleep();
+    void TEST_GetSleep();
+    void TEST_Destory();
+    int64_t TEST_GetCommitTimestamp();
+    int64_t TEST_GetPrewriteStartTimestamp();
+
+    void PerfReadDelay(int64_t begin_time, int64_t finish_time);
+    void PerfCommitDelay(int64_t begin_time, int64_t finish_time);
+    void PerfPrewriteDelay(int64_t begin_time, int64_t finish_time);
+    void PerfPrimaryCommitDelay(int64_t begin_time, int64_t finish_time);
+    void PerfSecondariesCommitDelay(int64_t begin_time, int64_t finish_time);
+    void PerfAckDelay(int64_t begin_time, int64_t finish_time);
+    void PerfNotifyDelay(int64_t begin_time, int64_t finish_time);
+
+    void PerfReport();
+private:
+    GlobalTxnInternal(const GlobalTxnInternal&) = delete;
+    GlobalTxnInternal& operator=(const GlobalTxnInternal&) = delete;
+    // for test
+    GlobalTxnTestHelper* TEST_GtxnTestHelper_;
+    // tablename-> (Table*, set(gtxn_cf_name))
+    typedef std::map<std::string, std::pair<Table*, std::set<std::string> > > TableInfoMap;
+    TableInfoMap tables_;
+    mutable Mutex tables_mu_;
+    int64_t start_ts_;
+    int64_t prewrite_start_ts_;
+
+    // for record this transaction perf
+    Counter read_cost_time_;
+    Counter commit_cost_time_;
+    Counter prewrite_cost_time_;
+    Counter primary_cost_time_;
+    Counter secondaries_cost_time_;
+    Counter acks_cost_time_;
+    Counter notifies_cost_time_;
+
+    int64_t terminal_time_;
+    std::atomic<bool> is_timeout_;
+    tera::Client* client_;
+};
+
+} // namespace tera
+
+#endif  // TERA_SDK_GLOBAL_TXN_INTERNAL_H_
diff --git a/src/sdk/http/http.cc b/src/sdk/http/http.cc
index 562c647f3..da7b571c0 100644
--- a/src/sdk/http/http.cc
+++ b/src/sdk/http/http.cc
@@ -13,7 +13,7 @@
 
 #include "proto/http.pb.h"
 #include "tera.h"
-#include "utils/counter.h"
+#include "common/counter.h"
 
 DECLARE_int32(tera_http_ctrl_thread_num);
 DECLARE_int32(tera_http_request_thread_num);
diff --git a/src/sdk/multi_row_txn.cc b/src/sdk/multi_row_txn.cc
deleted file mode 100644
index 7f9b1a8c8..000000000
--- a/src/sdk/multi_row_txn.cc
+++ /dev/null
@@ -1,79 +0,0 @@
-// Copyright (c) 2017, Baidu.com, Inc. All Rights Reserved
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file.
-
-#include "common/thread_pool.h"
-
-#include "sdk/read_impl.h"
-#include "sdk/single_row_txn.h"
-#include "sdk/table_impl.h"
-#include "sdk/multi_row_txn.h"
-
-namespace tera {
-
-Transaction* NewTransaction() {
-    return MultiRowTxn::NewMultiRowTxn();
-}
-
-Transaction* MultiRowTxn::NewMultiRowTxn() {
-    // int64_t start_ts = TimeOracle::GetTimestamp();
-    int64_t start_ts = 42;
-    if (start_ts > 0) {
-        return new MultiRowTxn(start_ts);
-    } else {
-        return NULL;
-    }
-}
-
-MultiRowTxn::MultiRowTxn(int64_t start_ts)
-   : start_ts_(start_ts) {}
-
-MultiRowTxn::~MultiRowTxn() {}
-
-std::string LockColumnName(const std::string& c) {
-    return c + "__l__"; // lock
-}
-
-std::string WriteColumnName(const std::string& c) {
-    return c + "__w__"; // write
-}
-
-bool MultiRowTxn::IsWritingByOthers(RowMutation* row_mu, RowReader* reader) {
-    return false;
-}
-
-bool MultiRowTxn::IsLockedByOthers(RowMutation* row_mu, RowReader* reader) {
-    return false;
-}
-
-ErrorCode MultiRowTxn::Prewrite(RowMutation* w, RowMutation* primary) {
-    ErrorCode status;
-    return status;
-}
-
-bool MultiRowTxn::LockExists(tera::Transaction* single_row_txn, RowMutation* row_mu) {
-    return false;
-}
-
-ErrorCode MultiRowTxn::Commit() {
-    assert(writes_.size() > 0);
-
-    ErrorCode status;
-    return status;
-}
-
-void MultiRowTxn::ApplyMutation(RowMutation* row_mu) {
-    assert(row_mu != NULL);
-    writes_.push_back(row_mu);
-}
-
-ErrorCode MultiRowTxn::Get(RowReader* row_reader) {
-    assert(row_reader != NULL);
-
-    ErrorCode status;
-    return status;
-}
-
-} // namespace tera
-
-/* vim: set expandtab ts=4 sw=4 sts=4 tw=100: */
diff --git a/src/sdk/multi_row_txn.h b/src/sdk/multi_row_txn.h
deleted file mode 100644
index acc9998a6..000000000
--- a/src/sdk/multi_row_txn.h
+++ /dev/null
@@ -1,56 +0,0 @@
-// Copyright (c) 2016, Baidu.com, Inc. All Rights Reserved
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file.
-
-#ifndef  TERA_SDK_TXN_H_
-#define  TERA_SDK_TXN_H_
-
-#include <string>
-#include <vector>
-
-#include "tera.h"
-
-namespace tera {
-
-/// cross-row, cross-table transaction
-/// 跨行，跨表事务
-
-class MultiRowTxn: public Transaction {
-public:
-    static Transaction* NewMultiRowTxn();
-    virtual ~MultiRowTxn();
-
-    virtual ErrorCode Get(RowReader* row_reader);
-    virtual void ApplyMutation(RowMutation* row_mu);
-    /// 提交事务
-    /// 同步模式下，Commit()的返回值代表了提交操作的结果(成功 或者 失败及其原因)
-    /// 异步模式下，通过GetError()获取提交结果
-    virtual ErrorCode Commit();
-
-    typedef void (*Callback)(Transaction* transaction);
-    virtual void SetCommitCallback(Callback callback) {}
-    virtual Callback GetCommitCallback() { return NULL; }
-    virtual void SetContext(void* context) {}
-    virtual void* GetContext() { return NULL; }
-    virtual const ErrorCode& GetError() { return status_; }
-    virtual int64_t GetStartTimestamp() { return 0; }
-
-private:
-    MultiRowTxn(int64_t start_ts);
-    MultiRowTxn(const MultiRowTxn&);
-    void operator=(const MultiRowTxn&);
-
-    bool IsWritingByOthers(RowMutation* row_mu, RowReader* reader);
-    bool IsLockedByOthers(RowMutation* row_mu, RowReader* reader);
-    bool LockExists(tera::Transaction* single_row_txn, RowMutation* row_mu);
-    ErrorCode Prewrite(RowMutation* w, RowMutation* primary);
-
-private:
-    int64_t start_ts_;
-    std::vector<RowMutation*> writes_;
-    ErrorCode status_;
-};
-
-} // namespace tera
-
-#endif  // TERA_SDK_TXN_H_
diff --git a/src/sdk/mutate_impl.cc b/src/sdk/mutate_impl.cc
index a90f850d8..634fb3817 100644
--- a/src/sdk/mutate_impl.cc
+++ b/src/sdk/mutate_impl.cc
@@ -5,7 +5,7 @@
 #include "common/base/string_format.h"
 #include "io/coding.h"
 #include "sdk/mutate_impl.h"
-#include "utils/timer.h"
+#include "common/timer.h"
 
 namespace tera {
 
diff --git a/src/sdk/mutate_impl.h b/src/sdk/mutate_impl.h
index 9b22af41f..c86a98c8d 100644
--- a/src/sdk/mutate_impl.h
+++ b/src/sdk/mutate_impl.h
@@ -13,7 +13,7 @@
 #include "sdk/sdk_task.h"
 #include "tera.h"
 #include "types.h"
-#include "utils/timer.h"
+#include "common/timer.h"
 
 namespace tera {
 
diff --git a/src/sdk/read_impl.cc b/src/sdk/read_impl.cc
index 352e645b0..35738cc53 100644
--- a/src/sdk/read_impl.cc
+++ b/src/sdk/read_impl.cc
@@ -19,6 +19,7 @@ RowReaderImpl::RowReaderImpl(TableImpl* table, const std::string& row_key)
       ts_start_(kOldestTs),
       ts_end_(kLatestTs),
       max_version_(1),
+      max_qualifiers_(std::numeric_limits<uint64_t>::max()),
       snapshot_id_(0),
       timeout_ms_(0),
       retry_times_(0),
@@ -78,6 +79,12 @@ uint32_t RowReaderImpl::GetMaxVersions() {
     return max_version_;
 }
 
+void RowReaderImpl::SetMaxQualifiers(uint64_t max_qualifiers) {
+    max_qualifiers_ = max_qualifiers;
+}
+uint64_t RowReaderImpl::GetMaxQualifiers() {
+    return max_qualifiers_;
+}
 
 /// 设置超时时间(只影响当前操作,不影响Table::SetReadTimeout设置的默认读超时)
 void RowReaderImpl::SetTimeOut(int64_t timeout_ms) {
@@ -303,6 +310,7 @@ const RowReader::ReadColumnList& RowReaderImpl::GetReadColumnList() {
 void RowReaderImpl::ToProtoBuf(RowReaderInfo* info) {
     info->set_key(row_key_);
     info->set_max_version(max_version_);
+    info->set_max_qualifiers(max_qualifiers_);
     info->mutable_time_range()->set_ts_start(ts_start_);
     info->mutable_time_range()->set_ts_end(ts_end_);
 
diff --git a/src/sdk/read_impl.h b/src/sdk/read_impl.h
index cf88cd65c..23dabcda1 100644
--- a/src/sdk/read_impl.h
+++ b/src/sdk/read_impl.h
@@ -13,7 +13,7 @@
 #include "sdk/sdk_task.h"
 #include "tera.h"
 #include "types.h"
-#include "utils/timer.h"
+#include "common/timer.h"
 
 namespace tera {
 
@@ -44,6 +44,8 @@ class RowReaderImpl : public RowReader, public SdkTask {
     void SetMaxVersions(uint32_t max_version);
     /// 返回max_version
     uint32_t GetMaxVersions();
+    void SetMaxQualifiers(uint64_t max_qualifiers);
+    uint64_t GetMaxQualifiers();
     /// 设置超时时间(只影响当前操作,不影响Table::SetReadTimeout设置的默认读超时)
     void SetTimeOut(int64_t timeout_ms);
     /// 设置异步回调, 操作会异步返回
@@ -120,6 +122,8 @@ class RowReaderImpl : public RowReader, public SdkTask {
 
     Table* GetTable() { return (Table*)table_; }
 
+    uint32_t Size() { return 0; }
+
 private:
     TableImpl* table_;
     std::string row_key_;
@@ -137,6 +141,7 @@ class RowReaderImpl : public RowReader, public SdkTask {
     int64_t ts_start_;
     int64_t ts_end_;
     uint32_t max_version_;
+    uint64_t max_qualifiers_;
     uint64_t snapshot_id_;
 
     int64_t timeout_ms_;
diff --git a/src/sdk/rowlock_client.cc b/src/sdk/rowlock_client.cc
new file mode 100644
index 000000000..ff145eeb5
--- /dev/null
+++ b/src/sdk/rowlock_client.cc
@@ -0,0 +1,140 @@
+// Copyright (c) 2017, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "sdk/rowlock_client.h"
+
+#include <stdlib.h>
+#include <time.h>
+
+#include "gflags/gflags.h"
+
+#include "observer/rowlocknode/ins_rowlock_client_zk_adapter.h"
+#include "proto/rowlocknode_rpc.pb.h"
+#include "types.h"
+#include "utils/utils_cmd.h"
+
+DECLARE_string(rowlock_server_port);
+DECLARE_string(tera_coord_type);
+DECLARE_bool(rowlock_test);
+DECLARE_int32(rowlock_client_max_fail_times);
+DECLARE_bool(mock_rowlock_enable);
+
+namespace tera{
+namespace observer {
+
+ThreadPool* RowlockStub::thread_pool_ = NULL;
+
+void RowlockStub::SetThreadPool(ThreadPool* thread_pool) {
+    thread_pool_ = thread_pool;
+}
+
+void RowlockStub::SetRpcOption(int32_t max_inflow, int32_t max_outflow,
+        int32_t pending_buffer_size, int32_t thread_num) {
+    tera::RpcClientBase::SetOption(max_inflow, max_outflow,
+            pending_buffer_size, thread_num);
+}
+
+RowlockStub::RowlockStub(const std::string& server_addr,
+        int32_t rpc_timeout)
+    : tera::RpcClient<RowlockService::Stub>(server_addr),
+      rpc_timeout_(rpc_timeout) {
+}
+
+RowlockStub::~RowlockStub() {}
+
+bool RowlockStub::TryLock(const RowlockRequest* request,
+        RowlockResponse* response,
+        std::function<void (RowlockRequest*, RowlockResponse*, bool, int)> done) {
+    return SendMessageWithRetry(&RowlockService::Stub::Lock,
+            request, response, done, "TryLock",
+            rpc_timeout_, thread_pool_);
+}
+
+bool RowlockStub::UnLock(const RowlockRequest* request,
+        RowlockResponse* response,
+        std::function<void (RowlockRequest*, RowlockResponse*, bool, int)> done) {
+    return SendMessageWithRetry(&RowlockService::Stub::UnLock,
+            request, response, done, "UnLock",
+            rpc_timeout_, thread_pool_);
+}
+
+void RowlockClient::SetThreadPool(ThreadPool* thread_pool) {
+    RowlockStub::SetThreadPool(thread_pool);
+}
+
+RowlockClient::RowlockClient(const std::string& addr, int32_t rpc_timeout)
+    : local_addr_(tera::utils::GetLocalHostName() + ":" + FLAGS_rowlock_server_port) {
+    srand((unsigned int)(time(NULL)));
+
+    SetZkAdapter();
+}
+
+void RowlockClient::Update(const std::vector<std::string>& addrs) {
+    std::string addr = addrs[rand() % addrs.size()];
+    std::shared_ptr<RowlockStub> client(new RowlockStub(addr));
+
+    MutexLock locker(&client_mutex_);
+    client_.swap(client);
+}
+
+bool RowlockClient::TryLock(const RowlockRequest* request,
+        RowlockResponse* response,
+        std::function<void (RowlockRequest*, RowlockResponse*, bool, int)> done) {
+    std::shared_ptr<RowlockStub> client;
+    {
+        MutexLock locker(&client_mutex_);
+        // COW ref +1
+        client = client_;
+    }
+    for (int32_t i = 0; i < FLAGS_rowlock_client_max_fail_times; ++i) {
+        bool ret = client->TryLock(request, response, done);
+        if (ret) {
+            return true;
+        }
+        // rpc fail
+        SetZkAdapter();
+    }
+    return false;
+}
+
+bool RowlockClient::UnLock(const RowlockRequest* request,
+        RowlockResponse* response,
+        std::function<void (RowlockRequest*, RowlockResponse*, bool, int)> done) {
+    std::shared_ptr<RowlockStub> client;
+    {
+        MutexLock locker(&client_mutex_);
+        // copy-on-write ref+1
+        client = client_;
+    }
+    for (int32_t i = 0; i < FLAGS_rowlock_client_max_fail_times; ++i) {
+        bool ret = client->TryLock(request, response, done);
+        if (ret) {
+            return true;
+        }
+        // rpc fail
+        SetZkAdapter();
+    }
+    return false;
+}
+
+void RowlockClient::SetZkAdapter() {
+    // mock rowlock, do not need a real zk adapter
+    if (FLAGS_mock_rowlock_enable == true) {
+        return;
+    }
+
+    if (FLAGS_tera_coord_type == "zk") {
+        zk_adapter_.reset(new ZkRowlockClientZkAdapter(this, local_addr_));
+    } else if (FLAGS_tera_coord_type == "ins") {
+        zk_adapter_.reset(new InsRowlockClientZkAdapter(this, local_addr_));
+    } else {
+        LOG(ERROR) << "Unknow coord type for rowlock client";
+        return;
+    }
+
+    zk_adapter_->Init();
+}
+
+} // namespace observer
+} // namespace tera
diff --git a/src/sdk/rowlock_client.h b/src/sdk/rowlock_client.h
new file mode 100644
index 000000000..c475a180f
--- /dev/null
+++ b/src/sdk/rowlock_client.h
@@ -0,0 +1,77 @@
+// Copyright (c) 2017, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef TERA_SDK_ROWLOCK_CLIENT_H_
+#define TERA_SDK_ROWLOCK_CLIENT_H_
+
+#include <atomic>
+
+#include <gflags/gflags.h>
+#include <sofa/pbrpc/pbrpc.h>
+
+#include "common/mutex.h"
+#include "observer/rowlocknode/zk_rowlock_client_zk_adapter.h"
+#include "proto/rpc_client.h"
+#include "proto/rowlocknode_rpc.pb.h"
+
+namespace tera {
+namespace observer {
+
+class RowlockClientZkAdapter;
+
+class RowlockStub : public tera::RpcClient<RowlockService::Stub> {
+public:
+    static void SetThreadPool(ThreadPool* thread_pool);
+
+    static void SetRpcOption(int32_t max_inflow = -1, int32_t max_outflow = -1,
+            int32_t pending_buffer_size = -1,
+            int32_t thread_num = -1);
+
+    RowlockStub(const std::string& addr = "", int32_t rpc_timeout = 60000);
+    ~RowlockStub();
+
+    virtual bool TryLock(const RowlockRequest* request,
+            RowlockResponse* response,
+            std::function<void (RowlockRequest*, RowlockResponse*, bool, int)> done = NULL);
+
+    virtual bool UnLock(const RowlockRequest* request,
+            RowlockResponse* response,
+            std::function<void (RowlockRequest*, RowlockResponse*, bool, int)> done = NULL);
+
+
+private:
+    int32_t rpc_timeout_;
+    static ThreadPool* thread_pool_;
+};
+
+class RowlockClient {
+public:
+    static void SetThreadPool(ThreadPool* thread_pool);
+    
+    RowlockClient(const std::string& addr = "", int32_t rpc_timeout = 60000);
+    ~RowlockClient() {}
+
+    virtual bool TryLock(const RowlockRequest* request,
+            RowlockResponse* response,
+            std::function<void (RowlockRequest*, RowlockResponse*, bool, int)> done = NULL);
+
+    virtual bool UnLock(const RowlockRequest* request,
+            RowlockResponse* response,
+            std::function<void (RowlockRequest*, RowlockResponse*, bool, int)> done = NULL);
+
+    void Update(const std::vector<std::string>& addrs);
+
+private:
+    void SetZkAdapter();
+
+private:
+    mutable Mutex client_mutex_;
+    std::shared_ptr<RowlockStub> client_;
+    std::unique_ptr<ZkRowlockClientZkAdapter> zk_adapter_;
+    std::string local_addr_;
+};
+
+} // namespace observer
+} // namespace tera
+#endif  // TERA_SDK_ROWLOCK_CLIENT_H
diff --git a/src/sdk/scan.cc b/src/sdk/scan.cc
index 846cc7044..f4b630216 100644
--- a/src/sdk/scan.cc
+++ b/src/sdk/scan.cc
@@ -31,6 +31,10 @@ void ScanDescriptor::SetMaxVersions(int32_t versions) {
     impl_->SetMaxVersions(versions);
 }
 
+void ScanDescriptor::SetMaxQualifiers(uint64_t max_qualifiers) {
+    impl_->SetMaxQualifiers(max_qualifiers);
+}
+
 void ScanDescriptor::SetPackInterval(int64_t interval) {
     impl_->SetPackInterval(interval);
 }
diff --git a/src/sdk/scan_impl.cc b/src/sdk/scan_impl.cc
index 786a05547..68049b017 100644
--- a/src/sdk/scan_impl.cc
+++ b/src/sdk/scan_impl.cc
@@ -5,6 +5,7 @@
 #include "sdk/scan_impl.h"
 
 #include <functional>
+#include <algorithm>
 
 #include "common/this_thread.h"
 #include "common/base/string_ext.h"
@@ -14,16 +15,18 @@
 #include "sdk/filter_utils.h"
 #include "sdk/sdk_utils.h"
 #include "sdk/table_impl.h"
-#include "utils/atomic.h"
-#include "utils/timer.h"
+#include "common/atomic.h"
+#include "common/timer.h"
 
 DECLARE_bool(tera_sdk_batch_scan_enabled);
 DECLARE_int64(tera_sdk_scan_number_limit);
 DECLARE_int64(tera_sdk_scan_buffer_size);
 DECLARE_int32(tera_sdk_max_batch_scan_req);
 DECLARE_int32(tera_sdk_batch_scan_max_retry);
+DECLARE_int32(tera_sdk_sync_scan_max_retry);
 DECLARE_int64(tera_sdk_scan_timeout);
 DECLARE_int64(batch_scan_delay_retry_in_us);
+DECLARE_int64(sync_scan_delay_retry_in_ms);
 
 namespace tera {
 
@@ -374,6 +377,7 @@ ResultStreamSyncImpl::ResultStreamSyncImpl(TableImpl* table,
       response_(new tera::ScanTabletResponse),
       result_pos_(0),
       finish_cond_(&finish_mutex_),
+      retry_times_(0),
       finish_(false) {
     table_ptr_->ScanTabletSync(this);
 }
@@ -392,13 +396,37 @@ bool ResultStreamSyncImpl::Done(ErrorCode* err) {
     while (1) {
         const string& scan_end_key = scan_desc_impl_->GetEndRowKey();
         /// scan failed
-        if (response_->status() != kTabletNodeOk) {
+        while (response_->status() != kTabletNodeOk &&
+               retry_times_ <= FLAGS_tera_sdk_sync_scan_max_retry) {
+            LOG(WARNING) << "[RETRY " << ++retry_times_ << "] scan error: "
+                         << StatusCodeToString(response_->status());
+
+            int64_t wait_time;
+            if(response_->status() == kKeyNotInRange) {
+                wait_time = FLAGS_sync_scan_delay_retry_in_ms;
+            } else {
+                /// Wait less than 60 seconds
+                wait_time = std::min(static_cast<int64_t>(FLAGS_sync_scan_delay_retry_in_ms * (1 << (retry_times_ - 1))),
+                                     static_cast<int64_t>(60000));
+            }
+
+            delete response_;
+            response_ = new tera::ScanTabletResponse;
+            result_pos_ = 0;
+            Reset();
+
+            ThisThread::Sleep(wait_time);
+            table_ptr_->ScanTabletSync(this);
+        }
+
+        if(response_->status() != kTabletNodeOk) {
             if (err) {
                 err->SetFailed(ErrorCode::kSystem,
-                               StatusCodeToString(response_->status()));
+                                StatusCodeToString(response_->status()));
             }
             return true;
         }
+
         if (result_pos_ < response_->results().key_values_size()) {
             break;
         }
@@ -542,6 +570,7 @@ ScanDescImpl::ScanDescImpl(const string& rowkey)
       number_limit_(FLAGS_tera_sdk_scan_number_limit),
       is_async_(FLAGS_tera_sdk_batch_scan_enabled),
       max_version_(1),
+      max_qualifiers_(std::numeric_limits<uint64_t>::max()),
       pack_interval_(FLAGS_tera_sdk_scan_timeout),
       snapshot_(0),
       value_converter_(&DefaultValueConverter) {
@@ -558,6 +587,7 @@ ScanDescImpl::ScanDescImpl(const ScanDescImpl& impl)
       number_limit_(impl.number_limit_),
       is_async_(impl.is_async_),
       max_version_(impl.max_version_),
+      max_qualifiers_(impl.max_qualifiers_),
       pack_interval_(impl.pack_interval_),
       snapshot_(impl.snapshot_),
       table_schema_(impl.table_schema_) {
@@ -622,6 +652,10 @@ void ScanDescImpl::SetMaxVersions(int32_t versions) {
     max_version_ = versions;
 }
 
+void ScanDescImpl::SetMaxQualifiers(int64_t max_qualifiers) {
+    max_qualifiers_ = max_qualifiers;
+}
+
 void ScanDescImpl::SetPackInterval(int64_t interval) {
     pack_interval_ = interval;
 }
@@ -693,6 +727,10 @@ int32_t ScanDescImpl::GetMaxVersion() const {
     return max_version_;
 }
 
+int64_t ScanDescImpl::GetMaxQualifiers() const {
+    return max_qualifiers_;
+}
+
 int64_t ScanDescImpl::GetPackInterval() const {
     return pack_interval_;
 }
diff --git a/src/sdk/scan_impl.h b/src/sdk/scan_impl.h
index 2d808044f..32d647c8b 100644
--- a/src/sdk/scan_impl.h
+++ b/src/sdk/scan_impl.h
@@ -16,7 +16,7 @@
 #include "sdk/sdk_task.h"
 #include "tera.h"
 #include "types.h"
-#include "utils/timer.h"
+#include "common/timer.h"
 
 
 namespace tera {
@@ -162,21 +162,10 @@ class ResultStreamSyncImpl : public ResultStreamImpl {
     int32_t result_pos_;
     mutable Mutex finish_mutex_;
     common::CondVar finish_cond_;
+    int32_t retry_times_;
     bool finish_;
 };
 
-struct ScanTask : public SdkTask {
-    ResultStreamImpl* stream;
-    tera::ScanTabletRequest* request;
-    tera::ScanTabletResponse* response;
-
-    uint32_t retry_times;
-    void IncRetryTimes() { retry_times++; }
-    uint32_t RetryTimes() { return retry_times; }
-    ScanTask() : SdkTask(SdkTask::SCAN), stream(NULL), request(NULL),
-        response(NULL), retry_times(0) {}
-};
-
 typedef ScanDescriptor::ValueConverter ValueConverter;
 
 class ScanDescImpl {
@@ -195,6 +184,8 @@ class ScanDescImpl {
 
     void SetMaxVersions(int32_t versions);
 
+    void SetMaxQualifiers(int64_t max_qualifiers);
+
     void SetPackInterval(int64_t timeout);
 
     void SetTimeRange(int64_t ts_end, int64_t ts_start);
@@ -238,6 +229,8 @@ class ScanDescImpl {
 
     int32_t GetMaxVersion() const;
 
+    int64_t GetMaxQualifiers() const;
+
     int64_t GetPackInterval() const;
 
     uint64_t GetSnapshot() const;
@@ -272,6 +265,7 @@ class ScanDescImpl {
     int64_t number_limit_;
     bool is_async_;
     int32_t max_version_;
+    int64_t max_qualifiers_;
     int64_t pack_interval_;
     uint64_t snapshot_;
     std::string filter_string_;
@@ -280,6 +274,26 @@ class ScanDescImpl {
     TableSchema table_schema_;
 };
 
+struct ScanTask : public SdkTask {
+    ResultStreamImpl* stream;
+    tera::ScanTabletRequest* request;
+    tera::ScanTabletResponse* response;
+
+    uint32_t retry_times;
+    void IncRetryTimes() { retry_times++; }
+    uint32_t RetryTimes() { return retry_times; }
+    ScanTask() : SdkTask(SdkTask::SCAN), stream(NULL), request(NULL),
+        response(NULL), retry_times(0) {}
+
+    virtual bool IsAsync() { return false; }
+    virtual uint32_t Size() { return 0; }
+    virtual int64_t TimeOut() { return 0; }
+    virtual void Wait() {}
+    virtual void SetError(ErrorCode::ErrorCodeType err,
+                          const std::string& reason) {}
+    virtual const std::string& RowKey() { return stream->GetScanDesc()->GetStartRowKey(); }
+};
+
 } // namespace tera
 
 #endif  // TERA_SDK_SCAN_IMPL_H_
diff --git a/src/sdk/schema_impl.cc b/src/sdk/schema_impl.cc
index 7e9e3b264..bf8cc6f00 100644
--- a/src/sdk/schema_impl.cc
+++ b/src/sdk/schema_impl.cc
@@ -14,6 +14,7 @@ DECLARE_int64(tera_master_merge_tablet_size);
 namespace tera {
 
 const std::string TableDescImpl::DEFAULT_LG_NAME = "lg0";
+const std::string TableDescImpl::NOTIFY_LG_NAME = "notify";
 const std::string TableDescImpl::DEFAULT_CF_NAME = "";
 
 /// 列族名字仅允许使用字母、数字和下划线构造, 长度不超过256
@@ -29,7 +30,9 @@ CFDescImpl::CFDescImpl(const std::string& cf_name,
       acl_(0),
       owner_(0),
       disk_quota_(-1),
-      type_("") {
+      type_(""),
+      is_global_transaction_(false),
+      is_notify_enabled_(false) {
 }
 
 int32_t CFDescImpl::Id() const {
@@ -88,6 +91,30 @@ ACL CFDescImpl::Acl() const {
     return ACL();
 }
 
+void CFDescImpl::EnableGlobalTransaction() {
+    is_global_transaction_ = true;
+}
+
+void CFDescImpl::DisableGlobalTransaction() {
+    is_global_transaction_ = false;
+}
+
+bool CFDescImpl::GlobalTransaction() const {
+    return is_global_transaction_; 
+}
+
+void CFDescImpl::EnableNotify() {
+    is_notify_enabled_ = true;
+}
+
+void CFDescImpl::DisableNotify() {
+    is_notify_enabled_ = false;
+}
+
+bool CFDescImpl::IsNotifyEnabled() const {
+    return is_notify_enabled_;
+}
+
 void CFDescImpl::SetType(const std::string& type) {
     type_ = type;
 }
diff --git a/src/sdk/schema_impl.h b/src/sdk/schema_impl.h
index a68a09f77..dcdb6c9a6 100644
--- a/src/sdk/schema_impl.h
+++ b/src/sdk/schema_impl.h
@@ -48,6 +48,18 @@ class CFDescImpl : public ColumnFamilyDescriptor {
 
     ACL Acl() const;
 
+    void EnableGlobalTransaction();
+
+    void DisableGlobalTransaction();
+    
+    bool GlobalTransaction() const;
+
+    void EnableNotify();
+
+    void DisableNotify();
+
+    bool IsNotifyEnabled() const;
+
     void SetType(const std::string& type);
 
     const std::string& Type() const;
@@ -63,6 +75,8 @@ class CFDescImpl : public ColumnFamilyDescriptor {
     int32_t owner_;
     int32_t disk_quota_;
     std::string type_;
+    bool is_global_transaction_;
+    bool is_notify_enabled_;
 };
 
 /// 局部性群组描述
@@ -192,6 +206,7 @@ class TableDescImpl {
     std::string Alias() const;
 
     static const std::string DEFAULT_LG_NAME;
+    static const std::string NOTIFY_LG_NAME;
     static const std::string DEFAULT_CF_NAME;
 
 private:
diff --git a/src/sdk/sdk_metric_name.h b/src/sdk/sdk_metric_name.h
new file mode 100644
index 000000000..5b358e912
--- /dev/null
+++ b/src/sdk/sdk_metric_name.h
@@ -0,0 +1,58 @@
+// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef TERA_SDK_METRIC_NAME_H_
+#define TERA_SDK_METRIC_NAME_H_ 
+ 
+#include <string>
+
+#include "common/metric/hardware_collectors.h"
+
+namespace tera {
+
+// global transaction labels
+const char* const kGTxnLabelRead = "gtxn:read";
+const char* const kGTxnLabelCommit = "gtxn:commit";
+const char* const kGTxnLabelTso = "gtxn:tso";
+
+// glabel transaction read metric names
+const char* const kGTxnReadDelayMetric = "tera_sdk_gtxn_read_delay_us";
+const char* const kGTxnReadCountMetric = "tera_sdk_gtxn_read_count";
+const char* const kGTxnReadFailCountMetric = "tera_sdk_gtxn_read_fail_count";
+const char* const kGTxnReadRetryCountMetric = "tera_sdk_gtxn_read_retry_count";
+const char* const kGTxnReadRollBackCountMetric = "tera_sdk_gtxn_read_rollback_count";
+const char* const kGTxnReadRollForwardCountMetric = "tera_sdk_gtxn_read_rollforward_count";
+
+// global transaction commit metric names
+const char* const kGTxnCommitDelayMetric = "tera_sdk_gtxn_commit_delay_us";
+const char* const kGTxnCommitCountMetric = "tera_sdk_gtxn_commit_count";
+const char* const kGTxnCommitFailCountMetric = "tera_sdk_gtxn_commit_fail_count";
+
+const char* const kGTxnPrewriteDelayMetric = "tera_sdk_gtxn_prewrite_delay_us";
+const char* const kGTxnPrewriteCountMetric = "tera_sdk_gtxn_prewrite_count";
+const char* const kGTxnPrewriteFailCountMetric = "tera_sdk_gtxn_prewrite_fail_count";
+
+const char* const kGTxnPrimaryDelayMetric = "tera_sdk_gtxn_primary_delay_us";
+const char* const kGTxnPrimaryCountMetric = "tera_sdk_gtxn_primary_count";
+const char* const kGTxnPrimaryFailCountMetric = "tera_sdk_gtxn_primary_fail_count";
+
+const char* const kGTxnSecondariesDelayMetric = "tera_sdk_gtxn_secondaries_delay_us";
+const char* const kGTxnSecondariesCountMetric = "tera_sdk_gtxn_secondaries_count";
+const char* const kGTxnSecondariesFailCountMetric = "tera_sdk_gtxn_secondaries_fail_count";
+
+const char* const kGTxnAcksDelayMetric = "tera_sdk_gtxn_acks_delay_us";
+const char* const kGTxnAcksCountMetric = "tera_sdk_gtxn_acks_count";
+const char* const kGTxnAcksFailCountMetric = "tera_sdk_gtxn_acks_fail_count";
+
+const char* const kGTxnNotifiesDelayMetric = "tera_sdk_gtxn_notifies_delay_us";
+const char* const kGTxnNotifiesCountMetric = "tera_sdk_gtxn_notifies_count";
+const char* const kGTxnNotifiesFailCountMetric = "tera_sdk_gtxn_notifies_fail_count";
+
+const char* const kGTxnTsoDelayMetric = "tera_sdk_gtxn_tso_delay_us";
+const char* const kGTxnTsoRequestCountMetric = "tera_sdk_gtxn_tso_request_count";
+} // end namespace tera 
+ 
+#endif // TERA_SDK_METRIC_NAME_H_
+ 
+/* vim: set expandtab ts=4 sw=4 sts=4 tw=100: */
diff --git a/src/sdk/sdk_perf.cc b/src/sdk/sdk_perf.cc
new file mode 100644
index 000000000..7cc5704d8
--- /dev/null
+++ b/src/sdk/sdk_perf.cc
@@ -0,0 +1,85 @@
+// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "sdk/sdk_perf.h"
+
+#include "gflags/gflags.h"
+
+#include "common/metric/metric_counter.h"
+#include "sdk/sdk_metric_name.h"
+
+namespace tera {
+namespace sdk {
+
+void PerfCollecter::DumpLog() {
+    std::shared_ptr<CollectorReport> latest_report = CollectorReportPublisher::GetInstance().GetCollectorReport();
+    int64_t interval = latest_report->interval_ms;
+    if (interval <= 0) {
+        // maybe happen at first report, the metric values must be 0
+        // set to any non-zero value to avoid div 0
+        VLOG(16) << "Metric Report interval is 0";
+        interval = 1000;
+    }
+    int64_t read_delay = latest_report->FindMetricValue(kGTxnReadDelayMetric, kGTxnLabelRead);
+    int64_t read_cnt = latest_report->FindMetricValue(kGTxnReadCountMetric, kGTxnLabelRead);
+    read_delay = read_cnt > 0 ? read_delay / read_cnt : 0;
+
+    LOG(INFO) << "[perf][gtxn] "
+              << "read_delay " << read_delay << " read_cnt " << read_cnt << " read_fail "
+              << latest_report->FindMetricValue(kGTxnReadFailCountMetric, kGTxnLabelRead)
+              << " read_retry_cnt "
+              << latest_report->FindMetricValue(kGTxnReadRetryCountMetric, kGTxnLabelRead)
+              << " read_rollback_cnt "
+              << latest_report->FindMetricValue(kGTxnReadRollBackCountMetric, kGTxnLabelRead)
+              << " read_rollforward_cnt "
+              << latest_report->FindMetricValue(kGTxnReadRollForwardCountMetric, kGTxnLabelRead);
+
+    int64_t commit_delay = latest_report->FindMetricValue(kGTxnCommitDelayMetric, kGTxnLabelCommit);
+    int64_t commit_cnt = latest_report->FindMetricValue(kGTxnCommitCountMetric, kGTxnLabelCommit);
+    commit_delay = commit_cnt > 0 ? commit_delay / commit_cnt : 0;
+
+    int64_t prewrite_delay = latest_report->FindMetricValue(kGTxnPrewriteDelayMetric, kGTxnLabelCommit);
+    int64_t prewrite_cnt = latest_report->FindMetricValue(kGTxnPrewriteCountMetric, kGTxnLabelCommit);
+    prewrite_delay = prewrite_cnt > 0 ? prewrite_delay / prewrite_cnt : 0;
+
+    int64_t primary_delay = latest_report->FindMetricValue(kGTxnPrimaryDelayMetric, kGTxnLabelCommit);
+    int64_t primary_cnt = latest_report->FindMetricValue(kGTxnPrimaryCountMetric, kGTxnLabelCommit);
+    primary_delay = primary_cnt > 0 ? primary_delay / primary_cnt : 0;
+
+    int64_t secondaries_delay = latest_report->FindMetricValue(kGTxnSecondariesDelayMetric, kGTxnLabelCommit);
+    int64_t secondaries_cnt = latest_report->FindMetricValue(kGTxnSecondariesCountMetric, kGTxnLabelCommit);
+    secondaries_delay = secondaries_cnt > 0 ? secondaries_delay / secondaries_cnt : 0;
+
+    LOG(INFO) << "[perf][gtxn] "
+              << "commit_delay " << commit_delay << " commit_cnt " << commit_cnt << " commit_fail "
+              << latest_report->FindMetricValue(kGTxnCommitFailCountMetric, kGTxnLabelCommit)
+              << " prew_delay " << prewrite_delay << " prew_cnt " << prewrite_cnt << " prew_fail "
+              << latest_report->FindMetricValue(kGTxnPrewriteFailCountMetric, kGTxnLabelCommit)
+              << " pri_delay " << primary_delay << " pri_cnt " << primary_cnt << " pri_fail "
+              << latest_report->FindMetricValue(kGTxnPrimaryFailCountMetric, kGTxnLabelCommit)
+              << " se_delay " << secondaries_delay << " se_cnt " << secondaries_cnt << " se_fail "
+              << latest_report->FindMetricValue(kGTxnSecondariesFailCountMetric, kGTxnLabelCommit);
+
+    int64_t tso_delay = latest_report->FindMetricValue(kGTxnTsoDelayMetric, kGTxnLabelTso);
+    int64_t tso_cnt = latest_report->FindMetricValue(kGTxnTsoRequestCountMetric, kGTxnLabelTso);
+    tso_delay = tso_cnt > 0 ? tso_delay / tso_cnt : 0;
+    LOG(INFO) << "[perf][gtxn] tso_delay " << tso_delay << " tso_cnt " << tso_cnt;
+
+    int64_t notify_delay = latest_report->FindMetricValue(kGTxnNotifiesDelayMetric, kGTxnLabelCommit);
+    int64_t notify_cnt = latest_report->FindMetricValue(kGTxnNotifiesCountMetric, kGTxnLabelCommit);
+    notify_delay = notify_cnt > 0 ? notify_delay / notify_cnt : 0;
+
+    int64_t ack_delay = latest_report->FindMetricValue(kGTxnAcksDelayMetric, kGTxnLabelCommit);
+    int64_t ack_cnt = latest_report->FindMetricValue(kGTxnAcksCountMetric, kGTxnLabelCommit);
+    ack_delay = ack_cnt > 0 ? ack_delay / ack_cnt : 0;
+
+    LOG(INFO) << "[perf][gtxn] "
+              << "notify_delay " << notify_delay << " notify_cnt " << notify_cnt << " notify_fail "
+              << latest_report->FindMetricValue(kGTxnNotifiesFailCountMetric, kGTxnLabelCommit)
+              << " ack_delay " << ack_delay << " ack_cnt " << ack_cnt << " ack_fail "
+              << latest_report->FindMetricValue(kGTxnAcksFailCountMetric, kGTxnLabelCommit);
+}
+   
+}  // namespace sdk
+}  // namespace tera
diff --git a/src/sdk/sdk_perf.h b/src/sdk/sdk_perf.h
new file mode 100644
index 000000000..d6b756a9e
--- /dev/null
+++ b/src/sdk/sdk_perf.h
@@ -0,0 +1,54 @@
+// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef  TERA_SDK_SDK_PERF_H_
+#define  TERA_SDK_SDK_PERF_H_
+
+#include "gflags/gflags.h"
+#include "glog/logging.h"
+
+#include "common/metric/metric_counter.h"
+#include "common/metric/collector_report.h"
+#include "common/thread.h"
+#include "common/this_thread.h"
+#include "tera.h"
+
+DECLARE_int32(tera_sdk_perf_collect_interval);
+
+namespace tera {
+namespace sdk {
+
+class PerfCollecter {
+public:
+    PerfCollecter() : stopped_(false){}
+    ~PerfCollecter() {}
+
+    void Run() {
+        thread_.Start(std::bind(&PerfCollecter::ScheduleCollect, this));
+    }
+
+    void Stop() {
+        stopped_ = true;
+        thread_.Join();
+    }
+
+private:
+    void ScheduleCollect() {
+        while (!stopped_) {
+            CollectorReportPublisher::GetInstance().Refresh(); 
+            DumpLog();
+            ThisThread::Sleep(FLAGS_tera_sdk_perf_collect_interval);
+        }
+    }
+
+    void DumpLog();
+private:
+    common::Thread thread_;
+    bool stopped_;
+};
+
+}  // namespace sdk
+}  // namespace tera
+
+#endif  // TERA_SDK_SDK_PERF_H_
diff --git a/src/sdk/sdk_task.cc b/src/sdk/sdk_task.cc
index ce1d64e2d..834bb4a97 100644
--- a/src/sdk/sdk_task.cc
+++ b/src/sdk/sdk_task.cc
@@ -6,7 +6,7 @@
 
 #include <glog/logging.h>
 
-#include "utils/timer.h"
+#include "common/timer.h"
 
 DECLARE_int32(tera_sdk_timeout_precision);
 
@@ -124,7 +124,8 @@ SdkTask* SdkTimeoutManager::PopTask(int64_t task_id) {
         SdkTask* task = it->second;
         CHECK_EQ(task->GetId(), task_id);
         map.id_hash_map.erase(it);
-        map.due_time_map.erase(task);
+        // make sure that we only erased the right one element
+        assert(map.due_time_map.erase(task) == 1);
         return task;
     } else {
         return NULL;
diff --git a/src/sdk/sdk_task.h b/src/sdk/sdk_task.h
index 58f61f65a..34ec25b1d 100644
--- a/src/sdk/sdk_task.h
+++ b/src/sdk/sdk_task.h
@@ -24,7 +24,8 @@ class SdkTask {
     enum TYPE {
         READ,
         MUTATION,
-        SCAN
+        SCAN,
+        TASKBATCH,
     };
     TYPE Type() { return type_; }
 
@@ -48,6 +49,14 @@ class SdkTask {
     void DecRef();
     void ExcludeOtherRef();
 
+    virtual bool IsAsync() = 0;
+    virtual uint32_t Size() = 0;
+    virtual int64_t TimeOut() = 0;
+    virtual void Wait() = 0;
+    virtual void SetError(ErrorCode::ErrorCodeType err,
+                          const std::string& reason) = 0;
+    virtual const std::string& RowKey() = 0;
+
 protected:
     SdkTask(TYPE type)
         : type_(type),
@@ -76,7 +85,10 @@ typedef void (*StatCallback)(Table* table, SdkTask* task);
 
 struct SdkTaskDueTimeComp {
     bool operator() (SdkTask* lhs, SdkTask* rhs) {
-        return lhs->DueTime() < rhs->DueTime();
+        if (lhs->DueTime() != rhs->DueTime()) {
+            return lhs->DueTime() < rhs->DueTime();
+        }
+        return lhs->GetId() < rhs->GetId();
     }
 };
 
diff --git a/src/sdk/sdk_utils.cc b/src/sdk/sdk_utils.cc
index 175bc7245..b135b99ed 100644
--- a/src/sdk/sdk_utils.cc
+++ b/src/sdk/sdk_utils.cc
@@ -18,6 +18,7 @@
 
 #include "sdk/schema_impl.h"
 #include "sdk/filter_utils.h"
+#include "types.h"
 
 DECLARE_int64(tera_tablet_write_block_size);
 DECLARE_int64(tera_tablet_ldb_sst_size);
@@ -184,6 +185,12 @@ void ShowTableSchema(const TableSchema& s, bool is_x) {
                     cf_ss << "type=bytes" << ",";
                 }
             }
+            if (is_x || (cf_schema.gtxn() != false)) {
+                cf_ss << "gtxn=" << Switch2Str(cf_schema.gtxn()) << ","; 
+            }
+            if (is_x || (cf_schema.notify() != false)) {
+                cf_ss << "notify=" << Switch2Str(cf_schema.notify()) << ","; 
+            }
             cf_ss << "\b>";
             if (cf_ss.str().size() > 5) {
                 ss << cf_ss.str();
@@ -281,6 +288,8 @@ void TableDescToSchema(const TableDescriptor& desc, TableSchema* schema) {
         cf->set_max_versions(cf_desc->MaxVersions());
         cf->set_min_versions(cf_desc->MinVersions());
         cf->set_type(cf_desc->Type());
+        cf->set_gtxn(cf_desc->GlobalTransaction());
+        cf->set_notify(cf_desc->IsNotifyEnabled());
     }
 }
 
@@ -365,6 +374,16 @@ void TableSchemaToDesc(const TableSchema& schema, TableDescriptor* desc) {
         cfd->SetMinVersions(cf.min_versions());
         cfd->SetTimeToLive(cf.time_to_live());
         cfd->SetType(cf.type());
+        if (cf.gtxn()) {
+            cfd->EnableGlobalTransaction();
+        } else {
+            cfd->DisableGlobalTransaction();
+        }
+        if (cf.notify()) {
+            cfd->EnableNotify();
+        } else {
+            cfd->DisableNotify();
+        }
     }
 }
 
@@ -402,6 +421,22 @@ bool SetCfProperties(const string& name, const string& value,
             return false;
         }
         desc->SetType(value);
+    } else if (name == "gtxn") {
+        if (value == "on") {
+            desc->EnableGlobalTransaction();
+        } else if (value == "off") {
+            desc->DisableGlobalTransaction();
+        } else {
+            return false;
+        }
+    } else if (name == "notify") {
+        if (value == "on") {
+            desc->EnableNotify();
+        } else if (value == "off") {
+            desc->DisableNotify();
+        } else {
+            return false;
+        }
     }else {
         return false;
     }
@@ -556,6 +591,13 @@ bool CheckTableDescrptor(const TableDescriptor& desc, ErrorCode* err) {
             }
             return false;
         }
+        if (!desc.IsTxnEnabled() && desc.ColumnFamily(i)->GlobalTransaction() == true) {
+            ss << " columnfamily property: gtxn is valid only when table set 'txn=on') ";
+            if (err != NULL) {
+                err->SetFailed(ErrorCode::kBadParam, ss.str());
+            }
+            return false;
+        }
     }
     if (desc.IsTxnEnabled() && (desc.RawKey() == kGeneralKv || desc.RawKey() == kTTLKv)) {
         ss << "kv and ttlkv don't support txn";
@@ -806,6 +848,8 @@ bool FillTableDescriptor(PropTree& schema_tree, TableDescriptor* table_desc) {
                 return false;
             }
         }
+        // extend notify locality group and _N_ columnfamily
+        return ExtendNotifyLgToDescriptor(table_desc); 
     } else if (schema_tree.MaxDepth() == 3) {
         // full mode, all elements are user-defined
         // e.g. table1<mergesize=100>{
@@ -860,6 +904,8 @@ bool FillTableDescriptor(PropTree& schema_tree, TableDescriptor* table_desc) {
                 return false;
             }
         }
+        // extend notify locality group and _N_ columnfamily
+        return ExtendNotifyLgToDescriptor(table_desc); 
     } else {
         LOG(FATAL) << "never here.";
     }
@@ -975,4 +1021,56 @@ bool IsKvTable(const TableSchema& schema) {
             schema.raw_key() == TTLKv);
 }
 
+bool IsTransactionTable(const TableSchema& schema) {
+    return schema.enable_txn();
+}
+
+void FindGlobalTransactionCfs(const TableSchema& schema, 
+                              std::set<string>* column_families) {
+    size_t cf_num = schema.column_families_size();
+    for (size_t cf_no = 0; cf_no < cf_num; ++cf_no) {
+        const ColumnFamilySchema& cf_schema = schema.column_families(cf_no);
+        if (cf_schema.gtxn()) {
+            column_families->insert(cf_schema.name());
+        }
+    }
+}
+
+bool ExtendNotifyLgToDescriptor(TableDescriptor* desc) {
+    bool do_extend = false;
+    bool have_n_cf = false;
+    for (int32_t i = 0; i < desc->ColumnFamilyNum(); ++i) {
+        if (desc->ColumnFamily(i)->Name() == kNotifyColumnFamily) {
+            have_n_cf = true;
+        }
+        if (desc->ColumnFamily(i)->IsNotifyEnabled()) {
+            do_extend = true;
+        }
+    }
+    if (!do_extend) {
+        return true;
+    } else if (do_extend && have_n_cf) {
+        return false;
+    }
+    if (desc->LocalityGroup(TableDescImpl::NOTIFY_LG_NAME) != NULL) {
+        LOG(ERROR) << "already exists locality group: " 
+                   << TableDescImpl::NOTIFY_LG_NAME;
+        return false;
+    }
+    LocalityGroupDescriptor* lg_desc 
+        = desc->AddLocalityGroup(TableDescImpl::NOTIFY_LG_NAME);
+    if (lg_desc == NULL) {
+        LOG(ERROR) << "fail to add locality group: " 
+                   << TableDescImpl::NOTIFY_LG_NAME;
+        return false;
+    }
+    ColumnFamilyDescriptor* cf_desc 
+        = desc->AddColumnFamily(kNotifyColumnFamily, TableDescImpl::NOTIFY_LG_NAME);
+    if (cf_desc == NULL) {
+        LOG(ERROR) << "fail to add column family: " << kNotifyColumnFamily;
+        return false;
+    }
+    return true;
+}
+
 } // namespace tera
diff --git a/src/sdk/sdk_utils.h b/src/sdk/sdk_utils.h
index 4974575af..0e8ddad54 100644
--- a/src/sdk/sdk_utils.h
+++ b/src/sdk/sdk_utils.h
@@ -50,5 +50,11 @@ bool ParseDelimiterFile(const string& filename, std::vector<string>* delims);
 
 bool IsKvTable(const TableSchema& schema);
 
+bool ExtendNotifyLgToDescriptor(TableDescriptor* desc);
+
+bool IsTransactionTable(const TableSchema& schema);
+
+void FindGlobalTransactionCfs(const TableSchema& schema, std::set<string>* column_families);
+
 } // namespace tera
 #endif // TERA_SDK_SDK_UTILS_H_
diff --git a/src/sdk/sdk_zk.cc b/src/sdk/sdk_zk.cc
index e08bb6c9b..874b4912c 100644
--- a/src/sdk/sdk_zk.cc
+++ b/src/sdk/sdk_zk.cc
@@ -5,11 +5,15 @@
 #include "sdk/sdk_zk.h"
 
 #include <iostream>
+#include <fcntl.h>
+#include <sys/types.h>
+#include <sys/stat.h>
 #include <gflags/gflags.h>
 
+#include "common/this_thread.h"
 #include "ins_sdk.h"
-
 #include "types.h"
+#include "utils/utils_cmd.h"
 #include "zk/zk_adapter.h"
 
 DECLARE_string(tera_zk_lib_log_path);
@@ -18,14 +22,174 @@ DECLARE_bool(tera_zk_enabled);
 DECLARE_bool(tera_mock_zk_enabled);
 DECLARE_string(tera_zk_addr_list);
 DECLARE_string(tera_zk_root_path);
+DECLARE_int32(tera_zk_timeout);
+DECLARE_int32(tera_zk_retry_max_times);
+DECLARE_int64(tera_zk_retry_period);
 DECLARE_bool(tera_ins_enabled);
 DECLARE_string(tera_ins_root_path);
 DECLARE_string(tera_ins_addr_list);
+DECLARE_int64(tera_sdk_ins_session_timeout);
 DECLARE_bool(tera_mock_ins_enabled);
+DECLARE_bool(tera_timeoracle_mock_enabled);
+DECLARE_string(tera_timeoracle_mock_root_path);
+DECLARE_string(tera_coord_type);
 
 namespace tera {
 namespace sdk {
 
+static pthread_once_t zk_init_once = PTHREAD_ONCE_INIT;
+
+static void InitZkLogOnce() {
+    zk::ZooKeeperLightAdapter::SetLibraryLogOutput(FLAGS_tera_zk_lib_log_path);
+}
+
+bool ClientZkAdapter::Init() {
+    pthread_once(&zk_init_once, InitZkLogOnce);
+    MutexLock lock(&mutex_);
+    LOG(INFO) << "try init zk ...";
+    int zk_errno = zk::ZE_OK;
+    int32_t retry_cnt = 0;
+    int wait_time = 60000;
+    while (!ZooKeeperAdapter::Init(FLAGS_tera_zk_addr_list,
+                                   FLAGS_tera_zk_root_path,
+                                   FLAGS_tera_zk_timeout,
+                                   "", &zk_errno, wait_time)) {
+        if (retry_cnt++ >= FLAGS_tera_zk_retry_max_times) {
+            LOG(ERROR) << "fail to init zk: " << zk::ZkErrnoToString(zk_errno);
+            return false;
+        }
+        LOG(ERROR) << "init zk fail: " << zk::ZkErrnoToString(zk_errno)
+            << ". retry in " << FLAGS_tera_zk_retry_period << " ms, retry: "
+            << retry_cnt;
+        ThisThread::Sleep(FLAGS_tera_zk_retry_period);
+        zk_errno = zk::ZE_OK;
+    }
+    LOG(INFO) << "init zk success";
+    return true;
+}
+
+bool ClientZkAdapter::RegisterClient(std::string* path) {
+    int64_t session_id = 0;
+    int zk_errno = zk::ZE_OK;
+    int32_t retry_cnt = 0;
+    LOG(INFO) << "try get client sesssion";
+    while (!GetSessionId(&session_id, &zk_errno)) {
+        if (retry_cnt++ >= FLAGS_tera_zk_retry_max_times) {
+            LOG(ERROR) << "fail to get client session : " 
+                       << zk::ZkErrnoToString(zk_errno);
+            return false;
+        }
+        LOG(ERROR) << "get client session fail: " << zk::ZkErrnoToString(zk_errno)
+            << ". retry in " << FLAGS_tera_zk_retry_period << " ms, retry: "
+            << retry_cnt;
+        ThisThread::Sleep(FLAGS_tera_zk_retry_period);
+        zk_errno = zk::ZE_OK;
+    }
+    std::string internal_path = utils::GetLocalHostAddr()
+            + "-" + std::to_string(getpid())
+            + "-" + std::to_string(session_id);
+    LOG(INFO) << "get client session success : " << internal_path;
+    zk_errno = zk::ZE_OK;
+    retry_cnt = 0;
+    LOG(INFO) << "try create client node : " << internal_path;
+    while (!CreateEphemeralNode(kClientsNodePath + "/" + internal_path, 
+                                "", 
+                                &zk_errno)) {
+        if (retry_cnt++ >= FLAGS_tera_zk_retry_max_times) {
+            LOG(ERROR) << "fail to create client node : " 
+                       << zk::ZkErrnoToString(zk_errno);
+            return false;
+        }
+        LOG(ERROR) << "create client node fail: " << zk::ZkErrnoToString(zk_errno)
+            << ". retry in " << FLAGS_tera_zk_retry_period << " ms, retry: "
+            << retry_cnt;
+        ThisThread::Sleep(FLAGS_tera_zk_retry_period);
+        zk_errno = zk::ZE_OK;
+    }
+    LOG(INFO) << "create client node success";
+    *path = internal_path;
+    return true;
+}
+
+bool ClientZkAdapter::IsClientAlive(const std::string& path) {
+    VLOG(12) << "try check client alive : " << path;
+    int32_t retry_cnt = 0;
+    int zk_errno = zk::ZE_OK;
+    bool ret = true;
+    while (!CheckExist(kClientsNodePath + "/" + path, &ret, &zk_errno)) {
+        if (retry_cnt++ >= FLAGS_tera_zk_retry_max_times) {
+            LOG(ERROR) << "fail to check client alive : " 
+                       << zk::ZkErrnoToString(zk_errno);
+            // when zk server error, client should think other client is alive
+            return true;
+        }
+        LOG(ERROR) << "check client alive fail: " << zk::ZkErrnoToString(zk_errno)
+            << ". retry in " << FLAGS_tera_zk_retry_period << " ms, retry: "
+            << retry_cnt;
+        ThisThread::Sleep(FLAGS_tera_zk_retry_period);
+        zk_errno = zk::ZE_OK;
+    }
+    VLOG(12) << "check client alive success";
+    return ret;
+}
+
+bool ClientZkAdapter::ReadNode(const std::string& path, std::string* value) {
+    VLOG(12) << "try read node : " << path;
+    int32_t retry_cnt = 0;
+    int zk_errno = zk::ZE_OK;
+    while (!ZooKeeperAdapter::ReadNode(path, value, &zk_errno)) {
+        if (retry_cnt++ >= FLAGS_tera_zk_retry_max_times) {
+            LOG(ERROR) << "fail to read node : " 
+                       << zk::ZkErrnoToString(zk_errno);
+            return false;
+        }
+        LOG(ERROR) << "read node fail: " << zk::ZkErrnoToString(zk_errno)
+            << ". retry in " << FLAGS_tera_zk_retry_period << " ms, retry: "
+            << retry_cnt;
+        ThisThread::Sleep(FLAGS_tera_zk_retry_period);
+        zk_errno = zk::ZE_OK;
+    }
+    VLOG(12) << "read node success";
+    return true;
+}
+
+bool InsClientZkAdapter::Init() {
+    ins_sdk_ = new galaxy::ins::sdk::InsSDK(FLAGS_tera_ins_addr_list);
+    ins_sdk_->SetTimeoutTime(FLAGS_tera_sdk_ins_session_timeout);
+    return true;
+}
+
+bool InsClientZkAdapter::RegisterClient(std::string* path) {
+    std::string internal_path = utils::GetLocalHostAddr()
+            + "-" + std::to_string(getpid())
+            + "-" + ins_sdk_->GetSessionID();
+    LOG(INFO) << "get client session success : " << internal_path;
+    std::string client_path = FLAGS_tera_ins_root_path + kClientsNodePath 
+        + "/" + internal_path;
+    galaxy::ins::sdk::SDKError err;
+    bool ret = ins_sdk_->Put(client_path, "", &err);
+    if (ret) {
+        *path = internal_path;
+    }
+    return ret;
+}
+
+bool InsClientZkAdapter::IsClientAlive(const std::string& path) {
+    std::string client_path = kClientsNodePath + "/" + path;
+    std::string value;
+    return ReadNode(client_path, &value);
+}
+
+bool InsClientZkAdapter::ReadNode(const std::string& path, std::string* value) {
+    std::string target_path = FLAGS_tera_ins_root_path + path;
+    galaxy::ins::sdk::SDKError err;
+    if (!ins_sdk_->Get(target_path, value, &err)) {
+        LOG(ERROR) << "ins read " << target_path << " fail: " << err;
+        return false;
+    }
+    return true;
+}
+
 std::string ClusterFinder::MasterAddr(bool update) {
     std::string master_addr;
     if (update || master_addr_ == "") {
@@ -41,6 +205,21 @@ std::string ClusterFinder::MasterAddr(bool update) {
     return master_addr_;
 }
 
+std::string ClusterFinder::TimeoracleAddr(bool update) {
+    std::string timeoracle_addr;
+    if (update || timeoracle_addr_ == "") {
+        if (!ReadNode(kTimeoracleNodePath, &timeoracle_addr)) {
+            timeoracle_addr = "";
+        }
+    }
+    if (!timeoracle_addr.empty()) {
+        MutexLock lock(&mutex_);
+        timeoracle_addr_ = timeoracle_addr;
+        LOG(INFO) << "timeoracle addr: " << timeoracle_addr_;
+    }
+    return timeoracle_addr_;
+}
+
 std::string ClusterFinder::RootTableAddr(bool update) {
     std::string root_table_addr;
     if (update || root_table_addr_ == "") {
@@ -72,46 +251,54 @@ std::string ClusterFinder::ClusterId() {
 }
 
 ZkClusterFinder::ZkClusterFinder(const std::string& zk_root_path,
-                                 const std::string& zk_addr_list)
-    : zk_root_path_(zk_root_path), zk_addr_list_(zk_addr_list) {
-}
-
-static pthread_once_t zk_init_once = PTHREAD_ONCE_INIT;
-
-static void InitZkLogOnce() {
-    zk::ZooKeeperLightAdapter::SetLibraryLogOutput(FLAGS_tera_zk_lib_log_path);
+                                 const std::string& zk_addr_list,
+                                 ClientZkAdapterBase* zk_adapter)
+    : zk_root_path_(zk_root_path), 
+      zk_addr_list_(zk_addr_list), 
+      zk_adapter_(zk_adapter) {
 }
 
 bool ZkClusterFinder::ReadNode(const std::string& name, std::string* value) {
-    pthread_once(&zk_init_once, InitZkLogOnce);
+    if (zk_adapter_ == NULL) {
+        pthread_once(&zk_init_once, InitZkLogOnce);
 
-    int zk_errno = tera::zk::ZE_OK;
-    zk::ZooKeeperLightAdapter zk_adapter;
-    if (!zk_adapter.Init(zk_addr_list_, zk_root_path_, 1000 * 15, "", &zk_errno)) {
-        LOG(ERROR) << "Init zookeeper fail: " << tera::zk::ZkErrnoToString(zk_errno);
-        return false;
-    }
+        int zk_errno = tera::zk::ZE_OK;
+        zk::ZooKeeperLightAdapter zk_adapter;
+        if (!zk_adapter.Init(zk_addr_list_, zk_root_path_, 1000 * 15, "", &zk_errno)) {
+            LOG(ERROR) << "Init zookeeper fail: " << tera::zk::ZkErrnoToString(zk_errno);
+            return false;
+        }
 
-    if (!zk_adapter.ReadNode(name, value, &zk_errno)) {
-        LOG(ERROR) << "zk read " << name << " fail: " << zk::ZkErrnoToString(zk_errno);
-        return false;
+        if (!zk_adapter.ReadNode(name, value, &zk_errno)) {
+            LOG(ERROR) << "zk read " << name << " fail: " << zk::ZkErrnoToString(zk_errno);
+            return false;
+        }
+        return true;
+    } else {
+        return zk_adapter_->ReadNode(name, value);
     }
-    return true;
 }
 
 InsClusterFinder::InsClusterFinder(const std::string& ins_root_path,
-                                   const std::string& ins_addr_list)
-    : ins_root_path_(ins_root_path), ins_addr_list_(ins_addr_list) {
+                                   const std::string& ins_addr_list,
+                                   ClientZkAdapterBase* zk_adapter)
+    : ins_root_path_(ins_root_path), 
+      ins_addr_list_(ins_addr_list), 
+      zk_adapter_(zk_adapter) {
 }
 
 bool InsClusterFinder::ReadNode(const std::string& name, std::string* value) {
-    galaxy::ins::sdk::InsSDK ins_sdk(ins_addr_list_);
-    galaxy::ins::sdk::SDKError err;
-    if (!ins_sdk.Get(ins_root_path_ + name, value, &err)) {
-        LOG(ERROR) << "ins read " << name << " fail: " << err;
-        return false;
+    if (zk_adapter_ == NULL) {
+        galaxy::ins::sdk::InsSDK ins_sdk(ins_addr_list_);
+        galaxy::ins::sdk::SDKError err;
+        if (!ins_sdk.Get(ins_root_path_ + name, value, &err)) {
+            LOG(ERROR) << "ins read " << name << " fail: " << err;
+            return false;
+        }
+        return true;
+    } else {
+        return zk_adapter_->ReadNode(name, value);
     }
-    return true;
 }
 
 FakeZkClusterFinder::FakeZkClusterFinder(const std::string& fake_zk_path_prefix)
@@ -122,18 +309,84 @@ bool FakeZkClusterFinder::ReadNode(const std::string& name, std::string* value)
     return zk::FakeZkUtil::ReadNode(fake_zk_path_prefix_ + name, value);
 }
 
-ClusterFinder* NewClusterFinder() {
-    if (FLAGS_tera_zk_enabled) {
-        return new sdk::ZkClusterFinder(FLAGS_tera_zk_root_path, FLAGS_tera_zk_addr_list);
-    } else if (FLAGS_tera_ins_enabled) {
-        return new sdk::InsClusterFinder(FLAGS_tera_ins_root_path, FLAGS_tera_ins_addr_list);
-    } else if (FLAGS_tera_mock_zk_enabled) {
+MockTimeoracleClusterFinder::MockTimeoracleClusterFinder(const std::string& mock_root_path) {
+    mock_root_path_ = mock_root_path;
+}
+
+bool MockTimeoracleClusterFinder::ReadNode(const std::string& kpath, std::string* value) {
+    std::string path = mock_root_path_ + kpath;
+    int fd = ::open(path.c_str(), O_RDWR);
+    if (fd < 0) {
+        return false;
+    }
+
+    value->resize(1024);
+    char *buf = &(*value)[0];
+    ssize_t len = ::pread(fd, buf, sizeof(buf), 0);
+    ::close(fd);
+    if (len < 0) {
+        return false;
+    }
+    value->resize(len);
+    return true;
+}
+
+ClientZkAdapterBase* NewClientZkAdapter() {
+    if (FLAGS_tera_coord_type.empty()) {
+        LOG(ERROR) << "Note: We don't recommend that use '--tera_[zk|ins|mock_zk|mock_ins]_enabled' flag for your cluster coord"
+                   << " replace by '--tera_coord_type=[zk|ins|mock_zk|mock_ins|fake_zk]' flag is usually recommended.";
+    }
+
+    if (FLAGS_tera_coord_type == "zk"
+            || (FLAGS_tera_coord_type.empty() && FLAGS_tera_zk_enabled)) {
+        return new sdk::ClientZkAdapter();
+    } else if (FLAGS_tera_coord_type == "ins"
+            || (FLAGS_tera_coord_type.empty() && FLAGS_tera_ins_enabled)) {
+        return new sdk::InsClientZkAdapter();
+    } else if (FLAGS_tera_coord_type == "mock_zk"
+            || (FLAGS_tera_coord_type.empty() && FLAGS_tera_mock_zk_enabled)) {
+        return new sdk::MockClientZkAdapter();
+    } else if (FLAGS_tera_coord_type == "mock_ins"
+            || (FLAGS_tera_coord_type.empty() && FLAGS_tera_mock_ins_enabled)) {
+        return new sdk::MockInsClientZkAdapter();
+    }
+    return NULL;
+}
+
+ClusterFinder* NewClusterFinder(ClientZkAdapterBase* zk_adapter) {
+    if (FLAGS_tera_coord_type.empty()) {
+        LOG(ERROR) << "Note: We don't recommend that use '--tera_[zk|ins|mock_zk|mock_ins]_enabled' flag for your cluster coord"
+                   << " replace by '--tera_coord_type=[zk|ins|mock_zk|mock_ins|fake_zk]' flag is usually recommended.";
+    }
+    if (FLAGS_tera_coord_type == "zk" 
+            || (FLAGS_tera_coord_type.empty() && FLAGS_tera_zk_enabled)) {
+        return new sdk::ZkClusterFinder(FLAGS_tera_zk_root_path, FLAGS_tera_zk_addr_list, zk_adapter);
+    } else if (FLAGS_tera_coord_type == "ins" 
+            || (FLAGS_tera_coord_type.empty() && FLAGS_tera_ins_enabled)) {
+        return new sdk::InsClusterFinder(FLAGS_tera_ins_root_path, FLAGS_tera_ins_addr_list, zk_adapter);
+    } else if (FLAGS_tera_coord_type == "mock_zk" 
+            || (FLAGS_tera_coord_type.empty() && FLAGS_tera_mock_zk_enabled)) {
         return new sdk::MockZkClusterFinder(FLAGS_tera_zk_root_path, FLAGS_tera_zk_addr_list);
-    } else if (FLAGS_tera_mock_ins_enabled) {
+    } else if (FLAGS_tera_coord_type == "mock_ins" 
+            || (FLAGS_tera_coord_type.empty() && FLAGS_tera_mock_ins_enabled)) {
         return new sdk::MockInsClusterFinder(FLAGS_tera_ins_root_path, FLAGS_tera_ins_addr_list);
-    } else {
+    } else if (FLAGS_tera_coord_type == "fake_zk" 
+            || FLAGS_tera_coord_type.empty()) {
         return new sdk::FakeZkClusterFinder(FLAGS_tera_fake_zk_path_prefix);
     }
+    return nullptr;
+}
+
+ClusterFinder* NewTimeoracleClusterFinder() {
+    if (FLAGS_tera_timeoracle_mock_enabled) {
+        return new sdk::MockTimeoracleClusterFinder(FLAGS_tera_timeoracle_mock_root_path);
+    } else if (FLAGS_tera_coord_type == "zk") {
+        return new sdk::ZkClusterFinder(FLAGS_tera_zk_root_path, FLAGS_tera_zk_addr_list);
+    } else if (FLAGS_tera_coord_type == "ins") {
+        return new sdk::InsClusterFinder(FLAGS_tera_ins_root_path, FLAGS_tera_ins_addr_list);
+    }
+
+    return nullptr;
 }
 
 }  // namespace sdk
diff --git a/src/sdk/sdk_zk.h b/src/sdk/sdk_zk.h
index dc199abe6..8ad026ebd 100644
--- a/src/sdk/sdk_zk.h
+++ b/src/sdk/sdk_zk.h
@@ -9,9 +9,95 @@
 #include <string>
 #include <common/mutex.h>
 
+#include "ins_sdk.h"
+#include "zk/zk_adapter.h"
+
+namespace galaxy{
+namespace ins{
+namespace sdk {
+    class InsSDK;
+}
+}
+}
+
 namespace tera {
 namespace sdk {
 
+class ClientZkAdapterBase : public zk::ZooKeeperLightAdapter {
+public:
+    virtual ~ClientZkAdapterBase() {};
+    virtual bool Init() = 0;
+    virtual bool RegisterClient(std::string* session_str) = 0;
+    virtual bool IsClientAlive(const std::string& path) = 0;
+    virtual bool ReadNode(const std::string& path, std::string* value) = 0;
+};
+
+class ClientZkAdapter : public ClientZkAdapterBase {
+public:
+    ClientZkAdapter() {}
+    virtual ~ClientZkAdapter() {}
+    virtual bool Init();
+    virtual bool RegisterClient(std::string* session_str);
+    virtual bool IsClientAlive(const std::string& path);
+    virtual bool ReadNode(const std::string& path, std::string* value);
+private:
+    mutable Mutex mutex_;
+};
+
+class MockClientZkAdapter : public ClientZkAdapter {
+public:
+    MockClientZkAdapter(): ClientZkAdapter() {}
+    virtual ~MockClientZkAdapter() {}
+    virtual bool Init() { return true; }
+    virtual bool RegisterClient(std::string* session_str) { 
+        *session_str = "localhost";
+        return true;
+    }
+    virtual bool IsClientAlive(const std::string& path) {
+        return true;
+    }
+    virtual bool ReadNode(const std::string& path, std::string* value) {
+        *value = "mock_zk_value";
+        return true;
+    }
+};
+
+class InsClientZkAdapter : public ClientZkAdapterBase {
+public:
+    InsClientZkAdapter() : ins_sdk_(NULL) {}
+    virtual ~InsClientZkAdapter() {
+        if (ins_sdk_ != NULL) {
+            delete ins_sdk_;
+        }
+    }
+    virtual bool Init ();
+    virtual bool RegisterClient(std::string* session_str);
+    virtual bool IsClientAlive(const std::string& path);
+    virtual bool ReadNode(const std::string& path, std::string* value);
+private:
+    galaxy::ins::sdk::InsSDK* ins_sdk_;
+};
+
+class MockInsClientZkAdapter : public InsClientZkAdapter {
+public:
+    MockInsClientZkAdapter() : InsClientZkAdapter() {}
+    virtual ~MockInsClientZkAdapter() {}
+    virtual bool Init() { return true; }
+    virtual bool RegisterClient(std::string* session_str) { 
+        *session_str = "localhost";
+        return true;
+    }
+    virtual bool IsClientAlive(const std::string& path) {
+        return true;
+    }
+    virtual bool ReadNode(const std::string& path, std::string* value) {
+        *value = "mock_ins_value";
+        return true;
+    }
+};
+
+ClientZkAdapterBase* NewClientZkAdapter();
+
 class ClusterFinder
 {
 public:
@@ -19,6 +105,7 @@ class ClusterFinder
     virtual ~ClusterFinder() {}
     std::string MasterAddr(bool update = false);
     std::string RootTableAddr(bool update = false);
+    std::string TimeoracleAddr(bool update = false);
     std::string ClusterId(); // cluster URI: <scheme>://<authority>/<path>
 
 protected:
@@ -30,12 +117,15 @@ class ClusterFinder
 private:
     mutable Mutex mutex_;
     std::string master_addr_;
+    std::string timeoracle_addr_;
     std::string root_table_addr_;
 };
 
 class ZkClusterFinder : public ClusterFinder {
 public:
-    ZkClusterFinder(const std::string& zk_root_path, const std::string& zk_addr_list);
+    ZkClusterFinder(const std::string& zk_root_path,
+                    const std::string& zk_addr_list,
+                    ClientZkAdapterBase* zk_adapter = NULL);
 protected:
     virtual bool ReadNode(const std::string& path, std::string* value);
     virtual std::string Name() { return "zk"; };
@@ -44,6 +134,7 @@ class ZkClusterFinder : public ClusterFinder {
 private:
     std::string zk_root_path_;
     std::string zk_addr_list_;
+    ClientZkAdapterBase* zk_adapter_;
 };
 
 class MockZkClusterFinder : public ZkClusterFinder {
@@ -56,7 +147,9 @@ class MockZkClusterFinder : public ZkClusterFinder {
 
 class InsClusterFinder : public ClusterFinder {
 public:
-    InsClusterFinder(const std::string& ins_root_path, const std::string& ins_addr_list);
+    InsClusterFinder(const std::string& ins_root_path,
+                     const std::string& ins_addr_list,
+                     ClientZkAdapterBase* zk_adapter = NULL);
 protected:
     virtual bool ReadNode(const std::string& path, std::string* value);
     virtual std::string Name() { return "ins"; }
@@ -65,6 +158,7 @@ class InsClusterFinder : public ClusterFinder {
 private:
     std::string ins_root_path_;
     std::string ins_addr_list_;
+    ClientZkAdapterBase* zk_adapter_;
 };
 
 class MockInsClusterFinder : public InsClusterFinder {
@@ -87,7 +181,24 @@ class FakeZkClusterFinder : public ClusterFinder {
     std::string fake_zk_path_prefix_;
 };
 
-ClusterFinder* NewClusterFinder();
+class MockTimeoracleClusterFinder : public ClusterFinder {
+public:
+    MockTimeoracleClusterFinder(const std::string& mock_root_path);
+
+protected:
+    virtual bool ReadNode(const std::string& path, std::string* value);
+
+    virtual std::string Name() { return "fakezk"; };
+
+    virtual std::string Authority() { return "localhost"; }
+
+    virtual std::string Path() { return mock_root_path_; }
+private:
+    std::string mock_root_path_;
+};
+
+ClusterFinder* NewTimeoracleClusterFinder();
+ClusterFinder* NewClusterFinder(ClientZkAdapterBase* zk_adapter = NULL);
 
 }  // namespace sdk
 }  // namespace tera
diff --git a/src/sdk/single_row_txn.cc b/src/sdk/single_row_txn.cc
index 0d63563e1..d55c31889 100644
--- a/src/sdk/single_row_txn.cc
+++ b/src/sdk/single_row_txn.cc
@@ -3,16 +3,18 @@
 // found in the LICENSE file.
 
 #include <functional>
+#include <memory>
 
 #include "common/thread_pool.h"
 #include "common/base/string_format.h"
 
 #include "io/coding.h"
+#include "sdk/global_txn_internal.h"
 #include "sdk/read_impl.h"
 #include "sdk/single_row_txn.h"
 #include "sdk/table_impl.h"
 #include "types.h"
-#include "utils/timer.h"
+#include "common/timer.h"
 
 namespace tera {
 
@@ -27,9 +29,12 @@ SingleRowTxn::SingleRowTxn(Table* table, const std::string& row_key,
       reader_max_versions_(1),
       reader_start_timestamp_(kOldestTs),
       reader_end_timestamp_(kLatestTs),
+      start_timestamp_(0),
+      commit_timestamp_(0),
       mutation_buffer_(table, row_key),
       user_commit_callback_(NULL),
       user_commit_context_(NULL) {
+    start_timestamp_ = get_micros();
 }
 
 SingleRowTxn::~SingleRowTxn() {
@@ -185,6 +190,8 @@ void CommitCallbackWrapper(RowMutation* row_mu) {
 
 /// 提交事务
 ErrorCode SingleRowTxn::Commit() {
+    commit_timestamp_ = get_micros();
+    InternalNotify();
     if (mutation_buffer_.MutationNum() > 0) {
         if (user_commit_callback_ != NULL) {
             // use our callback wrapper
@@ -266,6 +273,34 @@ void SingleRowTxn::Serialize(RowMutationSequence* mu_seq) {
     }
 }
 
+void SingleRowTxn::Ack(Table* t, 
+                     const std::string& row_key, 
+                     const std::string& column_family, 
+                     const std::string& qualifier) {
+    std::unique_ptr<tera::RowMutation> mutation(t->NewRowMutation(row_key));
+    std::string notify_qulifier = PackNotifyName(column_family, qualifier);
+    mutation->DeleteColumns(kNotifyColumnFamily, notify_qulifier, start_timestamp_);
+    this->ApplyMutation(mutation.get());
+}
+
+void SingleRowTxn::Notify(Table* t,
+                        const std::string& row_key, 
+                        const std::string& column_family, 
+                        const std::string& qualifier) {
+    Cell cell(t, row_key, column_family, qualifier);
+    notify_cells_.push_back(cell);
+}
+
+void SingleRowTxn::InternalNotify() {
+    for (auto cell : notify_cells_) {
+        std::unique_ptr<tera::RowMutation> mutation(cell.Table()->NewRowMutation(cell.RowKey()));
+        std::string notify_qulifier = PackNotifyName(cell.ColFamily(), cell.Qualifier());
+        mutation->Put(kNotifyColumnFamily, notify_qulifier, commit_timestamp_);
+        // single row transaction may notify different rows
+        cell.Table()->ApplyMutation(mutation.get());
+    }
+}
+
 } // namespace tera
 
 /* vim: set expandtab ts=4 sw=4 sts=4 tw=100: */
diff --git a/src/sdk/single_row_txn.h b/src/sdk/single_row_txn.h
index 3a57ea143..96b0fd104 100644
--- a/src/sdk/single_row_txn.h
+++ b/src/sdk/single_row_txn.h
@@ -17,6 +17,7 @@ class ThreadPool;
 namespace tera {
 
 class TableImpl;
+class Cell;
 
 class SingleRowTxn : public Transaction {
 public:
@@ -45,8 +46,33 @@ class SingleRowTxn : public Transaction {
     /// 提交事务
     virtual ErrorCode Commit();
 
-    /// 请忽略此接口
-    virtual int64_t GetStartTimestamp() { abort(); }
+    virtual int64_t GetStartTimestamp() { return start_timestamp_; }
+    
+    virtual int64_t GetCommitTimestamp() { return commit_timestamp_; }
+
+    virtual void Ack(Table* t, 
+                     const std::string& row_key, 
+                     const std::string& column_family, 
+                     const std::string& qualifier);
+
+    virtual void Notify(Table* t,
+                        const std::string& row_key, 
+                        const std::string& column_family, 
+                        const std::string& qualifier);
+
+    // not support
+    virtual void SetIsolation(const IsolationLevel& isolation_level) { abort(); }
+
+    // use default isolation level snapshot 
+    virtual IsolationLevel Isolation() { return IsolationLevel::kSnapshot; }
+
+    virtual void SetTimeout(int64_t timeout_ms) { 
+        mutation_buffer_.SetTimeOut(timeout_ms); 
+    }
+
+    virtual int64_t Timeout() {
+        return mutation_buffer_.TimeOut();
+    }
 
 public:
     /// 内部读操作回调
@@ -61,6 +87,8 @@ class SingleRowTxn : public Transaction {
     bool MarkHasRead();
 
     void MarkNoRead();
+
+    void InternalNotify();
 private:
     Table* table_;
     const std::string row_key_;
@@ -77,10 +105,15 @@ class SingleRowTxn : public Transaction {
     int64_t reader_start_timestamp_;
     int64_t reader_end_timestamp_;
 
+    int64_t start_timestamp_;
+    int64_t commit_timestamp_;
+
     RowMutationImpl mutation_buffer_;
     Callback user_commit_callback_;
     void* user_commit_context_;
 
+    std::vector<Cell> notify_cells_;
+
     mutable Mutex mu_;
 };
 
diff --git a/src/sdk/table_impl.cc b/src/sdk/table_impl.cc
index fc153676a..c87567abd 100644
--- a/src/sdk/table_impl.cc
+++ b/src/sdk/table_impl.cc
@@ -34,7 +34,7 @@
 #include "tera.h"
 #include "utils/crypt.h"
 #include "utils/string_util.h"
-#include "utils/timer.h"
+#include "common/timer.h"
 
 DECLARE_string(tera_master_meta_table_name);
 DECLARE_int32(tera_sdk_delay_send_internal);
@@ -73,8 +73,6 @@ TableImpl::TableImpl(const std::string& table_name,
       commit_size_(FLAGS_tera_sdk_batch_size),
       write_commit_timeout_(FLAGS_tera_sdk_write_send_interval),
       read_commit_timeout_(FLAGS_tera_sdk_read_send_interval),
-      mutation_batch_seq_(0),
-      reader_batch_seq_(0),
       max_commit_pending_num_(FLAGS_tera_sdk_max_mutation_pending_num),
       max_reader_pending_num_(FLAGS_tera_sdk_max_reader_pending_num),
       meta_cond_(&meta_mutex_),
@@ -126,11 +124,11 @@ void OpStatCallback(Table* table, SdkTask* task) {
     if (task->Type() == SdkTask::MUTATION) {
         ((TableImpl*)table)->StatUserPerfCounter(task->Type(),
                                       ((RowMutationImpl*)task)->GetError().GetType(),
-                                      common::timer::get_micros() - ((RowMutationImpl*)task)->GetStartTime());
+                                      get_micros() - ((RowMutationImpl*)task)->GetStartTime());
     } else if (task->Type() == SdkTask::READ) {
         ((TableImpl*)table)->StatUserPerfCounter(task->Type(),
                                       ((RowReaderImpl*)task)->GetError().GetType(),
-                                      common::timer::get_micros() - ((RowReaderImpl*)task)->GetStartTime());
+                                      get_micros() - ((RowReaderImpl*)task)->GetStartTime());
     }
 }
 
@@ -148,13 +146,15 @@ void TableImpl::ApplyMutation(RowMutation* row_mu) {
         thread_pool_->AddTask(task);
         return;
     }
-    std::vector<RowMutationImpl*> mu_list;
-    mu_list.push_back(static_cast<RowMutationImpl*>(row_mu));
-    DistributeMutations(mu_list, true);
+    std::vector<SdkTask*> task_list;
+    task_list.push_back(static_cast<SdkTask*>((RowMutationImpl*)row_mu));
+    int64_t ts = get_micros();
+    DistributeTasks(task_list, true, SdkTask::MUTATION);
+    perf_counter_.hist_async_cost.Add(get_micros() - ts);
 }
 
 void TableImpl::ApplyMutation(const std::vector<RowMutation*>& row_mutations) {
-    std::vector<RowMutationImpl*> mu_list;
+    std::vector<SdkTask*> task_list;
     for (uint32_t i = 0; i < row_mutations.size(); i++) {
         perf_counter_.user_mu_cnt.Add(1);
         ((RowMutationImpl*)row_mutations[i])->Prepare(OpStatCallback);
@@ -169,9 +169,11 @@ void TableImpl::ApplyMutation(const std::vector<RowMutation*>& row_mutations) {
             thread_pool_->AddTask(task);
             continue;
         }
-        mu_list.push_back(static_cast<RowMutationImpl*>(row_mutations[i]));
+        task_list.push_back(static_cast<SdkTask*>((RowMutationImpl*)row_mutations[i]));
     }
-    DistributeMutations(mu_list, true);
+    int64_t ts = get_micros();
+    DistributeTasks(task_list, true, SdkTask::MUTATION);
+    perf_counter_.hist_async_cost.Add(get_micros() - ts);
 }
 
 bool TableImpl::Put(const std::string& row_key, const std::string& family,
@@ -427,6 +429,7 @@ void TableImpl::CommitScan(ScanTask* scan_task,
     if (impl->GetMaxVersion() != 0) {
         request->set_max_version(impl->GetMaxVersion());
     }
+    request->set_max_qualifiers(impl->GetMaxQualifiers());
     if (impl->GetBufferSize() != 0) {
         request->set_buffer_limit(impl->GetBufferSize());
     }
@@ -450,7 +453,7 @@ void TableImpl::CommitScan(ScanTask* scan_task,
         << ", start_key " << request->start()
         << ", end_key " << request->end()
         << ", scan to " << server_addr;
-    request->set_timestamp(common::timer::get_micros());
+    request->set_timestamp(get_micros());
     std::function<void (ScanTabletRequest*, ScanTabletResponse*, bool, int)> done =
         std::bind(&TableImpl::ScanCallBack, this, scan_task, _1, _2, _3, _4);
     tabletnode_client.ScanTablet(request, response, done);
@@ -460,7 +463,7 @@ void TableImpl::ScanCallBack(ScanTask* scan_task,
                              ScanTabletRequest* request,
                              ScanTabletResponse* response,
                              bool failed, int error_code) {
-    perf_counter_.rpc_s.Add(common::timer::get_micros() - request->timestamp());
+    perf_counter_.rpc_s.Add(get_micros() - request->timestamp());
     perf_counter_.rpc_s_cnt.Inc();
     ResultStreamImpl* stream = scan_task->stream;
 
@@ -548,202 +551,111 @@ bool TableImpl::OpenInternal(ErrorCode* err) {
     return true;
 }
 
-void TableImpl::DistributeMutations(const std::vector<RowMutationImpl*>& mu_list,
-                                    bool called_by_user) {
-    typedef std::map<std::string, std::vector<RowMutationImpl*> > TsMuMap;
-    TsMuMap ts_mu_list;
+void TableImpl::DistributeTasks(const std::vector<SdkTask*>& task_list,
+                                bool called_by_user,
+                                SdkTask::TYPE task_type) {
+    typedef std::map<std::string, std::vector<SdkTask*> > TsTaskMap;
+    TsTaskMap ts_task_list;
     int64_t sync_min_timeout = -1;
-    std::vector<RowMutationImpl*> sync_mu_list;
+    std::vector<SdkTask*> sync_task_list;
+
+    int64_t max_pending_counter;
+    Counter* task_cnt = NULL;
+    Counter* pending_counter = NULL;
+    SdkTask::TimeoutFunc timeout_task;
+    std::string err_reason;
+    if (task_type == SdkTask::MUTATION) {
+        task_cnt = &(perf_counter_.mutate_cnt);
+        pending_counter = &(cur_commit_pending_counter_);
+        max_pending_counter = max_commit_pending_num_;
+        err_reason = "pending too much mutations, try it later.";
+        timeout_task = std::bind(&TableImpl::MutationTimeout, this, _1);
+    } else if (task_type == SdkTask::READ) {
+        task_cnt = &(perf_counter_.reader_cnt);
+        pending_counter = &(cur_reader_pending_counter_);
+        max_pending_counter = max_reader_pending_num_;
+        err_reason = "pending too much readers, try it later.";
+        timeout_task = std::bind(&TableImpl::ReaderTimeout, this, _1);
+    } else {
+        assert(0);
+    }
 
-    // evaluate minimum timeout of sync requests
-    if (called_by_user) {
-        for (uint32_t i = 0; i < mu_list.size(); i++) {
-            RowMutationImpl* row_mutation = (RowMutationImpl*)mu_list[i];
-            if (!row_mutation->IsAsync()) {
-                sync_mu_list.push_back(row_mutation);
-                int64_t row_timeout = row_mutation->TimeOut() > 0 ? row_mutation->TimeOut() : timeout_;
-                if (row_timeout > 0 && (sync_min_timeout <= 0 || sync_min_timeout > row_timeout)) {
-                    sync_min_timeout = row_timeout;
-                }
+    for (uint32_t i = 0; called_by_user && i < task_list.size(); i++) {
+        SdkTask* task = (SdkTask*)task_list[i];
+        if (!task->IsAsync()) {
+            sync_task_list.push_back(task);
+            int64_t task_timeout = task->TimeOut() > 0 ? task->TimeOut() : timeout_;
+            if (task_timeout > 0 && (sync_min_timeout <= 0 || sync_min_timeout > task_timeout)) {
+                sync_min_timeout = task_timeout;
             }
         }
     }
 
-    for (uint32_t i = 0; i < mu_list.size(); i++) {
-        RowMutationImpl* row_mutation = (RowMutationImpl*)mu_list[i];
-        perf_counter_.mutate_cnt.Inc();
+    for (uint32_t i = 0; i < task_list.size(); i++) {
+        SdkTask* task = (SdkTask*)task_list[i];
+        task_cnt->Inc();
         if (called_by_user) {
-            row_mutation->SetId(next_task_id_.Inc());
+            task->SetId(next_task_id_.Inc());
 
-            int64_t row_timeout = -1;
-            if (!row_mutation->IsAsync()) {
-                row_timeout = sync_min_timeout;
+            int64_t task_timeout = -1;
+            if (!task->IsAsync()) {
+                task_timeout = sync_min_timeout;
             } else {
-                row_timeout = row_mutation->TimeOut() > 0 ? row_mutation->TimeOut() : timeout_;
+                task_timeout = task->TimeOut() > 0 ? task->TimeOut() : timeout_;
             }
-            SdkTask::TimeoutFunc task = std::bind(&TableImpl::MutationTimeout, this, _1);
-            task_pool_.PutTask(row_mutation, row_timeout, task);
+            perf_counter_.total_task_cnt.Inc();
+            task_pool_.PutTask(task, task_timeout, timeout_task);
         }
 
         // flow control
         if (called_by_user
-            && cur_commit_pending_counter_.Add(row_mutation->MutationNum()) > max_commit_pending_num_
-            && row_mutation->IsAsync()) {
+            && pending_counter->Inc() > max_pending_counter
+            && task->IsAsync()) {
             if (FLAGS_tera_sdk_async_blocking_enabled) {
-                while (cur_commit_pending_counter_.Get() > max_commit_pending_num_) {
+                while (pending_counter->Get() > max_pending_counter) {
                     usleep(100000);
                 }
             } else {
-                cur_commit_pending_counter_.Sub(row_mutation->MutationNum());
-                row_mutation->SetError(ErrorCode::kBusy, "pending too much mutations, try it later.");
-                ThreadPool::Task task =
-                    std::bind(&TableImpl::BreakRequest, this, row_mutation->GetId());
-                row_mutation->DecRef();
-                thread_pool_->AddTask(task);
+                pending_counter->Dec();
+                task->SetError(ErrorCode::kBusy, err_reason);
+                ThreadPool::Task break_task =
+                    std::bind(&TableImpl::BreakRequest, this, task->GetId());
+                task->DecRef();
+                thread_pool_->AddTask(break_task);
                 continue;
             }
         }
 
         std::string server_addr;
-        if (!GetTabletAddrOrScheduleUpdateMeta(row_mutation->RowKey(),
-                                               row_mutation, &server_addr)) {
+        if (!GetTabletAddrOrScheduleUpdateMeta(task->RowKey(),
+                                               task, &server_addr)) {
+            perf_counter_.meta_sched_cnt.Inc();
             continue;
         }
-
-        ts_mu_list[server_addr].push_back(row_mutation);
+        ts_task_list[server_addr].push_back(task);
     }
 
-    TsMuMap::iterator it = ts_mu_list.begin();
-    for (; it != ts_mu_list.end(); ++it) {
-        PackMutations(it->first, it->second);
+    TsTaskMap::iterator it = ts_task_list.begin();
+    for (; it != ts_task_list.end(); ++it) {
+        PackSdkTasks(it->first, it->second, task_type);
     }
-    // 从现在开始，所有异步的row_mutation都不可以再操作了，因为随时会被用户释放
 
+    // 从现在开始，所有异步的row_mutation都不可以再操作了，因为随时会被用户释放
     // 不是用户调用的，立即返回
     if (!called_by_user) {
         return;
     }
 
     // 等待同步操作返回或超时
-    for (uint32_t i = 0; i < sync_mu_list.size(); i++) {
-        while (cur_commit_pending_counter_.Get() > max_commit_pending_num_) {
+    for (uint32_t i = 0; i < sync_task_list.size(); i++) {
+        while (pending_counter->Get() > max_pending_counter) {
             usleep(100000);
         }
-
-        RowMutationImpl* row_mutation = (RowMutationImpl*)sync_mu_list[i];
-        row_mutation->Wait();
-    }
-}
-
-void TableImpl::DistributeMutationsById(std::vector<int64_t>* mu_id_list) {
-    std::vector<RowMutationImpl*> mu_list;
-    for (uint32_t i = 0; i < mu_id_list->size(); ++i) {
-        int64_t mu_id = (*mu_id_list)[i];
-        SdkTask* task = task_pool_.GetTask(mu_id);
-        if (task == NULL) {
-            VLOG(10) << "mutation " << mu_id << " timeout when retry mutate";;
-            continue;
-        }
-        CHECK_EQ(task->Type(), SdkTask::MUTATION);
-        RowMutationImpl* row_mutation = (RowMutationImpl*)task;
-        mu_list.push_back(row_mutation);
-    }
-    DistributeMutations(mu_list, false);
-    delete mu_id_list;
-}
-
-void TableImpl::PackMutations(const std::string& server_addr,
-                              std::vector<RowMutationImpl*>& mu_list) {
-    MutexLock lock(&mutation_batch_mutex_);
-    TaskBatch* mutation_batch = NULL;
-    bool is_instant = false;
-    for (size_t i = 0; i < mu_list.size(); ++i) {
-        // find existing batch or create a new batch
-        if (mutation_batch == NULL) {
-            std::map<std::string, TaskBatch>::iterator it = mutation_batch_map_.find(server_addr);
-            if (it != mutation_batch_map_.end()) {
-                mutation_batch = &it->second;
-            } else {
-                mutation_batch = &mutation_batch_map_[server_addr];
-                mutation_batch->sequence_num = mutation_batch_seq_++;
-                mutation_batch->row_id_list = new std::vector<int64_t>;
-                ThreadPool::Task task = std::bind(&TableImpl::MutationBatchTimeout, this,
-                                                  server_addr, mutation_batch->sequence_num);
-                int64_t timer_id = thread_pool_->DelayTask(write_commit_timeout_, task);
-                mutation_batch->timer_id = timer_id;
-                mutation_batch->byte_size = 0;
-            }
-        }
-
-        // put mutation into the batch
-        RowMutationImpl* row_mutation = mu_list[i];
-        mutation_batch->row_id_list->push_back(row_mutation->GetId());
-        mutation_batch->byte_size += row_mutation->Size();
-        is_instant |= !row_mutation->IsAsync();
-        row_mutation->DecRef();
-
-        // commit the batch if:
-        // 1) batch_byte_size >= max_rpc_byte_size
-        // for the *LAST* batch, commit it if:
-        // 2) any mutation is sync (flush == true)
-        // 3) batch_row_num >= min_batch_row_num
-        if (mutation_batch->byte_size >= kMaxRpcSize ||
-            (i == mu_list.size() - 1 &&
-             (is_instant || mutation_batch->row_id_list->size() >= commit_size_))) {
-            std::vector<int64_t>* mu_id_list = mutation_batch->row_id_list;
-            uint64_t timer_id = mutation_batch->timer_id;
-            const bool non_block_cancel = true;
-            bool is_running = false;
-            if (!thread_pool_->CancelTask(timer_id, non_block_cancel, &is_running)) {
-                CHECK(is_running); // this delay task must be waiting for mutation_batch_mutex_
-            }
-            mutation_batch_map_.erase(server_addr);
-            mutation_batch_mutex_.Unlock();
-            CommitMutationsById(server_addr, *mu_id_list);
-            delete mu_id_list;
-            mutation_batch = NULL;
-            is_instant = false;
-            mutation_batch_mutex_.Lock();
-        }
+        SdkTask* task = (SdkTask*)sync_task_list[i];
+        task->Wait();
     }
 }
 
-void TableImpl::MutationBatchTimeout(std::string server_addr, uint64_t batch_seq) {
-    std::vector<int64_t>* mu_id_list = NULL;
-    {
-        MutexLock lock(&mutation_batch_mutex_);
-        std::map<std::string, TaskBatch>::iterator it =
-            mutation_batch_map_.find(server_addr);
-        if (it == mutation_batch_map_.end()) {
-            return;
-        }
-        TaskBatch* mutation_batch = &it->second;
-        if (mutation_batch->sequence_num != batch_seq) {
-            return;
-        }
-        mu_id_list = mutation_batch->row_id_list;
-        mutation_batch_map_.erase(it);
-    }
-    CommitMutationsById(server_addr, *mu_id_list);
-    delete mu_id_list;
-}
-
-void TableImpl::CommitMutationsById(const std::string& server_addr,
-                                    std::vector<int64_t>& mu_id_list) {
-    std::vector<RowMutationImpl*> mu_list;
-    for (size_t i = 0; i < mu_id_list.size(); i++) {
-        int64_t mu_id = mu_id_list[i];
-        SdkTask* task = task_pool_.GetTask(mu_id);
-        if (task == NULL) {
-            VLOG(10) << "mutation " << mu_id << " timeout";
-            continue;
-        }
-        CHECK_EQ(task->Type(), SdkTask::MUTATION);
-        mu_list.push_back((RowMutationImpl*)task);
-    }
-    CommitMutations(server_addr, mu_list);
-}
-
 void TableImpl::CommitMutations(const std::string& server_addr,
                                 std::vector<RowMutationImpl*>& mu_list) {
     tabletnode::TabletNodeClient tabletnode_client_async(server_addr);
@@ -776,7 +688,7 @@ void TableImpl::CommitMutations(const std::string& server_addr,
     request->set_is_instant(is_instant);
 
     VLOG(20) << "commit " << mu_list.size() << " mutations to " << server_addr;
-    request->set_timestamp(common::timer::get_micros());
+    request->set_timestamp(get_micros());
     std::function<void (WriteTabletRequest*, WriteTabletResponse*, bool, int)> done =
         std::bind(&TableImpl::MutateCallBack, this, mu_id_list, _1, _2, _3, _4);
     tabletnode_client_async.WriteTablet(request, response, done);
@@ -786,7 +698,7 @@ void TableImpl::MutateCallBack(std::vector<int64_t>* mu_id_list,
                                WriteTabletRequest* request,
                                WriteTabletResponse* response,
                                bool failed, int error_code) {
-    perf_counter_.rpc_w.Add(common::timer::get_micros() - request->timestamp());
+    perf_counter_.rpc_w.Add(get_micros() - request->timestamp());
     perf_counter_.rpc_w_cnt.Inc();
     if (failed) {
         if (error_code == sofa::pbrpc::RPC_ERROR_SERVER_SHUTDOWN ||
@@ -807,7 +719,7 @@ void TableImpl::MutateCallBack(std::vector<int64_t>* mu_id_list,
     }
 
     std::map<uint32_t, std::vector<int64_t>* > retry_times_list;
-    std::vector<RowMutationImpl*> not_in_range_list;
+    std::vector<SdkTask*> not_in_range_list;
     for (uint32_t i = 0; i < mu_id_list->size(); ++i) {
         const std::string& row = request->row_list(i).row_key();
         int64_t mu_id = (*mu_id_list)[i];
@@ -835,10 +747,10 @@ void TableImpl::MutateCallBack(std::vector<int64_t>* mu_id_list,
             }
 
             // only for flow control
-            cur_commit_pending_counter_.Sub(row_mutation->MutationNum());
-            int64_t perf_time = common::timer::get_micros();
+            cur_commit_pending_counter_.Dec();
+            int64_t perf_time = get_micros();
             row_mutation->RunCallback();
-            perf_counter_.user_callback.Add(common::timer::get_micros() - perf_time);
+            perf_counter_.user_callback.Add(get_micros() - perf_time);
             perf_counter_.user_callback_cnt.Inc();
             continue;
         }
@@ -860,7 +772,7 @@ void TableImpl::MutateCallBack(std::vector<int64_t>* mu_id_list,
         if (err == kKeyNotInRange) {
             perf_counter_.mutate_range_cnt.Inc();
             row_mutation->IncRetryTimes();
-            not_in_range_list.push_back(row_mutation);
+            not_in_range_list.push_back(task);
         } else {
             row_mutation->IncRetryTimes();
             std::vector<int64_t>* retry_mu_id_list = NULL;
@@ -878,7 +790,7 @@ void TableImpl::MutateCallBack(std::vector<int64_t>* mu_id_list,
     }
 
     if (not_in_range_list.size() > 0) {
-        DistributeMutations(not_in_range_list, false);
+        DistributeTasks(not_in_range_list, false, SdkTask::MUTATION);
     }
     std::map<uint32_t, std::vector<int64_t>* >::iterator it;
     for (it = retry_times_list.begin(); it != retry_times_list.end(); ++it) {
@@ -894,6 +806,22 @@ void TableImpl::MutateCallBack(std::vector<int64_t>* mu_id_list,
     delete mu_id_list;
 }
 
+void TableImpl::DistributeMutationsById(std::vector<int64_t>* mu_id_list) {
+    std::vector<SdkTask*> task_list;
+    for (uint32_t i = 0; i < mu_id_list->size(); ++i) {
+        int64_t mu_id = (*mu_id_list)[i];
+        SdkTask* task = task_pool_.GetTask(mu_id);
+        if (task == NULL) {
+            VLOG(10) << "mutation " << mu_id << " timeout when retry mutate";;
+            continue;
+        }
+        CHECK_EQ(task->Type(), SdkTask::MUTATION);
+        task_list.push_back(task);
+    }
+    DistributeTasks(task_list, false, SdkTask::MUTATION);
+    delete mu_id_list;
+}
+
 void TableImpl::MutationTimeout(SdkTask* task) {
     perf_counter_.mutate_timeout_cnt.Inc();
     CHECK_NOTNULL(task);
@@ -907,199 +835,33 @@ void TableImpl::MutationTimeout(SdkTask* task) {
         ScheduleUpdateMeta(row_mutation->RowKey(),
                            row_mutation->GetMetaTimeStamp());
     }
+
+    std::string err_reason;
     if (row_mutation->RetryTimes() == 0) {
         perf_counter_.mutate_queue_timeout_cnt.Inc();
-        std::string err_reason = StringFormat("commit %lld times, retry 0 times, in %u ms.",
-                                              row_mutation->GetCommitTimes(), timeout_);
-        row_mutation->SetError(ErrorCode::kTimeout, err_reason);
+        err_reason = StringFormat("commit %lld times, retry 0 times, in %u ms.",
+                                  row_mutation->GetCommitTimes(), timeout_);
     } else {
-        std::string err_reason = StringFormat("commit %lld times, retry %u times, in %u ms. last error: %s",
-                                              row_mutation->GetCommitTimes(), row_mutation->RetryTimes(),
-                                              timeout_, StatusCodeToString(err).c_str());
-        row_mutation->SetError(ErrorCode::kSystem, err_reason);
+        err_reason = StringFormat("commit %lld times, retry %u times, in %u ms. last error: %s",
+                                  row_mutation->GetCommitTimes(), row_mutation->RetryTimes(),
+                                  timeout_, StatusCodeToString(err).c_str());
     }
+    row_mutation->SetError(ErrorCode::kTimeout, err_reason);
     // only for flow control
-    cur_commit_pending_counter_.Sub(row_mutation->MutationNum());
-    int64_t perf_time = common::timer::get_micros();
+    cur_commit_pending_counter_.Dec();
+    int64_t perf_time = get_micros();
     row_mutation->RunCallback();
-    perf_counter_.user_callback.Add(common::timer::get_micros() - perf_time);
+    perf_counter_.user_callback.Add(get_micros() - perf_time);
     perf_counter_.user_callback_cnt.Inc();
 }
 
-bool TableImpl::GetTabletLocation(std::vector<TabletInfo>* tablets,
-                                  ErrorCode* err) {
-    return false;
-}
-
-bool TableImpl::GetDescriptor(TableDescriptor* desc, ErrorCode* err) {
-    return false;
-}
-
 void TableImpl::DistributeReaders(const std::vector<RowReaderImpl*>& row_reader_list,
                                   bool called_by_user) {
-    typedef std::map<std::string, std::vector<RowReaderImpl*> > TsReaderMap;
-    TsReaderMap ts_reader_list;
-
-    int64_t sync_min_timeout = -1;
-    std::vector<RowReaderImpl*> sync_reader_list;
-
-    if (called_by_user) {
-        for (uint32_t i = 0; i < row_reader_list.size(); i++) {
-            RowReaderImpl* row_reader = (RowReaderImpl*)row_reader_list[i];
-            if (row_reader->IsAsync()) {
-                continue;
-            }
-            sync_reader_list.push_back(row_reader);
-            int64_t row_timeout = row_reader->TimeOut() > 0 ? row_reader->TimeOut() : timeout_;
-            if (row_timeout > 0 && (sync_min_timeout <= 0 || sync_min_timeout > row_timeout)) {
-                sync_min_timeout = row_timeout;
-            }
-        }
-    }
-
-    for (uint32_t i = 0; i < row_reader_list.size(); i++) {
-        perf_counter_.reader_cnt.Inc();
-        RowReaderImpl* row_reader = (RowReaderImpl*)row_reader_list[i];
-        if (called_by_user) {
-            row_reader->SetId(next_task_id_.Inc());
-
-            int64_t row_timeout = sync_min_timeout;
-            if (row_reader->IsAsync()) {
-                row_timeout = row_reader->TimeOut() > 0 ? row_reader->TimeOut() : timeout_;
-            }
-            SdkTask::TimeoutFunc task = std::bind(&TableImpl::ReaderTimeout, this, _1);
-            task_pool_.PutTask(row_reader, row_timeout, task);
-        }
-
-        // flow control
-        if (called_by_user
-            && cur_reader_pending_counter_.Inc() > max_reader_pending_num_
-            && row_reader->IsAsync()) {
-            if (FLAGS_tera_sdk_async_blocking_enabled) {
-                while (cur_reader_pending_counter_.Get() > max_reader_pending_num_) {
-                    usleep(100000);
-                }
-            } else {
-                cur_reader_pending_counter_.Dec();
-                row_reader->SetError(ErrorCode::kBusy, "pending too much readers, try it later.");
-                ThreadPool::Task task =
-                    std::bind(&TableImpl::BreakRequest, this, row_reader->GetId());
-                row_reader->DecRef();
-                thread_pool_->AddTask(task);
-                continue;
-            }
-        }
-
-        std::string server_addr;
-        if (!GetTabletAddrOrScheduleUpdateMeta(row_reader->RowName(), row_reader,
-                                               &server_addr)) {
-            continue;
-        }
-
-        std::vector<RowReaderImpl*>& ts_row_readers = ts_reader_list[server_addr];
-        ts_row_readers.push_back(row_reader);
-    }
-
-    TsReaderMap::iterator it = ts_reader_list.begin();
-    for (; it != ts_reader_list.end(); ++it) {
-        std::vector<RowReaderImpl*>& reader_list = it->second;
-        PackReaders(it->first, reader_list);
-    }
-    // 从现在开始，所有异步的row_reader都不可以再操作了，因为随时会被用户释放
-
-    // 不是用户调用的，立即返回
-    if (!called_by_user) {
-        return;
-    }
-
-    // 等待同步操作返回或超时
-    for (uint32_t i = 0; i < sync_reader_list.size(); i++) {
-        while (cur_reader_pending_counter_.Get() > max_reader_pending_num_) {
-            usleep(100000);
-        }
-
-        RowReaderImpl* row_reader = (RowReaderImpl*)sync_reader_list[i];
-        row_reader->Wait();
+    std::vector<SdkTask*> task_list;
+    for (size_t i = 0; i < row_reader_list.size(); ++i) {
+        task_list.push_back((SdkTask*)(row_reader_list[i]));
     }
-}
-
-void TableImpl::PackReaders(const std::string& server_addr,
-                            std::vector<RowReaderImpl*>& reader_list) {
-    MutexLock lock(&reader_batch_mutex_);
-    TaskBatch* reader_buffer = NULL;
-    std::map<std::string, TaskBatch>::iterator it = reader_batch_map_.find(server_addr);
-    if (it != reader_batch_map_.end()) {
-        reader_buffer = &it->second;
-    } else {
-        reader_buffer = &reader_batch_map_[server_addr];
-        reader_buffer->sequence_num = reader_batch_seq_++;
-        reader_buffer->row_id_list = new std::vector<int64_t>;
-        ThreadPool::Task task = std::bind(&TableImpl::ReaderBatchTimeout, this,
-                                          server_addr, reader_buffer->sequence_num);
-        uint64_t timer_id = thread_pool_->DelayTask(read_commit_timeout_, task);
-        reader_buffer->timer_id = timer_id;
-    }
-
-    bool is_instant = false;
-    for (size_t i = 0; i < reader_list.size(); ++i) {
-        RowReaderImpl* reader = reader_list[i];
-        reader_buffer->row_id_list->push_back(reader->GetId());
-        is_instant |= !reader->IsAsync();
-        reader->DecRef();
-    }
-
-    if (reader_buffer->row_id_list->size() >= commit_size_ || is_instant) {
-        std::vector<int64_t>* reader_id_list = reader_buffer->row_id_list;
-        uint64_t timer_id = reader_buffer->timer_id;
-        const bool non_block_cancel = true;
-        bool is_running = false;
-        if (!thread_pool_->CancelTask(timer_id, non_block_cancel, &is_running)) {
-            CHECK(is_running); // this delay task must be waiting for reader_batch_map_
-        }
-        reader_batch_map_.erase(server_addr);
-        reader_batch_mutex_.Unlock();
-        CommitReadersById(server_addr, *reader_id_list);
-        delete reader_id_list;
-        reader_buffer = NULL;
-        reader_batch_mutex_.Lock();
-    }
-}
-
-void TableImpl::ReaderBatchTimeout(std::string server_addr, uint64_t batch_seq) {
-    std::vector<int64_t>* reader_id_list = NULL;
-    {
-        MutexLock lock(&reader_batch_mutex_);
-        std::map<std::string, TaskBatch>::iterator it =
-            reader_batch_map_.find(server_addr);
-        if (it == reader_batch_map_.end()) {
-            return;
-        }
-        TaskBatch* reader_buffer = &it->second;
-        if (reader_buffer->sequence_num != batch_seq) {
-            return;
-        }
-        reader_id_list = reader_buffer->row_id_list;
-        reader_batch_map_.erase(it);
-    }
-    CommitReadersById(server_addr, *reader_id_list);
-    delete reader_id_list;
-}
-
-void TableImpl::CommitReadersById(const std::string server_addr,
-                                  std::vector<int64_t>& reader_id_list) {
-    std::vector<RowReaderImpl*> reader_list;
-    for (size_t i = 0; i < reader_id_list.size(); ++i) {
-        int64_t reader_id = reader_id_list[i];
-        SdkTask* task = task_pool_.GetTask(reader_id);
-        if (task == NULL) {
-            VLOG(10) << "reader " << reader_id << " timeout when commit read";;
-            continue;
-        }
-        CHECK_EQ(task->Type(), SdkTask::READ);
-        RowReaderImpl* reader = (RowReaderImpl*)task;
-        reader_list.push_back(reader);
-    }
-    CommitReaders(server_addr, reader_list);
+    DistributeTasks(task_list, called_by_user, SdkTask::READ);
 }
 
 void TableImpl::CommitReaders(const std::string server_addr,
@@ -1122,7 +884,7 @@ void TableImpl::CommitReaders(const std::string server_addr,
         row_reader->DecRef();
     }
     VLOG(20) << "commit " << reader_list.size() << " reads to " << server_addr;
-    request->set_timestamp(common::timer::get_micros());
+    request->set_timestamp(get_micros());
     std::function<void (ReadTabletRequest*, ReadTabletResponse*, bool, int)> done =
         std::bind(&TableImpl::ReaderCallBack, this, reader_id_list, _1, _2, _3, _4);
     tabletnode_client_async.ReadTablet(request, response, done);
@@ -1132,7 +894,7 @@ void TableImpl::ReaderCallBack(std::vector<int64_t>* reader_id_list,
                                ReadTabletRequest* request,
                                ReadTabletResponse* response,
                                bool failed, int error_code) {
-    perf_counter_.rpc_r.Add(common::timer::get_micros() - request->timestamp());
+    perf_counter_.rpc_r.Add(get_micros() - request->timestamp());
     perf_counter_.rpc_r_cnt.Inc();
     if (failed) {
         if (error_code == sofa::pbrpc::RPC_ERROR_SERVER_SHUTDOWN ||
@@ -1185,9 +947,9 @@ void TableImpl::ReaderCallBack(std::vector<int64_t>* reader_id_list,
             } else { // err == kSnapshotNotExist
                 row_reader->SetError(ErrorCode::kNotFound, "snapshot not found");
             }
-            int64_t perf_time = common::timer::get_micros();
+            int64_t perf_time = get_micros();
             row_reader->RunCallback();
-            perf_counter_.user_callback.Add(common::timer::get_micros() - perf_time);
+            perf_counter_.user_callback.Add(get_micros() - perf_time);
             perf_counter_.user_callback_cnt.Inc();
             // only for flow control
             cur_reader_pending_counter_.Dec();
@@ -1273,25 +1035,161 @@ void TableImpl::ReaderTimeout(SdkTask* task) {
         ScheduleUpdateMeta(row_reader->RowName(),
                            row_reader->GetMetaTimeStamp());
     }
+
+    std::string err_reason;
     if (row_reader->RetryTimes() == 0) {
         perf_counter_.reader_queue_timeout_cnt.Inc();
-        std::string err_reason = StringFormat("commit %lld times, retry 0 times, in %u ms.",
-                                              row_reader->GetCommitTimes(), timeout_);
-        row_reader->SetError(ErrorCode::kTimeout, err_reason);
+        err_reason = StringFormat("commit %lld times, retry 0 times, in %u ms.",
+                                  row_reader->GetCommitTimes(), timeout_);
     } else {
-        std::string err_reason = StringFormat("commit %lld times, retry %u times, in %u ms. last error: %s",
-                                              row_reader->GetCommitTimes(),  row_reader->RetryTimes(),
-                                              timeout_, StatusCodeToString(err).c_str());
-        row_reader->SetError(ErrorCode::kSystem, err_reason);
+        err_reason = StringFormat("commit %lld times, retry %u times, in %u ms. last error: %s",
+                                  row_reader->GetCommitTimes(),  row_reader->RetryTimes(),
+                                  timeout_, StatusCodeToString(err).c_str());
     }
-    int64_t perf_time = common::timer::get_micros();
+    row_reader->SetError(ErrorCode::kTimeout, err_reason);
+    int64_t perf_time = get_micros();
     row_reader->RunCallback();
-    perf_counter_.user_callback.Add(common::timer::get_micros() - perf_time);
+    perf_counter_.user_callback.Add(get_micros() - perf_time);
     perf_counter_.user_callback_cnt.Inc();
     // only for flow control
     cur_reader_pending_counter_.Dec();
 }
 
+void TableImpl::PackSdkTasks(const std::string& server_addr,
+                             std::vector<SdkTask*>& task_list,
+                             SdkTask::TYPE task_type) {
+    Mutex* mutex = NULL;
+    std::map<std::string, TaskBatch*>* task_batch_map = NULL;
+    SdkTask::TimeoutFunc task;
+    uint64_t commit_timeout = 10000;
+    uint32_t commit_size = commit_size_;
+    if (task_type == SdkTask::MUTATION) {
+        mutex = &mutation_batch_mutex_;
+        task_batch_map = &mutation_batch_map_;
+        commit_timeout = write_commit_timeout_;
+    } else if (task_type == SdkTask::READ) {
+        mutex = &reader_batch_mutex_;
+        task_batch_map = &reader_batch_map_;
+        commit_timeout = read_commit_timeout_;
+    } else {
+        assert(0);
+    }
+
+    TaskBatch* task_batch = NULL;
+    bool is_instant = false;
+    MutexLock lock(mutex);
+    for (size_t i = 0; i < task_list.size(); ++i) {
+        // find existing batch or create a new batch
+        if (task_batch == NULL) {
+            std::map<std::string, TaskBatch*>::iterator it = task_batch_map->find(server_addr);
+            if (it != task_batch_map->end()) {
+                task_batch = it->second;
+            } else {
+                task_batch = new TaskBatch;
+                task_batch->type = task_type;
+                task_batch->mutex = mutex;
+                task_batch->task_batch_map = task_batch_map;
+                task_batch->byte_size = 0;
+                task_batch->server_addr = server_addr;
+                task_batch->row_id_list = new std::vector<int64_t>;
+
+                task_batch->SetId(next_task_id_.Inc());
+                (*task_batch_map)[server_addr] = task_batch;
+                SdkTask::TimeoutFunc task = std::bind(&TableImpl::TaskBatchTimeout, this, _1);
+                task_pool_.PutTask(task_batch, commit_timeout, task);
+                task_batch->DecRef();
+            }
+        }
+
+        // put task into the batch
+        SdkTask* sdk_task = task_list[i];
+        task_batch->row_id_list->push_back(sdk_task->GetId());
+        task_batch->byte_size += sdk_task->Size();
+        is_instant |= !sdk_task->IsAsync();
+        sdk_task->DecRef();
+
+        // commit the batch if:
+        // 1) batch_byte_size >= max_rpc_byte_size
+        // for the *LAST* batch, commit it if:
+        // 2) any mutation is sync (flush == true)
+        // 3) batch_row_num >= min_batch_row_num
+        // 4) commit timeout
+        if (task_batch->byte_size >= kMaxRpcSize ||
+            ((i == task_list.size() - 1) &&
+             (is_instant ||
+              (task_batch->row_id_list->size() >= commit_size)))) {
+            std::vector<int64_t>* task_id_list = task_batch->row_id_list;
+            task_batch->row_id_list = NULL;
+            task_batch_map->erase(server_addr);
+            mutex->Unlock();
+
+            CommitTasksById(server_addr, *task_id_list, task_type);
+            delete task_id_list;
+            task_batch = NULL;
+            is_instant = false;
+            mutex->Lock();
+        }
+    }
+}
+
+void TableImpl::TaskBatchTimeout(SdkTask* task) {
+    std::vector<int64_t>* task_id_list = NULL;
+    CHECK_NOTNULL(task);
+    CHECK_EQ(task->Type(), SdkTask::TASKBATCH);
+    TaskBatch* task_batch = (TaskBatch*)task;
+    task_batch->ExcludeOtherRef();
+
+    const std::string& server_addr = task_batch->server_addr;
+    SdkTask::TYPE task_type = task_batch->type;
+    Mutex* mutex = task_batch->mutex;
+    std::map<std::string, TaskBatch*>* task_batch_map = task_batch->task_batch_map;
+    {
+        MutexLock lock(mutex);
+        std::map<std::string, TaskBatch*>::iterator it =
+            task_batch_map->find(server_addr);
+        if (it != task_batch_map->end() &&
+            task_batch->GetId() == it->second->GetId()) {
+            task_id_list = task_batch->row_id_list;
+            task_batch->row_id_list = NULL;
+            task_batch_map->erase(it);
+        }
+    }
+
+    if (task_id_list != NULL) {
+        CommitTasksById(server_addr, *task_id_list, task_type);
+        delete task_id_list;
+    }
+    delete task_batch;
+}
+
+void TableImpl::CommitTasksById(const std::string& server_addr,
+                                std::vector<int64_t>& task_id_list,
+                                SdkTask::TYPE task_type) {
+    std::vector<RowMutationImpl*> mutation_list;
+    std::vector<RowReaderImpl*> reader_list;
+
+    for (size_t i = 0; i < task_id_list.size(); i++) {
+        int64_t task_id = task_id_list[i];
+        SdkTask* task = task_pool_.GetTask(task_id);
+        if (task == NULL) {
+            VLOG(10) << "commit task, type " << task_type << ", id " << task_id << " timeout";
+            continue;
+        }
+        perf_counter_.total_commit_cnt.Inc();
+        CHECK_EQ(task->Type(), task_type);
+        if (task_type == SdkTask::MUTATION) {
+            mutation_list.push_back((RowMutationImpl*)task);
+        } else if (task_type == SdkTask::READ) {
+            reader_list.push_back((RowReaderImpl*)task);
+        }
+    }
+    if (task_type == SdkTask::MUTATION) {
+        CommitMutations(server_addr, mutation_list);
+    } else if (task_type == SdkTask::READ) {
+        CommitReaders(server_addr, reader_list);
+    }
+}
+
 bool TableImpl::GetTabletMetaForKey(const std::string& key, TabletMeta* meta) {
     MutexLock lock(&meta_mutex_);
     TabletMetaNode* node = GetTabletMetaNodeForKey(key);
@@ -1486,7 +1384,7 @@ void TableImpl::ScanMetaTableAsync(const std::string& key_start, const std::stri
 
     std::function<void (ScanTabletRequest*, ScanTabletResponse*, bool, int)> done =
         std::bind(&TableImpl::ScanMetaTableCallBack, this, key_start, key_end,
-                  expand_key_end, ::common::timer::get_micros(), _1, _2, _3, _4);
+                  expand_key_end, get_micros(), _1, _2, _3, _4);
     tabletnode_client_async.ScanTablet(request, response, done);
 }
 
@@ -1497,7 +1395,7 @@ void TableImpl::ScanMetaTableCallBack(std::string key_start,
                                       ScanTabletRequest* request,
                                       ScanTabletResponse* response,
                                       bool failed, int error_code) {
-    perf_counter_.get_meta.Add(::common::timer::get_micros() - start_time);
+    perf_counter_.get_meta.Add(get_micros() - start_time);
     perf_counter_.get_meta_cnt.Inc();
     if (failed) {
         if (error_code == sofa::pbrpc::RPC_ERROR_SERVER_SHUTDOWN ||
@@ -1699,8 +1597,8 @@ void TableImpl::WakeUpPendingRequest(const TabletMetaNode& node) {
     const std::string& server_addr = node.meta.server_addr();
     int64_t meta_timestamp = node.update_time;
 
-    std::vector<RowMutationImpl*> mutation_list;
-    std::vector<RowReaderImpl*> reader_list;
+    std::vector<SdkTask*> mutation_list;
+    std::vector<SdkTask*> reader_list;
 
     std::map<std::string, std::list<int64_t> >::iterator it =
         pending_task_id_list_.lower_bound(start_key);
@@ -1711,6 +1609,7 @@ void TableImpl::WakeUpPendingRequest(const TabletMetaNode& node) {
         std::list<int64_t>& task_id_list = it->second;
         for (std::list<int64_t>::iterator itask = task_id_list.begin();
                 itask != task_id_list.end(); ++itask) {
+            perf_counter_.meta_update_cnt.Inc();
             int64_t task_id = *itask;
             SdkTask* task = task_pool_.GetTask(task_id);
             if (task == NULL) {
@@ -1721,12 +1620,10 @@ void TableImpl::WakeUpPendingRequest(const TabletMetaNode& node) {
 
             switch (task->Type()) {
             case SdkTask::READ: {
-                RowReaderImpl* reader = (RowReaderImpl*)task;
-                reader_list.push_back(reader);
+                reader_list.push_back(task);
             } break;
             case SdkTask::MUTATION: {
-                RowMutationImpl* mutation = (RowMutationImpl*)task;
-                mutation_list.push_back(mutation);
+                mutation_list.push_back(task);
             } break;
             case SdkTask::SCAN: {
                 ScanTask* scan_task = (ScanTask*)task;
@@ -1743,10 +1640,10 @@ void TableImpl::WakeUpPendingRequest(const TabletMetaNode& node) {
     }
 
     if (mutation_list.size() > 0) {
-        PackMutations(server_addr, mutation_list);
+        PackSdkTasks(server_addr, mutation_list, SdkTask::MUTATION);
     }
     if (reader_list.size() > 0) {
-        PackReaders(server_addr, reader_list);
+        PackSdkTasks(server_addr, reader_list, SdkTask::READ);
     }
 }
 
@@ -2068,6 +1965,19 @@ void TableImpl::PerfCounter::DoDumpPerfCounterLog(const std::string& log_prefix)
         << " cost_90: " << hist_read_cost.Percentile(90)
         << " cost_99: " << hist_read_cost.Percentile(99);
     hist_read_cost.Clear();
+
+    LOG(INFO) << log_prefix << "[hist_async_cost]"
+        << " cost_ave: " << hist_async_cost.Average()
+        << " cost_50: " << hist_async_cost.Percentile(50)
+        << " cost_90: " << hist_async_cost.Percentile(90)
+        << " cost_99: " << hist_async_cost.Percentile(99);
+    hist_async_cost.Clear();
+
+    LOG(INFO) << log_prefix << "[total]"
+        << " meta_sched_cnt: " << meta_sched_cnt.Get()
+        << " meta_update_cnt: " << meta_update_cnt.Get()
+        << " total_task_cnt: " << total_task_cnt.Get()
+        << " total_commit_cnt: " << total_commit_cnt.Get();
 }
 
 void TableImpl::DelayTaskWrapper(ThreadPool::Task task, int64_t task_id) {
@@ -2148,6 +2058,15 @@ void TableImpl::StatUserPerfCounter(enum SdkTask::TYPE op, ErrorCode::ErrorCodeT
     }
 }
 
+bool TableImpl::GetTabletLocation(std::vector<TabletInfo>* tablets,
+                                  ErrorCode* err) {
+    return false;
+}
+
+bool TableImpl::GetDescriptor(TableDescriptor* desc, ErrorCode* err) {
+    return false;
+}
+
 /// 创建事务
 Transaction* TableImpl::StartRowTransaction(const std::string& row_key) {
     return new SingleRowTxn((Table*)this, row_key, thread_pool_);
diff --git a/src/sdk/table_impl.h b/src/sdk/table_impl.h
index 088a2c206..6e0986b62 100644
--- a/src/sdk/table_impl.h
+++ b/src/sdk/table_impl.h
@@ -16,7 +16,7 @@
 #include "sdk/sdk_task.h"
 #include "sdk/sdk_zk.h"
 #include "tera.h"
-#include "utils/counter.h"
+#include "common/counter.h"
 
 namespace tera {
 
@@ -261,10 +261,16 @@ class TableImpl : public Table {
         Counter user_read_fail;
         ::leveldb::Histogram hist_read_cost;
 
+        ::leveldb::Histogram hist_async_cost;
+        Counter meta_sched_cnt;
+        Counter meta_update_cnt;
+        Counter total_task_cnt;
+        Counter total_commit_cnt;
+
         void DoDumpPerfCounterLog(const std::string& log_prefix);
 
         PerfCounter() {
-            start_time = common::timer::get_micros();
+            start_time = get_micros();
         }
     };
 private:
@@ -274,22 +280,13 @@ class TableImpl : public Table {
                         std::vector<KeyValuePair>* kv_list,
                         ErrorCode* err);
 
-    // 将一批mutation根据rowkey分配给各个TS
-    void DistributeMutations(const std::vector<RowMutationImpl*>& mu_list,
-                            bool called_by_user);
+    void DistributeTasks(const std::vector<SdkTask*>& task_list,
+                         bool called_by_user,
+                         SdkTask::TYPE task_type);
 
     void DistributeMutationsById(std::vector<int64_t>* retry_mu_id_list);
 
-    // 分配完成后将mutation打包
-    void PackMutations(const std::string& server_addr,
-                       std::vector<RowMutationImpl*>& mu_list);
-
-    // mutation打包不满但到达最大等待时间
-    void MutationBatchTimeout(std::string server_addr, uint64_t batch_seq);
-
     // 通过异步RPC将mutation提交至TS
-    void CommitMutationsById(const std::string& server_addr,
-                             std::vector<int64_t>& mu_id_list);
     void CommitMutations(const std::string& server_addr,
                          std::vector<RowMutationImpl*>& mu_list);
 
@@ -306,21 +303,12 @@ class TableImpl : public Table {
     void DistributeReaders(const std::vector<RowReaderImpl*>& row_reader_list,
                            bool called_by_user);
 
-    void DistributeReadersById(std::vector<int64_t>* reader_id_list);
-
-    // 分配完成后将reader打包
-    void PackReaders(const std::string& server_addr,
-                     std::vector<RowReaderImpl*>& reader_list);
-
-    // reader打包不满但到达最大等待时间
-    void ReaderBatchTimeout(std::string server_addr, uint64_t batch_seq);
-
     // 通过异步RPC将reader提交至TS
-    void CommitReadersById(const std::string server_addr,
-                           std::vector<int64_t>& reader_id_list);
     void CommitReaders(const std::string server_addr,
                        std::vector<RowReaderImpl*>& reader_list);
 
+    void DistributeReadersById(std::vector<int64_t>* reader_id_list);
+
     // reader RPC回调
     void ReaderCallBack(std::vector<int64_t>* reader_id_list,
                         ReadTabletRequest* request,
@@ -330,6 +318,14 @@ class TableImpl : public Table {
     // reader到达用户设置的超时时间但尚未处理完
     void ReaderTimeout(SdkTask* sdk_task);
 
+    void PackSdkTasks(const std::string& server_addr,
+                      std::vector<SdkTask*>& task_list,
+                      SdkTask::TYPE task_type);
+    void TaskBatchTimeout(SdkTask* task);
+    void CommitTasksById(const std::string& server_addr,
+                         std::vector<int64_t>& task_id_list,
+                         SdkTask::TYPE task_type);
+
     void ScanTabletAsync(ScanTask* scan_task, bool called_by_user);
 
     void CommitScan(ScanTask* scan_task, const std::string& server_addr);
@@ -415,11 +411,22 @@ class TableImpl : public Table {
     TableImpl(const TableImpl&);
     void operator=(const TableImpl&);
 
-    struct TaskBatch {
-        uint64_t sequence_num;
-        uint64_t timer_id;
+    struct TaskBatch : public SdkTask {
         uint64_t byte_size;
+        std::string server_addr;
+        SdkTask::TYPE type;
+        Mutex* mutex;
+        std::map<std::string, TaskBatch*>* task_batch_map;
         std::vector<int64_t>* row_id_list;
+
+        TaskBatch() : SdkTask(SdkTask::TASKBATCH) {}
+        virtual bool IsAsync() { return false; }
+        virtual uint32_t Size() { return 0; }
+        virtual int64_t TimeOut() { return 0; }
+        virtual void Wait() {}
+        virtual void SetError(ErrorCode::ErrorCodeType err,
+                              const std::string& reason) {}
+        virtual const std::string& RowKey() { return server_addr; }
     };
 
     std::string name_;
@@ -432,10 +439,8 @@ class TableImpl : public Table {
     uint32_t commit_size_;
     uint64_t write_commit_timeout_;
     uint64_t read_commit_timeout_;
-    std::map<std::string, TaskBatch> mutation_batch_map_;
-    std::map<std::string, TaskBatch> reader_batch_map_;
-    uint64_t mutation_batch_seq_;
-    uint64_t reader_batch_seq_;
+    std::map<std::string, TaskBatch*> mutation_batch_map_;
+    std::map<std::string, TaskBatch*> reader_batch_map_;
     Counter cur_commit_pending_counter_;
     Counter cur_reader_pending_counter_;
     int64_t max_commit_pending_num_;
diff --git a/src/sdk/tera.cc b/src/sdk/tera.cc
index 0003f9a5f..d01bce0fe 100644
--- a/src/sdk/tera.cc
+++ b/src/sdk/tera.cc
@@ -41,6 +41,42 @@ static const char* strerr(ErrorCode::ErrorCodeType type) {
     case ErrorCode::kTxnFail:
         ret = "TransactionFail";
         break;
+    case ErrorCode::kGTxnDataTooLarge:
+        ret = "GlobalTransactionDataTooLarge";
+        break;
+    case ErrorCode::kGTxnNotSupport:
+        ret = "GlobalTransactionNotSupport";
+        break;
+    case ErrorCode::kGTxnSchemaError:
+        ret = "GlobalTransactionSchemaError";
+        break;
+    case ErrorCode::kGTxnOpAfterCommit:
+        ret = "GlobalTransactionOpAfterCommit";
+        break;
+    case ErrorCode::kGTxnPrimaryLost:
+        ret = "GlobalTransactionPrimaryLost";
+        break;
+    case ErrorCode::kGTxnWriteConflict:
+        ret = "GlobalTransactionWriteConflict";
+        break;
+    case ErrorCode::kGTxnLockConflict:
+        ret = "GlobalTransactionLockConflict";
+        break;
+    case ErrorCode::kGTxnOKButAckFailed:
+        ret = "GlobalTransactionOkButAckFailed";
+        break;
+    case ErrorCode::kGTxnOKButNotifyFailed:
+        ret = "GlobalTransactionOKButNotifyFailed";
+        break;
+    case ErrorCode::kGTxnPrewriteTimeout:
+        ret = "GlobalTransactionPrewriteTimeout";
+        break;
+    case ErrorCode::kGTxnPrimaryCommitTimeout:
+        ret = "GlobalTransactionPrimaryCommitTimeout";
+        break;
+    case ErrorCode::kGTxnTimestampLost:
+        ret = "GlobalTransactionTimestampLost";
+        break;
     default:
         ret = "UnkownError";
     }
diff --git a/src/sdk/tera_easy.cc b/src/sdk/tera_easy.cc
index c0758eb1d..6978ad9e5 100644
--- a/src/sdk/tera_easy.cc
+++ b/src/sdk/tera_easy.cc
@@ -13,8 +13,8 @@
 
 #include "common/thread_pool.h"
 #include "tera.h"
-#include "utils/atomic.h"
-#include "utils/counter.h"
+#include "common/atomic.h"
+#include "common/counter.h"
 
 DEFINE_int32(tera_easy_ttl, 90 * 24 * 3600, "ttl(s) of key-value writed by tera_easy");
 DEFINE_int32(tera_sdk_rpc_max_pending_num, 1024 * 1024, "max num of pending kv");
diff --git a/src/sdk/test/filter_utils_test.cc b/src/sdk/test/filter_utils_test.cc
index 19051ce6c..456d406e7 100644
--- a/src/sdk/test/filter_utils_test.cc
+++ b/src/sdk/test/filter_utils_test.cc
@@ -40,27 +40,21 @@ TEST(FilterUtils, DefaultValueConverter) {
     EXPECT_FALSE(DefaultValueConverter("", "", NULL));
 
     in = "8";
-    out_p = string("\x80\x0\x0\x0\x0\x0\x0\x7", 8);
+    out_p = string("\x08\x0\x0\x0\x0\x0\x0\x0", 8);
     type = "int64";
+
     EXPECT_TRUE(DefaultValueConverter(in, type, &out));
     EXPECT_EQ(out, out_p);
 
     in = "-8";
-    out_p = string("\x7F\xFF\xFF\xFF\xFF\xFF\xFF\xF7", 8);
+    out_p = string("\xF8\xFF\xFF\xFF\xFF\xFF\xFF\xFF", 8);
     type = "int64";
     EXPECT_TRUE(DefaultValueConverter(in, type, &out));
     EXPECT_EQ(out, out_p);
 
-    in = "8";
-    out_p = string("\x0\x0\x0\x0\x0\x0\x0\x8", 8);
-    type = "uint64";
-    EXPECT_TRUE(DefaultValueConverter(in, type, &out));
-    EXPECT_EQ(out, out_p);
-
     in = "-8";
     type = "string";
-    EXPECT_TRUE(DefaultValueConverter(in, type, &out));
-    EXPECT_TRUE(out == "-8");
+    EXPECT_FALSE(DefaultValueConverter(in, type, &out));
 
     type = "illegal";
     EXPECT_FALSE(DefaultValueConverter(in, type, &out));
diff --git a/src/sdk/test/global_txn_batch_op.cc b/src/sdk/test/global_txn_batch_op.cc
new file mode 100644
index 000000000..3e1d14af6
--- /dev/null
+++ b/src/sdk/test/global_txn_batch_op.cc
@@ -0,0 +1,440 @@
+#include <iostream>
+#include <memory>
+#include <regex>
+#include <vector>
+#include <string>
+#include <algorithm>
+#include <utility>
+#include <unordered_map>
+#include <functional>
+#include <signal.h>
+#include <unistd.h>
+
+#include <gflags/gflags.h>
+
+#include "tera.h"
+#include "version.h"
+
+DECLARE_string(flagfile);
+
+using std::vector;
+using std::string;
+using std::cout;
+using std::endl;
+using std::pair;
+using std::shared_ptr;
+using std::unique_ptr;
+using std::unordered_map;
+using std::function;
+
+using TxnPtr            = shared_ptr<tera::Transaction>;
+using RowMutationPtr    = shared_ptr<tera::RowMutation>;
+using ClientPtr         = shared_ptr<tera::Client>;
+using TablePtr          = shared_ptr<tera::Table>;
+
+struct RowkeyCfQu{
+    RowkeyCfQu()=default;
+    RowkeyCfQu(string rowkey, string cf, string qu):
+        rowkey_(rowkey),
+        cf_(cf),
+        qu_(qu)
+    {}
+
+    string rowkey_, cf_, qu_;
+};
+//Used for parsing operator string
+using OperatorStructure = vector<pair<string,                     //vector of Table operations, pair<tablename, rowkeys(vector)>
+                              vector<RowkeyCfQu>>>;               //vector of rowkey-cf-qus in a table
+
+static unordered_map<string, string>& GetHelpCommand() {
+    static unordered_map<string, string> help_commands;
+    return help_commands;
+}
+
+static void InitHelpCommand() {
+    auto& help_commands = GetHelpCommand();
+    help_commands["cas"] = "Compare and set old_vals to new_vals across different Tables, Rows, and Columns atomically, usage:  \n"
+                           "    cas <Table1-rowkey1.cf1.qu1:rowkey2.cf2.qu2#Table2-rowkey3.cf3.qu3> <old_val1:oldval2:oldval3> <new_val1:new_val2:new_val3>";
+    help_commands["get"] = "Get values across different Tables, Rows, and Columns atomically, usage:                            \n"
+                           "    get <Table1-rowkey1.cf1.qu1:rowkey2.cf2.qu2#Table2-rowkey3.cf3.qu3>";
+    help_commands["put"] = "Put values across different Tables, Rows, and Columns atomically, usage:                            \n"
+                           "    put <Table1-rowkey1.cf1.qu1:rowkey2.cf2.qu2#Table2-rowkey3.cf3.qu3> <val1:val2:val3>";
+}
+
+static void PrintHelp(const string& str = "") {
+    auto& help_commands = GetHelpCommand();
+    if (str == "" || help_commands.find(str) == help_commands.end()) {
+        for (auto& help_info : help_commands) {
+            cout << help_info.first << " " << help_info.second << endl;
+        }
+    } else {
+        cout << str << ": " << help_commands[str] << endl;
+    }
+}
+
+static vector<string> split(const string& str, const char delimiter) {
+    vector<string> res;
+    string::size_type pos = 0;
+    while (pos < str.size()) {
+        string::size_type new_pos = str.find(delimiter, pos);
+        if (new_pos == string::npos) {
+            res.emplace_back(str.begin() + pos, str.end());
+            break;
+        } else {
+            res.emplace_back(str.begin() + pos, str.begin() + new_pos);
+        }
+        pos = new_pos + 1;
+    }
+    return res;
+}
+
+static int64_t ParseOperatorStructure(const string& str, OperatorStructure& opst, size_t& num) {
+    opst.clear();
+    num = 0;
+    vector<string> table_operations = split(str, '#');
+    for (auto& table_op : table_operations) {
+        vector<string> table_rowkey = split(table_op, '-');
+        if (table_rowkey.size() != 2) {
+            return -1;
+        }
+        
+        opst.emplace_back(table_rowkey[0], vector<RowkeyCfQu>());
+        vector<string> row_operations = split(table_rowkey[1], ':');
+        for (auto& row_op : row_operations) {
+            vector<string> rowkey_cf_qu = split(row_op, '.');
+            if (rowkey_cf_qu.size() < 2 || 
+                rowkey_cf_qu.size() > 3) {
+                return -1;
+            }
+            
+            if (rowkey_cf_qu.size() == 3) {
+                opst.back().second.emplace_back(rowkey_cf_qu[0], rowkey_cf_qu[1], rowkey_cf_qu[2]);
+            } else {
+                opst.back().second.emplace_back(rowkey_cf_qu[0], rowkey_cf_qu[1], "");
+            }
+            ++num;
+        }
+    }
+    return 0;
+}
+
+static int64_t OpenTables(ClientPtr client,
+                          const OperatorStructure& opst, 
+                          unordered_map<string, TablePtr>& tables) {
+    tables.clear();
+    tera::ErrorCode ec;
+    for (auto& table : opst) {
+        string tablename = table.first;
+        if (tables.find(table.first) == tables.end()) {
+            tables.emplace(table.first, TablePtr(client->OpenTable(table.first, &ec)));
+            if (!tables[table.first]) {
+                cout << "open table: " << table.first << " failed" << endl;
+                cout << ec.ToString() << endl;
+                return -1;
+            }
+        }
+    }
+    return 0;
+}
+
+static int64_t PutOp(ClientPtr client, const vector<string>& args) {
+    if (args.size() != 4) {
+        cout << "Arguments Error: " << args.size() << ", need 4" << endl;
+        PrintHelp(args[1]);
+        return -1;
+    }
+    
+    OperatorStructure opst;
+    size_t op_num = 0;
+    if (ParseOperatorStructure(args[2], opst, op_num) != 0) {
+        cout << "Parse Arguments Error" << endl;
+        PrintHelp(args[1]);
+        return -1;
+    }
+
+    vector<string> val = split(args[3], ':');
+    if (op_num != val.size()) {
+        cout << "op size is not equal to val size" << endl;
+        return -1;
+    }
+
+    unordered_map<string, TablePtr> tables;
+    if (OpenTables(client, opst, tables) != 0) {
+        return -1;
+    }
+
+    TxnPtr g_txn(client->NewGlobalTransaction());
+    if (!g_txn) {
+        cout << "open txn failed" << endl;
+        return -1;
+    }
+
+    string result;
+    for (auto& table : opst) {
+        const string& tablename = table.first;
+        const auto& row_cf_qu_list = table.second;
+        for (auto& row_cf_qu : row_cf_qu_list) {
+            const string& rowkey = row_cf_qu.rowkey_;
+            const string& cf     = row_cf_qu.cf_    ;
+            const string& qu     = row_cf_qu.qu_    ;
+
+            unique_ptr<tera::RowReader> reader(tables[tablename]->NewRowReader(rowkey));
+            reader->AddColumn(cf, qu);
+            g_txn->Get(reader.get());
+            if (reader->GetError().GetType() != tera::ErrorCode::kOK &&
+                reader->GetError().GetType() != tera::ErrorCode::kNotFound) {
+                std::cout << reader->GetError().ToString() << std::endl; 
+                return -1;
+            }
+
+            if (reader->Done()) {
+                result += ":";
+            } else {
+                result += reader->Value() + ":";
+            }
+        }
+    }
+    
+    if (!result.empty()) result.pop_back();
+
+    auto val_iter = val.begin();
+    for (auto& table : opst) {
+        const string& tablename = table.first;
+        const auto& row_cf_qu_list = table.second;
+        unordered_map<string, RowMutationPtr> row_mutations;
+
+        for (auto& row_cf_qu : row_cf_qu_list) {
+            const string& rowkey = row_cf_qu.rowkey_;
+            const string& cf     = row_cf_qu.cf_    ;
+            const string& qu     = row_cf_qu.qu_    ;
+
+            if (row_mutations.find(rowkey) == row_mutations.end()) {
+                RowMutationPtr row_mutation(tables[tablename]->NewRowMutation(rowkey));
+                row_mutations[rowkey] = row_mutation;
+            }
+            row_mutations[rowkey]->Put(cf, qu, *(val_iter++));
+        }
+
+        for (auto mutation : row_mutations) {
+            g_txn->ApplyMutation(mutation.second.get());
+        }
+    }
+
+    
+    g_txn->Commit();
+    if (g_txn->GetError().GetType() != tera::ErrorCode::kOK) {
+        std::cout << "commit failed: " << g_txn->GetError().ToString() << std::endl;
+        cout << result << endl;
+        return -1;
+    }
+    std::cout << "commit success" << std::endl;
+
+    return 0;
+}
+
+static int64_t GetOp(ClientPtr client, const vector<string>& args) {
+    if (args.size() != 3) {
+        cout << "Arguments Error: " << args.size() << ", need 3" << endl;
+        PrintHelp(args[1]);
+        return -1;
+    }
+    
+    OperatorStructure opst;
+    size_t op_num = 0;
+    if (ParseOperatorStructure(args[2], opst, op_num) != 0) {
+        cout << "Parse Arguments Error" << endl;
+        PrintHelp(args[1]);
+        return -1;
+    }
+
+    unordered_map<string, TablePtr> tables;
+    if (OpenTables(client, opst, tables) != 0) {
+        return -1;
+    }
+
+    TxnPtr g_txn(client->NewGlobalTransaction());
+    if (!g_txn) {
+        cout << "open txn failed" << endl;
+        return -1;
+    }
+
+    string result;
+    for (auto& table : opst) {
+        const string& tablename = table.first;
+        const auto& row_cf_qu_list = table.second;
+        for (auto& row_cf_qu : row_cf_qu_list) {
+            const string& rowkey = row_cf_qu.rowkey_;
+            const string& cf     = row_cf_qu.cf_    ;
+            const string& qu     = row_cf_qu.qu_    ;
+
+            unique_ptr<tera::RowReader> reader(tables[tablename]->NewRowReader(rowkey));
+            reader->AddColumn(cf, qu);
+            g_txn->Get(reader.get());
+            if (reader->GetError().GetType() != tera::ErrorCode::kOK &&
+                reader->GetError().GetType() != tera::ErrorCode::kNotFound) {
+                std::cout << reader->GetError().ToString() << std::endl; 
+                return -1;
+            }
+
+            if (reader->Done()) {
+                result += ":";
+            } else {
+                result += reader->Value() + ":";
+            }
+        }
+    }
+    
+    if (!result.empty()) result.pop_back();
+    cout << result << endl;
+    return 0;
+}
+
+static int64_t CasOp(ClientPtr client, const vector<string>& args) {
+    if (args.size() != 5) {
+        cout << "Arguments Error: " << args.size() << ", need 5" << endl;
+        PrintHelp(args[1]);
+        return -1;
+    }
+    
+    OperatorStructure opst;
+    size_t op_num = 0;
+    if (ParseOperatorStructure(args[2], opst, op_num) != 0) {
+        cout << "Parse Arguments Error" << endl;
+        PrintHelp(args[1]);
+        return -1;
+    }
+
+    unordered_map<string, TablePtr> tables;
+    if (OpenTables(client, opst, tables) != 0) {
+        return -1;
+    }
+
+    TxnPtr g_txn(client->NewGlobalTransaction());
+    if (!g_txn) {
+        cout << "open txn failed" << endl;
+        return -1;
+    }
+
+    string cur_val;
+    const string& old_val = args[3];
+    const string& new_val = args[4];
+    for (auto& table : opst) {
+        const string& tablename = table.first;
+        const auto& row_cf_qu_list = table.second;
+        for (auto& row_cf_qu : row_cf_qu_list) {
+            const string& rowkey = row_cf_qu.rowkey_;
+            const string& cf     = row_cf_qu.cf_    ;
+            const string& qu     = row_cf_qu.qu_    ;
+
+            unique_ptr<tera::RowReader> reader(tables[tablename]->NewRowReader(rowkey));
+            reader->AddColumn(cf, qu);
+            g_txn->Get(reader.get());
+            if (g_txn->GetError().GetType() != tera::ErrorCode::kOK) {
+                std::cout << g_txn->GetError().ToString() << std::endl; 
+                return -1;
+            }
+
+            if (reader->Done()) {
+                cur_val += ":";
+            } else {
+                cur_val += reader->Value() + ":";
+            }
+        }
+    }
+
+    if (!cur_val.empty()) cur_val.pop_back();
+
+    if (old_val != cur_val) {
+        cout << "cas failed: NotEqual" << endl;
+        return -1;
+    }
+
+    vector<string> new_val_list = split(new_val, ':');
+    if (op_num != new_val_list.size()) {
+        cout << "op size is not equal to val size" << endl;
+        return -1;
+    }
+
+    auto val_iter = new_val_list.begin();
+    for (auto& table : opst) {
+        const string& tablename = table.first;
+        const auto& row_cf_qu_list = table.second;
+        unordered_map<string, RowMutationPtr> row_mutations;
+
+        for (auto& row_cf_qu : row_cf_qu_list) {
+            const string& rowkey = row_cf_qu.rowkey_;
+            const string& cf     = row_cf_qu.cf_    ;
+            const string& qu     = row_cf_qu.qu_    ;
+
+            if (row_mutations.find(rowkey) == row_mutations.end()) {
+                RowMutationPtr row_mutation(tables[tablename]->NewRowMutation(rowkey));
+                row_mutations[rowkey] = row_mutation;
+            }
+
+            row_mutations[rowkey]->Put(cf, qu, *(val_iter++));
+        }
+
+        for (auto mutation : row_mutations) {
+            g_txn->ApplyMutation(mutation.second.get());
+        }
+    }
+
+    g_txn->Commit();
+    if (g_txn->GetError().GetType() != tera::ErrorCode::kOK) {
+        std::cout << "cas failed: " << g_txn->GetError().ToString() << std::endl;
+        return -1;
+    } else {
+        std::cout << "cas success" << endl;
+    }
+
+    return 0;
+}
+
+static void SignalHandler(int){
+    _exit(0);
+}
+
+int main(int argc, char *argv[]) {
+    signal(SIGINT, SignalHandler);
+    signal(SIGTERM, SignalHandler);
+    ::google::ParseCommandLineFlags(&argc, &argv, true);
+
+    vector<string> args(argv, argv + argc);
+    InitHelpCommand();
+
+    if (args.size() < 2) {
+        PrintHelp();
+        return 0;
+    } else if (args[1] == "help") {
+        if (args.size() > 2) {
+            PrintHelp(args[2]);
+            return 0;
+        } else {
+            PrintHelp();
+            return 0;
+        }
+    } else if (args[1] == "version") {
+        PrintSystemVersion();
+        return 0;
+    }
+
+    unordered_map<string, function<int64_t (ClientPtr client, const vector<string>& args)>> command_table;
+    command_table["put"] = PutOp;
+    command_table["get"] = GetOp;
+    command_table["cas"] = CasOp;
+
+    if (command_table.find(args[1]) == command_table.end()) {
+        cout << "Wrong Command" << endl;
+        PrintHelp();
+        return -1;
+    }
+    
+    tera::ErrorCode ec;
+    ClientPtr client(tera::Client::NewClient(FLAGS_flagfile, args[1], &ec));
+    if (!client) {
+        cout << "Create Client Failed: " << ec.ToString() << endl;
+        return -1;
+    }
+
+    return command_table[args[1]](client, args);
+}
diff --git a/src/sdk/test/global_txn_internal_test.cc b/src/sdk/test/global_txn_internal_test.cc
new file mode 100644
index 000000000..e3310aa3a
--- /dev/null
+++ b/src/sdk/test/global_txn_internal_test.cc
@@ -0,0 +1,789 @@
+// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+//
+// Author: baorenyi@baidu.com
+
+#include <iostream>
+
+#include "gflags/gflags.h"
+#include "glog/logging.h"
+#include "gtest/gtest.h"
+
+#include "sdk/global_txn_internal.h"
+#include "sdk/read_impl.h"
+#include "sdk/sdk_zk.h"
+#include "sdk/sdk_utils.h"
+#include "sdk/table_impl.h"
+#include "sdk/test/mock_table.h"
+#include "tera.h"
+
+DECLARE_string(tera_coord_type);
+DECLARE_int32(tera_sdk_timeout);
+DECLARE_int32(tera_gtxn_all_puts_size_limit);
+
+namespace tera {
+
+class GlobalTxnInternalTest : public ::testing::Test {
+public:
+    GlobalTxnInternalTest() 
+        : start_ts_(100), thread_pool_(2), gtxn_internal_(Client::NewClient()) {
+        gtxn_internal_.SetStartTimestamp(start_ts_);
+    }
+
+    ~GlobalTxnInternalTest() {}
+
+    Table* OpenTable(const std::string& tablename) {
+        FLAGS_tera_coord_type = "fake_zk";
+        return static_cast<tera::Table*>(new MockTable(tablename, &thread_pool_));
+    }
+    
+    void MakeKvPair(const std::string& row,
+                    const std::string& cf,
+                    const std::string& qu,
+                    int64_t ts,
+                    const std::string& val,
+                    RowResult* value_list) {
+        
+        value_list->clear_key_values();
+        KeyValuePair* kv = value_list->add_key_values();
+        kv->set_key(row);
+        kv->set_column_family(cf);
+        kv->set_qualifier(qu);
+        kv->set_timestamp(ts);
+        kv->set_value(val);
+    }
+
+    void SetSchema(Table* table, const TableSchema& table_schema) {
+        TableImpl* table_impl = static_cast<tera::TableImpl*>(table);
+        table_impl->table_schema_ = table_schema;    
+    }
+
+    void BuildResult(RowReaderImpl* reader_impl, 
+                     const RowResult& value_list, 
+                     RowReader::TRow *row) {
+        
+        reader_impl->result_.clear_key_values();
+        reader_impl->SetResult(value_list);
+        row->clear();
+        reader_impl->ToMap(row);
+    }
+
+private:
+    int64_t start_ts_;
+    common::ThreadPool thread_pool_;
+    GlobalTxnInternal gtxn_internal_;
+};
+
+TEST_F(GlobalTxnInternalTest, CheckTable) {
+    ErrorCode status;
+    Table* t1 = OpenTable("t1");
+    Table* t2 = OpenTable("t2");
+    Table* t3 = OpenTable("t3");
+    Table* t4 = OpenTable("t4");
+    EXPECT_FALSE(t1 == NULL);
+    EXPECT_FALSE(t2 == NULL);
+    EXPECT_FALSE(t3 == NULL);
+    EXPECT_FALSE(t4 == NULL);
+
+    // table<txn=true> and exist cf<gtxn=true>
+    TableDescriptor desc("t1");
+    desc.EnableTxn(); 
+    desc.AddLocalityGroup("lg0");
+    ColumnFamilyDescriptor* cfd1 = desc.AddColumnFamily("cf1");
+    cfd1->EnableGlobalTransaction();
+
+    TableSchema schema;
+    TableDescToSchema(desc, &schema);
+    SetSchema(t1, schema);
+  
+    EXPECT_TRUE(gtxn_internal_.CheckTable(t1, &status));
+
+    // table<txn=true> and not exist cf<gtxn=true>
+    TableDescriptor desc1("t1");
+    desc1.EnableTxn(); 
+    desc1.AddLocalityGroup("lg0");
+    ColumnFamilyDescriptor* cfd11 = desc1.AddColumnFamily("cf1");
+    cfd11->DisableGlobalTransaction();
+
+    TableSchema schema1;
+    TableDescToSchema(desc1, &schema1);
+    SetSchema(t2, schema1);
+    EXPECT_FALSE(gtxn_internal_.CheckTable(t2, &status));
+ 
+    // table<txn=false> and exist cf<gtxn=true>
+    TableDescriptor desc2("t1");
+    desc2.AddLocalityGroup("lg0");
+    ColumnFamilyDescriptor* cfd12 = desc2.AddColumnFamily("cf1");
+    cfd12->EnableGlobalTransaction();
+
+    TableSchema schema2;
+    TableDescToSchema(desc2, &schema2);
+    SetSchema(t3, schema2);
+    EXPECT_FALSE(gtxn_internal_.CheckTable(t3, &status));
+    
+    // table<txn=false> and not exist cf<gtxn=true>
+    TableDescriptor desc3("t1");
+    desc3.AddLocalityGroup("lg0");
+    ColumnFamilyDescriptor* cfd13 = desc3.AddColumnFamily("cf1");
+    cfd13->DisableGlobalTransaction();
+
+    TableSchema schema3;
+    TableDescToSchema(desc3, &schema3);
+    SetSchema(t4, schema3);
+    EXPECT_FALSE(gtxn_internal_.CheckTable(t4, &status));
+ 
+    delete t1;
+    delete t2;
+    delete t3;
+    delete t4;
+}
+
+TEST_F(GlobalTxnInternalTest, IsLockedByOthers) {
+    Table* t1_ptr = OpenTable("t1");
+    
+    Cell cell1(t1_ptr, "row1", "cf1", "qu1", start_ts_, "val");
+    
+    RowReader* reader = t1_ptr->NewRowReader("row1");
+    RowReaderImpl* reader_impl = (RowReaderImpl*)reader;
+    RowResult value_list;
+    // exist lock col && ts < start_ts_
+    // 12 < 100 less than start_ts
+    MakeKvPair("row1", "cf1", PackLockName("qu1"), 12, "", &value_list);
+    RowReader::TRow row;
+    BuildResult(reader_impl, value_list, &row); 
+    EXPECT_TRUE(gtxn_internal_.IsLockedByOthers(row, cell1));
+   
+    // not exist lock col
+    value_list.clear_key_values();
+    MakeKvPair("row1", "cf1", "qu1", 120, "", &value_list);
+    BuildResult(reader_impl, value_list, &row); 
+    EXPECT_FALSE(gtxn_internal_.IsLockedByOthers(row, cell1));
+
+    // exist lock col && ts > start_ts_
+    value_list.clear_key_values();
+    // 120 > 100 
+    MakeKvPair("row1", "cf1", PackLockName("qu1"), 120, "", &value_list);
+    BuildResult(reader_impl, value_list, &row); 
+    
+    EXPECT_FALSE(gtxn_internal_.IsLockedByOthers(row, cell1));
+    delete t1_ptr;
+}
+
+TEST_F(GlobalTxnInternalTest, IsPrimary) {
+    const std::string t1 = "t1", t2 = "t2", cf2 = "cf2";
+    Table* t1_ptr = OpenTable(t1);
+    EXPECT_FALSE(t1_ptr == NULL);
+    Cell cell1(t1_ptr, "row1", "cf1", "qu1", start_ts_, "val");
+    Cell cell2(t1_ptr, "row1", "cf2", "qu1", start_ts_, "val");
+    
+    PrimaryInfo info2;
+    info2.set_table_name("t1");
+    info2.set_row_key("row1");
+    info2.set_column_family("cf1");
+    info2.set_qualifier("qu1");
+    info2.set_gtxn_start_ts(200);
+
+    EXPECT_TRUE(gtxn_internal_.IsPrimary(cell1, info2));
+    EXPECT_FALSE(gtxn_internal_.IsPrimary(cell2, info2));
+
+    delete t1_ptr;
+}
+
+TEST_F(GlobalTxnInternalTest, FindTable) {
+    const std::string t1 = "t1", t2 = "t2", cf2 = "cf2";
+     
+    Table* t1_ptr = OpenTable(t1);
+    EXPECT_FALSE(t1_ptr == NULL);
+    
+    TableDescriptor desc(t1);
+    desc.EnableTxn();
+    desc.AddLocalityGroup("lg0");
+    ColumnFamilyDescriptor* cfd = desc.AddColumnFamily(cf2);
+    cfd->EnableGlobalTransaction();
+
+    TableSchema schema;
+    TableDescToSchema(desc, &schema);
+    SetSchema(t1_ptr, schema);
+    
+    // call CheckTable(t1)
+    ErrorCode status;
+    EXPECT_TRUE(gtxn_internal_.CheckTable(t1_ptr, &status));
+    
+    // t1 in tables_
+    Table* t1_ptr1 = gtxn_internal_.FindTable(t1);
+    EXPECT_TRUE(t1_ptr1->GetName() == t1_ptr->GetName());
+
+    delete t1_ptr;
+}
+
+TEST_F(GlobalTxnInternalTest, ConflictWithOtherWrite) {
+    Table* t1_ptr = OpenTable("t1");
+    RowReader* reader = t1_ptr->NewRowReader("row1");
+    RowReaderImpl* reader_impl = (RowReaderImpl*)reader;
+    RowResult value_list;
+    // 12 < 100 less than start_ts
+    MakeKvPair("row1", "cf1", "qu1", 12, "", &value_list);
+    reader_impl->SetResult(value_list);
+    ErrorCode status;
+    std::vector<Write> ws;
+    // ws is empty
+    EXPECT_FALSE(gtxn_internal_.ConflictWithOtherWrite(&ws, reader, &status));
+    
+    // different row writes
+    for(int i = 0; i < 3; ++i) {
+        Cell cell(t1_ptr, "row2", "cf" + std::to_string(i), 
+                  "qu" + std::to_string(i), start_ts_, "val");
+        Write w(cell);
+        ws.push_back(w);
+    }
+    EXPECT_FALSE(gtxn_internal_.ConflictWithOtherWrite(&ws, reader, &status));
+
+    // same row, but not exist target cf
+    ws.clear();
+    for(int i = 0; i < 3; ++i) {
+        Cell cell(t1_ptr, "row1", "cf0", "qu" + std::to_string(i), start_ts_, "val");
+        Write w(cell);
+        ws.push_back(w);
+    }
+    EXPECT_FALSE(gtxn_internal_.ConflictWithOtherWrite(&ws, reader, &status));
+
+    // same row,cf, but not exist write_col, lock_col
+    ws.clear();
+    for(int i = 0; i < 3; ++i) {
+        Cell cell(t1_ptr, "row1", "cf1", "qu" + std::to_string(i), start_ts_, "val");
+        Write w(cell);
+        ws.push_back(w);
+    }
+    EXPECT_FALSE(gtxn_internal_.ConflictWithOtherWrite(&ws, reader, &status));
+
+    // same row, cf && exist write_col(latest_ts >= start_ts_)
+    value_list.clear_key_values();
+    // 120 > 100
+    MakeKvPair("row1", "cf1", PackWriteName("qu1"), 120, "", &value_list);
+    reader_impl->result_.clear_key_values();
+    reader_impl->SetResult(value_list);
+  
+    EXPECT_TRUE(gtxn_internal_.ConflictWithOtherWrite(&ws, reader, &status));
+    EXPECT_TRUE(status.GetType() == ErrorCode::kGTxnWriteConflict);
+
+    // same row, cf && exist write_col(latest_ts < start_ts_)
+    // not exist lock_col
+    value_list.clear_key_values();
+    // 20 < 100 less than start_ts
+    MakeKvPair("row1", "cf1", PackWriteName("qu1"), 20, "", &value_list);
+    reader_impl->result_.clear_key_values();
+    reader_impl->SetResult(value_list);
+  
+    EXPECT_FALSE(gtxn_internal_.ConflictWithOtherWrite(&ws, reader, &status));
+
+    // same row, cf && exist write_col(latest_ts < start_ts_)
+    // not exist lock_col
+    value_list.clear_key_values();
+    // 20 < 100 less than start_ts
+    MakeKvPair("row1", "cf1", PackWriteName("qu1"), 20, "", &value_list);
+    MakeKvPair("row1", "cf1", PackLockName("qu1"), 20, "", &value_list);
+    reader_impl->result_.clear_key_values();
+    reader_impl->SetResult(value_list);
+  
+    EXPECT_TRUE(gtxn_internal_.ConflictWithOtherWrite(&ws, reader, &status));
+    EXPECT_TRUE(status.GetType() == ErrorCode::kGTxnLockConflict);
+     
+    delete t1_ptr;
+}
+
+TEST_F(GlobalTxnInternalTest, IsGTxnColumnFamily) {
+    const std::string t1 = "t1", t2 = "t2", cf1 = "cf1", cf2 = "cf2";
+     
+    Table* t1_ptr = OpenTable(t1);
+    EXPECT_FALSE(t1_ptr == NULL);
+    
+    TableDescriptor desc(t1);
+    desc.EnableTxn();
+    desc.AddLocalityGroup("lg0");
+    ColumnFamilyDescriptor* cfd = desc.AddColumnFamily(cf1);
+    cfd->DisableGlobalTransaction();
+    ColumnFamilyDescriptor* cfd1 = desc.AddColumnFamily(cf2);
+    cfd1->EnableGlobalTransaction();
+
+    TableSchema schema;
+    TableDescToSchema(desc, &schema);
+    SetSchema(t1_ptr, schema);
+    
+    // IsGTxnColumnFamily(t1, xxx) must be call after CheckTable(t1)
+    EXPECT_FALSE(gtxn_internal_.IsGTxnColumnFamily(t1, cf1));
+    EXPECT_FALSE(gtxn_internal_.IsGTxnColumnFamily(t1, cf2));
+    EXPECT_FALSE(gtxn_internal_.IsGTxnColumnFamily(t2, cf1));
+    // call CheckTable(t1)
+    ErrorCode status;
+    EXPECT_TRUE(gtxn_internal_.CheckTable(t1_ptr, &status));
+
+    // call IsGTxnColumnFamily(t1, xxx) cf1 is gtxn=false
+    EXPECT_FALSE(gtxn_internal_.IsGTxnColumnFamily(t1, cf1));
+    
+    // call IsGTxnColumnFamily(t1, xxx) cf2 is gtxn=true
+    EXPECT_TRUE(gtxn_internal_.IsGTxnColumnFamily(t1, cf2));
+    
+    // call IsGTxnColumnFamily(t2, xxx)
+    EXPECT_FALSE(gtxn_internal_.IsGTxnColumnFamily(t2, cf1));
+    delete t1_ptr;
+} 
+
+TEST_F(GlobalTxnInternalTest, SetInternalSdkTaskTimeout) {
+    Table* t1_ptr = OpenTable("t1");
+    RowReader* reader = t1_ptr->NewRowReader("row1");
+    RowReaderImpl* reader_impl = (RowReaderImpl*)reader;
+
+    EXPECT_TRUE(gtxn_internal_.terminal_time_ == 0);
+    gtxn_internal_.SetCommitDuration(1000);
+    EXPECT_TRUE(gtxn_internal_.terminal_time_ > 1000);
+
+    gtxn_internal_.SetInternalSdkTaskTimeout(reader);
+    EXPECT_TRUE(reader_impl->TimeOut() == 1000);
+
+    sleep(2);
+    gtxn_internal_.SetInternalSdkTaskTimeout(reader);
+    EXPECT_TRUE(reader_impl->TimeOut() == 1);
+    EXPECT_TRUE(gtxn_internal_.IsTimeOut() == true);
+
+    gtxn_internal_.is_timeout_ = false;
+    EXPECT_FALSE(gtxn_internal_.terminal_time_ == 0);
+    gtxn_internal_.SetCommitDuration(1000000);
+    EXPECT_TRUE(gtxn_internal_.terminal_time_ > 1000000);
+    
+    gtxn_internal_.SetInternalSdkTaskTimeout(reader);
+    EXPECT_TRUE(reader_impl->TimeOut() == FLAGS_tera_sdk_timeout);
+    EXPECT_TRUE(gtxn_internal_.IsTimeOut() == false);
+}
+
+TEST_F(GlobalTxnInternalTest, VerifyWritesSize0) {
+    Table* t1_ptr = OpenTable("t1");
+    RowMutation* mu = t1_ptr->NewRowMutation("r1");
+    int64_t writes_size = 0;
+    bool ret = gtxn_internal_.VerifyWritesSize(mu, &writes_size);
+    EXPECT_TRUE(writes_size == 0);
+    EXPECT_FALSE(ret);
+    EXPECT_TRUE(mu->GetError().GetType() == ErrorCode::kBadParam);
+    delete t1_ptr;
+    delete mu;
+}
+
+TEST_F(GlobalTxnInternalTest, VerifyWritesSize1) {
+    Table* t1_ptr = OpenTable("t1");
+    RowMutation* mu = t1_ptr->NewRowMutation("r1");
+    mu->Put("cf0", "qu1", "value", (int64_t)(5));
+    mu->Put("cf0", "qu2", "value", (int64_t)(5));
+    mu->Put("cf0", "qu3", "value", (int64_t)(5));
+    mu->Put("cf0", "qu4", "value", (int64_t)(5));
+    mu->DeleteColumns("cf1", "qu5", (int64_t)(5));
+    mu->DeleteColumns("cf1", "qu6", (int64_t)(5));
+    mu->DeleteColumns("cf1", "qu7", (int64_t)(5));
+
+    int64_t writes_size = 0;
+    FLAGS_tera_gtxn_all_puts_size_limit = 10;
+    bool ret = gtxn_internal_.VerifyWritesSize(mu, &writes_size);
+    RowMutationImpl* row_mu_impl = static_cast<RowMutationImpl*>(mu);
+    EXPECT_TRUE(row_mu_impl->Size() == writes_size);
+    EXPECT_FALSE(ret);
+    EXPECT_TRUE(mu->GetError().GetType() == ErrorCode::kGTxnDataTooLarge);
+    delete t1_ptr;
+    delete mu;
+}
+
+TEST_F(GlobalTxnInternalTest, VerifyWritesSize2) {
+    Table* t1_ptr = OpenTable("t1");
+    RowMutation* mu = t1_ptr->NewRowMutation("r1");
+    mu->Put("cf0", "qu1", "value", (int64_t)(5));
+
+    int64_t writes_size = 0;
+    FLAGS_tera_gtxn_all_puts_size_limit = 100000;
+    bool ret = gtxn_internal_.VerifyWritesSize(mu, &writes_size);
+    RowMutationImpl* row_mu_impl = static_cast<RowMutationImpl*>(mu);
+    EXPECT_TRUE(row_mu_impl->Size() == writes_size);
+    EXPECT_TRUE(ret);
+    EXPECT_TRUE(mu->GetError().GetType() == ErrorCode::kOK);
+    delete t1_ptr;
+    delete mu;
+}
+
+TEST_F(GlobalTxnInternalTest, BadQualifier) {
+    bool ret = BadQualifier("");
+    EXPECT_FALSE(ret);
+    ret = BadQualifier("aaaaaaaaaaaaaaa");
+    EXPECT_FALSE(ret);
+    ret = BadQualifier("_*_");
+    EXPECT_TRUE(ret);
+    ret = BadQualifier("____*_");
+    EXPECT_TRUE(ret);
+    ret = BadQualifier("______");
+    EXPECT_TRUE(ret);
+    ret = BadQualifier("____NN_");
+    EXPECT_FALSE(ret);
+    ret = BadQualifier("NN_");
+    EXPECT_FALSE(ret);
+}
+
+TEST_F(GlobalTxnInternalTest, VerifyUserRowMutation0) {
+    Table* t1_ptr = OpenTable("t1");
+    RowMutation* mu = t1_ptr->NewRowMutation("r1");
+    bool ret = gtxn_internal_.VerifyUserRowMutation(mu);
+    EXPECT_FALSE(ret);
+    EXPECT_TRUE(mu->GetError().GetType() == ErrorCode::kBadParam);
+    delete t1_ptr;
+    delete mu;
+}
+
+TEST_F(GlobalTxnInternalTest, VerifyUserRowMutation1) {
+    // set a table to tables_
+    ErrorCode status;
+    Table* t1 = OpenTable("t1");
+    // table<txn=true> and exist cf<gtxn=true>
+    TableDescriptor desc("t1");
+    desc.EnableTxn(); 
+    desc.AddLocalityGroup("lg0");
+    ColumnFamilyDescriptor* cfd1 = desc.AddColumnFamily("cf1");
+    cfd1->EnableGlobalTransaction();
+
+    TableSchema schema;
+    TableDescToSchema(desc, &schema);
+    SetSchema(t1, schema);
+  
+    EXPECT_TRUE(gtxn_internal_.CheckTable(t1, &status));
+
+    RowMutation* mu = t1->NewRowMutation("r1");
+    mu->Put("cf1", "qu1", "value", (int64_t)(5));
+    mu->Put("cf1", "qu1_N_", "value", (int64_t)(5));
+    mu->Put("cf1", "qu2", "value", (int64_t)(5));
+    bool ret = gtxn_internal_.VerifyUserRowMutation(mu);
+    EXPECT_FALSE(ret);
+    EXPECT_TRUE(mu->GetError().GetType() == ErrorCode::kBadParam);
+    delete t1;
+    delete mu;
+}
+
+TEST_F(GlobalTxnInternalTest, VerifyUserRowMutation2) {
+    // set a table to tables_
+    ErrorCode status;
+    Table* t1 = OpenTable("t1");
+    // table<txn=true> and exist cf<gtxn=true>
+    TableDescriptor desc("t1");
+    desc.EnableTxn(); 
+    desc.AddLocalityGroup("lg0");
+    ColumnFamilyDescriptor* cfd1 = desc.AddColumnFamily("cf1");
+    cfd1->EnableGlobalTransaction();
+
+    TableSchema schema;
+    TableDescToSchema(desc, &schema);
+    SetSchema(t1, schema);
+  
+    EXPECT_TRUE(gtxn_internal_.CheckTable(t1, &status));
+
+    RowMutation* mu = t1->NewRowMutation("r1");
+    mu->Put("cf0", "qu1", "value", (int64_t)(5));
+    mu->Put("cf1", "qu1_N_", "value", (int64_t)(5));
+    mu->Put("cf1", "qu2", "value", (int64_t)(5));
+    bool ret = gtxn_internal_.VerifyUserRowMutation(mu);
+    EXPECT_FALSE(ret);
+    EXPECT_TRUE(mu->GetError().GetType() == ErrorCode::kBadParam);
+    delete t1;
+    delete mu;
+}
+
+TEST_F(GlobalTxnInternalTest, VerifyUserRowMutation3) {
+    // set a table to tables_
+    ErrorCode status;
+    Table* t1 = OpenTable("t1");
+    // table<txn=true> and exist cf<gtxn=true>
+    TableDescriptor desc("t1");
+    desc.EnableTxn(); 
+    desc.AddLocalityGroup("lg0");
+    ColumnFamilyDescriptor* cfd1 = desc.AddColumnFamily("cf1");
+    cfd1->EnableGlobalTransaction();
+
+    TableSchema schema;
+    TableDescToSchema(desc, &schema);
+    SetSchema(t1, schema);
+  
+    EXPECT_TRUE(gtxn_internal_.CheckTable(t1, &status));
+
+    RowMutation* mu = t1->NewRowMutation("r1");
+    mu->Put("cf1", "qu1", "value", (int64_t)(5));
+    mu->DeleteColumns("cf1", "qu1", (int64_t)(5));
+    mu->DeleteColumn("cf1", "qu2", (int64_t)(5));
+    mu->DeleteFamily("cf1", (int64_t)(5));
+    bool ret = gtxn_internal_.VerifyUserRowMutation(mu);
+    EXPECT_FALSE(ret);
+    EXPECT_TRUE(mu->GetError().GetType() == ErrorCode::kGTxnNotSupport);
+    delete t1;
+    delete mu;
+}
+
+TEST_F(GlobalTxnInternalTest, VerifyUserRowMutation4) {
+    // set a table to tables_
+    ErrorCode status;
+    Table* t1 = OpenTable("t1");
+    // table<txn=true> and exist cf<gtxn=true>
+    TableDescriptor desc("t1");
+    desc.EnableTxn(); 
+    desc.AddLocalityGroup("lg0");
+    ColumnFamilyDescriptor* cfd1 = desc.AddColumnFamily("cf1");
+    cfd1->EnableGlobalTransaction();
+
+    TableSchema schema;
+    TableDescToSchema(desc, &schema);
+    SetSchema(t1, schema);
+  
+    EXPECT_TRUE(gtxn_internal_.CheckTable(t1, &status));
+
+    RowMutation* mu = t1->NewRowMutation("r1");
+    mu->Put("cf1", "qu1", "value", (int64_t)(5));
+    mu->DeleteColumns("cf1", "qu1", (int64_t)(5));
+    mu->DeleteColumn("cf1", "qu2", (int64_t)(5));
+    bool ret = gtxn_internal_.VerifyUserRowMutation(mu);
+    EXPECT_TRUE(ret);
+    EXPECT_TRUE(mu->GetError().GetType() == ErrorCode::kOK);
+    delete t1;
+    delete mu;
+}
+
+TEST_F(GlobalTxnInternalTest, VerifyUserRowReader0) {
+    Table* t1_ptr = OpenTable("t1");
+    RowReader* r = t1_ptr->NewRowReader("r1");
+    bool ret = gtxn_internal_.VerifyUserRowReader(r);
+    EXPECT_FALSE(ret);
+    EXPECT_TRUE(r->GetError().GetType() == ErrorCode::kBadParam);
+    delete t1_ptr;
+    delete r;
+}
+
+TEST_F(GlobalTxnInternalTest, VerifyUserRowReader1) {
+    // set a table to tables_
+    ErrorCode status;
+    Table* t1 = OpenTable("t1");
+    // table<txn=true> and exist cf<gtxn=true>
+    TableDescriptor desc("t1");
+    desc.EnableTxn(); 
+    desc.AddLocalityGroup("lg0");
+    ColumnFamilyDescriptor* cfd1 = desc.AddColumnFamily("cf1");
+    //cfd1->EnableGlobalTransaction();
+
+    TableSchema schema;
+    TableDescToSchema(desc, &schema);
+    SetSchema(t1, schema);
+  
+    EXPECT_FALSE(gtxn_internal_.CheckTable(t1, &status));
+
+    RowReader* r = t1->NewRowReader("r1");
+    r->AddColumn("cf1", "qu");
+    bool ret = gtxn_internal_.VerifyUserRowReader(r);
+    EXPECT_FALSE(ret);
+    EXPECT_TRUE(r->GetError().GetType() == status.GetType());
+    delete t1;
+    delete r;
+}
+
+TEST_F(GlobalTxnInternalTest, VerifyUserRowReader2) {
+    // set a table to tables_
+    ErrorCode status;
+    Table* t1 = OpenTable("t1");
+    // table<txn=true> and exist cf<gtxn=true>
+    TableDescriptor desc("t1");
+    desc.EnableTxn(); 
+    desc.AddLocalityGroup("lg0");
+    ColumnFamilyDescriptor* cfd1 = desc.AddColumnFamily("cf1");
+    cfd1->EnableGlobalTransaction();
+
+    TableSchema schema;
+    TableDescToSchema(desc, &schema);
+    SetSchema(t1, schema);
+  
+    EXPECT_TRUE(gtxn_internal_.CheckTable(t1, &status));
+
+    RowReader* r = t1->NewRowReader("r1");
+    r->AddColumn("cf1", "qu");
+    r->SetSnapshot(10);
+    bool ret = gtxn_internal_.VerifyUserRowReader(r);
+    EXPECT_FALSE(ret);
+    EXPECT_TRUE(r->GetError().GetType() == ErrorCode::kBadParam);
+    delete t1;
+    delete r;
+}
+
+TEST_F(GlobalTxnInternalTest, VerifyUserRowReader3) {
+    // set a table to tables_
+    ErrorCode status;
+    Table* t1 = OpenTable("t1");
+    // table<txn=true> and exist cf<gtxn=true>
+    TableDescriptor desc("t1");
+    desc.EnableTxn(); 
+    desc.AddLocalityGroup("lg0");
+    ColumnFamilyDescriptor* cfd1 = desc.AddColumnFamily("cf1");
+    cfd1->EnableGlobalTransaction();
+
+    TableSchema schema;
+    TableDescToSchema(desc, &schema);
+    SetSchema(t1, schema);
+  
+    EXPECT_TRUE(gtxn_internal_.CheckTable(t1, &status));
+
+    RowReader* r = t1->NewRowReader("r1");
+    r->AddColumnFamily("cf1");
+    bool ret = gtxn_internal_.VerifyUserRowReader(r);
+    EXPECT_FALSE(ret);
+    EXPECT_TRUE(r->GetError().GetType() == ErrorCode::kBadParam);
+    delete t1;
+    delete r;
+}
+
+TEST_F(GlobalTxnInternalTest, VerifyUserRowReader4) {
+    // set a table to tables_
+    ErrorCode status;
+    Table* t1 = OpenTable("t1");
+    // table<txn=true> and exist cf<gtxn=true>
+    TableDescriptor desc("t1");
+    desc.EnableTxn(); 
+    desc.AddLocalityGroup("lg0");
+    ColumnFamilyDescriptor* cfd1 = desc.AddColumnFamily("cf1");
+    cfd1->EnableGlobalTransaction();
+
+    TableSchema schema;
+    TableDescToSchema(desc, &schema);
+    SetSchema(t1, schema);
+  
+    EXPECT_TRUE(gtxn_internal_.CheckTable(t1, &status));
+
+    RowReader* r = t1->NewRowReader("r1");
+    r->AddColumn("cf0", "qu");
+    bool ret = gtxn_internal_.VerifyUserRowReader(r);
+    EXPECT_FALSE(ret);
+    EXPECT_TRUE(r->GetError().GetType() == ErrorCode::kBadParam);
+    delete t1;
+    delete r;
+}
+
+TEST_F(GlobalTxnInternalTest, VerifyUserRowReader5) {
+    // set a table to tables_
+    ErrorCode status;
+    Table* t1 = OpenTable("t1");
+    // table<txn=true> and exist cf<gtxn=true>
+    TableDescriptor desc("t1");
+    desc.EnableTxn(); 
+    desc.AddLocalityGroup("lg0");
+    ColumnFamilyDescriptor* cfd1 = desc.AddColumnFamily("cf1");
+    cfd1->EnableGlobalTransaction();
+
+    TableSchema schema;
+    TableDescToSchema(desc, &schema);
+    SetSchema(t1, schema);
+  
+    EXPECT_TRUE(gtxn_internal_.CheckTable(t1, &status));
+
+    RowReader* r = t1->NewRowReader("r1");
+    r->AddColumn("cf1", "qu_*_");
+    bool ret = gtxn_internal_.VerifyUserRowReader(r);
+    EXPECT_FALSE(ret);
+    EXPECT_TRUE(r->GetError().GetType() == ErrorCode::kBadParam);
+    delete t1;
+    delete r;
+}
+
+TEST_F(GlobalTxnInternalTest, VerifyUserRowReader6) {
+    // set a table to tables_
+    ErrorCode status;
+    Table* t1 = OpenTable("t1");
+    // table<txn=true> and exist cf<gtxn=true>
+    TableDescriptor desc("t1");
+    desc.EnableTxn(); 
+    desc.AddLocalityGroup("lg0");
+    ColumnFamilyDescriptor* cfd1 = desc.AddColumnFamily("cf1");
+    cfd1->EnableGlobalTransaction();
+
+    TableSchema schema;
+    TableDescToSchema(desc, &schema);
+    SetSchema(t1, schema);
+  
+    EXPECT_TRUE(gtxn_internal_.CheckTable(t1, &status));
+
+    RowReader* r = t1->NewRowReader("r1");
+    r->AddColumn("cf1", "qu");
+    r->AddColumn("cf1", "q1");
+    r->AddColumn("cf1", "q2");
+    bool ret = gtxn_internal_.VerifyUserRowReader(r);
+    EXPECT_TRUE(ret);
+    EXPECT_TRUE(r->GetError().GetType() == ErrorCode::kOK);
+    delete t1;
+    delete r;
+}
+
+TEST_F(GlobalTxnInternalTest, PrimaryIsLocked1) {
+    // bad case b. read primary lock failed
+    ErrorCode status;
+    Table* t1 = OpenTable("t1");
+    // table<txn=true> and exist cf<gtxn=true>
+    TableDescriptor desc("t1");
+    desc.EnableTxn(); 
+    desc.AddLocalityGroup("lg0");
+    ColumnFamilyDescriptor* cfd1 = desc.AddColumnFamily("cf1");
+    cfd1->EnableGlobalTransaction();
+
+    TableSchema schema;
+    TableDescToSchema(desc, &schema);
+    SetSchema(t1, schema);
+
+    tera::PrimaryInfo info2;
+    std::string info2_str;
+    info2.set_table_name("t1");
+    info2.set_row_key("row1");
+    info2.set_column_family("cf1");
+    info2.set_qualifier("qu1");
+    info2.set_gtxn_start_ts(100);
+    info2.SerializeToString(&info2_str);
+    EXPECT_TRUE(gtxn_internal_.CheckTable(t1, &status));
+
+    ErrorCode mock_status;
+    mock_status.SetFailed(ErrorCode::kSystem,"");
+    std::vector<ErrorCode> reader_errs;
+    reader_errs.push_back(mock_status);
+    (static_cast<MockTable*>(t1))->AddReaderErrors(reader_errs);
+
+    EXPECT_FALSE(gtxn_internal_.PrimaryIsLocked(info2, 12, &status));
+    EXPECT_TRUE(status.GetType() == ErrorCode::kSystem);
+    delete t1;
+}
+
+TEST_F(GlobalTxnInternalTest, PrimaryIsLocked2) {
+    // bad case a. read primary lock notfound  
+    ErrorCode status;
+    Table* t1 = OpenTable("t1");
+    // table<txn=true> and exist cf<gtxn=true>
+    TableDescriptor desc("t1");
+    desc.EnableTxn(); 
+    desc.AddLocalityGroup("lg0");
+    ColumnFamilyDescriptor* cfd1 = desc.AddColumnFamily("cf1");
+    cfd1->EnableGlobalTransaction();
+
+    TableSchema schema;
+    TableDescToSchema(desc, &schema);
+    SetSchema(t1, schema);
+
+    tera::PrimaryInfo info2;
+    std::string info2_str;
+    info2.set_table_name("t1");
+    info2.set_row_key("row1");
+    info2.set_column_family("cf1");
+    info2.set_qualifier("qu1");
+    info2.set_gtxn_start_ts(100);
+    info2.SerializeToString(&info2_str);
+    EXPECT_TRUE(gtxn_internal_.CheckTable(t1, &status));
+
+    ErrorCode mock_status;
+    mock_status.SetFailed(ErrorCode::kNotFound,"");
+    std::vector<ErrorCode> reader_errs;
+    reader_errs.push_back(mock_status);
+    (static_cast<MockTable*>(t1))->AddReaderErrors(reader_errs);
+
+    EXPECT_FALSE(gtxn_internal_.PrimaryIsLocked(info2, 12, &status));
+    delete t1;
+}
+
+} // namespace tera
diff --git a/src/sdk/test/global_txn_test.cc b/src/sdk/test/global_txn_test.cc
new file mode 100644
index 000000000..c68e0cd2e
--- /dev/null
+++ b/src/sdk/test/global_txn_test.cc
@@ -0,0 +1,1265 @@
+// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+//
+// Author: baorenyi@baidu.com
+
+#include <atomic>
+#include <iostream>
+#include <string>
+#include <thread>
+
+#include "gflags/gflags.h"
+#include "glog/logging.h"
+#include "gtest/gtest.h"
+
+#include "sdk/global_txn.h"
+#include "sdk/global_txn_internal.h"
+#include "sdk/read_impl.h"
+#include "sdk/table_impl.h"
+#include "sdk/sdk_zk.h"
+#include "sdk/test/mock_table.h"
+#include "tera.h"
+
+DECLARE_string(tera_coord_type);
+
+namespace tera {
+
+class GlobalTxnTest : public ::testing::Test {
+public:
+    GlobalTxnTest() : 
+        thread_pool_(2), 
+        gtxn_(Client::NewClient(), &thread_pool_, (new sdk::MockTimeoracleClusterFinder(""))) {
+        gtxn_.status_.SetFailed(ErrorCode::kOK);
+        gtxn_.status_returned_ = false;
+    }
+    
+    ~GlobalTxnTest() {}
+    
+    void SetSchema(Table* table, const TableSchema& table_schema) {
+        TableImpl* table_impl = static_cast<tera::TableImpl*>(table);
+        table_impl->table_schema_ = table_schema;    
+    }
+    
+    Table* OpenTable(const std::string& tablename) {
+        FLAGS_tera_coord_type = "fake_zk";
+        return static_cast<tera::Table*>(new MockTable(tablename, &thread_pool_));
+    }
+    
+private:
+    common::ThreadPool thread_pool_;
+    GlobalTxn gtxn_;
+};
+
+TEST_F(GlobalTxnTest, Commit) {
+
+    // sync commit ut
+    gtxn_.user_commit_callback_ = NULL;
+    // mutation haven't apply
+    gtxn_.finish_ = false;
+    gtxn_.status_returned_ = false;
+    gtxn_.put_fail_cnt_.Set(10);
+    gtxn_.has_commited_ = false;
+    EXPECT_TRUE(gtxn_.Commit().GetType() == ErrorCode::kGTxnOpAfterCommit);
+    EXPECT_TRUE(gtxn_.status_.GetType() == ErrorCode::kGTxnOpAfterCommit);
+    EXPECT_TRUE(gtxn_.finish_ == true);
+    EXPECT_TRUE(gtxn_.has_commited_ == false);
+
+    // have commited
+    gtxn_.finish_ = false;
+    gtxn_.status_returned_ = false;
+    gtxn_.put_fail_cnt_.Set(0);
+    gtxn_.has_commited_ = true;
+    EXPECT_TRUE(gtxn_.Commit().GetType() == ErrorCode::kGTxnOpAfterCommit);
+    EXPECT_TRUE(gtxn_.status_.GetType() == ErrorCode::kGTxnOpAfterCommit);
+    EXPECT_TRUE(gtxn_.finish_ == true);
+    EXPECT_TRUE(gtxn_.has_commited_ == true);
+
+    // run commit in the legal state
+    gtxn_.finish_ = false;
+    gtxn_.status_returned_ = false;
+    gtxn_.writes_.clear();
+    gtxn_.put_fail_cnt_.Set(0);
+    gtxn_.has_commited_ = false;
+    EXPECT_TRUE(gtxn_.Commit().GetType() == ErrorCode::kOK);
+    EXPECT_TRUE(gtxn_.status_.GetType() == ErrorCode::kOK);
+    EXPECT_TRUE(gtxn_.finish_ == true);
+    EXPECT_TRUE(gtxn_.has_commited_ == true);
+}
+
+TEST_F(GlobalTxnTest, DoVerifyPrimaryLockedCallback) {
+    RowReaderImpl* reader_impl = new RowReaderImpl(NULL, "rowkey");
+    SingleRowTxn* txn = new SingleRowTxn(NULL, "rowkey", NULL);
+    reader_impl->txn_ = txn;
+
+    // not found primary
+    reader_impl->error_code_.SetFailed(ErrorCode::kNotFound, "");
+
+    RowReader* reader = static_cast<RowReader*>(reader_impl);
+    gtxn_.DoVerifyPrimaryLockedCallback(reader);    
+    EXPECT_TRUE(gtxn_.finish_ == true);
+    EXPECT_TRUE(gtxn_.status_.GetType() == ErrorCode::kGTxnPrimaryLost);
+}
+
+TEST_F(GlobalTxnTest, DoVerifyPrimaryLockedCallback1) {
+    RowReaderImpl* reader_impl = new RowReaderImpl(NULL, "rowkey");
+    SingleRowTxn* txn = new SingleRowTxn(NULL, "rowkey", NULL);
+    reader_impl->txn_ = txn;
+
+    // reader timeout
+    reader_impl->error_code_.SetFailed(ErrorCode::kTimeout, "");
+    RowReader* reader = static_cast<RowReader*>(reader_impl);
+    gtxn_.DoVerifyPrimaryLockedCallback(reader);    
+    EXPECT_TRUE(gtxn_.finish_ == true);
+    EXPECT_TRUE(gtxn_.status_.GetType() == ErrorCode::kGTxnPrimaryCommitTimeout);
+}
+
+TEST_F(GlobalTxnTest, DoVerifyPrimaryLockedCallback2) {
+    RowReaderImpl* reader_impl = new RowReaderImpl(NULL, "rowkey");
+    SingleRowTxn* txn = new SingleRowTxn(NULL, "rowkey", NULL);
+    reader_impl->txn_ = txn;
+    // reader other error
+    reader_impl->error_code_.SetFailed(ErrorCode::kSystem, "");
+    RowReader* reader = static_cast<RowReader*>(reader_impl);
+    gtxn_.DoVerifyPrimaryLockedCallback(reader);    
+    EXPECT_TRUE(gtxn_.finish_ == true);
+    EXPECT_TRUE(gtxn_.status_.GetType() == ErrorCode::kSystem);
+}
+
+TEST_F(GlobalTxnTest, CheckPrimaryStatusAndCommmitSecondaries) {
+    SingleRowTxn* txn = new SingleRowTxn(NULL, "rowkey", NULL);
+    
+    // primary commit timeout
+    gtxn_.finish_ = false;
+    gtxn_.status_returned_ = false;
+    txn->mutation_buffer_.SetError(ErrorCode::kTimeout,"");
+    gtxn_.CheckPrimaryStatusAndCommmitSecondaries(txn);
+    EXPECT_TRUE(gtxn_.finish_ == true);
+    EXPECT_TRUE(gtxn_.status_.GetType() == ErrorCode::kGTxnPrimaryCommitTimeout);
+
+    // primary commit other error
+    gtxn_.finish_ = false;
+    gtxn_.status_returned_ = false;
+    txn = new SingleRowTxn(NULL, "rowkey", NULL);
+    txn->mutation_buffer_.SetError(ErrorCode::kSystem, "");
+    gtxn_.CheckPrimaryStatusAndCommmitSecondaries(txn);
+
+    EXPECT_TRUE(gtxn_.finish_ == true);
+    EXPECT_TRUE(gtxn_.status_.GetType() == ErrorCode::kSystem);
+
+    // primary done run next step
+    gtxn_.finish_ = false;
+    gtxn_.status_returned_ = false;
+    txn = new SingleRowTxn(NULL, "rowkey", NULL);
+    txn->mutation_buffer_.SetError(ErrorCode::kOK, "");
+    gtxn_.writes_.clear();
+    const std::string tablename = "test_t";
+    Table* t = OpenTable(tablename);
+    Cell cell(t, "r1", "cf", "qu", 1, "val");
+    Write w(cell);
+    // insert a 'Write'
+    gtxn_.SaveWrite(tablename, "r1", w);
+
+    gtxn_.acks_.clear();
+    gtxn_.notifies_.clear();
+    gtxn_.CheckPrimaryStatusAndCommmitSecondaries(txn);
+
+    EXPECT_TRUE(gtxn_.finish_ == true);
+    EXPECT_TRUE(gtxn_.status_returned_ == true);
+    EXPECT_TRUE(gtxn_.status_.GetType() == ErrorCode::kOK);
+}
+
+TEST_F(GlobalTxnTest, SaveWrite) {
+    const std::string tablename = "test_t", tablename4 = "test_t4";
+    Table* t = OpenTable(tablename);
+    const std::string row_key = "r1", row_key4 = "r2";
+    Cell cell(t, row_key, "cf", "qu", 1, "val");
+    Write w(cell);
+    gtxn_.writes_.clear();
+    // insert a 'Write'
+    gtxn_.SaveWrite(tablename, row_key, w);
+    GlobalTxn::TableWithRowkey twr(tablename, row_key);
+    auto w1 = gtxn_.writes_.find(twr);
+    EXPECT_TRUE(w1 != gtxn_.writes_.end());
+    
+    // insert a same 'Write'
+    gtxn_.SaveWrite(tablename, row_key, w);
+    EXPECT_TRUE(gtxn_.writes_.size() == 1);
+
+    // insert a delete type 'Write' at same Cell
+    Cell cell2(t, row_key, "cf", "qu", 1);
+    Write w2(cell2);
+    gtxn_.SaveWrite(tablename, row_key, w2);
+    EXPECT_TRUE(gtxn_.writes_.size() == 1);
+    
+    delete t;
+}
+
+TEST_F(GlobalTxnTest, DoAckCallback) {
+    const std::string tablename = "test_t1", tablename5 = "test_t5";
+    Table* t1 = OpenTable(tablename);
+    Table* t5 = OpenTable(tablename5);
+    
+    // test acks cnt = 2 && not notify
+    RowMutation* mu1 = t1->NewRowMutation("r1");
+    RowMutation* mu5 = t5->NewRowMutation("r1");
+    gtxn_.finish_ = false;
+    gtxn_.ack_done_cnt_.Set(0);
+    gtxn_.acks_cnt_.Set(2);
+    gtxn_.notifies_cnt_.Set(0);
+    gtxn_.DoAckCallback(mu1);
+    EXPECT_TRUE(gtxn_.finish_ == false);
+    gtxn_.DoAckCallback(mu5);
+    EXPECT_TRUE(gtxn_.finish_ == true);
+    
+    // test acks cnt = 2 && notify cnt > 0
+    RowMutation* mu11 = t1->NewRowMutation("r1");
+    RowMutation* mu55 = t5->NewRowMutation("r1");
+    gtxn_.finish_ = false;
+    gtxn_.ack_done_cnt_.Set(0);
+    gtxn_.acks_cnt_.Set(2);
+    gtxn_.notifies_cnt_.Set(1);
+
+    gtxn_.DoAckCallback(mu11);
+    EXPECT_TRUE(gtxn_.finish_ == false);
+    gtxn_.DoAckCallback(mu55);
+    EXPECT_TRUE(gtxn_.finish_ == false);
+
+    delete t1;
+    delete t5;
+}
+
+TEST_F(GlobalTxnTest, DoNotifyCallback) {
+    const std::string tablename = "test_t11", tablename5 = "test_t55";
+    Table* t11 = OpenTable(tablename);
+    Table* t55 = OpenTable(tablename5);
+    
+    // test notifies cnt = 2
+    RowMutation* mu1 = t11->NewRowMutation("r1");
+    RowMutation* mu5 = t55->NewRowMutation("r1");
+    gtxn_.finish_ = false;
+    gtxn_.notify_done_cnt_.Set(0);
+    gtxn_.notifies_cnt_.Set(2);
+    gtxn_.all_task_pushed_ = true;
+    gtxn_.DoNotifyCallback(mu1);
+    EXPECT_TRUE(gtxn_.finish_ == false);
+    gtxn_.DoNotifyCallback(mu5);
+    EXPECT_TRUE(gtxn_.finish_ == true);
+    delete t11;
+    delete t55;
+}
+
+void NotifyWarpper(GlobalTxn* gtxn, 
+                          Table* t,
+                          const std::string& row_key,
+                          const std::string& column_family,
+                          const std::string& qualifier) {
+    gtxn->Notify(t, row_key, column_family, qualifier);
+}
+
+TEST_F(GlobalTxnTest, Notify) {
+    size_t notify_thread_cnt = 30;
+    std::vector<std::thread> threads;
+    // all Table* is NULL
+    gtxn_.notifies_.clear();
+    gtxn_.notifies_cnt_.Set(0);
+    EXPECT_TRUE(0 == gtxn_.notifies_.size());
+    EXPECT_TRUE(gtxn_.notifies_cnt_.Get() == 0);
+    threads.reserve(notify_thread_cnt);
+    Table* t0 = NULL;
+    for (int i = 0; i < notify_thread_cnt; ++i) {
+        threads.emplace_back(std::thread(NotifyWarpper, &gtxn_, t0, "", "", ""));
+    }
+    for (int i = 0; i < threads.size(); ++i) {
+        threads[i].join();
+    }
+    threads.clear();
+    EXPECT_TRUE(0 == gtxn_.notifies_.size());
+    EXPECT_TRUE(gtxn_.notifies_cnt_.Get() == 0);
+
+    // same table and same row
+    gtxn_.notifies_.clear();
+    gtxn_.notifies_cnt_.Set(0);
+    EXPECT_TRUE(0 == gtxn_.notifies_.size());
+    EXPECT_TRUE(gtxn_.notifies_cnt_.Get() == 0);
+    Table* t1 = OpenTable("t1");
+    threads.reserve(30);
+    for (int i = 0; i < notify_thread_cnt; ++i) {
+        threads.emplace_back(std::thread(NotifyWarpper, &gtxn_, t1, "r1", "", ""));
+    }
+    for (int i = 0; i < threads.size(); ++i) {
+        threads[i].join();
+    }
+    threads.clear();
+    EXPECT_TRUE(1 == gtxn_.notifies_.size());
+    EXPECT_TRUE(gtxn_.notifies_cnt_.Get() == 1);
+    GlobalTxn::TableWithRowkey twr("t1", "r1");
+    EXPECT_TRUE(gtxn_.notifies_[twr].size() == notify_thread_cnt);
+
+    // same table and diff row
+    gtxn_.notifies_.clear();
+    gtxn_.notifies_cnt_.Set(0);
+    EXPECT_TRUE(0 == gtxn_.notifies_.size());
+    EXPECT_TRUE(gtxn_.notifies_cnt_.Get() == 0);
+    for (int i = 0; i < notify_thread_cnt; ++i) {
+        threads.emplace_back(std::thread(NotifyWarpper, &gtxn_, t1, "r" + std::to_string(i), "", ""));
+    }
+    for (int i = 0; i < threads.size(); ++i) {
+        threads[i].join();
+    }
+    threads.clear();
+    EXPECT_TRUE(notify_thread_cnt == gtxn_.notifies_.size());
+    EXPECT_TRUE(gtxn_.notifies_cnt_.Get() == notify_thread_cnt);
+
+    for (int i = 0; i < notify_thread_cnt; ++i) {
+        GlobalTxn::TableWithRowkey twr1("t1", "r" + std::to_string(i));
+        EXPECT_TRUE(gtxn_.notifies_[twr1].size() == 1);
+    }
+}
+
+void AckWarpper(GlobalTxn* gtxn, Table* t,
+                const std::string& row_key,
+                const std::string& column_family,
+                const std::string& qualifier) {
+    gtxn->Ack(t, row_key, column_family, qualifier);
+}
+
+TEST_F(GlobalTxnTest, Ack) {
+    size_t ack_thread_cnt = 30;
+    std::vector<std::thread> threads;
+    // all Table* is NULL
+    gtxn_.acks_.clear();
+    gtxn_.acks_cnt_.Set(0);
+    EXPECT_TRUE(0 == gtxn_.acks_.size());
+    EXPECT_TRUE(gtxn_.acks_cnt_.Get() == 0);
+    threads.reserve(ack_thread_cnt);
+    Table* t0 = NULL;
+    for (int i = 0; i < ack_thread_cnt; ++i) {
+        threads.emplace_back(std::thread(AckWarpper, &gtxn_, t0, "", "", ""));
+    }
+    for (int i = 0; i < threads.size(); ++i) {
+        threads[i].join();
+    }
+    threads.clear();
+    EXPECT_TRUE(0 == gtxn_.acks_.size());
+    EXPECT_TRUE(gtxn_.acks_cnt_.Get() == 0);
+
+    // same table and same row
+    gtxn_.acks_.clear();
+    gtxn_.acks_cnt_.Set(0);
+    EXPECT_TRUE(0 == gtxn_.acks_.size());
+    EXPECT_TRUE(gtxn_.acks_cnt_.Get() == 0);
+    Table* t1 = OpenTable("t1");
+    threads.reserve(30);
+    for (int i = 0; i < ack_thread_cnt; ++i) {
+        threads.emplace_back(std::thread(AckWarpper, &gtxn_, t1, "r1", "", ""));
+    }
+    for (int i = 0; i < threads.size(); ++i) {
+        threads[i].join();
+    }
+    threads.clear();
+    EXPECT_TRUE(1 == gtxn_.acks_.size());
+    EXPECT_TRUE(gtxn_.acks_cnt_.Get() == 1);
+    GlobalTxn::TableWithRowkey twr("t1", "r1");
+    EXPECT_TRUE(gtxn_.acks_[twr].size() == ack_thread_cnt);
+
+    // same table and diff row
+    gtxn_.acks_.clear();
+    gtxn_.acks_cnt_.Set(0);
+    EXPECT_TRUE(0 == gtxn_.acks_.size());
+    EXPECT_TRUE(gtxn_.acks_cnt_.Get() == 0);
+    for (int i = 0; i < ack_thread_cnt; ++i) {
+        threads.emplace_back(std::thread(AckWarpper, &gtxn_, t1, "r" + std::to_string(i), "", ""));
+    }
+    for (int i = 0; i < threads.size(); ++i) {
+        threads[i].join();
+    }
+    threads.clear();
+    EXPECT_TRUE(ack_thread_cnt == gtxn_.acks_.size());
+    EXPECT_TRUE(gtxn_.acks_cnt_.Get() == ack_thread_cnt);
+
+    for (int i = 0; i < ack_thread_cnt; ++i) {
+        GlobalTxn::TableWithRowkey twr1("t1", "r" + std::to_string(i));
+        EXPECT_TRUE(gtxn_.acks_[twr1].size() == 1);
+    }
+}
+
+TEST_F(GlobalTxnTest, DoCommitSecondariesCallback0) {
+    // mutation error is kOK will finish
+    std::vector<std::thread> threads;
+    size_t secondaries_thread_cnt = 10;
+    gtxn_.all_task_pushed_ = true;
+    gtxn_.status_.SetFailed(ErrorCode::kOK);
+    gtxn_.acks_cnt_.Set(0);
+    gtxn_.ack_done_cnt_.Set(0);
+    gtxn_.notifies_cnt_.Set(0);
+    gtxn_.notify_done_cnt_.Set(0);
+    gtxn_.writes_cnt_.Set(secondaries_thread_cnt);
+    for (int i = 0; i < secondaries_thread_cnt; ++i) {
+        RowMutationImpl* mu_impl = new RowMutationImpl(NULL, "rowkey");
+        mu_impl->error_code_.SetFailed(ErrorCode::kOK, "");
+        RowMutation* mu = static_cast<RowMutation*>(mu_impl);
+        auto func = std::bind(&GlobalTxn::DoCommitSecondariesCallback, &gtxn_, mu);
+        threads.emplace_back(std::thread(func));
+    }
+    for (int i = 0; i < threads.size(); ++i) {
+        threads[i].join();
+    }
+    threads.clear();
+    EXPECT_TRUE(gtxn_.finish_ == true);
+}
+
+TEST_F(GlobalTxnTest, DoCommitSecondariesCallback1) {
+    // mutation error is kOK not last one
+    size_t secondaries_thread_cnt = 50;
+    std::vector<std::thread> threads;
+    threads.reserve(secondaries_thread_cnt);
+    gtxn_.status_.SetFailed(ErrorCode::kOK);
+    gtxn_.acks_cnt_.Set(0);
+    gtxn_.ack_done_cnt_.Set(0);
+    gtxn_.notifies_cnt_.Set(0);
+    gtxn_.notify_done_cnt_.Set(0);
+    gtxn_.writes_cnt_.Set(secondaries_thread_cnt + 1);
+    for (int i = 0; i < secondaries_thread_cnt; ++i) {
+        RowMutationImpl* mu_impl = new RowMutationImpl(NULL, "rowkey");
+        mu_impl->error_code_.SetFailed(ErrorCode::kOK, "");
+        RowMutation* mu = static_cast<RowMutation*>(mu_impl);
+        auto func = std::bind(&GlobalTxn::DoCommitSecondariesCallback, &gtxn_, mu);
+        threads.emplace_back(std::thread(func));
+    }
+    for (int i = 0; i < threads.size(); ++i) {
+        threads[i].join();
+    }
+    threads.clear();
+    EXPECT_TRUE(gtxn_.finish_ == false);
+}
+
+TEST_F(GlobalTxnTest, DoCommitSecondariesCallback2) {
+    // mutation error is not kOK but status_ is not changed
+    size_t secondaries_thread_cnt = 10;
+    std::vector<std::thread> threads;
+    threads.reserve(secondaries_thread_cnt);
+    gtxn_.all_task_pushed_ = true;
+    gtxn_.status_.SetFailed(ErrorCode::kOK);
+    gtxn_.acks_cnt_.Set(0);
+    gtxn_.ack_done_cnt_.Set(0);
+    gtxn_.notifies_cnt_.Set(0);
+    gtxn_.notify_done_cnt_.Set(0);
+    gtxn_.writes_cnt_.Set(secondaries_thread_cnt);
+    for (int i = 0; i < secondaries_thread_cnt; ++i) {
+        RowMutationImpl* mu_impl = new RowMutationImpl(NULL, "rowkey");
+        mu_impl->error_code_.SetFailed(ErrorCode::kSystem, "");
+        RowMutation* mu = static_cast<RowMutation*>(mu_impl);
+        auto func = std::bind(&GlobalTxn::DoCommitSecondariesCallback, &gtxn_, mu);
+        threads.emplace_back(std::thread(func));
+    }
+    for (int i = 0; i < threads.size(); ++i) {
+        threads[i].join();
+    }
+    threads.clear();
+    EXPECT_TRUE(gtxn_.status_.GetType() == ErrorCode::kOK);
+    EXPECT_TRUE(gtxn_.finish_ == true);
+}
+
+TEST_F(GlobalTxnTest, DoVerifyPrimaryLockedCallback3) {
+    // mutation error is not kOK but status_ is not changed
+    size_t secondaries_thread_cnt = 30;
+    std::vector<std::thread> threads;
+
+    threads.reserve(secondaries_thread_cnt);
+    gtxn_.status_.SetFailed(ErrorCode::kOK);
+    gtxn_.acks_cnt_.Set(10);
+    gtxn_.ack_done_cnt_.Set(9);
+    gtxn_.notifies_cnt_.Set(10);
+    gtxn_.notify_done_cnt_.Set(10);
+    gtxn_.writes_cnt_.Set(secondaries_thread_cnt);
+    for (int i = 0; i < secondaries_thread_cnt; ++i) {
+        RowMutationImpl* mu_impl = new RowMutationImpl(NULL, "rowkey");
+        mu_impl->error_code_.SetFailed(ErrorCode::kOK, "");
+        RowMutation* mu = static_cast<RowMutation*>(mu_impl);
+        auto func = std::bind(&GlobalTxn::DoCommitSecondariesCallback, &gtxn_, mu);
+        threads.emplace_back(std::thread(func));
+    }
+    for (int i = 0; i < threads.size(); ++i) {
+        threads[i].join();
+    }
+    threads.clear();
+    EXPECT_TRUE(gtxn_.status_.GetType() == ErrorCode::kOK);
+    EXPECT_TRUE(gtxn_.finish_ == false);
+
+}
+
+std::atomic<int> g_callback_run_cnt(0);
+
+static void EmptyMutationCallback(RowMutation* mu) {
+    LOG(INFO) << "run empty mutation callback";
+    ++g_callback_run_cnt;
+} 
+
+// has_commited == true && status_returned_ == false && set mutation callback
+TEST_F(GlobalTxnTest, ApplyMutation0) {
+    g_callback_run_cnt = 0;
+    gtxn_.has_commited_ = true;
+    gtxn_.status_returned_ = false;
+    
+    RowMutationImpl* mu_impl = new RowMutationImpl(NULL, "rowkey");
+    RowMutation* mu = static_cast<RowMutation*>(mu_impl);
+    mu->SetCallBack(EmptyMutationCallback);
+    gtxn_.ApplyMutation(mu);
+    thread_pool_.Stop(true);
+    EXPECT_TRUE(mu->GetError().GetType() == ErrorCode::kGTxnOpAfterCommit);
+    EXPECT_TRUE(gtxn_.status_returned_ == true);
+    EXPECT_TRUE(gtxn_.put_fail_cnt_.Get() == 0);
+    EXPECT_TRUE(gtxn_.status_.GetType() == ErrorCode::kGTxnOpAfterCommit);
+    EXPECT_TRUE(g_callback_run_cnt == 1);
+}
+
+// has_commited == true && status_returned_ == false && don't set mutation callback
+TEST_F(GlobalTxnTest, ApplyMutation1) {
+    g_callback_run_cnt = 0;
+    gtxn_.has_commited_ = true;
+    gtxn_.status_returned_ = false;
+    
+    RowMutationImpl* mu_impl = new RowMutationImpl(NULL, "rowkey");
+    RowMutation* mu = static_cast<RowMutation*>(mu_impl);
+    gtxn_.ApplyMutation(mu);
+    thread_pool_.Stop(true);
+    EXPECT_TRUE(mu->GetError().GetType() == ErrorCode::kGTxnOpAfterCommit);
+    EXPECT_TRUE(gtxn_.status_returned_ == true);
+    EXPECT_TRUE(gtxn_.put_fail_cnt_.Get() == 0);
+    EXPECT_TRUE(gtxn_.status_.GetType() == ErrorCode::kGTxnOpAfterCommit);
+    EXPECT_TRUE(g_callback_run_cnt == 0);
+}
+
+TEST_F(GlobalTxnTest, SetReaderStatusAndRunCallback0) {
+    RowReaderImpl* reader_impl = new RowReaderImpl(NULL, "rowkey");
+    ErrorCode status;
+    status.SetFailed(ErrorCode::kSystem, "");
+    gtxn_.SetReaderStatusAndRunCallback(reader_impl,&status);
+    RowReader* r = static_cast<RowReader*>(reader_impl);
+    thread_pool_.Stop(true);
+    EXPECT_TRUE(r->GetError().GetType() == ErrorCode::kSystem);
+    EXPECT_TRUE(r->IsFinished());
+    delete r;
+}
+
+TEST_F(GlobalTxnTest, SetReaderStatusAndRunCallback1) {
+    RowReaderImpl* reader_impl = new RowReaderImpl(NULL, "rowkey");
+    reader_impl->SetCallBack([](RowReader* r) {
+        EXPECT_TRUE(r->GetError().GetType() == ErrorCode::kSystem);
+        delete r;        
+    });
+    ErrorCode status;
+    status.SetFailed(ErrorCode::kSystem, "");
+    gtxn_.SetReaderStatusAndRunCallback(reader_impl,&status);
+    thread_pool_.Stop(true);
+}
+
+TEST_F(GlobalTxnTest, Get0) {
+    gtxn_.has_commited_ = true;
+    RowReaderImpl* reader_impl = new RowReaderImpl(NULL, "rowkey");
+    RowReader* r = static_cast<RowReader*>(reader_impl);
+    EXPECT_TRUE(gtxn_.Get(r).GetType() == ErrorCode::kGTxnOpAfterCommit);
+    thread_pool_.Stop(true);
+    EXPECT_TRUE(r->GetError().GetType() == ErrorCode::kGTxnOpAfterCommit);
+    EXPECT_TRUE(r->IsFinished());
+    delete r;
+}
+
+TEST_F(GlobalTxnTest, Get1) {
+    // set a table to tables_
+    ErrorCode status;
+    Table* t1 = OpenTable("t1");
+    // table<txn=true> and exist cf<gtxn=true>
+    TableDescriptor desc("t1");
+    desc.EnableTxn(); 
+    desc.AddLocalityGroup("lg0");
+    ColumnFamilyDescriptor* cfd1 = desc.AddColumnFamily("cf1");
+    cfd1->EnableGlobalTransaction();
+
+    TableSchema schema;
+    TableDescToSchema(desc, &schema);
+    SetSchema(t1, schema);
+  
+    EXPECT_TRUE(gtxn_.gtxn_internal_->CheckTable(t1, &status));
+
+    RowReader* r = t1->NewRowReader("r1");
+    bool ret = gtxn_.gtxn_internal_->VerifyUserRowReader(r);
+    EXPECT_FALSE(ret);
+
+    gtxn_.has_commited_ = false;
+    EXPECT_TRUE(gtxn_.Get(r).GetType() == ErrorCode::kBadParam);
+    thread_pool_.Stop(true);
+    EXPECT_TRUE(r->GetError().GetType() == ErrorCode::kBadParam);
+    EXPECT_TRUE(r->IsFinished());
+    delete r;
+    delete t1;
+}
+
+TEST_F(GlobalTxnTest, DoGetCellReaderCallback0) {
+    Table* t1 = OpenTable("t1");
+    RowReader* r = t1->NewRowReader("r1");
+    RowReaderImpl* r_impl = static_cast<RowReaderImpl*>(r);
+    InternalReaderContext* ctx = new InternalReaderContext(2, r_impl, &gtxn_); 
+    r->SetContext(ctx);
+    std::vector<Cell*> cells;
+    cells.push_back(new Cell(t1, "r1", "cf1", "qu"));
+    cells.push_back(new Cell(t1, "r1", "cf2", "qu"));
+    for(auto& cell : cells) {
+        ctx->cell_map[cell] = 0;
+    }
+    RowReader* inter_r = t1->NewRowReader("r1");
+    inter_r->SetContext(new CellReaderContext(cells[0], ctx));
+    RowReaderImpl* inter_r_impl = static_cast<RowReaderImpl*>(inter_r);
+    inter_r_impl->error_code_.SetFailed(ErrorCode::kNotFound, "");
+    gtxn_.DoGetCellReaderCallback(inter_r);
+    EXPECT_TRUE(ctx->not_found_cnt == 1);
+    EXPECT_TRUE(ctx->fail_cell_cnt == 0);
+    EXPECT_TRUE(ctx->active_cell_cnt == 1);
+    thread_pool_.Stop(true);
+    EXPECT_FALSE(r_impl->IsFinished());
+}
+
+TEST_F(GlobalTxnTest, DoGetCellReaderCallback1) {
+    Table* t1 = OpenTable("t1");
+    RowReader* r = t1->NewRowReader("r1");
+    RowReaderImpl* r_impl = static_cast<RowReaderImpl*>(r);
+    InternalReaderContext* ctx = new InternalReaderContext(2, r_impl, &gtxn_); 
+    r->SetContext(ctx);
+    std::vector<Cell*> cells;
+    cells.push_back(new Cell(t1, "r1", "cf1", "qu"));
+    cells.push_back(new Cell(t1, "r1", "cf2", "qu"));
+    for(auto& cell : cells) {
+        ctx->cell_map[cell] = 0;
+    }
+    RowReader* inter_r = t1->NewRowReader("r1");
+    inter_r->SetContext(new CellReaderContext(cells[0], ctx));
+    RowReaderImpl* inter_r_impl = static_cast<RowReaderImpl*>(inter_r);
+    inter_r_impl->error_code_.SetFailed(ErrorCode::kOK, "");
+    gtxn_.DoGetCellReaderCallback(inter_r);
+    EXPECT_TRUE(ctx->fail_cell_cnt == 0);
+    EXPECT_TRUE(ctx->not_found_cnt == 1);
+    EXPECT_TRUE(ctx->active_cell_cnt == 1);
+    thread_pool_.Stop(true);
+    EXPECT_FALSE(r_impl->IsFinished());
+}
+
+TEST_F(GlobalTxnTest, DoGetCellReaderCallback2) {
+    Table* t1 = OpenTable("t1");
+    RowReader* r = t1->NewRowReader("r1");
+    RowReaderImpl* r_impl = static_cast<RowReaderImpl*>(r);
+    InternalReaderContext* ctx = new InternalReaderContext(2, r_impl, &gtxn_); 
+    r->SetContext(ctx);
+    std::vector<Cell*> cells;
+    cells.push_back(new Cell(t1, "r1", "cf1", "qu"));
+    cells.push_back(new Cell(t1, "r1", "cf2", "qu"));
+    for(auto& cell : cells) {
+        ctx->cell_map[cell] = 0;
+    }
+    RowReader* inter_r = t1->NewRowReader("r1");
+    inter_r->SetContext(new CellReaderContext(cells[0], ctx));
+    RowReaderImpl* inter_r_impl = static_cast<RowReaderImpl*>(inter_r);
+    inter_r_impl->error_code_.SetFailed(ErrorCode::kSystem, "");
+    gtxn_.DoGetCellReaderCallback(inter_r);
+    EXPECT_TRUE(ctx->fail_cell_cnt == 1);
+    EXPECT_TRUE(ctx->not_found_cnt == 0);
+    EXPECT_TRUE(ctx->active_cell_cnt == 1);
+    thread_pool_.Stop(true);
+    EXPECT_FALSE(r_impl->IsFinished());
+}
+
+TEST_F(GlobalTxnTest, DoGetCellReaderCallback3) {
+    Table* t1 = OpenTable("t1");
+    RowReader* r = t1->NewRowReader("r1");
+    RowReaderImpl* r_impl = static_cast<RowReaderImpl*>(r);
+    InternalReaderContext* ctx = new InternalReaderContext(1, r_impl, &gtxn_); 
+    r->SetContext(ctx);
+    std::vector<Cell*> cells;
+    cells.push_back(new Cell(t1, "r1", "cf1", "qu"));
+    for(auto& cell : cells) {
+        ctx->cell_map[cell] = 0;
+    }
+    RowReader* inter_r = t1->NewRowReader("r1");
+    inter_r->SetContext(new CellReaderContext(cells[0], ctx));
+    RowReaderImpl* inter_r_impl = static_cast<RowReaderImpl*>(inter_r);
+    inter_r_impl->error_code_.SetFailed(ErrorCode::kSystem, "");
+    gtxn_.DoGetCellReaderCallback(inter_r);
+    thread_pool_.Stop(true);
+    EXPECT_TRUE(r_impl->IsFinished());
+}
+
+TEST_F(GlobalTxnTest, MergeCellToRow) {
+    Table* t1 = OpenTable("t1");
+    RowReader* r = t1->NewRowReader("r1");
+    RowReaderImpl* r_impl = static_cast<RowReaderImpl*>(r);
+    InternalReaderContext* ctx = new InternalReaderContext(1, r_impl, &gtxn_); 
+    r->SetContext(ctx);
+    std::vector<Cell*> cells;
+    cells.push_back(new Cell(t1, "r1", "cf1", "qu"));
+    for(auto& cell : cells) {
+        ctx->cell_map[cell] = 0;
+    }
+    RowReader* inter_r = t1->NewRowReader("r1");
+    inter_r->SetContext(new CellReaderContext(cells[0], ctx));
+    ErrorCode status;
+    status.SetFailed(ErrorCode::kSystem, "");
+    gtxn_.MergeCellToRow(inter_r, status);
+    thread_pool_.Stop(true);
+    EXPECT_TRUE(r_impl->IsFinished());
+}
+
+TEST_F(GlobalTxnTest, GetCellCallback) {
+    Table* t1 = OpenTable("t1");
+    RowReader* r = t1->NewRowReader("r1");
+    RowReaderImpl* r_impl = static_cast<RowReaderImpl*>(r);
+    InternalReaderContext* ctx = new InternalReaderContext(1, r_impl, &gtxn_); 
+    r->SetContext(ctx);
+    std::vector<Cell*> cells;
+    cells.push_back(new Cell(t1, "r1", "cf1", "qu"));
+    for(auto& cell : cells) {
+        ctx->cell_map[cell] = 0;
+    }
+    RowReader* inter_r = t1->NewRowReader("r1");
+    inter_r->SetContext(new CellReaderContext(cells[0], ctx));
+    RowReaderImpl* inter_r_impl = static_cast<RowReaderImpl*>(inter_r);
+    inter_r_impl->error_code_.SetFailed(ErrorCode::kSystem, "");
+    gtxn_.GetCellCallback((CellReaderContext*)inter_r->GetContext());
+    thread_pool_.Stop(true);
+    EXPECT_TRUE(r_impl->IsFinished());
+}
+
+TEST_F(GlobalTxnTest, RollForward) {
+    // can't find primary write cell 
+    Table* t1 = OpenTable("t1");
+    Cell cell(t1, "r1", "cf1", "qu");
+    tera::PrimaryInfo primary;
+    primary.set_table_name("t1");
+    primary.set_row_key("r1");
+    primary.set_column_family("cf1");
+    primary.set_qualifier("qu");
+    primary.set_gtxn_start_ts(12);
+    ErrorCode status;
+
+    std::set<std::string> gtxn_cfs;
+    gtxn_.gtxn_internal_->tables_["t1"] =
+        std::pair<Table*, std::set<std::string>>(t1, gtxn_cfs);
+    ErrorCode mock_status;
+    mock_status.SetFailed(ErrorCode::kNotFound,"");
+    std::vector<ErrorCode> reader_errs;
+    reader_errs.push_back(mock_status);
+    (static_cast<MockTable*>(t1))->AddReaderErrors(reader_errs);
+    gtxn_.RollForward(cell, primary, 0, &status);
+    EXPECT_TRUE(ErrorCode::kGTxnPrimaryLost == status.GetType());
+}
+
+TEST_F(GlobalTxnTest, CleanLock0) {
+    // cell same as primary
+    Table* t1 = OpenTable("t1");
+    Cell cell(t1, "r1", "cf1", "qu");
+    tera::PrimaryInfo primary;
+    primary.set_table_name("t1");
+    primary.set_row_key("r1");
+    primary.set_column_family("cf1");
+    primary.set_qualifier("qu");
+    primary.set_gtxn_start_ts(12);
+    // init status is OK
+    ErrorCode status;
+    status.SetFailed(ErrorCode::kOK);
+    std::set<std::string> gtxn_cfs;
+    gtxn_.gtxn_internal_->tables_["t1"] = 
+        std::pair<Table*, std::set<std::string>>(t1, gtxn_cfs);
+    // only this cell will call mutation 
+    ErrorCode mock_status1;
+    mock_status1.SetFailed(ErrorCode::kSystem,"");
+    std::vector<ErrorCode> mu_errs;
+    mu_errs.push_back(mock_status1);
+    (static_cast<MockTable*>(t1))->AddMutationErrors(mu_errs);
+    // run test
+    gtxn_.CleanLock(cell, primary, &status);
+    EXPECT_TRUE(mock_status1.GetType() == status.GetType());
+}
+
+TEST_F(GlobalTxnTest, CleanLock1) {
+    // cell diff with primary
+    Table* t1 = OpenTable("t1");
+    Cell cell(t1, "r1", "cf1", "qu");
+    tera::PrimaryInfo primary;
+    primary.set_table_name("t1");
+    primary.set_row_key("r2"); // diff row
+    primary.set_column_family("cf1");
+    primary.set_qualifier("qu");
+    primary.set_gtxn_start_ts(12);
+    // init status is OK
+    ErrorCode status;
+    status.SetFailed(ErrorCode::kOK);
+    std::set<std::string> gtxn_cfs;
+    gtxn_.gtxn_internal_->tables_["t1"] = 
+        std::pair<Table*, std::set<std::string>>(t1, gtxn_cfs);
+    // mock primary return kSystem but cell kOK
+    // will get kSystem
+    ErrorCode mock_status1;
+    ErrorCode mock_status2;
+    mock_status1.SetFailed(ErrorCode::kSystem,"");
+    mock_status2.SetFailed(ErrorCode::kOK,"");
+    std::vector<ErrorCode> mu_errs;
+    mu_errs.push_back(mock_status1);
+    mu_errs.push_back(mock_status2);
+    (static_cast<MockTable*>(t1))->AddMutationErrors(mu_errs);
+    // run test
+    gtxn_.CleanLock(cell, primary, &status);
+    EXPECT_TRUE(mock_status1.GetType() == status.GetType());
+    EXPECT_TRUE(mock_status2.GetType() != status.GetType());
+}
+
+TEST_F(GlobalTxnTest, CleanLock2) {
+    // cell diff with primary
+    Table* t1 = OpenTable("t1");
+    Cell cell(t1, "r1", "cf1", "qu");
+    tera::PrimaryInfo primary;
+    primary.set_table_name("t1");
+    primary.set_row_key("r2"); // diff row
+    primary.set_column_family("cf1");
+    primary.set_qualifier("qu");
+    primary.set_gtxn_start_ts(12);
+    // init status is OK
+    ErrorCode status;
+    status.SetFailed(ErrorCode::kOK);
+    std::set<std::string> gtxn_cfs;
+    gtxn_.gtxn_internal_->tables_["t1"] = 
+        std::pair<Table*, std::set<std::string>>(t1, gtxn_cfs);
+    // mock primary return kOk but cell kSystem
+    // will get kSystem
+    ErrorCode mock_status1;
+    ErrorCode mock_status2;
+    mock_status1.SetFailed(ErrorCode::kOK,"");
+    mock_status2.SetFailed(ErrorCode::kSystem,"");
+    std::vector<ErrorCode> mu_errs;
+    mu_errs.push_back(mock_status1);
+    mu_errs.push_back(mock_status2);
+    (static_cast<MockTable*>(t1))->AddMutationErrors(mu_errs);
+    // run test
+    gtxn_.CleanLock(cell, primary, &status);
+    EXPECT_TRUE(mock_status1.GetType() != status.GetType());
+    EXPECT_TRUE(mock_status2.GetType() == status.GetType());
+}
+
+TEST_F(GlobalTxnTest, CleanLock3) {
+    // cell diff with primary
+    Table* t1 = OpenTable("t1");
+    Cell cell(t1, "r1", "cf1", "qu");
+    tera::PrimaryInfo primary;
+    primary.set_table_name("t1");
+    primary.set_row_key("r2"); // diff row
+    primary.set_column_family("cf1");
+    primary.set_qualifier("qu");
+    primary.set_gtxn_start_ts(12);
+    // init status is OK
+    ErrorCode status;
+    status.SetFailed(ErrorCode::kOK);
+    std::set<std::string> gtxn_cfs;
+    gtxn_.gtxn_internal_->tables_["t1"] = 
+        std::pair<Table*, std::set<std::string>>(t1, gtxn_cfs);
+    // mock primary return kTimeout but cell kSystem
+    // will get kSystem, the latest error will return
+    ErrorCode mock_status1;
+    ErrorCode mock_status2;
+    mock_status1.SetFailed(ErrorCode::kTimeout,"");
+    mock_status2.SetFailed(ErrorCode::kSystem,"");
+    std::vector<ErrorCode> mu_errs;
+    mu_errs.push_back(mock_status1);
+    mu_errs.push_back(mock_status2);
+    (static_cast<MockTable*>(t1))->AddMutationErrors(mu_errs);
+    // run test
+    gtxn_.CleanLock(cell, primary, &status);
+    EXPECT_TRUE(mock_status1.GetType() != status.GetType());
+    EXPECT_TRUE(mock_status2.GetType() == status.GetType());
+}
+
+void AddKeyValueToResult(const std::string& key, const std::string& cf,
+        const std::string& qu, int64_t timestamp,
+        const std::string& value, RowResult* result) {
+    KeyValuePair* kv = result->add_key_values();
+    kv->set_key(key);
+    kv->set_column_family(cf);
+    kv->set_qualifier(qu);
+    kv->set_timestamp(timestamp);
+    kv->set_value(value);
+}
+
+TEST_F(GlobalTxnTest, EncodeWriteValue) {
+    std::string ret = EncodeWriteValue(1, 100);
+    int type;
+    int64_t ts;
+    DecodeWriteValue(ret, &type, &ts);
+
+    EXPECT_TRUE(type == 1);
+    EXPECT_TRUE(ts == 100);
+}
+
+TEST_F(GlobalTxnTest, DecodeWriteValue) {
+    // a int bigger than mutaion type
+    std::string ret = EncodeWriteValue(99, 1000000);
+    int type;
+    int64_t ts;
+    DecodeWriteValue(ret, &type, &ts);
+
+    EXPECT_TRUE(type == 99);
+    EXPECT_TRUE(ts == 1000000);
+}
+
+TEST_F(GlobalTxnTest, FindValueFromResultRow0) {
+    // the success case
+    Table* t1 = OpenTable("t1");
+    RowReader* r = t1->NewRowReader("r1");
+    RowReaderImpl* r_impl = static_cast<RowReaderImpl*>(r);
+
+    // build RowReader::TRow
+    // cf must exist before call FindValueFromResultRow
+    RowResult result;
+    gtxn_.start_ts_ = 14;
+    AddKeyValueToResult("r1", "cf1", "qu1", 9, "v1", &result);
+    AddKeyValueToResult("r1", "cf1", "qu1", 13, "v2", &result);
+    
+    AddKeyValueToResult("r1", "cf1", "qu1_W_", 15, EncodeWriteValue(0, 13), &result);
+    AddKeyValueToResult("r1", "cf1", "qu1_W_", 12, EncodeWriteValue(0, 9), &result);
+    r_impl->SetResult(result);
+    RowReader::TRow row;
+    r->ToMap(&row);
+
+    for (auto& cf : row) {
+        std::cout << cf.first << "\n";
+        for (auto& qu : cf.second) {
+            std::cout << "\t" << qu.first << "\n";
+            for (auto& v : qu.second) {
+                std::cout << "\t\tts=" << v.first << ",v=" << v.second << "\n";
+            }
+        }
+    }
+
+    // build target_cell
+    Cell target_cell(t1, "r1", "cf1", "qu1");
+    
+    // run test
+    EXPECT_TRUE(gtxn_.FindValueFromResultRow(row, &target_cell));
+    EXPECT_TRUE(target_cell.Timestamp() == 9);
+    EXPECT_TRUE(target_cell.Value() == "v1");
+
+    delete t1;
+    delete r;
+}
+
+TEST_F(GlobalTxnTest, FindValueFromResultRow1) {
+    // the not found
+    Table* t1 = OpenTable("t1");
+    RowReader* r = t1->NewRowReader("r1");
+    RowReaderImpl* r_impl = static_cast<RowReaderImpl*>(r);
+
+    // build RowReader::TRow
+    // cf must exist before call FindValueFromResultRow
+    RowResult result;
+    r_impl->SetResult(result);
+    gtxn_.start_ts_ = 11;
+    RowReader::TRow row;
+    r->ToMap(&row);
+
+    // build target_cell
+    Cell target_cell(t1, "r1", "cf1", "qu1");
+    
+    // run test
+    EXPECT_FALSE(gtxn_.FindValueFromResultRow(row, &target_cell));
+
+    delete t1;
+    delete r;
+}
+
+TEST_F(GlobalTxnTest, FindValueFromResultRow2) {
+    // the not found write col
+    Table* t1 = OpenTable("t1");
+    RowReader* r = t1->NewRowReader("r1");
+    RowReaderImpl* r_impl = static_cast<RowReaderImpl*>(r);
+
+    // build RowReader::TRow
+    // cf must exist before call FindValueFromResultRow
+    RowResult result;
+    gtxn_.start_ts_ = 11;
+
+    AddKeyValueToResult("r1", "cf1", "qu1", 9, "v1", &result);
+    AddKeyValueToResult("r1", "cf1", "qu1", 13, "v2", &result);
+    r_impl->SetResult(result);
+    
+    RowReader::TRow row;
+    r->ToMap(&row);
+
+    // build target_cell
+    Cell target_cell(t1, "r1", "cf1", "qu1");
+    
+    // run test
+    EXPECT_FALSE(gtxn_.FindValueFromResultRow(row, &target_cell));
+
+    delete t1;
+    delete r;
+}
+
+TEST_F(GlobalTxnTest, FindValueFromResultRow3) {
+    // the not found rigth version
+    Table* t1 = OpenTable("t1");
+    RowReader* r = t1->NewRowReader("r1");
+    RowReaderImpl* r_impl = static_cast<RowReaderImpl*>(r);
+
+    // build RowReader::TRow
+    // cf must exist before call FindValueFromResultRow
+    RowResult result;
+    gtxn_.start_ts_ = 11;
+
+    AddKeyValueToResult("r1", "cf1", "qu1", 9, "v1", &result);
+    AddKeyValueToResult("r1", "cf1", "qu1", 13, "v2", &result);
+    
+    AddKeyValueToResult("r1", "cf1", "qu1_W_", 15, EncodeWriteValue(0, 13), &result);
+    // make ts = 9 v1 is deleted before this function called
+    AddKeyValueToResult("r1", "cf1", "qu1_W_", 12, EncodeWriteValue(1, 9), &result);
+    r_impl->SetResult(result);
+    RowReader::TRow row;
+    r->ToMap(&row);
+
+    // build target_cell
+    Cell target_cell(t1, "r1", "cf1", "qu1");
+    // run test
+    EXPECT_FALSE(gtxn_.FindValueFromResultRow(row, &target_cell));
+
+    delete t1;
+    delete r;
+}
+
+TEST_F(GlobalTxnTest, FindValueFromResultRow4) {
+    // the not found rigth version
+    Table* t1 = OpenTable("t1");
+    RowReader* r = t1->NewRowReader("r1");
+    RowReaderImpl* r_impl = static_cast<RowReaderImpl*>(r);
+
+    // build RowReader::TRow
+    // cf must exist before call FindValueFromResultRow
+    RowResult result;
+    gtxn_.start_ts_ = 11;
+
+    AddKeyValueToResult("r1", "cf1", "qu1", 9, "v1", &result);
+    AddKeyValueToResult("r1", "cf1", "qu1", 13, "v2", &result);
+    
+    // maybe other older version clean by gc, before this function called
+    AddKeyValueToResult("r1", "cf1", "qu1_W_", 15, EncodeWriteValue(0, 13), &result);
+    r_impl->SetResult(result);
+    RowReader::TRow row;
+    r->ToMap(&row);
+
+    // build target_cell
+    Cell target_cell(t1, "r1", "cf1", "qu1");
+    // run test
+    EXPECT_FALSE(gtxn_.FindValueFromResultRow(row, &target_cell));
+
+    delete t1;
+    delete r;
+}
+
+TEST_F(GlobalTxnTest, FindValueFromResultRow5) {
+    // the not found rigth version
+    Table* t1 = OpenTable("t1");
+    RowReader* r = t1->NewRowReader("r1");
+    RowReaderImpl* r_impl = static_cast<RowReaderImpl*>(r);
+
+    // build RowReader::TRow
+    // cf must exist before call FindValueFromResultRow
+    RowResult result;
+    gtxn_.start_ts_ = 11;
+
+    // maybe version 1 was clean by gc, before this function called
+    AddKeyValueToResult("r1", "cf1", "qu1", 13, "v2", &result);
+    
+    AddKeyValueToResult("r1", "cf1", "qu1_W_", 15, EncodeWriteValue(0, 13), &result);
+    AddKeyValueToResult("r1", "cf1", "qu1_W_", 12, EncodeWriteValue(0, 9), &result);
+    r_impl->SetResult(result);
+    RowReader::TRow row;
+    r->ToMap(&row);
+
+    // build target_cell
+    Cell target_cell(t1, "r1", "cf1", "qu1");
+    // run test
+    EXPECT_FALSE(gtxn_.FindValueFromResultRow(row, &target_cell));
+
+    delete t1;
+    delete r;
+}
+
+TEST_F(GlobalTxnTest, SetLastStatus) {
+    ErrorCode status;
+    status.SetFailed(ErrorCode::kOK, "");
+    gtxn_.status_returned_ = false;
+    gtxn_.SetLastStatus(&status);
+    EXPECT_TRUE(gtxn_.status_returned_);
+    EXPECT_TRUE(gtxn_.status_.GetType() == status.GetType());
+
+    status.SetFailed(ErrorCode::kTimeout, "");
+    gtxn_.status_returned_ = true;
+    gtxn_.SetLastStatus(&status);
+    EXPECT_TRUE(gtxn_.status_returned_);
+    EXPECT_TRUE(gtxn_.status_.GetType() == ErrorCode::kOK);
+}
+
+static bool g_callback_run_flag = false;
+
+TEST_F(GlobalTxnTest, RunUserCallback0) {
+    g_callback_run_flag = false;
+    gtxn_.SetCommitCallback([](Transaction* t) {g_callback_run_flag = true;});
+    gtxn_.RunUserCallback();
+    EXPECT_TRUE(g_callback_run_flag);
+}
+
+static void WaitWapper(GlobalTxn* gtxn) {
+    gtxn->WaitForComplete();
+    g_callback_run_flag = true;
+}
+
+TEST_F(GlobalTxnTest, RunUserCallback1) {
+    g_callback_run_flag = false;
+    thread_pool_.AddTask(std::bind(&WaitWapper, &gtxn_));
+    gtxn_.RunUserCallback();
+    EXPECT_TRUE(gtxn_.finish_);
+    thread_pool_.Stop(true);
+    EXPECT_TRUE(g_callback_run_flag);
+}
+
+TEST_F(GlobalTxnTest, BackoffAndMaybeCleanupLock0) {
+    bool try_clean = false;
+    ErrorCode status;
+    // make sure have lock_ts < start_ts
+    // can't found primary
+    Table* t1 = OpenTable("t1");
+    RowReader* r = t1->NewRowReader("r1");
+    RowReaderImpl* r_impl = static_cast<RowReaderImpl*>(r);
+
+    // build RowReader::TRow
+    // cf must exist before call FindValueFromResultRow
+    RowResult result;
+    gtxn_.start_ts_ = 11;
+
+    // start_ts > lock ts and primary info is bad for parse
+    AddKeyValueToResult("r1", "cf1", "qu1_L_", 9, "primary info", &result);
+    r_impl->SetResult(result);
+    RowReader::TRow row;
+    r->ToMap(&row);
+
+    // build target_cell
+    Cell target_cell(t1, "r1", "cf1", "qu1");
+    // run test
+    gtxn_.BackoffAndMaybeCleanupLock(row, target_cell, try_clean, &status);
+    EXPECT_TRUE(status.GetType() == ErrorCode::kGTxnPrimaryLost);
+    delete t1;
+    delete r;
+}
+
+TEST_F(GlobalTxnTest, RunAfterPrewriteFailed0) {
+    Table* t = OpenTable("t1");
+    Cell cell(t, "r1", "cf", "qu", 1, "val");
+    Write w(cell);
+    std::vector<Write> ws;
+    ws.push_back(w);
+    PrewriteContext* ctx = new PrewriteContext(&ws, &gtxn_, w.TableName(), w.RowKey());
+    ctx->status.SetFailed(ErrorCode::kOK, "");
+    gtxn_.RunAfterPrewriteFailed(ctx);
+    EXPECT_TRUE(gtxn_.status_.GetType() == ErrorCode::kOK);
+}
+
+TEST_F(GlobalTxnTest, RunAfterPrewriteFailed1) {
+    Table* t = OpenTable("t1");
+    Cell cell(t, "r1", "cf", "qu", 1, "val");
+    Write w(cell);
+    std::vector<Write> ws;
+    ws.push_back(w);
+    PrewriteContext* ctx = new PrewriteContext(&ws, &gtxn_, w.TableName(), w.RowKey());
+    ctx->status.SetFailed(ErrorCode::kTimeout, "");
+    gtxn_.RunAfterPrewriteFailed(ctx);
+    EXPECT_TRUE(gtxn_.status_.GetType() == ErrorCode::kGTxnPrewriteTimeout);
+}
+
+TEST_F(GlobalTxnTest, RunAfterPrewriteFailed2) {
+    Table* t = OpenTable("t1");
+    Cell cell(t, "r1", "cf", "qu", 1, "val");
+    Write w(cell);
+    std::vector<Write> ws;
+    ws.push_back(w);
+    PrewriteContext* ctx = new PrewriteContext(&ws, &gtxn_, w.TableName(), w.RowKey());
+    gtxn_.gtxn_internal_->is_timeout_ = true;
+    gtxn_.RunAfterPrewriteFailed(ctx);
+    EXPECT_TRUE(gtxn_.status_.GetType() == ErrorCode::kGTxnPrewriteTimeout);
+    delete t;
+}
+
+TEST_F(GlobalTxnTest, DoPrewriteCallback0) {
+    // case a. global timeout
+    Table* t = OpenTable("t1");
+    Transaction* txn = t->StartRowTransaction("r1");
+    SingleRowTxn* stxn = static_cast<SingleRowTxn*>(txn);
+    Cell cell(t, "r1", "cf", "qu", 1, "val");
+    Write w(cell);
+    std::vector<Write> ws;
+    ws.push_back(w);
+    PrewriteContext* ctx = new PrewriteContext(&ws, &gtxn_, w.TableName(), w.RowKey());
+    stxn->SetContext(ctx);
+    gtxn_.gtxn_internal_->is_timeout_ = true;
+    gtxn_.DoPrewriteCallback(stxn);
+    EXPECT_TRUE(gtxn_.status_.GetType() == ErrorCode::kGTxnPrewriteTimeout);
+    delete t;
+}
+
+TEST_F(GlobalTxnTest, DoPrewriteCallback1) {
+    // case b. this operator timeout
+    Table* t = OpenTable("t1");
+    Transaction* txn = t->StartRowTransaction("r1");
+    SingleRowTxn* stxn = static_cast<SingleRowTxn*>(txn);
+    Cell cell(t, "r1", "cf", "qu", 1, "val");
+    Write w(cell);
+    std::vector<Write> ws;
+    ws.push_back(w);
+    PrewriteContext* ctx = new PrewriteContext(&ws, &gtxn_, w.TableName(), w.RowKey());
+    stxn->SetContext(ctx);
+    stxn->mutation_buffer_.SetError(ErrorCode::kTimeout,"");
+    gtxn_.gtxn_internal_->is_timeout_ = false;
+    gtxn_.DoPrewriteCallback(stxn);
+    EXPECT_TRUE(gtxn_.status_.GetType() == ErrorCode::kGTxnPrewriteTimeout);
+    delete t;
+}
+
+TEST_F(GlobalTxnTest, DoPrewriteCallback2) {
+    // case b. this operator error
+    Table* t = OpenTable("t1");
+    Transaction* txn = t->StartRowTransaction("r1");
+    SingleRowTxn* stxn = static_cast<SingleRowTxn*>(txn);
+    Cell cell(t, "r1", "cf", "qu", 1, "val");
+    Write w(cell);
+    std::vector<Write> ws;
+    ws.push_back(w);
+    PrewriteContext* ctx = new PrewriteContext(&ws, &gtxn_, w.TableName(), w.RowKey());
+    stxn->SetContext(ctx);
+    stxn->mutation_buffer_.SetError(ErrorCode::kSystem,"");
+    gtxn_.gtxn_internal_->is_timeout_ = false;
+    gtxn_.DoPrewriteCallback(stxn);
+    EXPECT_TRUE(gtxn_.status_.GetType() == ErrorCode::kSystem);
+    delete t;
+}
+
+TEST_F(GlobalTxnTest, VerifyPrimaryLocked) {
+    Table* t = OpenTable("t1");
+    Cell cell(t, "r1", "cf", "qu", 1, "val");
+    Write w(cell);
+    gtxn_.primary_write_ = &w;
+
+    ErrorCode mock_status;
+    mock_status.SetFailed(ErrorCode::kNotFound,"");
+    std::vector<ErrorCode> reader_errs;
+    reader_errs.push_back(mock_status);
+    (static_cast<MockTable*>(t))->AddReaderErrors(reader_errs);
+
+    gtxn_.VerifyPrimaryLocked();
+    EXPECT_TRUE(gtxn_.status_.GetType() == ErrorCode::kGTxnPrimaryLost);
+}
+
+
+} // namespace tera
diff --git a/src/sdk/test/global_txn_test_tool.cc b/src/sdk/test/global_txn_test_tool.cc
new file mode 100644
index 000000000..889e442fa
--- /dev/null
+++ b/src/sdk/test/global_txn_test_tool.cc
@@ -0,0 +1,754 @@
+// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+// 
+// Author: baorenyi@baidu.com
+
+#include "sdk/test/global_txn_test_tool.h"
+
+#include <fstream>
+#include <iostream>
+#include <memory>
+#include <unordered_map>
+
+#include <gflags/gflags.h>
+#include <glog/logging.h>
+
+#include "common/base/string_ext.h"
+#include "common/file/file_path.h"
+#include "sdk/sdk_utils.h"
+#include "sdk/client_impl.h"
+#include "utils/config_utils.h"
+#include "common/timer.h"
+#include "version.h"
+
+DECLARE_string(tera_gtxn_test_flagfile);
+DEFINE_string(gtxn_test_conf_dir, "../conf/", "gtxn test conf dir");
+DEFINE_string(gtxn_test_case_dir, "../cases/", "gtxn test cases dir");
+DEFINE_string(case_number, "", "gtxn test case number");
+DEFINE_bool(ignore_bad_case, false, "gtxn test ignore bad case");
+DEFINE_bool(gtxn_test_async_mode, false, "gtxn test async mode");
+DEFINE_bool(gtxn_test_debug_opened, false, "gtxn test debug opened");
+DEFINE_int32(gtxn_test_thread_pool_size, 20, "gtxn test thread pool size");
+DEFINE_bool(gtxn_test_drop_table_before, true, "gtxn test set drop tables before test");
+
+namespace tera {
+/**
+ * cases/ directory format
+ *
+ * CONF_ROOR/cases/1/schemas/table_1 [table schema file]
+ * ....
+ * CONF_ROOR/cases/1/schemas/table_x
+ *
+ * CONF_ROOR/cases/1/T_1/op_list   [operations list]
+ *      Format of op_list:
+ *          
+ *          TABLES:table_1,table_2,table_3
+ *          GET table_1 r1 cf1 qu1
+ *          PUT table_2 r2 cf2 qu2 valuex
+ *          DEL table_3 r3 cf3 qu3
+ *
+ * CONF_ROOR/cases/1/T_1/gtxn.flag [option]
+ * CONF_ROOR/cases/1/T_1/result_list  [set result list]
+ *
+ * CONF_ROOR/cases/1/T_2/op_list
+ * CONF_ROOR/cases/1/T_1/gtxn.flag [option]
+ * CONF_ROOR/cases/1/T_2/result_list
+ *
+ **/
+bool GlobalTxnTestTool::LoadTestConf() {
+    // list cases
+    const std::string case_dir = FLAGS_gtxn_test_case_dir;
+    std::vector<std::string> file_list;
+    if (IsEmpty(case_dir) || !ListCurrentDir(case_dir, &file_list)) {
+        LOG(ERROR) << "list cases failed, dir:" << case_dir;
+        return false;
+    }
+    for (auto it = file_list.begin(); it != file_list.end(); ++it) {
+        if (FLAGS_case_number != "" && (*it) != FLAGS_case_number) {
+            continue;
+        }
+        const std::string& dir_name = case_dir + (*it);
+        
+        if (!IsDir(dir_name)) {
+            continue;
+        }
+        
+        int case_num = atoi((*it).c_str());
+        if (case_num <= 0) {
+            LOG(ERROR) << "load case failed, dir:" << dir_name;
+            return false;
+        }
+        // list cases/x/schemas/
+        std::vector<std::string> schema_files;
+        const std::string& schema_dir = dir_name + "/schemas/";
+        if (IsEmpty(schema_dir) || !ListCurrentDir(schema_dir, &schema_files)) {
+            LOG(ERROR) << "list case(" << dir_name << ") schemas failed";
+            return false;
+        }
+        int schema_cnt = 0;
+        for (auto sit = schema_files.begin(); sit != schema_files.end(); ++sit) {
+            const std::string& schema_file = schema_dir + (*sit);
+            if (IsDir(schema_file)) {
+                continue;
+            }
+            // load schemas
+            TableDescriptor* desc = new TableDescriptor();
+            if (LoadDescriptor(schema_file, desc)) {
+                if (case_desc_map_.find(case_num) == case_desc_map_.end()) {
+                    case_desc_map_[case_num] = std::vector<TableDescriptor*>();
+                }
+                case_desc_map_[case_num].push_back(desc);
+                ++schema_cnt;
+            } else {
+                delete desc;
+                LOG(ERROR) << "load schema failed, schema_file:" << schema_file;
+                break;
+            }
+        }
+        if (schema_cnt == 0) {
+            LOG(ERROR) << "schemafile not found";
+            return false;
+        }
+
+        // mark cases/x/T_xx/
+        std::vector<std::string> txn_list;
+        if (!ListCurrentDir(dir_name, &txn_list)) {
+            LOG(ERROR) << "find txn dir failed, dir:" << dir_name;
+            return false;
+        }
+        int reg_cnt = 0;
+        for(auto it = txn_list.begin(); it != txn_list.end(); ++it) {
+            if (!IsDir(dir_name + "/" + (*it)) || *it == "schemas") {
+                continue;
+            }
+            if ((*it).find("T_") != std::string::npos) {
+                // find transaction 
+                int gtxn_id = atoi(((*it).substr(2)).c_str());
+                if (gtxn_id <= 0) {
+                    LOG(ERROR) << "mark gtxn conf failed, dir:" 
+                               << case_dir << "/" << dir_name;
+                    return false;
+                } else {
+                    CaseRegister(case_num, gtxn_id);
+                    ++reg_cnt;
+                }
+            }
+        }
+        if (reg_cnt == 0) {
+            LOG(ERROR) << "transaction not found";
+            return false;
+        }
+    }
+    return true;
+}
+
+void GlobalTxnTestTool::CaseRegister(const int case_num, const int gtxn_id) {
+    CasePair case_pair(case_num, gtxn_id);
+    case_list_.push_back(case_pair);
+}
+
+bool GlobalTxnTestTool::LoadDescriptor(const std::string& schema_file, 
+                                       TableDescriptor* table_desc) {
+    ErrorCode err;
+    if (!ParseTableSchemaFile(schema_file, table_desc, &err)) {
+        LOG(ERROR) << "fail to parse input table schema." << schema_file;
+        return false;
+    }
+    //ShowTableDescriptor(*table_desc, true);
+    return true;
+}
+
+GlobalTxnTestTool::GlobalTxnTestTool(Client* client):
+    thread_pool_(FLAGS_gtxn_test_thread_pool_size),
+    client_(client) {
+}
+
+void GlobalTxnTestTool::RunTest(tera::Client* client, int case_number) {
+    do_cnt_.Set(0);
+    done_cnt_.Set(0);
+    done_fail_cnt_.Set(0);
+    for (auto it = case_list_.begin(); it != case_list_.end(); ++it) {
+        CasePair case_pair = *it;
+        int case_num = case_pair.first;
+        if (case_number != -1 && case_num != case_number) {
+            continue;
+        }
+        int gtxn_id = case_pair.second;
+
+        const std::string case_dir = FLAGS_gtxn_test_case_dir;
+        const std::string conf_dir = case_dir + std::to_string(case_num) 
+                                   + "/T_" + std::to_string(gtxn_id);
+        const std::string& op_list_file = conf_dir + "/op_list";
+        std::vector<std::string> op_list;
+        std::ifstream ifile(op_list_file);
+        std::string line;
+        int cnt = 0;
+        while (std::getline(ifile, line)) {
+            if (cnt == 0) {
+                std::size_t found = line.find("TABLES:");
+                if (found!=std::string::npos) {
+                    std::vector<std::string> tables;
+                    SplitString(line.substr(found + 7), ",", &tables); 
+                    if (!OpenTestTables(tables)) {
+                        return;
+                    }
+                }
+            } else {
+                op_list.push_back(line);
+            }
+            ++cnt;
+        }
+        ifile.close();
+        if (cnt < 1) {
+            LOG(ERROR) << "no operations in op_list";
+        }
+        do_cnt_.Inc();
+        ThreadPool::Task task = std::bind(&GlobalTxnTestTool::RunTestInternal, 
+                                          this, client, case_num, gtxn_id, op_list);
+        thread_pool_.AddTask(task);
+    }
+}
+
+void GlobalTxnTestTool::RunTestInternal(tera::Client* client, const int case_num, const int gtxn_id, 
+                                        const std::vector<std::string>& op_list) {
+    const std::string case_dir = FLAGS_gtxn_test_case_dir;
+    const std::string conf_dir = case_dir + std::to_string(case_num) 
+                               + "/T_" + std::to_string(gtxn_id);
+    
+    // make sure flagfile only service for this transaction
+    tera::Transaction* gtxn = nullptr;
+    {
+        MutexLock lock(&mu_);
+        FLAGS_tera_gtxn_test_flagfile = conf_dir + "/gtxn.flag";
+        gtxn = client->NewGlobalTransaction();
+    }
+
+    if (!FLAGS_gtxn_test_async_mode) {
+        std::vector<std::string> result;
+        for (auto it = op_list.begin(); it != op_list.end(); ++it) {
+            const std::string& op_str = *it;
+            VLOG(12) << "OPERATION:" << op_str;
+            OpType op_type;
+            std::vector<std::string> op_args;
+            if (!ParseOp(op_str, &op_type, &op_args)
+                || !DoOp(gtxn, op_type, op_args, &result)) {
+                LOG(ERROR) << gtxn->GetError().ToString();
+                delete gtxn;
+                done_cnt_.Inc();
+                return;
+            }
+        }
+        gtxn->Commit();
+        result.push_back(std::to_string(gtxn->GetError().GetType()));
+        if(!CheckResult(case_num, gtxn_id, result)) {
+            done_fail_cnt_.Inc();
+        }
+        delete gtxn;
+        done_cnt_.Inc();
+    } else {
+        if (op_list.size() > 0) {
+            GTxnTestContext* ctx = new GTxnTestContext();
+            ctx->tool = this;
+            ctx->gtxn = gtxn;
+            ctx->op_list = op_list;
+            ctx->case_num = case_num;
+            ctx->gtxn_id = gtxn_id;
+            ctx->it = ctx->op_list.begin();
+            const std::string& op_str = *(ctx->it);
+            VLOG(12) << "OPERATION:" << op_str;
+            OpType op_type;
+            std::vector<std::string> op_args;
+            if (!ParseOp(op_str, &op_type, &op_args)) {
+                LOG(ERROR) << "parse op failed";
+                delete ctx->gtxn;
+                delete ctx;
+                done_cnt_.Inc();
+                return;
+            }
+            DoOpAsync(ctx, op_type, op_args);
+        } else {
+            LOG(ERROR) << "not set operators";
+            delete gtxn;
+            done_cnt_.Inc();
+        }
+    }
+}
+
+bool GlobalTxnTestTool::OpenTestTables(const std::vector<std::string>& tables) {
+    ErrorCode err;
+    MutexLock lock(&mu_);
+    for(auto it = tables.begin(); it != tables.end(); ++it) {
+        const std::string tablename = *it;
+        if (tables_.find(tablename) == tables_.end()) {
+            Table* table = client_->OpenTable(tablename, &err);
+            if (table == NULL) {
+                return false;
+            }
+            tables_[tablename] = table;
+        }
+    }
+    return true;
+}
+
+void GlobalTxnTestTool::DoOpAsync(GTxnTestContext* ctx, 
+                                  const OpType& op_type,
+                                  const std::vector<std::string>& op_args) {
+    if (op_args.size() < 4) {
+        return;
+    }
+    Table* table = nullptr;
+    const std::string tablename = op_args[0];
+    auto table_it = tables_.find(tablename);
+    if (table_it != tables_.end()) {
+        table = table_it->second;
+    } else {
+        return;
+    }
+    const std::string row = op_args[1];
+    const std::string cf = op_args[2];
+    const std::string qu = op_args[3];
+    if (op_type == OpType::PUT && op_args.size() == 5) {
+        const std::string value = op_args[4];
+        tera::RowMutation* m = table->NewRowMutation(row);
+        m->Put(cf, qu, value);
+        ctx->gtxn->ApplyMutation(m);
+        ctx->result.push_back("PUT: " + std::to_string(ctx->gtxn->GetError().GetType()));
+        delete m;
+    } else if (op_type == OpType::GET && op_args.size() == 4) {
+        tera::RowReader* r = table->NewRowReader(row);
+        r->AddColumn(cf, qu);
+        r->SetCallBack([] (RowReader* r) {
+            ((GTxnTestContext*)r->GetContext())->tool->DoOpAsyncCallback(r);
+        });
+        r->SetContext(ctx);
+        ctx->gtxn->Get(r);
+        return;
+    } else if (op_type == OpType::DEL && op_args.size() == 4) {
+        tera::RowMutation* m = table->NewRowMutation(row);
+        m->DeleteColumns(cf, qu);
+        ctx->gtxn->ApplyMutation(m);
+        ctx->result.push_back("DEL: " + std::to_string(ctx->gtxn->GetError().GetType()));
+        delete m;
+    }
+    
+    // this operation is muation , run next operation
+    if (op_type == OpType::PUT || op_type == OpType::DEL) { 
+        if (++ctx->it != ctx->op_list.end()) {
+            const std::string& op_str = *(ctx->it);
+            VLOG(12) << "OPERATION:" << op_str;
+            OpType next_op_type;
+            std::vector<std::string> next_op_args;
+            if (!ParseOp(op_str, &next_op_type, &next_op_args)) {
+                LOG(ERROR) << "parse op failed";
+                delete ctx->gtxn;
+                delete ctx;
+                done_cnt_.Inc();
+                return;
+            }
+            DoOpAsync(ctx, next_op_type, next_op_args);
+        } else {
+            ctx->gtxn->SetCommitCallback([] (Transaction* t) {
+                ((GTxnTestContext*)t->GetContext())->tool->DoCommitCallback(t);
+            });
+            ctx->gtxn->SetContext(ctx);
+            ctx->gtxn->Commit();
+        }
+    }
+}
+
+void GlobalTxnTestTool::DoOpAsyncCallback(RowReader* r) {
+    GTxnTestContext* ctx = (GTxnTestContext*)r->GetContext();
+    if (r->GetError().GetType() == ErrorCode::kOK) {
+        while (!r->Done()) {
+            const std::string& result_item = "GET: " 
+                + std::to_string(r->GetError().GetType()) + " " 
+                + std::to_string(r->Timestamp()) + ":" + r->Value();
+            ctx->result.push_back(result_item);
+            r->Next();
+        }
+    } else if (r->GetError().GetType() == ErrorCode::kNotFound) {
+        ctx->result.push_back("GET: " + std::to_string(r->GetError().GetType()));
+    } else {
+        ctx->result.push_back("GET: " + std::to_string(r->GetError().GetType()));
+    }
+    delete r;
+    // if not last, call next operation
+    if (++ctx->it != ctx->op_list.end()) {
+        const std::string& op_str = *(ctx->it);
+        VLOG(12) << "OPERATION:" << op_str;
+        OpType next_op_type;
+        std::vector<std::string> next_op_args;
+        if (!ParseOp(op_str, &next_op_type, &next_op_args)) {
+            LOG(ERROR) << "parse op failed";
+            delete ctx->gtxn;
+            delete ctx;
+            done_cnt_.Inc();
+            return;
+        }
+        DoOpAsync(ctx, next_op_type, next_op_args);
+    } else {
+        ctx->gtxn->SetCommitCallback([] (Transaction* t) {
+            ((GTxnTestContext*)t->GetContext())->tool->DoCommitCallback(t);
+        });
+        ctx->gtxn->SetContext(ctx);
+        ctx->gtxn->Commit();
+    }
+}
+
+void GlobalTxnTestTool::DoCommitCallback(Transaction* t) {
+    GTxnTestContext* ctx = (GTxnTestContext*)t->GetContext();
+    
+    ctx->result.push_back(std::to_string(t->GetError().GetType()));
+    if (!CheckResult(ctx->case_num, ctx->gtxn_id, ctx->result)) {
+        done_fail_cnt_.Inc();
+    } 
+    delete ctx;
+    delete t;
+    done_cnt_.Inc();
+}
+
+bool GlobalTxnTestTool::DoOp(tera::Transaction* gtxn, 
+                             const OpType& op_type,
+                             const std::vector<std::string>& op_args,
+                             std::vector<std::string>* result) {
+    if (op_args.size() < 4) {
+        return false;
+    }
+    Table* table = nullptr;
+    const std::string tablename = op_args[0];
+    auto table_it = tables_.find(tablename);
+    if (table_it != tables_.end()) {
+        table = table_it->second;
+    } else {
+        return false;
+    }
+    const std::string row = op_args[1];
+    const std::string cf = op_args[2];
+    const std::string qu = op_args[3];
+    if (op_type == OpType::PUT && op_args.size() == 5) {
+        const std::string value = op_args[4];
+        std::unique_ptr<tera::RowMutation> m(table->NewRowMutation(row));
+        m->Put(cf, qu, value);
+        gtxn->ApplyMutation(m.get());
+        result->push_back("PUT: " + std::to_string(gtxn->GetError().GetType()));
+        return true;
+    } else if (op_type == OpType::GET && op_args.size() == 4) {
+        std::unique_ptr<tera::RowReader> r(table->NewRowReader(row));
+        r->AddColumn(cf, qu);
+        gtxn->Get(r.get());
+        if (r->GetError().GetType() == ErrorCode::kOK) {
+            while (!r->Done()) {
+                const std::string& result_item = "GET: " 
+                    + std::to_string(r->GetError().GetType()) + " " 
+                    + std::to_string(r->Timestamp()) + ":" + r->Value();
+                result->push_back(result_item);
+                r->Next();
+            }
+            return true;
+        } else if (r->GetError().GetType() == ErrorCode::kNotFound) {
+            result->push_back("GET: " + std::to_string(r->GetError().GetType()));
+            return true;
+        } else {
+            result->push_back("GET: " + std::to_string(r->GetError().GetType()));
+        }
+    } else if (op_type == OpType::DEL && op_args.size() == 4) {
+        std::unique_ptr<tera::RowMutation> m(table->NewRowMutation(row));
+        m->DeleteColumns(cf, qu);
+        gtxn->ApplyMutation(m.get());
+        result->push_back("DEL: " + std::to_string(gtxn->GetError().GetType()));
+        return true;
+    }
+    return false;
+}
+
+bool GlobalTxnTestTool::ParseOp(const std::string& op_str, 
+             OpType* op_type, std::vector<std::string>* op_args) {
+    std::vector<std::string> args;
+    SplitString(op_str, " ", &args);
+    if (TrimString(args[0]) == "PUT") {
+        *op_type = OpType::PUT;
+    } else if (TrimString(args[0]) == "GET") {
+        *op_type = OpType::GET;
+    } else if (TrimString(args[0]) == "DEL") {
+        *op_type = OpType::DEL;
+    } else {
+        LOG(ERROR) << "operation type not support :[" << TrimString(args[0]) << "]";
+        return false;
+    }
+    for (size_t i = 1; i < args.size(); ++i) {
+        op_args->push_back(TrimString(args[i]));
+    }
+    return true;
+}
+
+void GlobalTxnTestTool::DebugOpList(const std::string& op_list_file) {
+    std::vector<std::string> op_list;
+    std::ifstream ofile(op_list_file);
+    std::string line;
+    int cnt = 0;
+    while (std::getline(ofile, line)) {
+        op_list.push_back(line);
+        ++cnt;
+    }
+    ofile.close();
+    if (cnt < 1) {
+        LOG(ERROR) << "no operators in op_list";
+    }
+    std::cout  << "OpList:" << std::endl;
+    for (auto l : op_list) {
+        std::cout << l <<std::endl;
+    } 
+    std::cout  << "-------------------------------------------" << std::endl;
+}
+
+void GlobalTxnTestTool::DebugFlagFile(const std::string& flag_file) {
+    std::vector<std::string> flag_list;
+    std::ifstream ofile(flag_file);
+    std::string line;
+    int cnt = 0;
+    while (std::getline(ofile, line)) {
+        flag_list.push_back(line);
+        ++cnt;
+    }
+    ofile.close();
+    if (cnt < 1) {
+        LOG(ERROR) << "no flags in gtxn.flag";
+    }
+    std::cout  << "FLAGS:" << std::endl;
+    for (auto f : flag_list) {
+        std::string flag = TrimString(f);
+        if (flag.length() > 0 && flag[0] == '#') {
+            continue;
+        }
+        std::cout << flag <<std::endl;
+    } 
+    std::cout  << "-------------------------------------------" << std::endl;
+} 
+
+bool GlobalTxnTestTool::CheckResult(const int case_num, const int gtxn_id, 
+                                    const std::vector<std::string>& result) {
+    MutexLock lock(&mu_);
+    const std::string case_dir = FLAGS_gtxn_test_case_dir;
+    const std::string conf_dir = case_dir + std::to_string(case_num) 
+                               + "/T_" + std::to_string(gtxn_id);
+    std::cout  << "===========================================" << std::endl;
+    std::cout  << "CASE:" << case_num << " GTXN_ID:" << gtxn_id << std::endl;
+    if (FLAGS_gtxn_test_debug_opened) {
+        const std::string& op_list_file = conf_dir + "/op_list";
+        const std::string& flag_file = conf_dir + "/gtxn.flag";
+        DebugOpList(op_list_file);
+        DebugFlagFile(flag_file);
+        std::cout  << "Result Printing:" << std::endl;
+        for (auto it = result.begin(); it != result.end(); ++it) {
+            std::cout << "RESULT:" << *it << std::endl;
+        }
+        std::cout  << "-------------------------------------------" << std::endl;
+    }
+
+    VLOG(12)  << "case:" << case_num 
+              << " gtxn_id:" << gtxn_id << " Printing";
+    for (auto it = result.begin(); it != result.end(); ++it) {
+        VLOG(12) << "RESULT:" << *it;
+    }
+
+    const std::string& result_list_file = conf_dir + "/result_list";
+    std::vector<std::string> result_list;
+    std::ifstream ofile(result_list_file);
+    std::string line;
+    int cnt = 0;
+    while (std::getline(ofile, line)) {
+        result_list.push_back(line);
+        ++cnt;
+    }
+    ofile.close();
+    if (cnt < 1) {
+        LOG(ERROR) << "no results in result_list";
+        return false;
+    }
+
+    if (result_list.size() != result.size()) {
+        std::cout << "\tERROR[expect_line_count: " << result_list.size() << " actual_line_count: " << result.size() << "]\n";
+        return false;
+    } else {
+        int have_diff = 0;
+        for (size_t i = 0; i < result.size(); ++i) {
+            const std::string& ret = result[i];
+            const std::string& default_ret = result_list[i];
+            if (TrimString(ret) != TrimString(default_ret)) {
+                std::cout << "\tERROR[expect: (" << default_ret << ") actual: (" << ret << ")]\n";
+                ++have_diff;
+            }
+        }
+        if (have_diff > 0) {
+            std::cout << "FAILED :" << have_diff << std::endl;
+            return false;
+        }
+    }
+    std::cout << "SUCCEED" << std::endl;
+    return true;
+}
+
+bool GlobalTxnTestTool::InitTestTables(int case_num) {
+    ErrorCode err;
+    std::unordered_map<std::string, TableDescriptor*> table_map;
+    for (auto it = case_desc_map_.begin(); it != case_desc_map_.end(); ++it) {
+        if (case_num != -1 && case_num != it->first) {
+            continue;
+        }
+        std::vector<TableDescriptor*>& desc_list = it->second;
+        for (auto dit = desc_list.begin(); dit != desc_list.end(); ++dit) {
+            TableDescriptor* desc = (*dit);
+            const std::string& tablename = desc->TableName();
+            if (table_map.find(tablename) == table_map.end()) {
+                table_map[tablename] = desc;
+            }
+        }
+    }
+
+    for (auto& table : table_map) {
+        if (client_->CreateTable(*(table.second), &err) && err.GetType() == ErrorCode::kOK) {
+            VLOG(12) << "create table " << table.first << " ok";
+        } else {
+            LOG(ERROR) << "create table " << table.first << " failed";
+            return false;
+        }
+    }
+    return true;
+}
+
+bool GlobalTxnTestTool::DropTestTables(int case_num) {
+    ErrorCode err;
+    std::unordered_map<std::string, TableDescriptor*> table_map;
+    for (auto it = case_desc_map_.begin(); it != case_desc_map_.end(); ++it) {
+        if (case_num != -1 && case_num != it->first) {
+            continue;
+        }
+        std::vector<TableDescriptor*>& desc_list = it->second;
+        for (auto dit = desc_list.begin(); dit != desc_list.end(); ++dit) {
+            TableDescriptor* desc = (*dit);
+            const std::string& tablename = desc->TableName();
+            if (table_map.find(tablename) == table_map.end()) {
+                table_map[tablename] = desc;
+            }
+        }
+    }
+
+    for (auto& table : table_map) {
+        const std::string& tablename = table.first;
+        if (!client_->DisableTable(tablename, &err)) {
+            LOG(ERROR) << "disable table failed, table: " << tablename;
+            return false;
+        }
+        TableMeta table_meta;
+        TabletMetaList tablet_list;
+        tera::ClientImpl* client_impl = static_cast<tera::ClientImpl*>(client_);
+        if (!client_impl->ShowTablesInfo(tablename, &table_meta, &tablet_list, &err)) {
+            LOG(ERROR) << "table not exist: " << tablename;
+            return false;
+        }
+
+        uint64_t tablet_num = tablet_list.meta_size();
+        while (true) {
+            if (!client_impl->ShowTablesInfo(tablename, &table_meta, &tablet_list, &err)) {
+                LOG(ERROR) << "table not exist: " << tablename;
+                return false;
+            }
+            uint64_t tablet_cnt = 0;
+            for (int32_t i = 0; i < tablet_list.meta_size(); ++i) {
+                const TabletMeta& tablet = tablet_list.meta(i);
+                if (tablet.status() == kTabletDisable || tablet.status() == kTableOffLine) {
+                    tablet_cnt++;
+                }
+            }
+            if (tablet_cnt == tablet_num) {
+                // disable finish
+                break;
+            }
+            sleep(1);
+        }
+
+        if (!client_->DropTable(tablename, &err)) {
+            LOG(ERROR) << "drop table " << tablename << " failed";
+            return false;
+        }
+    }
+    return true;
+}
+
+void GlobalTxnTestTool::Wait() {
+    while(do_cnt_.Get() > done_cnt_.Get()) {
+        sleep(1);
+    }
+}
+
+void GlobalTxnTestTool::RunCaseOneByOne() {
+    std::set<int> cases;
+    for (auto it = case_list_.begin(); it != case_list_.end(); ++it) {
+        CasePair case_pair = *it;
+        int case_num = case_pair.first;
+        cases.insert(case_num);
+    }
+    for (auto& case_num : cases) {
+        LOG(INFO) << "GlobalTxnTest Case " << case_num << " Begin";
+        // drop table
+        if (FLAGS_gtxn_test_drop_table_before) {
+            DropTestTables(case_num);
+        }
+
+        if (!InitTestTables(case_num)) {
+            LOG(ERROR) << "GlobalTxnTest Case " << case_num 
+                       << " InitTestTables Failed";
+            if (FLAGS_ignore_bad_case == true) {
+                continue;
+            } else {
+                break;
+            }
+        }
+        RunTest(client_, case_num);
+        Wait();
+        LOG(INFO) << "GlobalTxnTest Case " << case_num << " Finish";
+        if (done_fail_cnt_.Get() > 0) {
+            if (FLAGS_ignore_bad_case == true) {
+                continue;
+            } else {
+                break;
+            }
+        }
+    }
+}
+
+} // namespace tera
+
+
+int main(int argc, char *argv[]){
+    ::google::ParseCommandLineFlags(&argc, &argv, true);
+    
+    if (argc > 1 && std::string(argv[1]) == "version") {
+        PrintSystemVersion();
+        return 0;
+    } 
+    if (FLAGS_gtxn_test_conf_dir == "") {
+        LOG(ERROR) << "not set \"--gtxn_test_conf_dir\"";
+        return -1;
+    }
+    if (FLAGS_gtxn_test_case_dir == "") {
+        LOG(ERROR) << "not set \"--gtxn_test_case_dir\"";
+        return -1;
+    }
+    
+    tera::ErrorCode error_code;
+    tera::Client* client = tera::Client::NewClient(FLAGS_gtxn_test_conf_dir + "/tera.flag", 
+                                                   &error_code);
+    if (client == NULL) {
+        return -1;
+    }
+
+    tera::GlobalTxnTestTool gtxn_test_tool(client);
+    // init table
+    if (!gtxn_test_tool.LoadTestConf()) {
+        return -1;
+    } 
+    gtxn_test_tool.RunCaseOneByOne();
+    return 0;
+}
diff --git a/src/sdk/test/global_txn_test_tool.h b/src/sdk/test/global_txn_test_tool.h
new file mode 100644
index 000000000..7acf12644
--- /dev/null
+++ b/src/sdk/test/global_txn_test_tool.h
@@ -0,0 +1,95 @@
+// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef  TERA_SDK_TEST_GLOBAL_TXN_TEST_TOOL_H_
+#define  TERA_SDK_TEST_GLOBAL_TXN_TEST_TOOL_H_
+
+#include <string>
+
+#include "common/thread_pool.h"
+#include "common/counter.h"
+#include "tera.h"
+
+namespace tera {
+
+enum OpType { GET, PUT, DEL };
+
+class GlobalTxnTestTool;
+
+
+struct GTxnTestContext {
+    GlobalTxnTestTool* tool;
+    tera::Transaction* gtxn;
+    std::vector<std::string> op_list;
+    std::vector<std::string> result;
+    std::vector<std::string>::iterator it;
+    int case_num;
+    int gtxn_id;
+};
+
+class GlobalTxnTestTool {
+public:
+    GlobalTxnTestTool(Client* client);
+    ~GlobalTxnTestTool(){}
+
+    bool LoadTestConf();
+
+    bool InitTestTables(int case_num = -1);
+
+    bool DropTestTables(int case_num = -1);
+
+    void RunTest(tera::Client* client, int case_num = -1);
+
+    void Wait();
+
+    void RunCaseOneByOne();
+private:
+    void RunTestInternal(tera::Client* client, const int case_num, const int gtxn_id, 
+                         const std::vector<std::string>& op_list);
+
+    void CaseRegister(const int case_num, const int gtxn_id);
+
+    bool LoadDescriptor(const std::string& schema_file, TableDescriptor* schema);
+
+    void DebugOpList(const std::string& op_list_file);
+
+    void DebugFlagFile(const std::string& flag_file);
+    
+    bool CheckResult(const int case_num, const int gtxn_id, 
+                     const std::vector<std::string>& result);
+
+    bool ParseOp(const std::string& op_str, 
+                 OpType* op_type, std::vector<std::string>* op_args);
+
+    bool DoOp(tera::Transaction* gtxn, 
+              const OpType& op_type, 
+              const std::vector<std::string>& op_args,
+              std::vector<std::string>* result); 
+
+    void DoOpAsync(GTxnTestContext* ctx, const OpType& op_type, 
+                   const std::vector<std::string>& op_args);
+
+    void DoOpAsyncCallback(tera::RowReader* r);
+
+    void DoCommitCallback(tera::Transaction* t);
+
+    bool OpenTestTables(const std::vector<std::string>& tables);
+
+private:
+    typedef std::pair<int, int> CasePair;
+    std::vector<CasePair> case_list_;
+    typedef std::map<int, std::vector<TableDescriptor*>> CaseDescMap;
+    CaseDescMap case_desc_map_;
+    std::map<std::string, Table*> tables_;
+    mutable Mutex mu_;
+    common::ThreadPool thread_pool_;
+    Client* client_;
+    Counter do_cnt_;
+    Counter done_cnt_;
+    Counter done_fail_cnt_;
+};
+
+} // namespace tera
+
+#endif  // TERA_SDK_TEST_GLOBAL_TXN_TEST_TOOL_H_
diff --git a/src/sdk/test/global_txn_testutils.cc b/src/sdk/test/global_txn_testutils.cc
new file mode 100644
index 000000000..c615489d7
--- /dev/null
+++ b/src/sdk/test/global_txn_testutils.cc
@@ -0,0 +1,178 @@
+// Copyright (c) 2017, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include <gflags/gflags.h>
+#include <glog/logging.h>
+#include "common/base/string_ext.h"
+#include "common/this_thread.h"
+#include "sdk/test/global_txn_testutils.h"
+#include "utils/config_utils.h"
+#include "common/timer.h"
+
+DEFINE_bool(tera_gtxn_test_opened, false, "for test gtxn opened");
+DEFINE_bool(tera_gtxn_test_isolation_snapshot, true, "true means Snapshot, false means ReadCommitedSnapshot");
+DEFINE_string(tera_gtxn_test_flagfile, "", "gtxn test flagfile");
+DEFINE_int64(start_ts, 1, "start ts");
+DEFINE_int64(begin_commit_ts, 0, "time to wait before begin commit");
+DEFINE_int64(begin_prewrite_ts, 0, "time to wait before prewrite");
+DEFINE_int64(end_prewrite_ts, 0, "time to wait from before prewrite");
+DEFINE_int64(commit_ts, 1, "time to wait from end prewrite");
+DEFINE_int64(begin_primary_commit_ts, 0, "time to wait before primary commit");
+DEFINE_int64(end_primary_commit_ts, 0, "time to wait from primary commit");
+DEFINE_int64(begin_other_commit_ts, 0, "time to wait before other commit");
+DEFINE_string(get_wait_ts_list, "", "timestamp list for wait to get");
+
+namespace tera {
+
+constexpr int64_t kMillisPerSecond = 1000L;
+
+GlobalTxnTestHelper::GlobalTxnTestHelper(const std::string& conffile) : 
+    pos_(0), get_pos_(0), conf_file_(conffile), 
+    start_ts_(0), prewrite_start_ts_(0), commit_ts_(0), 
+    helper_create_time_(get_millis()) {
+}
+
+void GlobalTxnTestHelper::LoadTxnConf() {
+    utils::LoadFlagFile(conf_file_); 
+    ts_[0] = FLAGS_start_ts;
+    start_ts_ = FLAGS_start_ts;
+    ts_[1] = FLAGS_begin_commit_ts;
+    ts_[2] = FLAGS_begin_prewrite_ts;
+    ts_[3] = FLAGS_end_prewrite_ts;
+    ts_[4] = FLAGS_commit_ts;
+    ts_[5] = FLAGS_begin_primary_commit_ts;
+    ts_[6] = FLAGS_end_primary_commit_ts;
+    ts_[7] = FLAGS_begin_other_commit_ts;
+    VLOG(13) << "split get wait ts list begin...";
+    SplitString(FLAGS_get_wait_ts_list, ",", &get_ts_list_);
+    for (auto item : get_ts_list_) {
+        VLOG(13) << item;
+    }
+    VLOG(13) << "split get wait ts list done";
+    // if isolation_level == ReadCommitedSnapshot
+    if (!FLAGS_tera_gtxn_test_isolation_snapshot) {
+        prewrite_start_ts_ = FLAGS_start_ts + FLAGS_begin_commit_ts + FLAGS_begin_prewrite_ts;
+    } else {
+        prewrite_start_ts_ = start_ts_;
+    }
+    commit_ts_ = FLAGS_start_ts + FLAGS_begin_commit_ts + FLAGS_begin_prewrite_ts 
+           + FLAGS_end_prewrite_ts + FLAGS_commit_ts;
+    if (commit_ts_ <= prewrite_start_ts_) {
+        commit_ts_ = prewrite_start_ts_ + 1;
+    }
+    Wait(ts_[0]);
+}
+
+int64_t GlobalTxnTestHelper::GetStartTs() {
+    return start_ts_;
+}
+
+int64_t GlobalTxnTestHelper::GetPrewriteStartTs() {
+    return prewrite_start_ts_;
+}
+
+int64_t GlobalTxnTestHelper::GetCommitTs() {
+    return commit_ts_;
+}
+
+void GlobalTxnTestHelper::GetWait(int64_t start_ts) {
+    if (get_ts_list_.size() == 0) {
+        // don't wait
+        VLOG(13) << "[gtxn_helper] [" << start_ts << "] will do get operater immediate";
+    } else {
+        // get operaters in 'get_ts_list' will wait by 'get_ts_list' set,
+        // not in get_ts_list will immediate GET after the last 'get_ts_list' item finished
+        if (get_pos_ < get_ts_list_.size()) {
+            int64_t now_millis = tera::get_millis();
+            int64_t def_wait_time = stol(get_ts_list_[get_pos_]) * kMillisPerSecond;
+            int64_t wait_time = helper_create_time_ + def_wait_time - now_millis;
+            VLOG(13) << "get_pos_:" << get_pos_ 
+                     << " now_millis:" << now_millis 
+                     << " def_wait_time:" << def_wait_time 
+                     << " size:" << get_ts_list_.size() 
+                     << " wait_time:" << wait_time;
+            if (wait_time > 0) {
+                VLOG(13) << "[gtxn_helper] [" << start_ts << "] will do get operater(" 
+                         << (get_pos_ + 1) << ") after" << wait_time << " ms.";
+                ThisThread::Sleep(wait_time);
+            } else {
+                VLOG(13) << "[gtxn_helper] [" << start_ts << "] will do get operater(" 
+                         << (get_pos_ + 1) << ") immediate";
+            }
+        } else {
+            VLOG(13) << "[gtxn_helper] [" << start_ts << "] will do get operater(" 
+                     << (get_pos_ + 1) << ") immediate";
+        }
+        get_pos_++;
+    }
+}
+
+void GlobalTxnTestHelper::Wait(int64_t start_ts) {
+    int wait_position = pos_++;
+    int64_t* info = ts_;
+    int64_t now_micros = tera::get_micros();
+    if (wait_position == 0) {
+        PrintLog(start_ts, "begin txn", info[wait_position + 1]);
+    } else {
+        if (info[wait_position] == -1) {
+            ExitNow(start_ts, wait_position);
+        }
+        int64_t should_wait = info[wait_position] * 1000000L + info[wait_position - 1];
+        if (should_wait - now_micros > 10) {
+            ThisThread::Sleep((should_wait - now_micros) / 1000L);
+        } else if (info[wait_position] == 0) {
+            // nothing to do
+        } else if (should_wait < now_micros) {
+            LOG(ERROR) << "[gtxn_helper] [" << start_ts << "] txn run timeout, exited";
+            _Exit(0);
+        }
+        switch (wait_position) {
+            case 1:
+                PrintLog(start_ts, "begin commit", info[wait_position + 1]);
+                break;
+            case 2:
+                PrintLog(start_ts, "begin prewrite", info[wait_position + 1]);
+                break;
+            case 3:
+                PrintLog(start_ts, "end prewrite", info[wait_position + 1]);
+                break;
+            case 4:
+                PrintLog(start_ts, "begin real commit", info[wait_position + 1]);
+                break;
+            case 5:
+                PrintLog(start_ts, "begin primary commit", info[wait_position + 1]);
+                break;
+            case 6:
+                PrintLog(start_ts, "end primary commit", info[wait_position + 1]);
+                break;
+            case 7:
+                PrintLog(start_ts, "begin other commit");
+                break;
+            default:
+                LOG(ERROR) << "overflow position";
+                _Exit(0);
+        }
+    }
+    info[wait_position] = tera::get_micros();
+    return;
+}
+
+void GlobalTxnTestHelper::ExitNow(int64_t start_ts, int position) {
+    VLOG(13) << "[gtxn_helper] [" << start_ts << "] exit @ position=" << position;
+    _Exit(0); // for simulate test gtxn stop at anywhere
+}
+
+void GlobalTxnTestHelper::PrintLog(int64_t start_ts, 
+                                   const std::string& log_str, 
+                                   int64_t next_wait_time) {
+    if (next_wait_time == -1) {
+        VLOG(13) << "[gtxn_helper] [" << start_ts << "] " << log_str << ", txn will be done.";
+    } else {
+        VLOG(13) << "[gtxn_helper] [" << start_ts << "] " << log_str
+                 << ", next step will begin after [" << next_wait_time << "s]";
+    }
+}
+
+} // namespace tera
+
diff --git a/src/sdk/test/global_txn_testutils.h b/src/sdk/test/global_txn_testutils.h
new file mode 100644
index 000000000..278ef8e68
--- /dev/null
+++ b/src/sdk/test/global_txn_testutils.h
@@ -0,0 +1,41 @@
+// Copyright (c) 2017, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef  TERA_SDK_TEST_GLOBAL_TXN_TESTUTILS_H_
+#define  TERA_SDK_TEST_GLOBAL_TXN_TESTUTILS_H_
+
+#include <string>
+
+namespace tera {
+
+class GlobalTxnTestHelper {
+public:
+    GlobalTxnTestHelper(const std::string& conffile);
+    ~GlobalTxnTestHelper(){}
+    int64_t GetStartTs();
+    int64_t GetPrewriteStartTs();
+    int64_t GetCommitTs();
+    void Wait(int64_t start_ts);
+    void GetWait(int64_t start_ts);
+    void LoadTxnConf();
+private:
+    
+    void ExitNow(int64_t start_ts, int position);
+    void PrintLog(int64_t start_ts, 
+                  const std::string& log_str, 
+                  int64_t next_wait_time = -1);
+    int pos_;
+    size_t get_pos_;
+    std::string conf_file_;
+    int64_t start_ts_;
+    int64_t prewrite_start_ts_;
+    int64_t commit_ts_;
+    int64_t ts_[8];
+    std::vector<std::string> get_ts_list_;
+    int64_t helper_create_time_;
+};
+
+} // namespace tera
+
+#endif  // TERA_SDK_TEST_GLOBAL_TXN_TESTUTILS_H_
diff --git a/src/sdk/test/mock_table.h b/src/sdk/test/mock_table.h
new file mode 100644
index 000000000..5d1a75e3b
--- /dev/null
+++ b/src/sdk/test/mock_table.h
@@ -0,0 +1,78 @@
+// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+//
+// Author: baorenyi@baidu.com
+
+#ifndef  TERA_SDK_TEST_MOCK_TABLE_H_
+#define  TERA_SDK_TEST_MOCK_TABLE_H_
+
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include "gflags/gflags.h"
+#include "glog/logging.h"
+
+#include "sdk/read_impl.h"
+#include "sdk/mutate_impl.h"
+#include "sdk/table_impl.h"
+
+namespace tera {
+
+struct MockReaderResult {
+    RowResult result;
+    ErrorCode status;
+};
+
+class MockTable: public TableImpl {
+public:
+    MockTable(const std::string& table_name, 
+              common::ThreadPool* thread_pool, 
+              sdk::ClusterFinder* cluster = NULL)
+        : TableImpl(table_name, thread_pool, cluster) {
+        reader_err_.clear();
+        mu_err_.clear();
+        reader_pos_ = 0;
+        mu_pos_ = 0;
+    }
+    void ApplyMutation(RowMutation* row_mu) {
+        RowMutationImpl* mu = static_cast<RowMutationImpl*>(row_mu);
+        mu->SetError(mu_err_[mu_pos_++].GetType(),"");
+        mu->RunCallback();
+    }
+
+    void Get(RowReader* reader) {
+        RowReaderImpl* r = static_cast<RowReaderImpl*>(reader);
+        if (reader_result_.size() > 0) {
+            r->SetResult(reader_result_[reader_pos_].result);    
+            r->SetError(reader_result_[reader_pos_++].status.GetType(), "");
+        } else {
+            r->SetError(reader_err_[reader_pos_++].GetType(), "");
+        }
+        r->RunCallback();
+    }
+
+    void AddReaderResult(const std::vector<MockReaderResult>& results) {
+        reader_result_.insert(reader_result_.end(),
+                results.begin(), results.end());    
+    }
+    
+    void AddReaderErrors(const std::vector<ErrorCode>& errs) {
+        reader_err_.insert(reader_err_.end(), errs.begin(), errs.end());
+    }
+
+    void AddMutationErrors(const std::vector<ErrorCode>& errs) {
+        mu_err_.insert(mu_err_.end(), errs.begin(), errs.end());
+    }
+private:
+    std::vector<ErrorCode> reader_err_;
+    std::vector<ErrorCode> mu_err_;
+    std::vector<MockReaderResult> reader_result_;
+    int reader_pos_;
+    int mu_pos_;
+};
+
+} // namespace tera
+
+#endif  // TERA_SDK_TEST_MOCK_TABLE_H_
diff --git a/src/sdk/test/scan_impl_test.cc b/src/sdk/test/scan_impl_test.cc
index abef2d305..475e2ff1c 100644
--- a/src/sdk/test/scan_impl_test.cc
+++ b/src/sdk/test/scan_impl_test.cc
@@ -49,21 +49,6 @@ class ScanDescImplTest : public ::testing::Test, public ScanDescImpl {
     TableSchema table_schema_;
 };
 
-TEST_F(ScanDescImplTest, GetCfType) {
-    string cf_name, type;
-
-    cf_name = "cf0";
-    EXPECT_TRUE(GetCfType(cf_name, &type));
-    EXPECT_EQ(type, "int32");
-
-    cf_name = "cf2";
-    EXPECT_TRUE(GetCfType(cf_name, &type));
-    EXPECT_EQ(type, "binary");
-
-    cf_name = "cf100";
-    EXPECT_FALSE(GetCfType(cf_name, &type));
-}
-
 TEST_F(ScanDescImplTest, ParseValueCompareFilter) {
     string filter_str;
     Filter filter;
@@ -76,21 +61,19 @@ TEST_F(ScanDescImplTest, ParseValueCompareFilter) {
     filter_str = "qualifier10";
     EXPECT_FALSE(ParseValueCompareFilter(filter_str, &filter));
 
-    filter_str = "cf0==-10";
+    filter_str = "int64cf0==-10";
     EXPECT_TRUE(ParseValueCompareFilter(filter_str, &filter));
     EXPECT_EQ(filter.type(), BinComp);
     EXPECT_EQ(filter.bin_comp_op(), EQ);
     EXPECT_EQ(filter.field(), ValueFilter);
     EXPECT_EQ(filter.content(), "cf0");
 
-    filter_str = "cf1>1";
+    filter_str = "int64cf1>1";
     EXPECT_TRUE(ParseValueCompareFilter(filter_str, &filter));
     EXPECT_EQ(filter.bin_comp_op(), GT);
 
     filter_str = "cf2==hello";
-    EXPECT_TRUE(ParseValueCompareFilter(filter_str, &filter));
-    EXPECT_EQ(filter.bin_comp_op(), EQ);
-    EXPECT_EQ(filter.ref_value(), "hello");
+    EXPECT_FALSE(ParseValueCompareFilter(filter_str, &filter));
 }
 
 TEST_F(ScanDescImplTest, ParseSubFilterString) {
@@ -104,33 +87,15 @@ TEST_F(ScanDescImplTest, ParseSubFilterString) {
     filter_str = "qual@ifier10";
     EXPECT_FALSE(ParseSubFilterString(filter_str, &filter));
 
-    filter_str = "cf0 == -10";
+    filter_str = "int64cf0 == -10";
     EXPECT_TRUE(ParseSubFilterString(filter_str, &filter));
     EXPECT_EQ(filter.type(), BinComp);
     EXPECT_EQ(filter.bin_comp_op(), EQ);
     EXPECT_EQ(filter.field(), ValueFilter);
     EXPECT_EQ(filter.content(), "cf0");
 
-    filter_str = "cf1 > 1";
+    filter_str = "int64cf1 > 1";
     EXPECT_TRUE(ParseSubFilterString(filter_str, &filter));
     EXPECT_EQ(filter.bin_comp_op(), GT);
 }
-
-TEST_F(ScanDescImplTest, ParseFilterString) {
-    string filter_str;
-
-    filter_str = "cf0 < 10 AND cf1 >100 AND cf2 == world";
-    SetFilterString(filter_str);
-    EXPECT_TRUE(ParseFilterString());
-    EXPECT_EQ(filter_list_.filter_size(), 3);
-
-    filter_str = "cf < 10 AND cf1 >100 AND cf2 == world";
-    SetFilterString(filter_str);
-    EXPECT_FALSE(ParseFilterString());
-
-    filter_str = "cf0 < 10 OR cf1 >100 AND cf2 == world";
-    SetFilterString(filter_str);
-    EXPECT_FALSE(ParseFilterString());
-}
-
 } // namespace tera
diff --git a/src/sdk/test/sdk_test.cc b/src/sdk/test/sdk_test.cc
new file mode 100644
index 000000000..7177bdc3a
--- /dev/null
+++ b/src/sdk/test/sdk_test.cc
@@ -0,0 +1,16 @@
+// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "gflags/gflags.h"
+#include "glog/logging.h"
+#include "gtest/gtest.h"
+DECLARE_bool(tera_sdk_tso_client_enabled);
+DECLARE_bool(tera_sdk_client_for_gtxn);
+
+int main(int argc, char* argv[]) {
+    FLAGS_tera_sdk_client_for_gtxn = true;
+    FLAGS_tera_sdk_tso_client_enabled = false;
+    ::testing::InitGoogleTest(&argc, argv);
+    return RUN_ALL_TESTS();
+}
diff --git a/src/sdk/test/sdk_timeout_manager_test.cc b/src/sdk/test/sdk_timeout_manager_test.cc
new file mode 100644
index 000000000..84ea5a4c1
--- /dev/null
+++ b/src/sdk/test/sdk_timeout_manager_test.cc
@@ -0,0 +1,244 @@
+// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include <thread>
+#include "gflags/gflags.h"
+#include "gtest/gtest.h"
+
+#include "sdk_task.h"
+#include "common/counter.h"
+#include "common/timer.h"
+
+using std::string;
+using namespace std::placeholders;
+DEFINE_int32(thread_num, 10, "thread number for TimeoutManager ThreadPool");
+DEFINE_int32(perf_test_thead_num, 10, "thread number of put/pop");
+DEFINE_int32(perf_test_duration, 2, "seconds for performance test");
+namespace tera {
+
+#define YELLOW  "\033[33m"      /* Yellow */
+
+static Counter callback_called_times = Counter();
+static Counter task_counter = Counter();
+
+class TestTask : public SdkTask {
+public:
+    std::string dummy_key;
+
+    TestTask() : SdkTask(SdkTask::READ) {}
+    virtual ~TestTask() {}
+
+    bool IsAsync() { return false; }
+    uint32_t Size() { return 0; }
+    int64_t TimeOut() { return 0; }
+    void Wait() {}
+    void SetError(ErrorCode::ErrorCodeType err,
+                  const std::string& reason) {}
+    const std::string& RowKey() { return dummy_key; }
+};
+
+class SdkTimeoutManagerTest : public ::testing::Test {
+public:
+    SdkTimeoutManagerTest() : thread_pool_(FLAGS_thread_num), timeout_manager_(NULL) {}
+
+    virtual void SetUp() {
+        timeout_manager_ = new SdkTimeoutManager(&thread_pool_);
+        ASSERT_TRUE(timeout_manager_ != NULL);
+        callback_called_times.Clear();
+        task_counter.Clear();
+    }
+    virtual void TearDown() {
+        delete timeout_manager_;
+    }
+
+private:
+    common::ThreadPool thread_pool_;
+    SdkTimeoutManager* timeout_manager_ = NULL;
+};
+
+static void TimeoutFunc(SdkTask* task) {
+    callback_called_times.Add(1);
+}
+
+static SdkTask::TimeoutFunc timeout_func = std::bind(TimeoutFunc, _1);
+
+TEST_F(SdkTimeoutManagerTest, PutTaskPopTaskTest) {
+    const int32_t LOOP_CNT = 10000;
+    int64_t put_start_time = get_micros();
+    bool succ = true;
+    for (int32_t i = 0; i < LOOP_CNT; ++i) {
+        TestTask* sdk_task = new TestTask();
+        sdk_task->SetId(LOOP_CNT - i);
+        succ &= timeout_manager_->PutTask(sdk_task, 5000, timeout_func);
+    }
+    EXPECT_TRUE(succ);
+    int64_t put_done_time = get_micros();
+
+    uint32_t task_cnt = 0;
+    for (uint32_t i = 0; i < SdkTimeoutManager::kShardNum; ++i) {
+        uint32_t shard_due_cnt = timeout_manager_->map_shard_[i].due_time_map.size();
+        EXPECT_EQ(shard_due_cnt, timeout_manager_->map_shard_[i].id_hash_map.size());
+        task_cnt += shard_due_cnt;
+    }
+    EXPECT_EQ(task_cnt, LOOP_CNT);
+
+    int64_t pop_start_time = get_micros();
+    for (uint32_t shard_idx = 0; shard_idx < SdkTimeoutManager::kShardNum; ++shard_idx) {
+        SdkTimeoutManager::DueTimeMap& due_time_map =
+                timeout_manager_->map_shard_[shard_idx].due_time_map;
+        uint32_t shard_task_cnt = due_time_map.size();
+        uint32_t shard_pop_cnt = 0;
+        while (!due_time_map.empty()) {
+            SdkTask* task = timeout_manager_->PopTask((*due_time_map.begin())->GetId());
+            EXPECT_TRUE(task != NULL);
+            shard_pop_cnt += 1;
+            delete static_cast<TestTask*>(task);
+        }
+        EXPECT_EQ(shard_pop_cnt, shard_task_cnt);
+    }
+    int64_t pop_done_time = get_micros();
+
+    std::cout << YELLOW << "SdkTimeoutManager performance(single thread): "
+        << "\n\t\tPutTask: " << int(LOOP_CNT / ((put_done_time - put_start_time + 1) / 1000000.0))
+        << "\n\t\tPopTask: " << int(LOOP_CNT / ((pop_done_time - pop_start_time + 1) / 1000000.0))
+        << std::endl;
+}
+
+TEST_F(SdkTimeoutManagerTest, CheckTimeout) {
+    const int32_t LOOP_CNT = 10000;
+    std::vector<TestTask*> tasks;
+    tasks.reserve(LOOP_CNT);
+    bool succ = true;
+    for (int32_t i = 0; i < LOOP_CNT; ++i) {
+        TestTask* sdk_task = new TestTask();
+        sdk_task->SetId(i + 1);
+        succ &= timeout_manager_->PutTask(sdk_task, 500, timeout_func);
+        tasks.push_back(sdk_task);
+    }
+    EXPECT_TRUE(true);
+    // waiting until all SdkTasks have been check timeout and their TimeoutFunc been put to thread pool to execute
+    for (uint32_t shard = 0; shard < SdkTimeoutManager::kShardNum; ++shard) {
+        while (!timeout_manager_->map_shard_[shard].due_time_map.empty()){
+            usleep(timeout_manager_->timeout_precision_);
+        }
+    }
+    // waiting another 100ms until all TimeoutFunc in thread_pool have been done
+    usleep(250000);
+    EXPECT_EQ(callback_called_times.Get(), LOOP_CNT);
+
+    TestTask* sdk_task = new TestTask();
+    sdk_task->SetId(100);
+    EXPECT_TRUE(timeout_manager_->PutTask(sdk_task, 500, timeout_func));
+    tasks.push_back(sdk_task);
+    EXPECT_FALSE(timeout_manager_->PutTask(sdk_task, 500, timeout_func));
+
+    sdk_task = new TestTask();
+    sdk_task->SetId(100);
+    EXPECT_FALSE(timeout_manager_->PutTask(sdk_task, 500, timeout_func));
+    tasks.push_back(sdk_task);
+
+    usleep(1000);
+    sdk_task = new TestTask();
+    sdk_task->SetId(100);
+    EXPECT_FALSE(timeout_manager_->PutTask(sdk_task, 500, timeout_func));
+    tasks.push_back(sdk_task);
+    // waiting until all SdkTasks have been check timeout and their TimeoutFunc been put to thread pool to execute
+    for (uint32_t shard = 0; shard < SdkTimeoutManager::kShardNum; ++shard) {
+        while (!timeout_manager_->map_shard_[shard].due_time_map.empty()){
+            usleep(timeout_manager_->timeout_precision_);
+        }
+    }
+    // waiting another 100ms until all TimeoutFunc in thread_pool have been done
+    usleep(250000);
+    EXPECT_EQ(callback_called_times.Get(), 1 + LOOP_CNT);
+    for (std::size_t i = 0; i < tasks.size(); ++i) {
+        delete tasks[i];
+    }
+}
+
+static bool add_task_run = true;
+static void AddTaskFunc(SdkTimeoutManager* mgr, int64_t timeout) {
+   while (add_task_run) {
+        SdkTask* task = new TestTask();
+        task->SetId(task_counter.Add(1));
+        mgr->PutTask(task, timeout, timeout_func);
+    }
+}
+
+static void PopTaskFunc(SdkTimeoutManager* mgr) {
+    int64_t task_id;
+    while ((task_id = task_counter.Sub(1) + 1) > 0) {
+        SdkTask* task = mgr->PopTask(task_id);
+        delete static_cast<TestTask*>(task);
+    }
+}
+
+TEST_F(SdkTimeoutManagerTest, PutPopPerformance) {
+    std::vector<std::thread> threads;
+    threads.reserve(FLAGS_perf_test_thead_num);
+    add_task_run = true;
+    int64_t timeout = FLAGS_perf_test_duration * 1000 + 1000;
+    for (int32_t i = 0; i < FLAGS_perf_test_thead_num; ++i) {
+        threads.emplace_back(std::thread(std::bind(&AddTaskFunc, timeout_manager_, timeout)));
+    }
+    sleep(FLAGS_perf_test_duration);
+    add_task_run = false;
+    for (std::size_t i = 0; i < threads.size(); ++i) {
+        threads[i].join();
+    }
+    threads.clear();
+    int64_t task_cnt = task_counter.Get();
+
+    int64_t pop_start_time = get_micros();
+    for (int i = 0; i < FLAGS_perf_test_thead_num; ++i) {
+        threads.emplace_back(std::thread(std::bind(PopTaskFunc, timeout_manager_)));
+    }
+    for (std::size_t i = 0; i < threads.size(); ++i) {
+        threads[i].join();
+    }
+    int64_t pop_end_time = get_micros();
+    std::cout << YELLOW
+        << "SdkTimeoutManager performance(" << FLAGS_perf_test_thead_num <<" put/pop threads): "
+        << "\n\t\tPutTask: " << task_cnt / FLAGS_perf_test_duration
+        << "\n\t\tPopTask: " << int(task_cnt / ((pop_end_time - pop_start_time) / 1000000.0))
+        << std::endl;
+}
+
+TEST_F(SdkTimeoutManagerTest, CheckTimeoutPerformance) {
+    common::ThreadPool thread_pool(FLAGS_thread_num);
+    SdkTimeoutManager* timeout_mgr = new SdkTimeoutManager(&thread_pool);
+
+    std::vector<std::thread> threads;
+    threads.reserve(FLAGS_perf_test_thead_num);
+    add_task_run = true;
+    // timeout set to 1us
+    int64_t timeout = 1;
+    int64_t start_time = get_micros();
+    for (int32_t i = 0; i < FLAGS_perf_test_thead_num; ++i) {
+        threads.emplace_back(std::thread(std::bind(&AddTaskFunc, timeout_mgr, timeout)));
+    }
+    sleep(FLAGS_perf_test_duration);
+    add_task_run = false;
+    int64_t end_time = get_micros();
+    for (std::size_t i = 0; i < threads.size(); ++i) {
+        threads[i].join();
+    }
+    threads.clear();
+    int64_t callback_run_cnt = callback_called_times.Get();
+    int64_t pending_cnt = task_counter.Get() - callback_run_cnt;
+    delete timeout_mgr;
+
+    std::cout << YELLOW
+        << "SdkTimeoutManager performance@CheckTimeout("
+        << FLAGS_perf_test_thead_num <<" put threads, "
+        << FLAGS_thread_num << "TimeoutFunc run threads): "
+        << "\n\t\tPutTask: " << task_counter.Get() / FLAGS_perf_test_duration
+        << "\n\t\tPending: " << pending_cnt / FLAGS_perf_test_duration
+        << "\n\t\tCheckTimeout: " <<callback_run_cnt / FLAGS_perf_test_duration << ","
+        << int(task_counter.Get() / ((end_time - start_time) / 1000000.0))
+        << std::endl;
+}
+
+
+} // namespace tera
diff --git a/src/sdk/test/sdk_utils_test.cc b/src/sdk/test/sdk_utils_test.cc
index 9a032cf48..559a7baf2 100644
--- a/src/sdk/test/sdk_utils_test.cc
+++ b/src/sdk/test/sdk_utils_test.cc
@@ -1,221 +1,117 @@
-// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved
+// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.
+//
+// Author: baorenyi@baidu.com
 
-#include "sdk/sdk_utils.h"
+#include "sdk_utils.h"
 
-#include "sdk/scan_impl.h"
 #include "gtest/gtest.h"
 
 namespace tera {
-namespace sdk {
 
-TEST(SdkUtils, CheckName) {
-    EXPECT_TRUE(CheckName("hel_lo"));
-    EXPECT_TRUE(CheckName(""));
-    EXPECT_FALSE(CheckName("0hel_lo"));
-    EXPECT_FALSE(CheckName("h.el_lo"));
-    EXPECT_FALSE(CheckName("h el_lo"));
-    EXPECT_FALSE(CheckName("he#l_lo"));
+class SdkUtilsTest : public ::testing::Test {
+public:
+    SdkUtilsTest() {}
+    ~SdkUtilsTest() {}  
+};
+
+TEST(SdkUtilsTest, ExtendNotifyLgToDescriptor0) {
+    // all disable notify
+	tera::TableDescriptor schema("t1");
+	schema.AddLocalityGroup("lg0");
+	tera::ColumnFamilyDescriptor* cfd1 = schema.AddColumnFamily("cf1");
+    cfd1->DisableNotify();
+	tera::ColumnFamilyDescriptor* cfd2 = schema.AddColumnFamily("cf2");
+    cfd2->DisableNotify();
+    auto before_num = schema.LocalityGroupNum();
+    EXPECT_TRUE(ExtendNotifyLgToDescriptor(&schema));
+	EXPECT_TRUE(schema.LocalityGroupNum() == before_num);
 }
 
-TEST(SdkUtils, ParseCfNameType) {
-    string in, name, type;
-
-    in = "cf";
-    ASSERT_TRUE(ParseCfNameType(in, &name, &type));
-    ASSERT_TRUE(name == "cf");
-    ASSERT_TRUE(type == "");
-    ASSERT_TRUE(ParseCfNameType(in, NULL, &type));
-
-    in = "";
-    ASSERT_TRUE(ParseCfNameType(in, &name, &type));
-    ASSERT_TRUE(name == "");
-    ASSERT_TRUE(type == "");
-    ASSERT_TRUE(ParseCfNameType(in, &name, NULL));
-
-    in = "cf<int>";
-    ASSERT_TRUE(ParseCfNameType(in, &name, &type));
-    ASSERT_TRUE(name == "cf");
-    ASSERT_TRUE(type == "int");
-    ASSERT_TRUE(ParseCfNameType(in, NULL, NULL));
-
-    in = "<int>";
-    ASSERT_TRUE(ParseCfNameType(in, &name, &type));
-    ASSERT_TRUE(name == "");
-    ASSERT_TRUE(type == "int");
-
-    in = "cf<";
-    ASSERT_FALSE(ParseCfNameType(in, &name, &type));
-
-    in = "cf1int>";
-    ASSERT_FALSE(ParseCfNameType(in, &name, &type));
-
-    in = "<>";
-    ASSERT_FALSE(ParseCfNameType(in, &name, &type));
-}
-
-TEST(SdkUtils, CommaInBracket) {
-    string test;
-
-    test = "0123,{67,90,23},6,89,1{3,567,}01{345}789,12";
-    EXPECT_TRUE(CommaInBracket(test, 8));
-    EXPECT_TRUE(CommaInBracket(test, 13));
-    EXPECT_TRUE(CommaInBracket(test, 23));
-    EXPECT_TRUE(CommaInBracket(test, 27));
-    EXPECT_TRUE(CommaInBracket(test, 34));
-
-    EXPECT_FALSE(CommaInBracket(test, 2));
-    EXPECT_FALSE(CommaInBracket(test, 4));
-    EXPECT_FALSE(CommaInBracket(test, 15));
-    EXPECT_FALSE(CommaInBracket(test, 20));
-    EXPECT_FALSE(CommaInBracket(test, 37));
-}
-
-TEST(SdkUtils, SplitCfSchema) {
-    string schema;
-    std::vector<string> cfs;
-
-    schema = "cf1";
-    SplitCfSchema(schema, &cfs);
-    EXPECT_EQ(cfs.size(), 1);
-
-    schema = "cf1,cf2,cf3";
-    SplitCfSchema(schema, &cfs);
-    EXPECT_EQ(cfs.size(), 3);
-
-    schema = "cf2{prop1,prop2}";
-    SplitCfSchema(schema, &cfs);
-    EXPECT_EQ(cfs.size(), 1);
-
-    schema = "cf1,cf2{prop1,prop2},cf3{prop2}";
-    SplitCfSchema(schema, &cfs);
-    EXPECT_EQ(cfs.size(), 3);
-
-    schema = "cf1{prop1,prop2,prop3},cf2,cf3{prop1,prop2,prop3}";
-    SplitCfSchema(schema, &cfs);
-    EXPECT_EQ(cfs.size(), 3);
-
-    schema = "cf1{prop1,prop2,prop3},cf2{prop1,prop2,prop3},cf3";
-    SplitCfSchema(schema, &cfs);
-    EXPECT_EQ(cfs.size(), 3);
-
-    schema = "cf1,cf2{prop1,prop2,prop3},cf3{prop1,prop2,prop3}";
-    SplitCfSchema(schema, &cfs);
-    EXPECT_EQ(cfs.size(), 3);
-
-    schema = "cf1{prop1,prop2,prop3},cf2{prop1,prop2,prop3},cf3{prop1,prop2,prop3}";
-    SplitCfSchema(schema, &cfs);
-    EXPECT_EQ(cfs.size(), 3);
+TEST(SdkUtilsTest, ExtendNotifyLgToDescriptor1) {
+    // some disable notify
+	tera::TableDescriptor schema("t1");
+	schema.AddLocalityGroup("lg0");
+	tera::ColumnFamilyDescriptor* cfd1 = schema.AddColumnFamily("cf1");
+    cfd1->EnableNotify();
+	tera::ColumnFamilyDescriptor* cfd2 = schema.AddColumnFamily("cf2");
+    cfd2->DisableNotify();
+    auto before_num = schema.LocalityGroupNum();
+    EXPECT_TRUE(ExtendNotifyLgToDescriptor(&schema));
+	EXPECT_TRUE(schema.LocalityGroupNum() == before_num + 1);
 }
 
-TEST(SdkUtils, ParseProperty) {
-    string schema;
-    PropertyList prop_list;
-    string name;
-
-    schema = "name{prop1,prop2=value2,prop3=value3}";
-    ASSERT_TRUE(ParseProperty(schema, &name, &prop_list));
-    ASSERT_TRUE(name == "name");
-    ASSERT_EQ(prop_list.size(), 3);
-    ASSERT_TRUE(prop_list[0].first == "prop1");
-    ASSERT_TRUE(prop_list[0].second == "");
-    ASSERT_TRUE(prop_list[1].first == "prop2");
-    ASSERT_TRUE(prop_list[1].second == "value2");
-    ASSERT_TRUE(prop_list[2].first == "prop3");
-    ASSERT_TRUE(prop_list[2].second == "value3");
-
-    schema = "{prop1,prop2=value2}";
-    ASSERT_TRUE(ParseProperty(schema, &name, &prop_list));
-    ASSERT_TRUE(name == "");
-    ASSERT_EQ(prop_list.size(), 2);
-    ASSERT_TRUE(prop_list[0].first == "prop1");
-    ASSERT_TRUE(prop_list[0].second == "");
-    ASSERT_TRUE(prop_list[1].first == "prop2");
-    ASSERT_TRUE(prop_list[1].second == "value2");
-
-    schema = "name";
-    ASSERT_TRUE(ParseProperty(schema, &name, &prop_list));
-    ASSERT_TRUE(name == "name");
-    ASSERT_EQ(prop_list.size(), 0);
-
-    schema = "";
-    ASSERT_TRUE(ParseProperty(schema, &name, &prop_list));
-    ASSERT_TRUE(name == "");
-    ASSERT_EQ(prop_list.size(), 0);
-
-    schema = "nameprop1,prop2=value2,prop3=value3}";
-    ASSERT_FALSE(ParseProperty(schema, &name, &prop_list));
-
-    schema = "name{prop1,pr'op2=value2,prop3=value3}";
-    ASSERT_FALSE(ParseProperty(schema, &name, &prop_list));
-
-    schema = "name{0prop1,prop2=value2,prop3=value3}";
-    ASSERT_FALSE(ParseProperty(schema, &name, &prop_list));
+TEST(SdkUtilsTest, ExtendNotifyLgToDescriptor2) {
+    // some disable notify
+	tera::TableDescriptor schema("t1");
+	schema.AddLocalityGroup("lg0");
+	tera::ColumnFamilyDescriptor* cfd1 = schema.AddColumnFamily("cf1");
+    cfd1->DisableNotify();
+	tera::ColumnFamilyDescriptor* cfd2 = schema.AddColumnFamily("cf2");
+    cfd2->EnableNotify();
+    auto before_num = schema.LocalityGroupNum();
+    EXPECT_TRUE(ExtendNotifyLgToDescriptor(&schema));
+	EXPECT_TRUE(schema.LocalityGroupNum() == before_num + 1);
 }
 
-TEST(SdkUtils, ParseScanSchema) {
-    ScanDescriptor desc("row1");
-    ScanDescImpl* impl;
-    string schema;
-
-    schema = "SELECT cf0,cf1:qu2";
-    ASSERT_TRUE(ParseScanSchema(schema, &desc));
-    impl = desc.GetImpl();
-    ASSERT_EQ(impl->GetSizeofColumnFamilyList(), 2);
-    ASSERT_TRUE(impl->GetFilterString() == "");
-
-    schema = "SELECT cf0,cf1:qu2 WHERE cf0 < 10 AND cf1 > 23";
-    ASSERT_TRUE(ParseScanSchema(schema, &desc));
-    impl = desc.GetImpl();
-    ASSERT_EQ(impl->GetSizeofColumnFamilyList(), 2);
-    ASSERT_TRUE(impl->GetFilterString() == "cf0 < 10 AND cf1 > 23");
+TEST(SdkUtilsTest, ExtendNotifyLgToDescriptor3) {
+    // all enable notify
+	tera::TableDescriptor schema("t1");
+	schema.AddLocalityGroup("lg0");
+	tera::ColumnFamilyDescriptor* cfd1 = schema.AddColumnFamily("cf1");
+    cfd1->EnableNotify();
+	tera::ColumnFamilyDescriptor* cfd2 = schema.AddColumnFamily("cf2");
+    cfd2->EnableNotify();
+    auto before_num = schema.LocalityGroupNum();
+    EXPECT_TRUE(ExtendNotifyLgToDescriptor(&schema));
+	EXPECT_TRUE(schema.LocalityGroupNum() == before_num + 1);
 }
 
-TEST(SdkUtils, BuildSchema) {
-    string schema = "lg0:cf1,cf2|lg3:cf3,cf4,cf5";
-
-    TableDescriptor table_desc("unittest");
-    ParseSchema(schema, &table_desc);
-
-    string schema_t;
-    BuildSchema(&table_desc, &schema_t);
-    EXPECT_TRUE(schema == schema_t);
+TEST(SdkUtilsTest, ExtendNotifyLgToDescriptor4) {
+    // have lg named 'notify' but not set any cf 'notify=on'
+	tera::TableDescriptor schema("t1");
+	schema.AddLocalityGroup("notify");
+	tera::ColumnFamilyDescriptor* cfd1 = schema.AddColumnFamily("cf1", "notify");
+	tera::ColumnFamilyDescriptor* cfd2 = schema.AddColumnFamily("cf2", "notify");
+    auto before_num = schema.LocalityGroupNum();
+    EXPECT_TRUE(ExtendNotifyLgToDescriptor(&schema));
+	EXPECT_TRUE(schema.LocalityGroupNum() == before_num);
 }
 
-TEST(SdkUtils, HasInvalidCharInSchema) {
-    EXPECT_FALSE(HasInvalidCharInSchema(""));
-    EXPECT_FALSE(HasInvalidCharInSchema("table:splitsize=3,lg0:compress=none"));
-
-    EXPECT_TRUE(HasInvalidCharInSchema("\n \t`~!@#$%^&*()-+{}[]\\|;\"'.<>?/"));
-    EXPECT_TRUE(HasInvalidCharInSchema("table:splitsize=3;lg0:compress=none"));
+TEST(SdkUtilsTest, ExtendNotifyLgToDescriptor5) {
+    // have lg named 'notify' and set some cf 'notify=on'
+	tera::TableDescriptor schema("t1");
+	schema.AddLocalityGroup("notify");
+	tera::ColumnFamilyDescriptor* cfd1 = schema.AddColumnFamily("cf1", "notify");
+    cfd1->EnableNotify();
+	tera::ColumnFamilyDescriptor* cfd2 = schema.AddColumnFamily("cf2", "notify");
+    auto before_num = schema.LocalityGroupNum();
+    EXPECT_FALSE(ExtendNotifyLgToDescriptor(&schema));
+	EXPECT_TRUE(schema.LocalityGroupNum() == before_num);
 }
 
-TEST(SdkUtils, PrefixType) {
-    EXPECT_TRUE(PrefixType("compress") == "lg");
-    EXPECT_TRUE(PrefixType("storage") == "lg");
-    EXPECT_TRUE(PrefixType("blocksize") == "lg");
-    EXPECT_TRUE(PrefixType("ttl") == "cf");
-    EXPECT_TRUE(PrefixType("maxversions") == "cf");
-    EXPECT_TRUE(PrefixType("minversions") == "cf");
-    EXPECT_TRUE(PrefixType("diskquota") == "cf");
-    EXPECT_TRUE(PrefixType("splitsize") == "unknown"); // only support lg && cf
-    EXPECT_TRUE(PrefixType("anythingother") == "unknown");
+TEST(SdkUtilsTest, ExtendNotifyLgToDescriptor6) {
+    // have cf named '_N_' but not set any cf 'notify=on'
+	tera::TableDescriptor schema("t1");
+	schema.AddLocalityGroup("lg0");
+	tera::ColumnFamilyDescriptor* cfd1 = schema.AddColumnFamily("_N_");
+	tera::ColumnFamilyDescriptor* cfd2 = schema.AddColumnFamily("cf2");
+    auto before_num = schema.LocalityGroupNum();
+    EXPECT_TRUE(ExtendNotifyLgToDescriptor(&schema));
+	EXPECT_TRUE(schema.LocalityGroupNum() == before_num);
 }
 
-TEST(SdkUtils, ParsePrefixPropertyValue) {
-    string prefix;
-    string property;
-    string value;
-    EXPECT_TRUE(ParsePrefixPropertyValue("lg123:compress=none", prefix, property, value));
-
-    EXPECT_FALSE(ParsePrefixPropertyValue(":ttl=3", prefix, property, value));
-    EXPECT_FALSE(ParsePrefixPropertyValue("cf123:=3", prefix, property, value));
-    EXPECT_FALSE(ParsePrefixPropertyValue("cf123:ttl=", prefix, property, value));
-    EXPECT_FALSE(ParsePrefixPropertyValue("ttl", prefix, property, value));
-    EXPECT_FALSE(ParsePrefixPropertyValue("cf123:ttl", prefix, property, value));
-    EXPECT_FALSE(ParsePrefixPropertyValue("cf123:ttl:3", prefix, property, value));
+TEST(SdkUtilsTest, ExtendNotifyLgToDescriptor7) {
+    // have cf named '_N_' but some set cf 'notify=on'
+	tera::TableDescriptor schema("t1");
+	schema.AddLocalityGroup("lg0");
+	tera::ColumnFamilyDescriptor* cfd1 = schema.AddColumnFamily("_N_");
+    cfd1->EnableNotify();
+	tera::ColumnFamilyDescriptor* cfd2 = schema.AddColumnFamily("cf2");
+    auto before_num = schema.LocalityGroupNum();
+    EXPECT_FALSE(ExtendNotifyLgToDescriptor(&schema));
+	EXPECT_TRUE(schema.LocalityGroupNum() == before_num);
 }
 
-} // namespace sdk
 } // namespace tera
diff --git a/src/sdk/timeoracle_client_impl.cc b/src/sdk/timeoracle_client_impl.cc
new file mode 100644
index 000000000..7f0e16b6e
--- /dev/null
+++ b/src/sdk/timeoracle_client_impl.cc
@@ -0,0 +1,118 @@
+// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "sdk/timeoracle_client_impl.h"
+#include <mutex>
+#include <memory>
+
+#include "common/timer.h"
+
+namespace tera {
+namespace timeoracle {
+
+TimeoracleClientImpl::TimeoracleClientImpl(ThreadPool* thread_pool,
+                                           sdk::ClusterFinder* cluster_finder,
+                                           int32_t rpc_timeout) :
+    RpcClient<TimeoracleServer::Stub>(cluster_finder->TimeoracleAddr()),
+    thread_pool_(thread_pool),
+    rpc_timeout_(rpc_timeout),
+    update_timestamp_(0),
+    cluster_finder_(cluster_finder) {}
+
+void TimeoracleClientImpl::refresh_timeoracle_address(int64_t last_timestamp) {
+    std::unique_lock<std::mutex>        lock_guard(mutex_);
+    if (last_timestamp > 0 && last_timestamp < update_timestamp_) {
+        return;
+    }
+
+    LOG(INFO) << "TimeoracleClientImpl try to update cluster, before is " << GetConnectAddr();
+    std::string addr = cluster_finder_->TimeoracleAddr(true);
+    ResetClient(addr);
+    LOG(INFO) << "TimeoracleClientImpl update cluster, current is " << GetConnectAddr();
+    update_timestamp_ = get_micros();
+}
+
+int64_t TimeoracleClientImpl::GetTimestamp(uint32_t count) {
+    GetTimestampRequest request;
+    GetTimestampResponse response;
+
+    request.set_count(count);
+
+    std::function<void (const GetTimestampRequest*, GetTimestampResponse*, bool, int)> done;
+
+    if (SendMessageWithRetry(&TimeoracleServer::Stub::GetTimestamp,
+                             &request,
+                             &response,
+                             done,
+                             "GetTimestamp",
+                             rpc_timeout_,
+                             thread_pool_)) {
+        int code = response.status();
+        if (code != kTimeoracleOk) {
+            // Internel Error
+            return 0;
+        }
+        return response.start_timestamp();
+    }
+
+    // Rpc Failed
+    refresh_timeoracle_address(0);
+    return 0;
+}
+
+bool TimeoracleClientImpl::GetTimestamp(uint32_t count, std::function<void (int64_t)> callback) {
+    auto request = new GetTimestampRequest();
+    auto response = new GetTimestampResponse();
+    request->set_count(count);
+    int64_t start_time = get_micros();
+
+    std::function<void (const GetTimestampRequest*, GetTimestampResponse*, bool, int)> done
+        = std::bind(&TimeoracleClientImpl::OnRpcFinished, this, start_time, callback,
+                    std::placeholders::_1, std::placeholders::_2,
+                    std::placeholders::_3, std::placeholders::_4);
+
+    if (SendMessageWithRetry(&TimeoracleServer::Stub::GetTimestamp,
+                             request,
+                             response,
+                             done,
+                             "GetTimestamp",
+                             rpc_timeout_,
+                             thread_pool_)) {
+        return true;
+    }
+
+    // Rpc Failed
+    refresh_timeoracle_address(0);
+    return false;
+}
+
+void TimeoracleClientImpl::OnRpcFinished(int64_t start_time,
+                                         std::function<void (int64_t)> callback,
+                                         const GetTimestampRequest* request,
+                                         GetTimestampResponse* response,
+                                         bool rpc_error,
+                                         int  error_code){
+    std::unique_ptr<const GetTimestampRequest> req_hold(request);
+    std::unique_ptr<GetTimestampResponse> res_hold(response);
+
+    if (rpc_error) {
+        LOG(ERROR) << "RpcRequest failed for GetTimestamp, errno=" << error_code;
+        callback(0);
+        refresh_timeoracle_address(start_time);
+        return ;
+    }
+
+    int64_t ts = response->start_timestamp();
+
+    int code = response->status();
+
+    if (code != kTimeoracleOk) {
+        ts = 0;
+    }
+
+    callback(ts);
+}
+
+} // namespace timeoracle
+} // namespace tera
diff --git a/src/sdk/timeoracle_client_impl.h b/src/sdk/timeoracle_client_impl.h
new file mode 100644
index 000000000..e47fe9995
--- /dev/null
+++ b/src/sdk/timeoracle_client_impl.h
@@ -0,0 +1,56 @@
+// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef TERA_SDK_TIMEORACLE_CLIENT_IMPL_H_
+#define TERA_SDK_TIMEORACLE_CLIENT_IMPL_H_
+
+#include <memory>
+#include <mutex>
+#include <gflags/gflags.h>
+#include <sofa/pbrpc/pbrpc.h>
+
+#include "proto/timeoracle_rpc.pb.h"
+#include "proto/rpc_client.h"
+#include "sdk/sdk_zk.h"
+
+DECLARE_int32(tera_rpc_timeout_period);
+
+namespace tera {
+namespace timeoracle {
+
+class TimeoracleClientImpl : public RpcClient<TimeoracleServer::Stub> {
+public:
+    TimeoracleClientImpl(ThreadPool* thread_pool,
+            sdk::ClusterFinder* cluster_finder,
+            int32_t rpc_timeout = FLAGS_tera_rpc_timeout_period);
+
+    ~TimeoracleClientImpl() {}
+
+    int64_t GetTimestamp(uint32_t count);
+
+    bool GetTimestamp(uint32_t count, std::function<void (int64_t)> callback);
+
+private:
+    void refresh_timeoracle_address(int64_t last_timestamp);
+
+    void OnRpcFinished(int64_t start_time,
+                       std::function<void (int64_t)> callback,
+                       const GetTimestampRequest* request,
+                       GetTimestampResponse* response,
+                       bool rpc_error,
+                       int  error_code);
+
+private:
+    ThreadPool* thread_pool_;
+    int32_t rpc_timeout_;
+
+    std::mutex                                              mutex_;
+    int64_t                                                 update_timestamp_;
+    sdk::ClusterFinder*                                     cluster_finder_;
+};
+
+} // namespace timeoracle
+} // namespace tera
+
+#endif // TERA_SDK_TIMEORACLE_CLIENT_IMPL_H_
diff --git a/src/tabletnode/remote_tabletnode.cc b/src/tabletnode/remote_tabletnode.cc
index 2d95a0e5a..87f1a71de 100644
--- a/src/tabletnode/remote_tabletnode.cc
+++ b/src/tabletnode/remote_tabletnode.cc
@@ -5,14 +5,18 @@
 #include "tabletnode/remote_tabletnode.h"
 
 #include <functional>
+#include <memory>
 
 #include "gflags/gflags.h"
 #include "glog/logging.h"
 
+#include "common/metric/metric_counter.h"
+#include "common/metric/ratio_subscriber.h"
+#include "common/metric/prometheus_subscriber.h"
 #include "tabletnode/tabletnode_impl.h"
-#include "utils/counter.h"
+#include "tabletnode/tabletnode_metric_name.h"
 #include "utils/network_utils.h"
-#include "utils/timer.h"
+#include "common/timer.h"
 
 DECLARE_int32(tera_tabletnode_ctrl_thread_num);
 DECLARE_int32(tera_tabletnode_write_thread_num);
@@ -22,14 +26,93 @@ DECLARE_int32(tera_tabletnode_manual_compact_thread_num);
 DECLARE_int32(tera_request_pending_limit);
 DECLARE_int32(tera_scan_request_pending_limit);
 
-extern tera::Counter read_pending_counter;
-extern tera::Counter write_pending_counter;
-extern tera::Counter scan_pending_counter;
-extern tera::Counter compact_pending_counter;
-
 namespace tera {
 namespace tabletnode {
 
+//Add SubscriberType::SUM for caculating SLA
+tera::MetricCounter read_request_counter(kRequestCountMetric, kApiLabelRead, 
+                                         {SubscriberType::QPS, SubscriberType::SUM});
+tera::MetricCounter write_request_counter(kRequestCountMetric, kApiLabelWrite, 
+                                          {SubscriberType::QPS, SubscriberType::SUM});
+tera::MetricCounter scan_request_counter(kRequestCountMetric, kApiLabelScan, {SubscriberType::QPS});
+
+tera::MetricCounter read_pending_counter(kPendingCountMetric, kApiLabelRead, {SubscriberType::LATEST}, false);
+tera::MetricCounter write_pending_counter(kPendingCountMetric, kApiLabelWrite, {SubscriberType::LATEST}, false);
+tera::MetricCounter scan_pending_counter(kPendingCountMetric, kApiLabelScan, {SubscriberType::LATEST}, false);
+tera::MetricCounter compact_pending_counter(kPendingCountMetric, kApiLabelCompact, {SubscriberType::LATEST}, false);
+
+//Add SubscriberType::SUM for caculating SLA
+tera::MetricCounter read_reject_counter(kRejectCountMetric, kApiLabelRead, 
+                                        {SubscriberType::QPS, SubscriberType::SUM});
+tera::MetricCounter write_reject_counter(kRejectCountMetric, kApiLabelWrite, 
+                                         {SubscriberType::QPS, SubscriberType::SUM});
+tera::MetricCounter scan_reject_counter(kRejectCountMetric, kApiLabelScan, {SubscriberType::QPS});
+
+tera::MetricCounter finished_read_request_counter(kFinishedRequestCountMetric, kApiLabelRead, {SubscriberType::QPS});
+tera::MetricCounter finished_write_request_counter(kFinishedRequestCountMetric, kApiLabelWrite, {SubscriberType::QPS});
+tera::MetricCounter finished_scan_request_counter(kFinishedRequestCountMetric, kApiLabelScan, {SubscriberType::QPS});
+
+//These three metrics are not auto registered with a subscriber, they are used for ratio subscriber.
+tera::MetricCounter read_delay(kRequestDelayMetric, kApiLabelRead, {});
+tera::MetricCounter write_delay(kRequestDelayMetric, kApiLabelWrite, {});
+tera::MetricCounter scan_delay(kRequestDelayMetric, kApiLabelScan, {});
+
+tera::AutoSubscriberRegister rand_read_delay_per_request(std::unique_ptr<Subscriber>(new tera::RatioSubscriber(
+    MetricId("tera_ts_read_delay_us_per_request"),
+    std::unique_ptr<Subscriber>(new tera::PrometheusSubscriber(MetricId(kRequestDelayMetric, kApiLabelRead), SubscriberType::SUM)),
+    std::unique_ptr<Subscriber>(new tera::PrometheusSubscriber(MetricId(kFinishedRequestCountMetric, kApiLabelRead), SubscriberType::SUM)))));
+
+tera::AutoSubscriberRegister write_delay_per_request(std::unique_ptr<Subscriber>(new tera::RatioSubscriber(
+    MetricId("tera_ts_write_delay_us_per_request"),
+    std::unique_ptr<Subscriber>(new tera::PrometheusSubscriber(MetricId(kRequestDelayMetric, kApiLabelWrite), SubscriberType::SUM)),
+    std::unique_ptr<Subscriber>(new tera::PrometheusSubscriber(MetricId(kFinishedRequestCountMetric, kApiLabelWrite), SubscriberType::SUM)))));
+
+tera::AutoSubscriberRegister scan_delay_per_request(std::unique_ptr<Subscriber>(new tera::RatioSubscriber(
+    MetricId("tera_ts_scan_delay_us_per_request"),
+    std::unique_ptr<Subscriber>(new tera::PrometheusSubscriber(MetricId(kRequestDelayMetric, kApiLabelScan), SubscriberType::SUM)),
+    std::unique_ptr<Subscriber>(new tera::PrometheusSubscriber(MetricId(kFinishedRequestCountMetric, kApiLabelScan), SubscriberType::SUM)))));
+
+void ReadDoneWrapper::Run() {
+    if (response_->has_detail()) {
+        int64_t now_us = get_micros();
+        int64_t used_us =  now_us - start_micros_;
+        if (used_us <= 0) {
+            LOG(ERROR) << "now us: "<< now_us << " start_us: "<< start_micros_;
+        }
+        finished_read_request_counter.Add(response_->detail().status_size());
+        read_delay.Add(used_us);
+    }
+    delete this;
+}
+
+void WriteDoneWrapper::Run() {
+    if (response_->row_status_list_size() != 0) {
+        int64_t now_us = get_micros();
+        int64_t used_us =  now_us - start_micros_;
+        if (used_us <= 0) {
+            LOG(ERROR) << "now us: "<< now_us << " start_us: "<< start_micros_;
+        }
+
+        finished_write_request_counter.Add(response_->row_status_list_size());
+        write_delay.Add(used_us);
+    }
+    delete this;
+}
+
+void ScanDoneWrapper::Run() {
+    if (response_->has_results()) {
+        int64_t now_us = get_micros();
+        int64_t used_us =  now_us - start_micros_;
+        if (used_us <= 0) {
+            LOG(ERROR) << "now us: "<< now_us << " start_us: "<< start_micros_;
+        }
+
+        finished_scan_request_counter.Add(response_->results().key_values_size());
+        scan_delay.Add(used_us);
+    }
+    delete this;
+}
+
 enum RpcType {
     RPC_READ = 1,
     RPC_SCAN = 2
@@ -105,11 +188,16 @@ void RemoteTabletNode::ReadTablet(google::protobuf::RpcController* controller,
                                   const ReadTabletRequest* request,
                                   ReadTabletResponse* response,
                                   google::protobuf::Closure* done) {
+    int64_t start_micros = get_micros();
+    done = ReadDoneWrapper::NewInstance(start_micros, response, done);
     VLOG(8) << "accept RPC (ReadTablet): [" << request->tablet_name() << "] " << tera::utils::GetRemoteAddress(controller);
     static uint32_t last_print = time(NULL);
+    int32_t row_num = request->row_info_list_size();
+    read_request_counter.Add(row_num);
     if (read_pending_counter.Get() > FLAGS_tera_request_pending_limit) {
         response->set_sequence_id(request->sequence_id());
         response->set_status(kTabletNodeIsBusy);
+        read_reject_counter.Add(row_num);
         done->Run();
         uint32_t now_time = time(NULL);
         if (now_time > last_print) {
@@ -118,9 +206,7 @@ void RemoteTabletNode::ReadTablet(google::protobuf::RpcController* controller,
         }
         VLOG(8) << "finish RPC (ReadTablet)";
     } else {
-        int32_t row_num = request->row_info_list_size();
         read_pending_counter.Add(row_num);
-        int64_t start_micros = get_micros();
         ReadRpcTimer* timer = new ReadRpcTimer(request, response, done, start_micros);
         RpcTimerList::Instance()->Push(timer);
 
@@ -136,11 +222,16 @@ void RemoteTabletNode::WriteTablet(google::protobuf::RpcController* controller,
                                    const WriteTabletRequest* request,
                                    WriteTabletResponse* response,
                                    google::protobuf::Closure* done) {
+    int64_t start_micros = get_micros();
+    done = WriteDoneWrapper::NewInstance(start_micros, response, done);
     VLOG(8) << "accept RPC (WriteTablet): [" << request->tablet_name() << "] " << tera::utils::GetRemoteAddress(controller);
     static uint32_t last_print = time(NULL);
+    int32_t row_num = request->row_list_size();
+    write_request_counter.Add(row_num);
     if (write_pending_counter.Get() > FLAGS_tera_request_pending_limit) {
         response->set_sequence_id(request->sequence_id());
         response->set_status(kTabletNodeIsBusy);
+        write_reject_counter.Add(row_num);
         done->Run();
         uint32_t now_time = time(NULL);
         if (now_time > last_print) {
@@ -149,9 +240,7 @@ void RemoteTabletNode::WriteTablet(google::protobuf::RpcController* controller,
         }
         VLOG(8) << "finish RPC (WriteTablet)";
     } else {
-        int32_t row_num = request->row_list_size();
         write_pending_counter.Add(row_num);
-        int64_t start_micros = get_micros();
         WriteRpcTimer* timer = new WriteRpcTimer(request, response, done, start_micros);
         RpcTimerList::Instance()->Push(timer);
         ThreadPool::Task callback =
@@ -165,10 +254,13 @@ void RemoteTabletNode::ScanTablet(google::protobuf::RpcController* controller,
                                   const ScanTabletRequest* request,
                                   ScanTabletResponse* response,
                                   google::protobuf::Closure* done) {
+    done = ScanDoneWrapper::NewInstance(get_micros(), response, done);
     VLOG(8) << "accept RPC (ScanTablet): [" << request->table_name() << "] " << tera::utils::GetRemoteAddress(controller);
+    scan_request_counter.Inc();
     if (scan_pending_counter.Get() > FLAGS_tera_scan_request_pending_limit) {
         response->set_sequence_id(request->sequence_id());
         response->set_status(kTabletNodeIsBusy);
+        scan_reject_counter.Inc();
         done->Run();
         VLOG(8) << "finish RPC (ScanTablet)";
     } else {
@@ -254,6 +346,18 @@ void RemoteTabletNode::SplitTablet(google::protobuf::RpcController* controller,
     ctrl_thread_pool_->AddTask(callback);
 }
 
+void RemoteTabletNode::ComputeSplitKey(google::protobuf::RpcController* controller,
+                                   const SplitTabletRequest* request,
+                                   SplitTabletResponse* response,
+                                   google::protobuf::Closure* done) {
+    uint64_t id = request->sequence_id();
+    LOG(INFO) << "accept RPC (ComputeSplitKey) id: " << id << ", src: " << tera::utils::GetRemoteAddress(controller);
+    ThreadPool::Task callback =
+        std::bind(&RemoteTabletNode::DoComputeSplitKey, this, controller,
+                  request, response, done);
+    ctrl_thread_pool_->AddTask(callback);
+}
+
 void RemoteTabletNode::CompactTablet(google::protobuf::RpcController* controller,
                                    const CompactTabletRequest* request,
                                    CompactTabletResponse* response,
@@ -322,7 +426,7 @@ void RemoteTabletNode::DoReadTablet(google::protobuf::RpcController* controller,
         int64_t read_timeout = request->client_timeout_ms() * 1000; // ms -> us
         int64_t detal = get_micros() - start_micros;
         if (detal > read_timeout) {
-            VLOG(5) << "timeout, drop read request for:" << request->tablet_name()
+            LOG(WARNING) << "timeout, drop read request for:" << request->tablet_name()
                 << ", detal(in us):" << detal
                 << ", read_timeout(in us):" << read_timeout;
             is_read_timeout = true;
@@ -335,6 +439,7 @@ void RemoteTabletNode::DoReadTablet(google::protobuf::RpcController* controller,
         response->set_sequence_id(request->sequence_id());
         response->set_success_num(0);
         response->set_status(kTableIsBusy);
+        read_reject_counter.Inc();
         done->Run();
     }
 
@@ -431,6 +536,16 @@ void RemoteTabletNode::DoSplitTablet(google::protobuf::RpcController* controller
     LOG(INFO) << "finish RPC (SplitTablet) id: " << id;
 }
 
+void RemoteTabletNode::DoComputeSplitKey(google::protobuf::RpcController* controller,
+                                     const SplitTabletRequest* request,
+                                     SplitTabletResponse* response,
+                                     google::protobuf::Closure* done) {
+    uint64_t id = request->sequence_id();
+    LOG(INFO) << "run RPC (ComputeSplitKey) id: " << id;
+    tabletnode_impl_->ComputeSplitKey(request, response, done);
+    LOG(INFO) << "finish RPC (ComputeSplitKey) id: " << id;
+}
+
 void RemoteTabletNode::DoCompactTablet(google::protobuf::RpcController* controller,
                                      const CompactTabletRequest* request,
                                      CompactTabletResponse* response,
diff --git a/src/tabletnode/remote_tabletnode.h b/src/tabletnode/remote_tabletnode.h
index 93e692121..936a3ff12 100644
--- a/src/tabletnode/remote_tabletnode.h
+++ b/src/tabletnode/remote_tabletnode.h
@@ -7,6 +7,7 @@
 
 #include "common/base/scoped_ptr.h"
 #include "common/thread_pool.h"
+#include "common/request_done_wrapper.h"
 
 #include "proto/tabletnode_rpc.pb.h"
 #include "tabletnode/rpc_schedule.h"
@@ -17,6 +18,82 @@ namespace tabletnode {
 
 class TabletNodeImpl;
 
+
+class ReadDoneWrapper final : public RequestDoneWrapper {
+public:
+    static google::protobuf::Closure* NewInstance(int64_t start_micros,
+                                                  ReadTabletResponse* response,
+                                                  google::protobuf::Closure* done) {
+        return new ReadDoneWrapper(start_micros, response, done);
+    }
+
+    virtual void Run() override;
+
+    virtual ~ReadDoneWrapper() {}
+
+protected:
+    //Just Can Create on Heap;
+    ReadDoneWrapper(int64_t start_micros,
+                    ReadTabletResponse* response,
+                    google::protobuf::Closure* done):
+        RequestDoneWrapper(done),
+        start_micros_(start_micros),
+        response_(response) { }
+
+    int64_t start_micros_;
+    ReadTabletResponse* response_;
+};
+
+class WriteDoneWrapper final : public RequestDoneWrapper {
+public:
+    static google::protobuf::Closure* NewInstance(int64_t start_micros,
+                                                  WriteTabletResponse* response,
+                                                  google::protobuf::Closure* done) {
+        return new WriteDoneWrapper(start_micros, response, done);
+    }
+
+    virtual void Run() override;
+
+    virtual ~WriteDoneWrapper() {}
+
+protected:
+    //Just Can Create on Heap;
+    WriteDoneWrapper(int64_t start_micros,
+                     WriteTabletResponse* response,
+                     google::protobuf::Closure* done):
+        RequestDoneWrapper(done),
+        start_micros_(start_micros),
+        response_(response) { }
+
+    int64_t start_micros_;
+    WriteTabletResponse* response_;
+};
+
+class ScanDoneWrapper final : public RequestDoneWrapper {
+public:
+    static google::protobuf::Closure* NewInstance(int64_t start_micros,
+                                                  ScanTabletResponse* response,
+                                                  google::protobuf::Closure* done) {
+        return new ScanDoneWrapper(start_micros, response, done);
+    }
+
+    virtual void Run() override;
+
+    virtual ~ScanDoneWrapper() {}
+
+protected:
+    //Just Can Create on Heap;
+    ScanDoneWrapper(int64_t start_micros,
+                    ScanTabletResponse* response,
+                    google::protobuf::Closure* done):
+        RequestDoneWrapper(done),
+        start_micros_(start_micros),
+        response_(response) { }
+
+    int64_t start_micros_;
+    ScanTabletResponse* response_;
+};
+
 class RemoteTabletNode : public TabletNodeServer {
 public:
     explicit RemoteTabletNode(TabletNodeImpl* tabletnode_impl);
@@ -72,6 +149,11 @@ class RemoteTabletNode : public TabletNodeServer {
                      SplitTabletResponse* response,
                      google::protobuf::Closure* done);
 
+    void ComputeSplitKey(google::protobuf::RpcController* controller,
+                     const SplitTabletRequest* request,
+                     SplitTabletResponse* response,
+                     google::protobuf::Closure* done);
+
     void CompactTablet(google::protobuf::RpcController* controller,
                        const CompactTabletRequest* request,
                        CompactTabletResponse* response,
@@ -139,6 +221,10 @@ class RemoteTabletNode : public TabletNodeServer {
                        const SplitTabletRequest* request,
                        SplitTabletResponse* response,
                        google::protobuf::Closure* done);
+    void DoComputeSplitKey(google::protobuf::RpcController* controller,
+                       const SplitTabletRequest* request,
+                       SplitTabletResponse* response,
+                       google::protobuf::Closure* done);
 
     void DoMergeTablet(google::protobuf::RpcController* controller,
                        const MergeTabletRequest* request,
diff --git a/src/tabletnode/rpc_schedule_policy.cc b/src/tabletnode/rpc_schedule_policy.cc
index 2c43156ab..99a897dee 100644
--- a/src/tabletnode/rpc_schedule_policy.cc
+++ b/src/tabletnode/rpc_schedule_policy.cc
@@ -8,7 +8,7 @@
 
 #include "glog/logging.h"
 
-#include "utils/timer.h"
+#include "common/timer.h"
 
 namespace tera {
 namespace tabletnode {
diff --git a/src/tabletnode/tabletnode_entry.cc b/src/tabletnode/tabletnode_entry.cc
index 37ac8409f..a81628b14 100644
--- a/src/tabletnode/tabletnode_entry.cc
+++ b/src/tabletnode/tabletnode_entry.cc
@@ -9,6 +9,7 @@
 
 #include "common/base/string_ext.h"
 #include "common/base/string_number.h"
+#include "common/metric/collector_report.h"
 #include "common/net/ip_address.h"
 #include "common/this_thread.h"
 #include "common/thread_attributes.h"
@@ -19,20 +20,21 @@
 #include "proto/tabletnode.pb.h"
 #include "tabletnode/remote_tabletnode.h"
 #include "tabletnode/tabletnode_impl.h"
-#include "utils/counter.h"
+#include "common/counter.h"
 #include "utils/rpc_timer_list.h"
-#include "utils/timer.h"
+#include "common/timer.h"
 #include "utils/utils_cmd.h"
 
 DECLARE_string(tera_tabletnode_port);
 DECLARE_int32(tera_garbage_collect_period);
-DECLARE_bool(tera_zk_enabled);
 DECLARE_bool(tera_tabletnode_cpu_affinity_enabled);
 DECLARE_string(tera_tabletnode_cpu_affinity_set);
 DECLARE_bool(tera_tabletnode_hang_detect_enabled);
 DECLARE_int32(tera_tabletnode_hang_detect_threshold);
 DECLARE_int32(tera_tabletnode_rpc_server_max_inflow);
 DECLARE_int32(tera_tabletnode_rpc_server_max_outflow);
+DECLARE_bool(tera_metric_http_server_enable);
+DECLARE_int32(tera_metric_http_server_listen_port);
 
 std::string GetTeraEntryName() {
     return "tabletnode";
@@ -47,7 +49,8 @@ namespace tabletnode {
 
 TabletNodeEntry::TabletNodeEntry()
     : tabletnode_impl_(NULL),
-      remote_tabletnode_(NULL) {
+      remote_tabletnode_(NULL),
+      metric_http_server_(new tera::MetricHttpServer()) {
     sofa::pbrpc::RpcServerOptions rpc_options;
     rpc_options.max_throughput_in = FLAGS_tera_tabletnode_rpc_server_max_inflow;
     rpc_options.max_throughput_out = FLAGS_tera_tabletnode_rpc_server_max_outflow;
@@ -78,14 +81,23 @@ bool TabletNodeEntry::StartServer() {
         return false;
     }
     LOG(INFO) << "finish starting RPC server";
+
+    // start metric http server
+	if (FLAGS_tera_metric_http_server_enable) {
+	    if(!metric_http_server_->Start(FLAGS_tera_metric_http_server_listen_port)) {
+		    LOG(WARNING) << "Start metric http server failed. Ignore";
+		}
+	} else {
+	    LOG(INFO) << "Metric http server is disabled.";
+	}
     return true;
 }
 
 void TabletNodeEntry::ShutdownServer() {
+    metric_http_server_->Stop();
     tabletnode_impl_->Exit();
-    LOG(INFO) << "shut down server";
-    rpc_server_->Stop();
     LOG(INFO) << "TabletNodeEntry stop done!";
+    _exit(0);
 }
 
 bool TabletNodeEntry::Run() {
@@ -99,20 +111,17 @@ bool TabletNodeEntry::Run() {
         tabletnode_impl_->GarbageCollect();
     }
 
+    CollectorReportPublisher::GetInstance().Refresh();
     tabletnode_impl_->RefreshSysInfo();
     tabletnode_impl_->GetSysInfo().DumpLog();
     LOG(INFO) << "[ThreadPool schd/task/cnt] "
         << remote_tabletnode_->ProfilingLog();
 
-    LOG(INFO) << "[Cache HitRate/Cnt/Size] table_cache "
-        << tabletnode_impl_->TableCacheProfileInfo()
-        << ", block_cache " << tabletnode_impl_->BlockCacheProfileInfo();
-
     int64_t now_time = get_micros();
     int64_t earliest_rpc_time = now_time;
     RpcTimerList::Instance()->TopTime(&earliest_rpc_time);
     double max_delay = (now_time - earliest_rpc_time) / 1000.0;
-    VLOG(5) << "pending rpc max delay: "
+    LOG(INFO) << "pending rpc max delay: "
             << std::fixed<< std::setprecision(2) << max_delay;
     if (FLAGS_tera_tabletnode_hang_detect_enabled &&
         max_delay > FLAGS_tera_tabletnode_hang_detect_threshold) {
diff --git a/src/tabletnode/tabletnode_entry.h b/src/tabletnode/tabletnode_entry.h
index a27a89747..ec87acc2b 100644
--- a/src/tabletnode/tabletnode_entry.h
+++ b/src/tabletnode/tabletnode_entry.h
@@ -10,6 +10,7 @@
 #include <sofa/pbrpc/pbrpc.h>
 
 #include "common/base/scoped_ptr.h"
+#include "common/metric/metric_http_server.h"
 #include "tera_entry.h"
 
 namespace tera {
@@ -37,6 +38,7 @@ class TabletNodeEntry : public TeraEntry {
     scoped_ptr<TabletNodeImpl> tabletnode_impl_;
     RemoteTabletNode* remote_tabletnode_;
     scoped_ptr<sofa::pbrpc::RpcServer> rpc_server_;
+    scoped_ptr<tera::MetricHttpServer> metric_http_server_;
 };
 
 } // namespace tabletnode
diff --git a/src/tabletnode/tabletnode_impl.cc b/src/tabletnode/tabletnode_impl.cc
index aed9d27f8..42f720723 100644
--- a/src/tabletnode/tabletnode_impl.cc
+++ b/src/tabletnode/tabletnode_impl.cc
@@ -14,6 +14,10 @@
 
 #include "db/filename.h"
 #include "db/table_cache.h"
+#include "common/metric/cache_collector.h"
+#include "common/metric/prometheus_subscriber.h"
+#include "common/metric/ratio_collector.h"
+#include "common/metric/metric_counter.h"
 #include "common/thread.h"
 #include "io/io_utils.h"
 #include "io/utils_leveldb.h"
@@ -28,12 +32,13 @@
 #include "proto/proto_helper.h"
 #include "proto/tabletnode_client.h"
 #include "tabletnode/tablet_manager.h"
+#include "tabletnode/tabletnode_metric_name.h"
 #include "tabletnode/tabletnode_zk_adapter.h"
 #include "types.h"
 #include "utils/config_utils.h"
-#include "utils/counter.h"
+#include "common/counter.h"
 #include "utils/string_util.h"
-#include "utils/timer.h"
+#include "common/timer.h"
 #include "utils/utils_cmd.h"
 
 DECLARE_string(tera_tabletnode_port);
@@ -84,6 +89,7 @@ DECLARE_string(tera_leveldb_env_type);
 DECLARE_string(tera_local_addr);
 DECLARE_bool(tera_ins_enabled);
 DECLARE_bool(tera_mock_ins_enabled);
+DECLARE_string(tera_coord_type);
 
 DECLARE_bool(tera_io_cache_path_vanish_allowed);
 DECLARE_int64(tera_tabletnode_tcm_cache_size);
@@ -92,20 +98,49 @@ DECLARE_string(flagfile);
 
 using namespace std::placeholders;
 
-extern tera::Counter range_error_counter;
-extern tera::Counter rand_read_delay;
-
 static const int GC_LOG_LEVEL = FLAGS_tera_tabletnode_gc_log_level;
 
+namespace leveldb {
+extern tera::Counter snappy_before_size_counter;
+extern tera::Counter snappy_after_size_counter;
+}
+
 namespace tera {
 namespace tabletnode {
+using tera::SubscriberType;
+
+tera::MetricCounter read_error_counter(kErrorCountMetric, kApiLabelRead,
+                                       {SubscriberType::QPS, SubscriberType::SUM});
+tera::MetricCounter write_error_counter(kErrorCountMetric, kApiLabelWrite,
+                                        {SubscriberType::QPS, SubscriberType::SUM});
+tera::MetricCounter scan_error_counter(kErrorCountMetric, kApiLabelScan,
+                                        {SubscriberType::QPS, SubscriberType::SUM});
+
+tera::MetricCounter read_range_error_counter(kRangeErrorMetric, kApiLabelRead, {SubscriberType::QPS});
+tera::MetricCounter write_range_error_counter(kRangeErrorMetric, kApiLabelWrite, {SubscriberType::QPS});
+tera::MetricCounter scan_range_error_counter(kRangeErrorMetric, kApiLabelScan, {SubscriberType::QPS});
+
+TabletNodeImpl::CacheMetrics::CacheMetrics(leveldb::Cache* block_cache, leveldb::TableCache* table_cache)
+    : block_cache_hitrate_(kBlockCacheHitRateMetric,
+        std::unique_ptr<Collector>(new LRUCacheCollector(block_cache, CacheCollectType::kHitRate))),
+      block_cache_entries_(kBlockCacheEntriesMetric,
+        std::unique_ptr<Collector>(new LRUCacheCollector(block_cache, CacheCollectType::kEntries))),
+      block_cache_charge_(kBlockCacheChargeMetric,
+        std::unique_ptr<Collector>(new LRUCacheCollector(block_cache, CacheCollectType::kCharge))),
+      table_cache_hitrate_(kTableCacheHitRateMetric,
+        std::unique_ptr<Collector>(new TableCacheCollector(table_cache, CacheCollectType::kHitRate))),
+      table_cache_entries_(kTableCacheEntriesMetric,
+        std::unique_ptr<Collector>(new TableCacheCollector(table_cache, CacheCollectType::kEntries))),
+      table_cache_charge_(kTableCacheChargeMetric,
+        std::unique_ptr<Collector>(new TableCacheCollector(table_cache, CacheCollectType::kCharge))) {}
 
 TabletNodeImpl::TabletNodeImpl()
     : status_(kNotInited),
       tablet_manager_(new TabletManager()),
       zk_adapter_(NULL),
       release_cache_timer_id_(kInvalidTimerId),
-      thread_pool_(new ThreadPool(FLAGS_tera_tabletnode_impl_thread_max_num)) {
+      thread_pool_(new ThreadPool(FLAGS_tera_tabletnode_impl_thread_max_num)),
+      cache_metrics_(NULL) {
     if (FLAGS_tera_local_addr == "") {
         local_addr_ = utils::GetLocalHostName()+ ":" + FLAGS_tera_tabletnode_port;
     } else {
@@ -157,24 +192,42 @@ TabletNodeImpl::~TabletNodeImpl() {
 }
 
 bool TabletNodeImpl::Init() {
-    if (FLAGS_tera_zk_enabled) {
+    if (FLAGS_tera_coord_type.empty()) {
+        LOG(ERROR) << "Note: We don't recommend that use '"
+                   << "--tera_[zk|ins|mock_zk|mock_ins]_enabled' flag for your cluster coord"
+                   << " replace by '--tera_coord_type=[zk|ins|mock_zk|mock_ins|fake_zk]'"
+                   << " flag is usually recommended.";
+    }
+    if (FLAGS_tera_coord_type == "zk" ||
+            (FLAGS_tera_coord_type.empty() && FLAGS_tera_zk_enabled)) {
         zk_adapter_.reset(new TabletNodeZkAdapter(this, local_addr_));
-    } else if(FLAGS_tera_ins_enabled) {
+    } else if (FLAGS_tera_coord_type == "ins" ||
+            (FLAGS_tera_coord_type.empty() && FLAGS_tera_ins_enabled)) {
         LOG(INFO) << "ins mode!";
         zk_adapter_.reset(new InsTabletNodeZkAdapter(this, local_addr_));
-    } else if (FLAGS_tera_mock_zk_enabled) {
+    } else if (FLAGS_tera_coord_type == "mock_zk" ||
+            (FLAGS_tera_coord_type.empty() && FLAGS_tera_mock_zk_enabled)) {
         LOG(INFO) << "mock zk mode!";
         zk_adapter_.reset(new MockTabletNodeZkAdapter(this, local_addr_));
-    } else if (FLAGS_tera_mock_ins_enabled) {
+    } else if (FLAGS_tera_coord_type == "mock_ins" ||
+            (FLAGS_tera_coord_type.empty() && FLAGS_tera_mock_ins_enabled)) {
         LOG(INFO) << "mock ins mode!";
         zk_adapter_.reset(new MockInsTabletNodeZkAdapter(this, local_addr_));
-    } else {
+    } else if (FLAGS_tera_coord_type == "fake_zk" ||
+            FLAGS_tera_coord_type.empty()) {
         LOG(INFO) << "fake zk mode!";
         zk_adapter_.reset(new FakeTabletNodeZkAdapter(this, local_addr_));
     }
 
     SetTabletNodeStatus(kIsIniting);
     thread_pool_->AddTask(std::bind(&TabletNodeZkAdapterBase::Init, zk_adapter_.get()));
+
+    // register cache metrics
+    cache_metrics_.reset(new CacheMetrics(ldb_block_cache_, ldb_table_cache_));
+    // register snappy metrics
+    snappy_ratio_metric_.reset(new AutoCollectorRegister(kSnappyCompressionRatioMetric, std::unique_ptr<Collector>(
+        new RatioCollector(&leveldb::snappy_before_size_counter, &leveldb::snappy_after_size_counter, true))));
+
     return true;
 }
 
@@ -208,6 +261,8 @@ void TabletNodeImpl::InitCacheSystem() {
 }
 
 bool TabletNodeImpl::Exit() {
+    cache_metrics_.reset(NULL);
+
     std::vector<io::TabletIO*> tablet_ios;
     tablet_manager_->GetAllTablets(&tablet_ios);
 
@@ -309,6 +364,11 @@ void TabletNodeImpl::LoadTablet(const LoadTabletRequest* request,
         CHECK(i < 2) << "parent_tablets should less than 2: " << i;
         parent_tablets.push_back(request->parent_tablets(i));
     }
+    std::set<std::string> ignore_err_lgs;
+    for (int i = 0; i < request->ignore_err_lgs_size(); ++i) {
+        VLOG(10) << "oops lg:" << request->ignore_err_lgs(i);
+        ignore_err_lgs.insert(request->ignore_err_lgs(i));
+    }
 
     io::TabletIO* tablet_io = NULL;
     StatusCode status = kTabletNodeOk;
@@ -324,7 +384,7 @@ void TabletNodeImpl::LoadTablet(const LoadTabletRequest* request,
         ///TODO: User per user memery_cache according to user quota.
         tablet_io->SetMemoryCache(m_memory_cache);
         if (!tablet_io->Load(schema, request->path(), parent_tablets,
-                             snapshots, rollbacks, ldb_logger_,
+                             ignore_err_lgs, snapshots, rollbacks, ldb_logger_,
                              ldb_block_cache_, ldb_table_cache_, &status)) {
             tablet_io->DecRef();
             LOG(ERROR) << "fail to load tablet: " << request->path()
@@ -466,28 +526,50 @@ void TabletNodeImpl::ReadTablet(int64_t start_micros,
                                 const ReadTabletRequest* request,
                                 ReadTabletResponse* response,
                                 google::protobuf::Closure* done) {
+    bool is_timeout = false;
     int32_t row_num = request->row_info_list_size();
     uint64_t snapshot_id = request->snapshot_id() == 0 ? 0 : request->snapshot_id();
     uint32_t read_success_num = 0;
 
+    int64_t client_timeout_ms = std::numeric_limits<int64_t>::max() / 2;
+    if (request->has_client_timeout_ms()) {
+        client_timeout_ms = request->client_timeout_ms();
+    }
+    int64_t end_time_ms = start_micros / 1000 + client_timeout_ms;
+    VLOG(20) << "start_ms: " << start_micros / 1000 << ", client_timeout_ms: " << client_timeout_ms
+             << " end_ms: " << end_time_ms;
+
     for (int32_t i = 0; i < row_num; i++) {
+        int64_t time_remain_ms = end_time_ms - GetTimeStampInMs();
         StatusCode row_status = kTabletNodeOk;
         io::TabletIO* tablet_io = tablet_manager_->GetTablet(
             request->tablet_name(), request->row_info_list(i).key(), &row_status);
         if (tablet_io == NULL) {
-            range_error_counter.Inc();
+            read_error_counter.Inc();
+            read_range_error_counter.Inc();
             response->mutable_detail()->add_status(kKeyNotInRange);
         } else {
+            VLOG(20) << "time_remain_ms: " << time_remain_ms;
             if (tablet_io->ReadCells(request->row_info_list(i),
                                      response->mutable_detail()->add_row_result(),
-                                     snapshot_id, &row_status)) {
+                                     snapshot_id, &row_status, time_remain_ms)) {
                 read_success_num++;
             } else {
+                if (row_status != kKeyNotExist && row_status != kRPCTimeout) {
+                    read_error_counter.Inc();
+                }
                 response->mutable_detail()->mutable_row_result()->RemoveLast();
             }
             tablet_io->DecRef();
             response->mutable_detail()->add_status(row_status);
         }
+
+        if (row_status == kRPCTimeout) {
+            is_timeout = true;
+            LOG(WARNING) << "seq_id: " << request->sequence_id() << " timeout,"
+                    << " clinet_timeout_ms: " << request->client_timeout_ms();
+            break;
+        }
     }
 
     VLOG(10) << "seq_id: " << request->sequence_id()
@@ -495,15 +577,14 @@ void TabletNodeImpl::ReadTablet(int64_t start_micros,
         << ", read_suc: " << read_success_num;
     response->set_sequence_id(request->sequence_id());
     response->set_success_num(read_success_num);
-    response->set_status(kTabletNodeOk);
-    done->Run();
 
-    int64_t now_ms = get_micros();
-    int64_t used_ms =  now_ms - start_micros;
-    if (used_ms <= 0) {
-        LOG(ERROR) << "now ms: "<< now_ms << " start_ms: "<< start_micros;
+    if (is_timeout) {
+        response->set_status(kRPCTimeout);
+    } else {
+        response->set_status(kTabletNodeOk);
     }
-    rand_read_delay.Add(used_ms);
+
+    done->Run();
 }
 
 void TabletNodeImpl::WriteTablet(const WriteTabletRequest* request,
@@ -527,12 +608,12 @@ void TabletNodeImpl::WriteTablet(const WriteTabletRequest* request,
         return;
     }
 
-    Counter* row_done_counter = new Counter;
+    std::shared_ptr<Counter> row_done_counter(new Counter);
     for (int32_t i = 0; i < row_num; i++) {
         io::TabletIO* tablet_io = tablet_manager_->GetTablet(
             request->tablet_name(), request->row_list(i).row_key(), &status);
         if (tablet_io == NULL) {
-            range_error_counter.Inc();
+            write_range_error_counter.Inc();
         }
         it = tablet_task_map.find(tablet_io);
         WriteTabletTask* tablet_task = NULL;
@@ -579,6 +660,7 @@ void TabletNodeImpl::WriteTablet(const WriteTabletRequest* request,
 
 void TabletNodeImpl::WriteTabletFail(WriteTabletTask* tablet_task, StatusCode status) {
     int32_t row_num = tablet_task->row_status_vec.size();
+    write_error_counter.Add(row_num);
     for (int32_t i = 0; i < row_num; i++) {
         tablet_task->row_status_vec[i] = status;
     }
@@ -600,7 +682,6 @@ void TabletNodeImpl::WriteTabletCallback(WriteTabletTask* tablet_task,
             RpcTimerList::Instance()->Erase(tablet_task->timer);
             delete tablet_task->timer;
         }
-        delete tablet_task->row_done_counter;
     }
 
     delete tablet_task;
@@ -806,12 +887,14 @@ void TabletNodeImpl::ScanTablet(const ScanTabletRequest* request,
                                             request->start(), &status);
 
     if (tablet_io == NULL) {
-        range_error_counter.Inc();
+        scan_range_error_counter.Inc();
         response->set_status(status);
         done->Run();
     } else {
         response->set_end(tablet_io->GetEndKey());
-        tablet_io->ScanRows(request, response, done);
+        if (!tablet_io->ScanRows(request, response, done)) {
+            scan_error_counter.Inc();
+        }
         tablet_io->DecRef();
     }
 }
@@ -837,6 +920,14 @@ void TabletNodeImpl::SplitTablet(const SplitTabletRequest* request,
         done->Run();
         return;
     }
+    // Master is not responsible for update children tablets to meta table, refuse to split
+    if (!request->has_master_update_meta() || !request->master_update_meta()) {
+        LOG(ERROR) << kSms <<"SplitRequest without master_update_meta, maybe "
+                "request from old master, refuse split!" << *tablet_io;
+        response->set_status(kTableNotSupport);
+        done->Run();
+
+    }
 
     if (!tablet_io->Split(&split_key, &status)) {
         LOG(ERROR) << "fail to split tablet: " << tablet_io->GetTablePath()
@@ -852,10 +943,6 @@ void TabletNodeImpl::SplitTablet(const SplitTabletRequest* request,
         done->Run();
         return;
     }
-    uint64_t tablet_size = 0;
-    tablet_io->GetDataSize(&tablet_size);
-    int64_t first_half_size = tablet_size / 2;
-    int64_t second_half_size = tablet_size / 2;
     LOG(INFO) << "split tablet: " << tablet_io->GetTablePath()
         << " [" << DebugString(tablet_io->GetStartKey())
         << ", " << DebugString(tablet_io->GetEndKey())
@@ -888,11 +975,58 @@ void TabletNodeImpl::SplitTablet(const SplitTabletRequest* request,
                 << ", " << DebugString(request->key_range().key_end())
                 << "], status: " << StatusCodeToString(status);
     }
+    response->set_status(kTabletNodeOk);
+    response->add_split_keys(split_key);
+    done->Run();
+}
 
-    UpdateMetaTableAsync(request, response, done, path, split_key, schema,
-                         first_half_size, second_half_size, request->tablet_meta());
+void TabletNodeImpl::ComputeSplitKey(const SplitTabletRequest* request,
+                                 SplitTabletResponse* response,
+                                 google::protobuf::Closure* done) {
+    response->set_sequence_id(request->sequence_id());
+
+    std::string split_key = request->split_key();
+    std::string path;
+    StatusCode status = kTabletNodeOk;
+    io::TabletIO* tablet_io = tablet_manager_->GetTablet(request->tablet_name(),
+                                                request->key_range().key_start(),
+                                                request->key_range().key_end(),
+                                                &status);
+    if (tablet_io == NULL) {
+        LOG(WARNING) << "split fail to get tablet: " << request->tablet_name()
+            << " [" << DebugString(request->key_range().key_start())
+            << ", " << DebugString(request->key_range().key_end())
+            << "], status: " << StatusCodeToString(status);
+        response->set_status(kKeyNotInRange);
+        done->Run();
+        return;
+    }
+
+    if (!tablet_io->Split(&split_key, &status)) {
+        LOG(ERROR) << "fail to split tablet: " << tablet_io->GetTablePath()
+            << " [" << DebugString(tablet_io->GetStartKey())
+            << ", " << DebugString(tablet_io->GetEndKey())
+            << "], split_key: " << DebugString(split_key) << ". status: " << StatusCodeToString(status);
+        if (status == kTableNotSupport) {
+            response->set_status(kTableNotSupport);
+        } else {
+            response->set_status((StatusCode)tablet_io->GetStatus());
+        }
+        tablet_io->DecRef();
+        done->Run();
+        return;
+    }
+    LOG(INFO) << "split tablet: " << tablet_io->GetTablePath()
+        << " [" << DebugString(tablet_io->GetStartKey())
+        << ", " << DebugString(tablet_io->GetEndKey())
+        << "], split key: " << DebugString(split_key);
+    response->set_status(kTabletNodeOk);
+    response->add_split_keys(split_key);
+    tablet_io->DecRef();
+    done->Run();
 }
 
+
 bool TabletNodeImpl::CheckInKeyRange(const KeyList& key_list,
                                      const std::string& key_start,
                                      const std::string& key_end) {
@@ -954,7 +1088,7 @@ void TabletNodeImpl::LeaveSafeMode() {
 
 void TabletNodeImpl::ExitService() {
     LOG(FATAL) << "master kick me!";
-    exit(1);
+    _exit(1);
 }
 
 void TabletNodeImpl::SetTabletNodeStatus(const TabletNodeStatus& status) {
@@ -971,96 +1105,6 @@ void TabletNodeImpl::SetRootTabletAddr(const std::string& root_tablet_addr) {
     root_tablet_addr_ = root_tablet_addr;
 }
 
-void TabletNodeImpl::UpdateMetaTableAsync(const SplitTabletRequest* rpc_request,
-         SplitTabletResponse* rpc_response, google::protobuf::Closure* rpc_done,
-         const std::string& path, const std::string& key_split,
-         const TableSchema& schema, int64_t first_size, int64_t second_size,
-         const TabletMeta& meta) {
-    WriteTabletRequest* request = new WriteTabletRequest;
-    WriteTabletResponse* response = new WriteTabletResponse;
-    request->set_sequence_id(this_sequence_id_++);
-    request->set_tablet_name(FLAGS_tera_master_meta_table_name);
-    request->set_is_sync(true);
-    request->set_is_instant(true);
-
-    TabletMeta tablet_meta;
-    tablet_meta.CopyFrom(meta);
-    tablet_meta.set_server_addr(local_addr_);
-    tablet_meta.clear_parent_tablets();
-    tablet_meta.add_parent_tablets(leveldb::GetTabletNumFromPath(path));
-
-    std::string meta_key, meta_value;
-    VLOG(5) << "update meta for split tablet: " << path
-        << " [" << DebugString(rpc_request->key_range().key_start())
-        << ", " << DebugString(rpc_request->key_range().key_end()) << "]";
-
-    CHECK(2 == rpc_request->child_tablets_size());
-    // first write 2nd half
-    tablet_meta.set_path(leveldb::GetChildTabletPath(path, rpc_request->child_tablets(0)));
-    tablet_meta.set_size(second_size);
-    tablet_meta.mutable_key_range()->set_key_start(key_split);
-    tablet_meta.mutable_key_range()->set_key_end(rpc_request->key_range().key_end());
-    MakeMetaTableKeyValue(tablet_meta, &meta_key, &meta_value);
-    RowMutationSequence* mu_seq = request->add_row_list();
-    mu_seq->set_row_key(meta_key);
-    Mutation* mutation = mu_seq->add_mutation_sequence();
-    mutation->set_type(kPut);
-    mutation->set_value(meta_value);
-    VLOG(5) << "write meta: key [" << DebugString(meta_key)
-        << "], value_size: " << meta_value.size();
-
-    // then write 1st half
-    // update root_tablet_addr in fake zk mode
-    if (!FLAGS_tera_zk_enabled) {
-        zk_adapter_->GetRootTableAddr(&root_tablet_addr_);
-    }
-    TabletNodeClient meta_tablet_client(root_tablet_addr_);
-
-    tablet_meta.set_path(leveldb::GetChildTabletPath(path, rpc_request->child_tablets(1)));
-    tablet_meta.set_size(first_size);
-    tablet_meta.mutable_key_range()->set_key_start(rpc_request->key_range().key_start());
-    tablet_meta.mutable_key_range()->set_key_end(key_split);
-    MakeMetaTableKeyValue(tablet_meta, &meta_key, &meta_value);
-    mu_seq = request->add_row_list();
-    mu_seq->set_row_key(meta_key);
-    mutation = mu_seq->add_mutation_sequence();
-    mutation->set_type(kPut);
-    mutation->set_value(meta_value);
-    VLOG(5) << "write meta: key [" << DebugString(meta_key)
-        << "], value_size: " << meta_value.size();
-
-    std::function<void (WriteTabletRequest*, WriteTabletResponse*, bool, int)> done =
-        std::bind(&TabletNodeImpl::UpdateMetaTableCallback, this, rpc_request,
-                   rpc_response, rpc_done, _1, _2, _3, _4);
-    meta_tablet_client.WriteTablet(request, response, done);
-}
-
-
-void TabletNodeImpl::UpdateMetaTableCallback(const SplitTabletRequest* rpc_request,
-         SplitTabletResponse* rpc_response, google::protobuf::Closure* rpc_done,
-         WriteTabletRequest* request, WriteTabletResponse* response, bool failed,
-         int error_code) {
-    if (failed) {
-        rpc_response->set_status(kMetaTabletError);
-    } else if (response->status() != kTabletNodeOk) {
-        LOG(ERROR) << "fail to update meta for tablet: "
-            << request->tablet_name() << " ["
-            << DebugString(rpc_request->key_range().key_start())
-            << ", " << DebugString(rpc_request->key_range().key_end())
-            << "], status: " << StatusCodeToString(response->status());
-        rpc_response->set_status(kMetaTabletError);
-    } else {
-        LOG(INFO) << "split tablet success: " << rpc_request->tablet_name()
-            << " [" << DebugString(rpc_request->key_range().key_start())
-            << ", " << DebugString(rpc_request->key_range().key_end()) << "]";
-        rpc_response->set_status(kTabletNodeOk);
-    }
-
-    delete request;
-    delete response;
-    rpc_done->Run();
-}
-
 /*
  * all cached tablets/files:
  * ------------------------------------------
@@ -1191,22 +1235,6 @@ std::string TabletNodeImpl::GetSessionId() {
     return session_id_;
 }
 
-std::string TabletNodeImpl::BlockCacheProfileInfo() {
-    std::stringstream ss;
-    ss << ldb_block_cache_->HitRate(true);
-    ss << " " << ldb_block_cache_->Entries();
-    ss << " " << ldb_block_cache_->TotalCharge();
-    return ss.str();
-}
-
-std::string TabletNodeImpl::TableCacheProfileInfo() {
-    std::stringstream ss;
-    ss << ldb_table_cache_->HitRate(true);
-    ss << " " << ldb_table_cache_->TableEntries();
-    ss << " " << ldb_table_cache_->ByteSize();
-    return ss.str();
-}
-
 TabletNodeSysInfo& TabletNodeImpl::GetSysInfo() {
     return sysinfo_;
 }
diff --git a/src/tabletnode/tabletnode_impl.h b/src/tabletnode/tabletnode_impl.h
index ed19d4ad6..b4d327a2d 100644
--- a/src/tabletnode/tabletnode_impl.h
+++ b/src/tabletnode/tabletnode_impl.h
@@ -6,8 +6,10 @@
 #define TERA_TABLETNODE_TABLETNODE_IMPL_H_
 
 #include <string>
+#include <memory>
 
 #include "common/base/scoped_ptr.h"
+#include "common/metric/collector_report_publisher.h"
 #include "common/thread_pool.h"
 
 #include "io/tablet_io.h"
@@ -38,7 +40,7 @@ class TabletNodeImpl {
         std::vector<const RowMutationSequence*> row_mutation_vec;
         std::vector<StatusCode> row_status_vec;
         std::vector<int32_t> row_index_vec;
-        Counter* row_done_counter;
+        std::shared_ptr<Counter> row_done_counter;
 
         const WriteTabletRequest* request;
         WriteTabletResponse* response;
@@ -46,7 +48,7 @@ class TabletNodeImpl {
         WriteRpcTimer* timer;
 
         WriteTabletTask(const WriteTabletRequest* req, WriteTabletResponse* resp,
-                   google::protobuf::Closure* d, WriteRpcTimer* t, Counter* c)
+                   google::protobuf::Closure* d, WriteRpcTimer* t, std::shared_ptr<Counter> c)
             : row_done_counter(c), request(req), response(resp), done(d), timer(t) {}
     };
 
@@ -112,6 +114,9 @@ class TabletNodeImpl {
     void SplitTablet(const SplitTabletRequest* request,
                      SplitTabletResponse* response,
                      google::protobuf::Closure* done);
+    void ComputeSplitKey(const SplitTabletRequest* request,
+                     SplitTabletResponse* response,
+                     google::protobuf::Closure* done);
 
     void EnterSafeMode();
     void LeaveSafeMode();
@@ -125,10 +130,6 @@ class TabletNodeImpl {
     void SetSessionId(const std::string& session_id);
     std::string GetSessionId();
 
-    std::string BlockCacheProfileInfo();
-
-    std::string TableCacheProfileInfo();
-
     TabletNodeSysInfo& GetSysInfo();
 
     void RefreshSysInfo();
@@ -157,15 +158,6 @@ class TabletNodeImpl {
                          const std::string& key_start,
                          const std::string& key_end);
 
-    void UpdateMetaTableAsync(const SplitTabletRequest* request,
-             SplitTabletResponse* response, google::protobuf::Closure* done,
-             const std::string& path, const std::string& key_split,
-             const TableSchema& schema, int64_t first_size, int64_t second_size,
-             const TabletMeta& meta);
-    void UpdateMetaTableCallback(const SplitTabletRequest* rpc_request,
-             SplitTabletResponse* rpc_response, google::protobuf::Closure* rpc_done,
-             WriteTabletRequest* request, WriteTabletResponse* response,
-             bool failed, int error_code);
 
     void InitCacheSystem();
 
@@ -206,6 +198,22 @@ class TabletNodeImpl {
     leveldb::Cache* ldb_block_cache_;
     leveldb::Cache* m_memory_cache;
     leveldb::TableCache* ldb_table_cache_;
+    
+    // metric for caches
+    struct CacheMetrics {
+        tera::AutoCollectorRegister block_cache_hitrate_;
+        tera::AutoCollectorRegister block_cache_entries_;
+        tera::AutoCollectorRegister block_cache_charge_;
+        
+        tera::AutoCollectorRegister table_cache_hitrate_;
+        tera::AutoCollectorRegister table_cache_entries_;
+        tera::AutoCollectorRegister table_cache_charge_;
+        
+        CacheMetrics(leveldb::Cache* block_cache, leveldb::TableCache* table_cache);
+    };
+    
+    scoped_ptr<CacheMetrics> cache_metrics_;
+    scoped_ptr<tera::AutoCollectorRegister> snappy_ratio_metric_;
 };
 
 } // namespace tabletnode
diff --git a/src/tabletnode/tabletnode_metric_name.h b/src/tabletnode/tabletnode_metric_name.h
new file mode 100644
index 000000000..bca35a3dd
--- /dev/null
+++ b/src/tabletnode/tabletnode_metric_name.h
@@ -0,0 +1,113 @@
+// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef TERA_TABLETNODE_TABLETNODE_METRIC_NAME_H_
+#define TERA_TABLETNODE_TABLETNODE_METRIC_NAME_H_ 
+ 
+#include <string>
+
+#include "common/metric/hardware_collectors.h"
+ 
+namespace tera {
+namespace tabletnode {
+
+// api labels
+const char* const kApiLabelRead = "api:read";
+const char* const kApiLabelWrite = "api:write";
+const char* const kApiLabelScan = "api:scan";
+const char* const kApiLabelCompact = "api:compact"; 
+
+// env lables
+const char* const kEnvLabelDfs = "env:dfs";
+const char* const kEnvLabelSsd = "env:ssd";
+const char* const kEnvLabelPosix = "env:posix";
+const char* const kEnvLabelOther = "env:other";
+
+// metric names
+const char* const kRequestCountMetric = "tera_ts_request_count";
+const char* const kPendingCountMetric = "tera_ts_pending_count";
+const char* const kRejectCountMetric = "tera_ts_reject_count";
+const char* const kErrorCountMetric = "tera_ts_error_count";
+const char* const kRangeErrorMetric = "tera_ts_range_error_count";
+
+const char* const kRowDelayMetric = "tera_ts_row_delay_us_total";
+const char* const kRowCountMetric = "tera_ts_row_count";
+const char* const kRowThroughPutMetric = "tera_ts_row_through_put";
+const char* const kLowLevelReadMetric = "tera_ts_low_level_read";
+
+const char* const kRequestDelayMetric = "tera_ts_request_delay_us_total";
+const char* const kFinishedRequestCountMetric = "tera_ts_finished_request_count";
+
+// cache metric names
+const char* const kBlockCacheHitRateMetric = "tera_ts_block_cache_hit_percentage";
+const char* const kBlockCacheEntriesMetric = "tera_ts_block_cache_entry_count";
+const char* const kBlockCacheChargeMetric = "tera_ts_block_cache_charge_bytes";
+
+const char* const kTableCacheHitRateMetric = "tera_ts_table_cache_hit_percentage";
+const char* const kTableCacheEntriesMetric = "tera_ts_table_cache_entry_count";
+const char* const kTableCacheChargeMetric = "tera_ts_table_cache_charge_bytes";
+
+// env metric names
+const char* const kDfsReadBytesThroughPut = "tera_ts_dfs_read_bytes_through_put";
+const char* const kDfsWriteBytesThroughPut = "tera_ts_dfs_write_bytes_through_put";
+const char* const kDfsReadDelayMetric = "tera_ts_dfs_read_delay_us_total";
+const char* const kDfsWriteDelayMetric = "tera_ts_dfs_write_delay_us_total";
+const char* const kDfsSyncDelayMetric = "tera_ts_dfs_sync_delay_us_total";
+const char* const kDfsReadCountMetric = "tera_ts_dfs_read_count";
+const char* const kDfsWriteCountMetric = "tera_ts_dfs_write_count";
+const char* const kDfsSyncCountMetric = "tera_ts_dfs_sync_count";
+const char* const kDfsReadDelayPerRequestMetric = "tera_ts_dfs_read_delay_us_per_request";
+const char* const kDfsWriteDelayPerRequestMetric = "tera_ts_dfs_write_delay_us_per_request";
+const char* const kDfsSyncDelayPerRequestMetric = "tera_ts_dfs_sync_delay_us_per_request";
+const char* const kDfsFlushCountMetric = "tera_ts_dfs_flush_count";
+const char* const kDfsListCountMetric = "tera_ts_dfs_list_count";
+const char* const kDfsOtherCountMetric = "tera_ts_dfs_other_count";
+const char* const kDfsExistsCountMetric = "tera_ts_dfs_exists_count";
+const char* const kDfsOpenCountMetric = "tera_ts_dfs_open_count";
+const char* const kDfsCloseCountMetric = "tera_ts_dfs_close_count";
+const char* const kDfsDeleteCountMetric = "tera_ts_dfs_delete_count";
+const char* const kDfsTellCountMetric = "tera_ts_dfs_tell_count";
+const char* const kDfsInfoCountMetric = "tera_ts_dfs_info_count";
+const char* const kDfsReadHangMetric = "tera_ts_dfs_read_hang_total";
+const char* const kDfsWriteHangMetric = "tera_ts_dfs_write_hang_total";
+const char* const kDfsSyncHangMetric = "tera_ts_dfs_sync_hang_total";
+const char* const kDfsFlushHangMetric = "tera_ts_dfs_flush_hang_total";
+const char* const kDfsListHangMetric = "tera_ts_dfs_list_hang_total";
+const char* const kDfsOtherHangMetric = "tera_ts_dfs_other_hang_total";
+const char* const kDfsExistsHangMetric = "tera_ts_dfs_exists_hang_total";
+const char* const kDfsOpenHangMetric = "tera_ts_dfs_open_hang_total";
+const char* const kDfsCloseHangMetric = "tera_ts_dfs_close_hang_total";
+const char* const kDfsDeleteHangMetric = "tera_ts_dfs_delete_hang_total";
+const char* const kDfsTellHangMetric = "tera_ts_dfs_tell_hang_total";
+const char* const kDfsInfoHangMetric = "tera_ts_dfs_info_hang_total";
+
+const char* const kSsdReadCountMetric = "tera_ts_ssd_read_count";
+const char* const kSsdReadThroughPutMetric = "tera_ts_ssd_read_through_put";
+const char* const kSsdWriteCountMetric = "tera_ts_ssd_write_count";
+const char* const kSsdWriteThroughPutMetric = "tera_ts_ssd_write_through_put";
+
+const char* const kPosixReadThroughPutMetric = "tera_ts_posix_read_through_put";
+const char* const kPosixWriteThroughPutMetric = "tera_ts_posix_write_through_put";
+const char* const kPosixReadCountMetric = "tera_ts_posix_read_count";
+const char* const kPosixWriteCountMetric = "tera_ts_posix_write_count";
+const char* const kPosixSyncCountMetric = "tera_ts_posix_sync_count";
+const char* const kPosixListCountMetric = "tera_ts_posix_list_count";
+const char* const kPosixExistsCountMetric = "tera_ts_posix_exists_count";
+const char* const kPosixOpenCountMetric = "tera_ts_posix_open_count";
+const char* const kPosixCloseCountMetric = "tera_ts_posix_close_count";
+const char* const kPosixDeleteCountMetric = "tera_ts_posix_delete_count";
+const char* const kPosixTellCountMetric = "tera_ts_posix_tell_count";
+const char* const kPosixSeekCountMetric = "tera_ts_posix_seek_count";
+const char* const kPosixInfoCountMetric = "tera_ts_posix_info_count";
+const char* const kPosixOtherCountMetric = "tera_ts_posix_other_count";
+
+const char* const kRawkeyCompareCountMetric = "tera_ts_rawkey_compare_count";
+const char* const kSnappyCompressionRatioMetric = "tera_ts_snappy_compression_percentage";
+} // end namespace tabletnode
+} // end namespace tera 
+ 
+#endif // TERA_TABLETNODE_TABLETNODE_METRIC_NAME_H_
+ 
+/* vim: set expandtab ts=4 sw=4 sts=4 tw=100: */
+
diff --git a/src/tabletnode/tabletnode_sysinfo.cc b/src/tabletnode/tabletnode_sysinfo.cc
index b3c09520d..30b325df9 100644
--- a/src/tabletnode/tabletnode_sysinfo.cc
+++ b/src/tabletnode/tabletnode_sysinfo.cc
@@ -4,8 +4,7 @@
 //
 // Author: Xu Peilin (xupeilin@baidu.com)
 
-#include "tabletnode_sysinfo.h"
-
+#include <cmath>
 #include <sys/stat.h>
 #include <sys/time.h>
 #include <unistd.h>
@@ -16,17 +15,20 @@
 #include <sstream>
 #include <string>
 
+#include "tabletnode/tabletnode_sysinfo.h"
 #include "common/base/string_number.h"
 #include "proto/proto_helper.h"
-#include "utils/timer.h"
+#include "tabletnode/tabletnode_metric_name.h"
+#include "common/timer.h"
 #include "utils/tprinter.h"
 #include "utils/utils_cmd.h"
+#include "common/metric/collector_report_publisher.h"
+#include "common/metric/ratio_subscriber.h"
+#include "common/metric/prometheus_subscriber.h"
 
-DEFINE_int32(tera_tabletnode_sysinfo_mem_collect_interval, 10, "interval of mem checking(s)");
-DEFINE_int32(tera_tabletnode_sysinfo_net_collect_interval, 5, "interval of net checking(s)");
-DEFINE_int32(tera_tabletnode_sysinfo_cpu_collect_interval, 5, "interval of cpu checking(s)");
 DECLARE_bool(tera_tabletnode_dump_running_info);
 DECLARE_string(tera_tabletnode_running_info_dump_file);
+DECLARE_int64(tera_tabletnode_sysinfo_check_interval);
 
 namespace leveldb {
 extern tera::Counter rawkey_compare_counter;
@@ -49,9 +51,6 @@ extern tera::Counter posix_seek_counter;
 extern tera::Counter posix_info_counter;
 extern tera::Counter posix_other_counter;
 
-extern tera::Counter snappy_before_size_counter;
-extern tera::Counter snappy_after_size_counter;
-
 extern tera::Counter dfs_read_counter;
 extern tera::Counter dfs_write_counter;
 extern tera::Counter dfs_read_delay_counter;
@@ -87,17 +86,127 @@ extern tera::Counter ssd_write_counter;
 extern tera::Counter ssd_write_size_counter;
 }
 
-tera::Counter rand_read_delay;
-extern tera::Counter row_read_delay;
-tera::Counter range_error_counter;
-tera::Counter read_pending_counter;
-tera::Counter write_pending_counter;
-tera::Counter scan_pending_counter;
-tera::Counter compact_pending_counter;
 
 namespace tera {
 namespace tabletnode {
 
+// dfs metrics
+tera::AutoCollectorRegister dfs_read_size_metric(kDfsReadBytesThroughPut,
+    std::unique_ptr<Collector>(new CounterCollector(&leveldb::dfs_read_size_counter, true)), {SubscriberType::THROUGHPUT});
+tera::AutoCollectorRegister dfs_write_size_metric(kDfsWriteBytesThroughPut,
+    std::unique_ptr<Collector>(new CounterCollector(&leveldb::dfs_write_size_counter, true)), {SubscriberType::THROUGHPUT});
+tera::AutoCollectorRegister dfs_read_delay_metric(kDfsReadDelayMetric,
+    std::unique_ptr<Collector>(new CounterCollector(&leveldb::dfs_read_delay_counter, true)), {});
+tera::AutoCollectorRegister dfs_write_delay_metric(kDfsWriteDelayMetric,
+    std::unique_ptr<Collector>(new CounterCollector(&leveldb::dfs_write_delay_counter, true)), {});
+tera::AutoCollectorRegister dfs_sync_delay_metric(kDfsSyncDelayMetric,
+    std::unique_ptr<Collector>(new CounterCollector(&leveldb::dfs_sync_delay_counter, true)), {});
+tera::AutoCollectorRegister dfs_read_metric(kDfsReadCountMetric,
+    std::unique_ptr<Collector>(new CounterCollector(&leveldb::dfs_read_counter, true)), {SubscriberType::QPS});
+tera::AutoCollectorRegister dfs_write_metric(kDfsWriteCountMetric,
+    std::unique_ptr<Collector>(new CounterCollector(&leveldb::dfs_write_counter, true)), {SubscriberType::QPS});
+tera::AutoCollectorRegister dfs_sync_metric(kDfsSyncCountMetric,
+    std::unique_ptr<Collector>(new CounterCollector(&leveldb::dfs_sync_counter, true)), {SubscriberType::QPS});
+
+tera::AutoSubscriberRegister dfs_read_delay_avg_subscriber (std::unique_ptr<Subscriber>(new RatioSubscriber(
+    MetricId(kDfsReadDelayPerRequestMetric),
+    std::unique_ptr<Subscriber>(new PrometheusSubscriber(MetricId(kDfsReadDelayMetric), SubscriberType::SUM)),
+    std::unique_ptr<Subscriber>(new PrometheusSubscriber(MetricId(kDfsReadCountMetric), SubscriberType::SUM)))));
+
+tera::AutoSubscriberRegister dfs_write_delay_avg_subscriber (std::unique_ptr<Subscriber>(new RatioSubscriber(
+    MetricId(kDfsWriteDelayPerRequestMetric),
+    std::unique_ptr<Subscriber>(new PrometheusSubscriber(MetricId(kDfsWriteDelayMetric), SubscriberType::SUM)),
+    std::unique_ptr<Subscriber>(new PrometheusSubscriber(MetricId(kDfsWriteCountMetric), SubscriberType::SUM)))));
+
+tera::AutoSubscriberRegister dfs_sync_delay_avg_subscriber (std::unique_ptr<Subscriber>(new RatioSubscriber(
+    MetricId(kDfsSyncDelayPerRequestMetric),
+    std::unique_ptr<Subscriber>(new PrometheusSubscriber(MetricId(kDfsSyncDelayMetric), SubscriberType::SUM)),
+    std::unique_ptr<Subscriber>(new PrometheusSubscriber(MetricId(kDfsSyncCountMetric), SubscriberType::SUM)))));
+
+tera::AutoCollectorRegister dfs_flush_metric(kDfsFlushCountMetric,
+    std::unique_ptr<Collector>(new CounterCollector(&leveldb::dfs_flush_counter, true)), {SubscriberType::QPS});
+tera::AutoCollectorRegister dfs_list_metric(kDfsListCountMetric,
+    std::unique_ptr<Collector>(new CounterCollector(&leveldb::dfs_list_counter, true)), {SubscriberType::QPS});
+tera::AutoCollectorRegister dfs_exists_metric(kDfsExistsCountMetric,
+    std::unique_ptr<Collector>(new CounterCollector(&leveldb::dfs_exists_counter, true)), {SubscriberType::QPS});
+tera::AutoCollectorRegister dfs_open_metric(kDfsOpenCountMetric,
+    std::unique_ptr<Collector>(new CounterCollector(&leveldb::dfs_open_counter, true)), {SubscriberType::QPS});
+tera::AutoCollectorRegister dfs_close_metric(kDfsCloseCountMetric,
+    std::unique_ptr<Collector>(new CounterCollector(&leveldb::dfs_close_counter, true)), {SubscriberType::QPS});
+tera::AutoCollectorRegister dfs_delete_metric(kDfsDeleteCountMetric,
+    std::unique_ptr<Collector>(new CounterCollector(&leveldb::dfs_delete_counter, true)), {SubscriberType::QPS});
+tera::AutoCollectorRegister dfs_tell_metric(kDfsTellCountMetric,
+    std::unique_ptr<Collector>(new CounterCollector(&leveldb::dfs_tell_counter, true)), {SubscriberType::QPS});
+tera::AutoCollectorRegister dfs_info_metric(kDfsInfoCountMetric,
+    std::unique_ptr<Collector>(new CounterCollector(&leveldb::dfs_info_counter, true)), {SubscriberType::QPS});
+tera::AutoCollectorRegister dfs_other_metric(kDfsOtherCountMetric,
+    std::unique_ptr<Collector>(new CounterCollector(&leveldb::dfs_other_counter, true)), {SubscriberType::QPS});
+tera::AutoCollectorRegister dfs_read_hang_metric(kDfsReadHangMetric,
+    std::unique_ptr<Collector>(new CounterCollector(&leveldb::dfs_read_hang_counter, false)));
+tera::AutoCollectorRegister dfs_write_hang_metric(kDfsWriteHangMetric,
+    std::unique_ptr<Collector>(new CounterCollector(&leveldb::dfs_write_hang_counter, false)));
+tera::AutoCollectorRegister dfs_sync_hang_metric(kDfsSyncHangMetric,
+    std::unique_ptr<Collector>(new CounterCollector(&leveldb::dfs_sync_hang_counter, false)));
+tera::AutoCollectorRegister dfs_flush_hang_metric(kDfsFlushHangMetric,
+    std::unique_ptr<Collector>(new CounterCollector(&leveldb::dfs_flush_hang_counter, false)));
+tera::AutoCollectorRegister dfs_list_hang_metric(kDfsListHangMetric,
+    std::unique_ptr<Collector>(new CounterCollector(&leveldb::dfs_list_hang_counter, false)));
+tera::AutoCollectorRegister dfs_exists_hang_metric(kDfsExistsHangMetric,
+    std::unique_ptr<Collector>(new CounterCollector(&leveldb::dfs_exists_hang_counter, false)));
+tera::AutoCollectorRegister dfs_open_hang_metric(kDfsOpenHangMetric,
+    std::unique_ptr<Collector>(new CounterCollector(&leveldb::dfs_open_hang_counter, false)));
+tera::AutoCollectorRegister dfs_close_hang_metric(kDfsCloseHangMetric,
+    std::unique_ptr<Collector>(new CounterCollector(&leveldb::dfs_close_hang_counter, false)));
+tera::AutoCollectorRegister dfs_delete_hang_metric(kDfsDeleteHangMetric,
+    std::unique_ptr<Collector>(new CounterCollector(&leveldb::dfs_delete_hang_counter, false)));
+tera::AutoCollectorRegister dfs_tell_hang_metric(kDfsTellHangMetric,
+    std::unique_ptr<Collector>(new CounterCollector(&leveldb::dfs_tell_hang_counter, false)));
+tera::AutoCollectorRegister dfs_info_hang_metric(kDfsInfoHangMetric,
+    std::unique_ptr<Collector>(new CounterCollector(&leveldb::dfs_info_hang_counter, false)));
+tera::AutoCollectorRegister dfs_other_hang_metric(kDfsOtherHangMetric,
+    std::unique_ptr<Collector>(new CounterCollector(&leveldb::dfs_other_hang_counter, false)));
+// ssd metrics
+tera::AutoCollectorRegister ssd_read_through_put_metric(kSsdReadThroughPutMetric,
+    std::unique_ptr<Collector>(new CounterCollector(&leveldb::ssd_read_size_counter, true)), {SubscriberType::THROUGHPUT});
+tera::AutoCollectorRegister ssd_write_through_put_metric(kSsdWriteThroughPutMetric,
+    std::unique_ptr<Collector>(new CounterCollector(&leveldb::ssd_write_size_counter, true)), {SubscriberType::THROUGHPUT});
+tera::AutoCollectorRegister ssd_read_metric(kSsdReadCountMetric,
+    std::unique_ptr<Collector>(new CounterCollector(&leveldb::ssd_read_counter, true)), {SubscriberType::QPS});
+tera::AutoCollectorRegister ssd_write_metric(kSsdWriteCountMetric,
+    std::unique_ptr<Collector>(new CounterCollector(&leveldb::ssd_write_counter, true)), {SubscriberType::QPS});
+// local metrics
+tera::AutoCollectorRegister posix_read_size_metric(kPosixReadThroughPutMetric,
+    std::unique_ptr<Collector>(new CounterCollector(&leveldb::posix_read_size_counter, true)), {SubscriberType::THROUGHPUT});
+tera::AutoCollectorRegister posix_write_size_metric(kPosixWriteThroughPutMetric,
+    std::unique_ptr<Collector>(new CounterCollector(&leveldb::posix_write_size_counter, true)), {SubscriberType::THROUGHPUT});
+tera::AutoCollectorRegister posix_read_metric(kPosixReadCountMetric,
+    std::unique_ptr<Collector>(new CounterCollector(&leveldb::posix_read_counter, true)), {SubscriberType::QPS});
+tera::AutoCollectorRegister posix_write_metric(kPosixWriteCountMetric,
+    std::unique_ptr<Collector>(new CounterCollector(&leveldb::posix_write_counter, true)), {SubscriberType::QPS});
+tera::AutoCollectorRegister posix_sync_metric(kPosixSyncCountMetric,
+    std::unique_ptr<Collector>(new CounterCollector(&leveldb::posix_sync_counter, true)), {SubscriberType::QPS});
+tera::AutoCollectorRegister posix_list_metric(kPosixListCountMetric,
+    std::unique_ptr<Collector>(new CounterCollector(&leveldb::posix_list_counter, true)), {SubscriberType::QPS});
+tera::AutoCollectorRegister posix_exists_metric(kPosixExistsCountMetric,
+    std::unique_ptr<Collector>(new CounterCollector(&leveldb::posix_exists_counter, true)), {SubscriberType::QPS});
+tera::AutoCollectorRegister posix_open_metric(kPosixOpenCountMetric,
+    std::unique_ptr<Collector>(new CounterCollector(&leveldb::posix_open_counter, true)), {SubscriberType::QPS});
+tera::AutoCollectorRegister posix_close_metric(kPosixCloseCountMetric,
+    std::unique_ptr<Collector>(new CounterCollector(&leveldb::posix_close_counter, true)), {SubscriberType::QPS});
+tera::AutoCollectorRegister posix_delete_metric(kPosixDeleteCountMetric,
+    std::unique_ptr<Collector>(new CounterCollector(&leveldb::posix_delete_counter, true)), {SubscriberType::QPS});
+tera::AutoCollectorRegister posix_tell_metric(kPosixTellCountMetric,
+    std::unique_ptr<Collector>(new CounterCollector(&leveldb::posix_tell_counter, true)), {SubscriberType::QPS});
+tera::AutoCollectorRegister posix_seek_metric(kPosixSeekCountMetric,
+    std::unique_ptr<Collector>(new CounterCollector(&leveldb::posix_seek_counter, true)), {SubscriberType::QPS});
+tera::AutoCollectorRegister posix_info_metric(kPosixInfoCountMetric,
+    std::unique_ptr<Collector>(new CounterCollector(&leveldb::posix_info_counter, true)), {SubscriberType::QPS});
+tera::AutoCollectorRegister posix_other_metric(kPosixOtherCountMetric,
+    std::unique_ptr<Collector>(new CounterCollector(&leveldb::posix_other_counter, true)), {SubscriberType::QPS});
+
+tera::AutoCollectorRegister rawkey_compare_metric(kRawkeyCompareCountMetric,
+    std::unique_ptr<Collector>(new CounterCollector(&leveldb::rawkey_compare_counter, true)), {SubscriberType::QPS});
+
 class TabletNodeSysInfoDumper {
 public:
     TabletNodeSysInfoDumper(const std::string& filename) :
@@ -135,29 +244,16 @@ class TabletNodeSysInfoDumper {
     FILE* fp_;
 };
 
-TabletNodeSysInfo::TabletNodeSysInfo()
-    : mem_check_ts_(0),
-      net_check_ts_(0),
-      io_check_ts_(0),
-      net_tx_total_(0),
-      net_rx_total_(0),
-      cpu_check_ts_(0),
-      tablet_check_ts_(0) {
+TabletNodeSysInfo::TabletNodeSysInfo() {
+    last_check_ts_ = get_micros();
 }
 
 TabletNodeSysInfo::TabletNodeSysInfo(const TabletNodeInfo& info)
-    : info_(info),
-      mem_check_ts_(0),
-      net_check_ts_(0),
-      io_check_ts_(0),
-      net_tx_total_(0),
-      net_rx_total_(0),
-      cpu_check_ts_(0),
-      tablet_check_ts_(0) {
+    : info_(info) {
+    last_check_ts_ = get_micros();
 }
 
-TabletNodeSysInfo::~TabletNodeSysInfo() {
-}
+TabletNodeSysInfo::~TabletNodeSysInfo() {}
 
 void TabletNodeSysInfo::AddExtraInfo(const std::string& name, int64_t value) {
     MutexLock lock(&mutex_);
@@ -176,32 +272,79 @@ void TabletNodeSysInfo::SetTimeStamp(int64_t ts) {
     info_.set_timestamp(ts);
 }
 
+struct DBSize {
+    uint64_t size;
+    std::vector<uint64_t> lg_size;
+};
+
 void TabletNodeSysInfo::CollectTabletNodeInfo(TabletManager* tablet_manager,
                                               const string& server_addr) {
+    std::vector<io::TabletIO*> tablet_ios;
+    std::vector<TabletStatus> db_status_vec;
+    std::vector<DBSize> db_size_vec;
+
+    int64_t ts = get_micros();
+    bool need_check = false;
+    if (ts - last_check_ts_ > FLAGS_tera_tabletnode_sysinfo_check_interval) {
+        last_check_ts_ = ts;
+        need_check = true;
+    }
+    tablet_manager->GetAllTablets(&tablet_ios);
+    std::vector<io::TabletIO*>::iterator it = tablet_ios.begin();
+    while (it != tablet_ios.end()) {
+        io::TabletIO* tablet_io = *it;
+        if (tablet_io->ShouldForceUnloadOnError()) {
+            LOG(WARNING) << *tablet_io << ", has internal error triggered unload";
+            StatusCode status;
+            if (!tablet_io->Unload(&status)) {
+                LOG(ERROR) << *tablet_io << ", Unload tablet failed, status: "
+                    << StatusCodeToString(status);
+            }
+            if (!tablet_manager->RemoveTablet(tablet_io->GetTableName(),
+                        tablet_io->GetStartKey(), tablet_io->GetEndKey(), &status)) {
+                LOG(ERROR) << *tablet_io << ", remove from TabletManager failed, status: "
+                    << StatusCodeToString(status);
+            }
+            tablet_io->DecRef();
+            it = tablet_ios.erase(it);
+            continue;
+        }
+
+        // check db status whether is corruption
+        TabletStatus tablet_status = static_cast<TabletStatus>(kTabletReady);
+        tablet_io->GetDBStatus(&tablet_status, need_check);
+        db_status_vec.push_back(tablet_status);
+
+        DBSize db_size;
+        tablet_io->GetDataSize(&db_size.size, &db_size.lg_size);
+        db_size_vec.push_back(db_size);
+
+        ++it;
+    }
+
     MutexLock lock(&mutex_);
-    int64_t cur_ts = get_micros();
-    int64_t interval = cur_ts - tablet_check_ts_;
-    tablet_check_ts_ = cur_ts;
+    std::shared_ptr<CollectorReport> latest_report = CollectorReportPublisher::GetInstance().GetCollectorReport();
+    int64_t interval = latest_report->interval_ms;
+    if (interval <= 0) {
+        // maybe happen at first report, the metric values must be 0
+        // set to any non-zero value to avoid div 0
+        VLOG(16) << "Metric Report interval is 0";
+        interval = 1000;
+    }
 
     tablet_list_.Clear();
     int64_t total_size = 0;
-    int64_t low_read_cell = 0;
-    int64_t scan_rows = 0;
     int64_t scan_kvs = 0;
-    int64_t scan_size = 0;
-    int64_t read_rows = 0;
     int64_t read_kvs = 0;
-    int64_t read_size = 0;
-    int64_t write_rows = 0;
     int64_t write_kvs = 0;
-    int64_t write_size = 0;
     int64_t busy_cnt = 0;
+    int64_t db_corruption_cnt = 0;
+
+    for (uint32_t i = 0; i < tablet_ios.size(); i++) {
+        io::TabletIO* tablet_io = tablet_ios[i];
+        TabletStatus tablet_status = db_status_vec[i];
+        DBSize db_size = db_size_vec[i];
 
-    std::vector<io::TabletIO*> tablet_ios;
-    tablet_manager->GetAllTablets(&tablet_ios);
-    std::vector<io::TabletIO*>::iterator it = tablet_ios.begin();
-    for (; it != tablet_ios.end(); ++it) {
-        io::TabletIO* tablet_io = *it;
         TabletMeta* tablet_meta = tablet_list_.add_meta();
         tablet_meta->set_status(TabletStatus(tablet_io->GetStatus()));
         tablet_meta->set_server_addr(server_addr);
@@ -210,274 +353,185 @@ void TabletNodeSysInfo::CollectTabletNodeInfo(TabletManager* tablet_manager,
         tablet_meta->mutable_key_range()->set_key_start(tablet_io->GetStartKey());
         tablet_meta->mutable_key_range()->set_key_end(tablet_io->GetEndKey());
 
-        std::vector<uint64_t> lgsize;
-        uint64_t size;
-        tablet_io->GetDataSize(&size, &lgsize);
-        tablet_meta->set_size(size);
-        for (size_t i = 0; i < lgsize.size(); ++i) {
-            tablet_meta->add_lg_size(lgsize[i]);
+        tablet_meta->set_size(db_size.size);
+        for (size_t i = 0; i < db_size.lg_size.size(); ++i) {
+            tablet_meta->add_lg_size(db_size.lg_size[i]);
         }
         tablet_meta->set_compact_status(tablet_io->GetCompactStatus());
         total_size += tablet_meta->size();
 
         TabletCounter* counter = tablet_list_.add_counter();
-        tablet_io->GetAndClearCounter(counter);
-        low_read_cell += counter->low_read_cell();
-        scan_rows += counter->scan_rows();
+        const std::string& label_str = tablet_io->GetMetricLabel();
+        counter->set_low_read_cell(latest_report->FindMetricValue(kLowReadCellMetricName, label_str));
+        counter->set_scan_rows(latest_report->FindMetricValue(kScanRowsMetricName, label_str));
+        counter->set_scan_kvs(latest_report->FindMetricValue(kScanKvsMetricName, label_str));
+        counter->set_scan_size(latest_report->FindMetricValue(kScanThroughPutMetricName, label_str));
+        counter->set_read_rows(latest_report->FindMetricValue(kReadRowsMetricName, label_str));
+        counter->set_read_kvs(latest_report->FindMetricValue(kReadKvsMetricName, label_str));
+        counter->set_read_size(latest_report->FindMetricValue(kReadThroughPutMetricName, label_str));
+        counter->set_write_rows(latest_report->FindMetricValue(kWriteRowsMetricName, label_str));
+        counter->set_write_kvs(latest_report->FindMetricValue(kWriteKvsMetricName, label_str));
+        counter->set_write_size(latest_report->FindMetricValue(kWriteThroughPutMetricName, label_str));
+        counter->set_is_on_busy(tablet_io->IsBusy());
+        double write_workload = 0;
+        tablet_io->Workload(&write_workload);
+        counter->set_write_workload(write_workload);
+        counter->set_db_status(tablet_status); // set runtime counter
+
         scan_kvs += counter->scan_kvs();
-        scan_size += counter->scan_size();
-        read_rows += counter->read_rows();
         read_kvs += counter->read_kvs();
-        read_size += counter->read_size();
-        write_rows += counter->write_rows();
         write_kvs += counter->write_kvs();
-        write_size += counter->write_size();
 
         if (counter->is_on_busy()) {
             busy_cnt++;
         }
+        if (counter->db_status() == kTabletCorruption) {
+            db_corruption_cnt++;
+        }
         tablet_io->DecRef();
     }
-    info_.set_low_read_cell(low_read_cell * 1000000 / interval);
-    info_.set_scan_rows(scan_rows * 1000000 / interval);
-    info_.set_scan_kvs(scan_kvs * 1000000 / interval);
-    info_.set_scan_size(scan_size * 1000000 / interval);
-    info_.set_read_rows(read_rows * 1000000 / interval);
-    info_.set_read_kvs(read_kvs * 1000000 / interval);
-    info_.set_read_size(read_size * 1000000 / interval);
-    info_.set_write_rows(write_rows * 1000000 / interval);
-    info_.set_write_kvs(write_kvs * 1000000 / interval);
-    info_.set_write_size(write_size * 1000000 / interval);
+
+    int64_t low_read_cell =
+        latest_report->FindMetricValue(kLowLevelReadMetric);
+    int64_t read_rows =
+        latest_report->FindMetricValue(kRowCountMetric, kApiLabelRead);
+    int64_t read_size =
+        latest_report->FindMetricValue(kRowThroughPutMetric, kApiLabelRead);
+    int64_t write_rows =
+        latest_report->FindMetricValue(kRowCountMetric, kApiLabelWrite);
+    int64_t write_size =
+        latest_report->FindMetricValue(kRowThroughPutMetric, kApiLabelWrite);
+    int64_t scan_rows =
+        latest_report->FindMetricValue(kRowCountMetric, kApiLabelScan);
+    int64_t scan_size =
+        latest_report->FindMetricValue(kRowThroughPutMetric, kApiLabelScan);
+
+    info_.set_low_read_cell(low_read_cell * 1000 / interval);
+    info_.set_scan_rows(scan_rows * 1000 / interval);
+    info_.set_scan_kvs(scan_kvs * 1000 / interval);
+    info_.set_scan_size(scan_size * 1000 / interval);
+    info_.set_read_rows(read_rows * 1000 / interval);
+    info_.set_read_kvs(read_kvs * 1000 / interval);
+    info_.set_read_size(read_size * 1000 / interval);
+    info_.set_write_rows(write_rows * 1000 / interval);
+    info_.set_write_kvs(write_kvs * 1000 / interval);
+    info_.set_write_size(write_size * 1000 / interval);
     info_.set_tablet_onbusy(busy_cnt);
+    info_.set_tablet_corruption(db_corruption_cnt);
 
     // refresh tabletnodeinfo
     info_.set_load(total_size);
     info_.set_tablet_total(tablet_ios.size());
 
     int64_t tmp;
-    tmp = leveldb::dfs_read_size_counter.Clear() * 1000000 / interval;
+    tmp = latest_report->FindMetricValue(kDfsReadBytesThroughPut) * 1000 / interval;
     info_.set_dfs_io_r(tmp);
-    tmp = leveldb::dfs_write_size_counter.Clear() * 1000000 / interval;
+    tmp = latest_report->FindMetricValue(kDfsWriteBytesThroughPut) * 1000 / interval;
     info_.set_dfs_io_w(tmp);
-    tmp = leveldb::posix_read_size_counter.Clear() * 1000000 / interval;
+    tmp = latest_report->FindMetricValue(kPosixReadThroughPutMetric) * 1000 / interval;
     info_.set_local_io_r(tmp);
-    tmp = leveldb::posix_write_size_counter.Clear() * 1000000 / interval;
+    tmp = latest_report->FindMetricValue(kPosixWriteThroughPutMetric) * 1000 / interval;
     info_.set_local_io_w(tmp);
 
-    info_.set_read_pending(read_pending_counter.Get());
-    info_.set_write_pending(write_pending_counter.Get());
-    info_.set_scan_pending(scan_pending_counter.Get());
+    int64_t read_pending = latest_report->FindMetricValue(kPendingCountMetric, kApiLabelRead);
+    int64_t write_pending = latest_report->FindMetricValue(kPendingCountMetric, kApiLabelWrite);
+    int64_t scan_pending = latest_report->FindMetricValue(kPendingCountMetric, kApiLabelScan);
+    int64_t compact_pending = latest_report->FindMetricValue(kPendingCountMetric, kApiLabelCompact);
+
+    info_.set_read_pending(read_pending);
+    info_.set_write_pending(write_pending);
+    info_.set_scan_pending(scan_pending);
 
     // collect extra infos
     info_.clear_extra_info();
     ExtraTsInfo* einfo = info_.add_extra_info();
-    if (read_rows == 0) {
-        tmp = 0;
-    } else {
-        tmp = rand_read_delay.Clear() / read_rows;
-    }
-    einfo->set_name("rand_read_delay");
-    einfo->set_value(tmp / 1000);
 
-    einfo = info_.add_extra_info();
-    if (read_rows == 0) {
-        tmp = 0;
-    } else {
-        tmp = row_read_delay.Clear() / read_rows;
-    }
-    einfo->set_name("row_read_delay");
-    einfo->set_value(tmp / 1000);
+    int64_t range_error_sum =
+            latest_report->FindMetricValue(kRangeErrorMetric, kApiLabelRead) +
+            latest_report->FindMetricValue(kRangeErrorMetric, kApiLabelWrite) +
+            latest_report->FindMetricValue(kRangeErrorMetric, kApiLabelScan);
 
-    einfo = info_.add_extra_info();
-    tmp = range_error_counter.Clear() * 1000000 / interval;
+    tmp = range_error_sum * 1000 / interval;
     einfo->set_name("range_error");
     einfo->set_value(tmp);
 
     einfo = info_.add_extra_info();
-    tmp = read_pending_counter.Get();
     einfo->set_name("read_pending");
-    einfo->set_value(tmp);
+    einfo->set_value(read_pending);
 
     einfo = info_.add_extra_info();
-    tmp = write_pending_counter.Get();
     einfo->set_name("write_pending");
-    einfo->set_value(tmp);
+    einfo->set_value(write_pending);
 
     einfo = info_.add_extra_info();
-    tmp = scan_pending_counter.Get();
     einfo->set_name("scan_pending");
-    einfo->set_value(tmp);
+    einfo->set_value(scan_pending);
 
     einfo = info_.add_extra_info();
-    tmp = compact_pending_counter.Get();
     einfo->set_name("compact_pending");
+    einfo->set_value(compact_pending);
+
+    einfo = info_.add_extra_info();
+    tmp = latest_report->FindMetricValue(kRejectCountMetric, kApiLabelRead) * 1000 / interval;
+    einfo->set_name("read_reject");
     einfo->set_value(tmp);
-}
 
-// return the number of ticks(jiffies) that this process
-// has been scheduled in user and kernel mode.
-static long long ProcessCpuTick() {
-    const int PATH_MAX_LEN = 64;
-    char path[PATH_MAX_LEN];
-    sprintf(path, "/proc/%d/stat", getpid());
-    FILE *fp = fopen(path, "r");
-    if (fp == NULL) {
-        return 0;
-    }
-    long long utime = 0, stime = 0;
-    if (fscanf(fp, "%*s %*s %*s %*s %*s %*s %*s %*s %*s %*s %*s %*s %*s %lld %lld",
-               &utime, &stime) < 2) {
-        LOG(ERROR) << "get cpu tick from /proc/" << getpid() << "/stat failed.";
-    }
-    fclose(fp);
-    return utime + stime;
-}
+    einfo = info_.add_extra_info();
+    tmp = latest_report->FindMetricValue(kRejectCountMetric, kApiLabelWrite) * 1000 / interval;
+    einfo->set_name("write_reject");
+    einfo->set_value(tmp);
 
-// return number of cpu(cores)
-static int GetCpuCount() {
-#if defined(_SC_NPROCESSORS_ONLN)
-    return sysconf(_SC_NPROCESSORS_ONLN);
-#else
-    FILE *fp = fopen("/proc/stat", "r");
-    if (fp == NULL) {
-        return 1;
-    }
-    const int LINE_MAX_LEN = 256; // enough in here
-    char *aline = (char*)malloc(LINE_MAX_LEN);
-    if (aline == NULL) {
-        LOG(ERROR) << "[HardWare System Info] malloc failed.";
-        return 1;
-    }
-    const int HEADER_MAX_LEN = 10;
-    char header[HEADER_MAX_LEN];
-    int i=0;
-    size_t len=0;
-    getline(&aline, &len, fp); // drop the first line
-    while (getline(&aline, &len, fp)) {
-        i++;
-        sscanf(aline, "%s", header);
-        if (!strncmp(header, "intr", HEADER_MAX_LEN)) {
-            break;
-        }
-    }
-    fclose(fp);
-    free(aline);
-    return i-1 > 0 ? i-1 : 1;
-#endif
-}
+    einfo = info_.add_extra_info();
+    tmp = latest_report->FindMetricValue(kRejectCountMetric, kApiLabelScan) * 1000 / interval;
+    einfo->set_name("scan_reject");
+    einfo->set_value(tmp);
 
-// irix_on == 1 --> irix mode on
-// irix_on == 0 --> irix mode off
-//
-// return this process's the percentage of CPU usage ( %CPU ).
-//
-// NOTE: the first time call this function would get 0 as result.
-static float GetCpuUsage(int is_irix_on) {
-    static int cpu_count = 1; // assume cpu count is not variable when process is running
-    static unsigned long hertz = 0;
-    if (hertz == 0) {
-        hertz = sysconf(_SC_CLK_TCK);
-        cpu_count = GetCpuCount();
-    }
+    einfo = info_.add_extra_info();
+    tmp = latest_report->FindMetricValue(kRequestCountMetric, kApiLabelRead) * 1000 / interval;
+    einfo->set_name("read_request");
+    einfo->set_value(tmp);
 
-    static struct timeval oldtimev;
-    struct timeval timev;
-    gettimeofday(&timev, NULL);
-    float et = (timev.tv_sec - oldtimev.tv_sec)
-        + (float)(timev.tv_usec - oldtimev.tv_usec) / 1000000.0;
-    oldtimev.tv_sec = timev.tv_sec;
-    oldtimev.tv_usec = timev.tv_usec;
-
-    float frame_etscale;
-    if (is_irix_on) {
-        frame_etscale = 100.0f / ((float)hertz * et);
-    } else {
-        frame_etscale = 100.0f / ((float)hertz * et * cpu_count);
-    }
+    einfo = info_.add_extra_info();
+    tmp = latest_report->FindMetricValue(kRequestCountMetric, kApiLabelWrite) * 1000 / interval;
+    einfo->set_name("write_request");
+    einfo->set_value(tmp);
 
-    static unsigned long oldtick;
-    unsigned long newtick;
-    newtick = ProcessCpuTick();
-    float u = (newtick - (float)oldtick) * frame_etscale;
-    oldtick = newtick;
+    einfo = info_.add_extra_info();
+    tmp = latest_report->FindMetricValue(kRequestCountMetric, kApiLabelScan) * 1000 / interval;
+    einfo->set_name("scan_request");
+    einfo->set_value(tmp);
 
-    const float MAX_CPU_USAGE = 99.9f;
-    if (u > MAX_CPU_USAGE ) {
-        u = MAX_CPU_USAGE;
-    }
+    einfo = info_.add_extra_info();
+    tmp = latest_report->FindMetricValue(kErrorCountMetric, kApiLabelRead) * 1000 / interval;
+    einfo->set_name("read_error");
+    einfo->set_value(tmp);
 
-    // rounding cpu usage to 1 decimal places
-    const int USAGE_STR_MAX_LEN = 5;
-    char usage_str[USAGE_STR_MAX_LEN];
-    sprintf(usage_str, "%.1f\n", u);
-    sscanf(usage_str, "%f", &u);
-    return u;
+    einfo = info_.add_extra_info();
+    tmp = latest_report->FindMetricValue(kErrorCountMetric, kApiLabelWrite) * 1000 / interval;
+    einfo->set_name("write_error");
+    einfo->set_value(tmp);
+
+    einfo = info_.add_extra_info();
+    tmp = latest_report->FindMetricValue(kErrorCountMetric, kApiLabelScan) * 1000 / interval;
+    einfo->set_name("scan_error");
+    einfo->set_value(tmp);
 }
 
 void TabletNodeSysInfo::CollectHardwareInfo() {
     MutexLock lock(&mutex_);
-    int pid = getpid();
-    FILE* f;
-    std::ostringstream ss;
-    ss << "/proc/" << pid << "/";
-    int64_t cur_ts = get_micros();
-
-    int64_t interval = cur_ts - mem_check_ts_;
-    if (interval / 1000000 > FLAGS_tera_tabletnode_sysinfo_mem_collect_interval) {
-        mem_check_ts_ = cur_ts;
-        int64_t mem;
-        f = fopen((ss.str() + "statm").data(), "r");
-        if (f == NULL) {
-            return;
-        }
-        fscanf(f, "%*d %ld", &mem);
-        mem = mem * 4 * 1024;
-        fclose(f);
-        info_.set_mem_used(mem);
+    std::shared_ptr<CollectorReport> latest_report = CollectorReportPublisher::GetInstance().GetCollectorReport();
 
-        VLOG(15) << "[HardWare System Info] Memory: " << mem * 4;
-        return;
-    }
+    int64_t cpu_usage = latest_report->FindMetricValue(kInstCpuMetricName);
+    info_.set_cpu_usage(static_cast<float>(cpu_usage));
 
-    interval = cur_ts - net_check_ts_;
-    if (interval / 1000000 > FLAGS_tera_tabletnode_sysinfo_net_collect_interval) {
-        net_check_ts_ = cur_ts;
-        int64_t net_rx = 0, net_tx = 0;
-        f = fopen((ss.str() + "net/dev").data(), "r");
-        if (f == NULL) {
-            return;
-        }
-        int ret = fseek(f, 327, SEEK_SET);
-        CHECK_EQ(ret, 0);
-        for (int i = 0; i < 10; i++) {
-            while (':' != fgetc(f));
-            ret = fscanf(f, "%ld%*d%*d%*d%*d%*d%*d%*d%ld", &net_rx, &net_tx);
-            if (ret >= 2 && net_rx > 0 && net_tx > 0) {
-                break;
-            }
-        }
-        fclose(f);
-
-        int64_t tmp;
-        tmp = (net_rx - net_rx_total_) * 1000000 / interval;
-        info_.set_net_rx(tmp);
-        tmp = (net_tx - net_tx_total_) * 1000000 / interval;
-        info_.set_net_tx(tmp);
-        net_rx_total_ = net_rx;
-        net_tx_total_ = net_tx;
-
-        VLOG(15) << "[HardWare System Info] Network RX/TX: " << net_rx << " / " << net_tx;
-        return;
-    }
+    int64_t mem_usage = latest_report->FindMetricValue(kInstMemMetricName);
+    info_.set_mem_used(mem_usage);
 
-    interval = cur_ts - cpu_check_ts_;
-    if (interval / 1000000 > FLAGS_tera_tabletnode_sysinfo_cpu_collect_interval) {
-        cpu_check_ts_ = cur_ts;
-        float cpu_usage = GetCpuUsage(0);
-        info_.set_cpu_usage(cpu_usage);
-        VLOG(15) << "[HardWare System Info] %CPU: "<< cpu_usage;
-        return;
-    }
+    int64_t net_rx_usage = latest_report->FindMetricValue(kInstNetRXMetricName);
+    info_.set_net_rx(net_rx_usage);
+
+    int64_t net_tx_usage = latest_report->FindMetricValue(kInstNetTXMetricName);
+    info_.set_net_tx(net_tx_usage);
 }
 
 void TabletNodeSysInfo::GetTabletNodeInfo(TabletNodeInfo* info) {
@@ -502,11 +556,17 @@ void TabletNodeSysInfo::SetStatus(StatusCode status) {
 
 void TabletNodeSysInfo::DumpLog() {
     MutexLock lock(&mutex_);
-
+    std::shared_ptr<CollectorReport> latest_report = CollectorReportPublisher::GetInstance().GetCollectorReport();
+    int64_t interval = latest_report->interval_ms;
+    
     TabletNodeSysInfoDumper dumper(FLAGS_tera_tabletnode_running_info_dump_file);
 
-    double snappy_ratio = (double)leveldb::snappy_before_size_counter.Clear()
-                          / leveldb::snappy_after_size_counter.Clear();
+    double snappy_ratio = latest_report->FindMetricValue(kSnappyCompressionRatioMetric);
+    if (snappy_ratio > 0) {
+        snappy_ratio /= 100.0;
+    }
+
+    int64_t rawkey_compare_count = latest_report->FindMetricValue(kRawkeyCompareCountMetric);
 
     if (FLAGS_tera_tabletnode_dump_running_info) {
         dumper.DumpData("low_level", info_.low_read_cell());
@@ -517,7 +577,7 @@ void TabletNodeSysInfo::DumpLog() {
         dumper.DumpData("scan", info_.scan_rows());
         dumper.DumpData("sspeed", info_.scan_size());
         dumper.DumpData("snappy", snappy_ratio);
-        dumper.DumpData("rowcomp", leveldb::rawkey_compare_counter.Get());
+        dumper.DumpData("rowcomp", rawkey_compare_count);
     }
 
     LOG(INFO) << "[SysInfo]"
@@ -529,7 +589,7 @@ void TabletNodeSysInfo::DumpLog() {
         << " scan " << info_.scan_rows()
         << " sspeed " << utils::ConvertByteToString(info_.scan_size())
         << " snappy " << snappy_ratio
-        << " rawcomp " << leveldb::rawkey_compare_counter.Clear();
+        << " rawcomp " << rawkey_compare_count;
 
     // hardware info
     if (FLAGS_tera_tabletnode_dump_running_info) {
@@ -549,15 +609,19 @@ void TabletNodeSysInfo::DumpLog() {
         << " cpu_usage " << info_.cpu_usage() << "%";
 
     // net and io info
+    int64_t ssd_read_count = latest_report->FindMetricValue(kSsdReadCountMetric);
+    int64_t ssd_read_size = latest_report->FindMetricValue(kSsdReadThroughPutMetric);
+    int64_t ssd_write_count = latest_report->FindMetricValue(kSsdWriteCountMetric);
+    int64_t ssd_write_size = latest_report->FindMetricValue(kSsdWriteThroughPutMetric);
     if (FLAGS_tera_tabletnode_dump_running_info) {
         dumper.DumpData("dfs_r", info_.dfs_io_r());
         dumper.DumpData("dfs_w", info_.dfs_io_w());
         dumper.DumpData("local_r", info_.local_io_r());
         dumper.DumpData("local_w", info_.local_io_w());
-        dumper.DumpData("ssd_r_counter", leveldb::ssd_read_counter.Get());
-        dumper.DumpData("ssd_r_size", leveldb::ssd_read_size_counter.Get());
-        dumper.DumpData("ssd_w_counter", leveldb::ssd_write_counter.Get());
-        dumper.DumpData("ssd_w_size", leveldb::ssd_write_size_counter.Get());
+        dumper.DumpData("ssd_r_counter", ssd_read_count);
+        dumper.DumpData("ssd_r_size", ssd_read_size);
+        dumper.DumpData("ssd_w_counter", ssd_write_count);
+        dumper.DumpData("ssd_w_size", ssd_write_size);
     }
 
     LOG(INFO) << "[IO]"
@@ -569,10 +633,79 @@ void TabletNodeSysInfo::DumpLog() {
         << utils::ConvertByteToString(info_.local_io_r())
         << " local_w " << info_.local_io_w() << " "
         << utils::ConvertByteToString(info_.local_io_w())
-        << " ssd_r " << leveldb::ssd_read_counter.Clear() << " "
-        << utils::ConvertByteToString(leveldb::ssd_read_size_counter.Clear())
-        << " ssd_w " << leveldb::ssd_write_counter.Clear() << " "
-        << utils::ConvertByteToString(leveldb::ssd_write_size_counter.Clear());
+        << " ssd_r " << ssd_read_count << " "
+        << utils::ConvertByteToString(ssd_read_size)
+        << " ssd_w " << ssd_write_count << " "
+        << utils::ConvertByteToString(ssd_write_size);
+
+    // cache info
+    double block_cache_hitrate = static_cast<double>(latest_report->FindMetricValue(kBlockCacheHitRateMetric)) / 100.0;
+    if (block_cache_hitrate < 0.0) {
+        block_cache_hitrate = NAN;
+    }
+    int64_t block_cache_entries = latest_report->FindMetricValue(kBlockCacheEntriesMetric);
+    int64_t block_cache_charge = latest_report->FindMetricValue(kBlockCacheChargeMetric);
+    double table_cache_hitrate = static_cast<double>(latest_report->FindMetricValue(kTableCacheHitRateMetric)) / 100.0;
+    if (table_cache_hitrate < 0.0) {
+        table_cache_hitrate = NAN;
+    }
+    int64_t table_cache_entries = latest_report->FindMetricValue(kTableCacheEntriesMetric);
+    int64_t table_cache_charge = latest_report->FindMetricValue(kTableCacheChargeMetric);
+    if (FLAGS_tera_tabletnode_dump_running_info) {
+        dumper.DumpData("block_cache_hitrate", block_cache_hitrate);
+        dumper.DumpData("block_cache_entry", block_cache_entries);
+        dumper.DumpData("block_cache_bytes", block_cache_charge);
+        dumper.DumpData("table_cache_hitrate", table_cache_hitrate);
+        dumper.DumpData("table_cache_entry", table_cache_entries);
+        dumper.DumpData("table_cache_bytes", table_cache_charge);
+    }
+    LOG(INFO) << "[Cache HitRate/Cnt/Size] table_cache "
+              << table_cache_hitrate << " "
+              << table_cache_entries << " "
+              << table_cache_charge
+              << ", block_cache "
+              << block_cache_hitrate << " "
+              << block_cache_entries << " "
+              << block_cache_charge;
+    
+    int64_t finished_read_request = 
+        latest_report->FindMetricValue(kFinishedRequestCountMetric, kApiLabelRead);
+    int64_t finished_write_request = 
+        latest_report->FindMetricValue(kFinishedRequestCountMetric, kApiLabelWrite);
+    int64_t finished_scan_request = 
+        latest_report->FindMetricValue(kFinishedRequestCountMetric, kApiLabelScan);
+    LOG(INFO) << "[Finished Requests] "
+              << "read: " << finished_read_request * 1000 / interval
+              << ", write: " << finished_write_request * 1000 / interval
+              << ", scan: " << finished_scan_request * 1000 / interval;
+
+    int64_t read_request_delay =
+        (finished_read_request == 0 ? 0 : latest_report->FindMetricValue(kRequestDelayMetric, kApiLabelRead) / finished_read_request);
+    int64_t write_request_delay =
+        (finished_write_request == 0 ? 0 : latest_report->FindMetricValue(kRequestDelayMetric, kApiLabelWrite) / finished_write_request);
+    int64_t scan_request_delay =
+        (finished_scan_request == 0 ? 0 : latest_report->FindMetricValue(kRequestDelayMetric, kApiLabelScan) / finished_scan_request);
+    LOG(INFO) << "[Requests Delay In Ms] "
+              << "read: " << read_request_delay / 1000.0
+              << ", write: " << write_request_delay / 1000.0
+              << ", scan: " << scan_request_delay / 1000.0;
+
+    int64_t read_rows =
+        latest_report->FindMetricValue(kRowCountMetric, kApiLabelRead);
+    int64_t write_rows =
+        latest_report->FindMetricValue(kRowCountMetric, kApiLabelWrite);
+    int64_t scan_rows =
+        latest_report->FindMetricValue(kRowCountMetric, kApiLabelScan);
+    int64_t row_read_delay = 
+        (read_rows == 0 ? 0 : latest_report->FindMetricValue(kRowDelayMetric, kApiLabelRead) / read_rows);
+    int64_t row_write_delay = 
+        (write_rows == 0 ? 0 : latest_report->FindMetricValue(kRowDelayMetric, kApiLabelWrite) / write_rows);
+    int64_t row_scan_delay = 
+        (scan_rows == 0 ? 0 : latest_report->FindMetricValue(kRowDelayMetric, kApiLabelScan) / scan_rows);
+    LOG(INFO) << "[Row Delay In Ms] "
+              << "row_read_delay: " << row_read_delay / 1000.0
+              << ", row_write_delay: " << row_write_delay / 1000.0
+              << ", row_scan_delay: " << row_scan_delay / 1000.0;
 
     // extra info
     std::ostringstream ss;
@@ -587,102 +720,138 @@ void TabletNodeSysInfo::DumpLog() {
     LOG(INFO) << ss.str();
 
     // DFS info
-    double rdelay = leveldb::dfs_read_counter.Get() ?
-        leveldb::dfs_read_delay_counter.Clear()/1000/leveldb::dfs_read_counter.Get()
-        : 0;
-    double wdelay = leveldb::dfs_write_counter.Get() ?
-        leveldb::dfs_write_delay_counter.Clear()/1000/leveldb::dfs_write_counter.Get()
-        : 0;
-    double sdelay = leveldb::dfs_sync_counter.Get() ?
-        leveldb::dfs_sync_delay_counter.Clear()/1000/leveldb::dfs_sync_counter.Get()
-        : 0;
+    int64_t dfs_read_delay = latest_report->FindMetricValue(kDfsReadDelayMetric);
+    int64_t dfs_write_delay = latest_report->FindMetricValue(kDfsWriteDelayMetric);
+    int64_t dfs_sync_delay = latest_report->FindMetricValue(kDfsSyncDelayMetric);
+    int64_t dfs_read_count = latest_report->FindMetricValue(kDfsReadCountMetric);
+    int64_t dfs_write_count = latest_report->FindMetricValue(kDfsWriteCountMetric);
+    int64_t dfs_sync_count = latest_report->FindMetricValue(kDfsSyncCountMetric);
+    int64_t dfs_flush_count = latest_report->FindMetricValue(kDfsFlushCountMetric);
+    int64_t dfs_list_count = latest_report->FindMetricValue(kDfsListCountMetric);
+    int64_t dfs_other_count = latest_report->FindMetricValue(kDfsOtherCountMetric);
+    int64_t dfs_exists_count = latest_report->FindMetricValue(kDfsExistsCountMetric);
+    int64_t dfs_open_count = latest_report->FindMetricValue(kDfsOpenCountMetric);
+    int64_t dfs_close_count = latest_report->FindMetricValue(kDfsCloseCountMetric);
+    int64_t dfs_delete_count = latest_report->FindMetricValue(kDfsDeleteCountMetric);
+    int64_t dfs_tell_count = latest_report->FindMetricValue(kDfsTellCountMetric);
+    int64_t dfs_info_count = latest_report->FindMetricValue(kDfsInfoCountMetric);
+    int64_t dfs_read_hang = latest_report->FindMetricValue(kDfsReadHangMetric);
+    int64_t dfs_write_hang = latest_report->FindMetricValue(kDfsWriteHangMetric);
+    int64_t dfs_sync_hang = latest_report->FindMetricValue(kDfsSyncHangMetric);
+    int64_t dfs_flush_hang = latest_report->FindMetricValue(kDfsFlushHangMetric);
+    int64_t dfs_list_hang = latest_report->FindMetricValue(kDfsListHangMetric);
+    int64_t dfs_other_hang = latest_report->FindMetricValue(kDfsOtherHangMetric);
+    int64_t dfs_exists_hang = latest_report->FindMetricValue(kDfsExistsHangMetric);
+    int64_t dfs_open_hang = latest_report->FindMetricValue(kDfsOpenHangMetric);
+    int64_t dfs_close_hang = latest_report->FindMetricValue(kDfsCloseHangMetric);
+    int64_t dfs_delete_hang = latest_report->FindMetricValue(kDfsDeleteHangMetric);
+    int64_t dfs_tell_hang = latest_report->FindMetricValue(kDfsTellHangMetric);
+    int64_t dfs_info_hang = latest_report->FindMetricValue(kDfsInfoHangMetric);
+    double rdelay = dfs_read_count ? static_cast<double>(dfs_read_delay) / 1000.0 / dfs_read_count : 0;
+    double wdelay = dfs_write_count ? static_cast<double>(dfs_write_delay) / 1000.0 / dfs_write_count : 0;
+    double sdelay = dfs_sync_count ? static_cast<double>(dfs_sync_delay) / 1000.0 / dfs_sync_count : 0;
 
     if (FLAGS_tera_tabletnode_dump_running_info) {
-        dumper.DumpData("dfs_read", leveldb::dfs_read_counter.Get());
-        dumper.DumpData("dfs_read_hang", leveldb::dfs_read_hang_counter.Get());
+        dumper.DumpData("dfs_read", dfs_read_count);
+        dumper.DumpData("dfs_read_hang", dfs_read_hang);
         dumper.DumpData("dfs_rdealy", rdelay);
-        dumper.DumpData("dfs_write", leveldb::dfs_write_counter.Get());
-        dumper.DumpData("dfs_write_hang", leveldb::dfs_write_hang_counter.Get());
+        dumper.DumpData("dfs_write", dfs_write_count);
+        dumper.DumpData("dfs_write_hang", dfs_write_hang);
         dumper.DumpData("dfs_wdelay", wdelay);
-        dumper.DumpData("dfs_sync", leveldb::dfs_sync_counter.Get());
-        dumper.DumpData("dfs_sync_hang", leveldb::dfs_sync_hang_counter.Get());
+        dumper.DumpData("dfs_sync", dfs_sync_count);
+        dumper.DumpData("dfs_sync_hang", dfs_sync_hang);
         dumper.DumpData("dfs_sdelay", sdelay);
-        dumper.DumpData("dfs_flush", leveldb::dfs_flush_counter.Get());
-        dumper.DumpData("dfs_flush_hang", leveldb::dfs_flush_hang_counter.Get());
-        dumper.DumpData("dfs_list", leveldb::dfs_list_counter.Get());
-        dumper.DumpData("dfs_list_hang", leveldb::dfs_list_hang_counter.Get());
-        dumper.DumpData("dfs_info", leveldb::dfs_info_counter.Get());
-        dumper.DumpData("dfs_info_hang", leveldb::dfs_info_hang_counter.Get());
-        dumper.DumpData("dfs_exists", leveldb::dfs_exists_counter.Get());
-        dumper.DumpData("dfs_exists_hang", leveldb::dfs_exists_hang_counter.Get());
-        dumper.DumpData("dfs_open", leveldb::dfs_open_counter.Get());
-        dumper.DumpData("dfs_open_hang", leveldb::dfs_open_hang_counter.Get());
-        dumper.DumpData("dfs_close", leveldb::dfs_close_counter.Get());
-        dumper.DumpData("dfs_close_hang", leveldb::dfs_close_hang_counter.Get());
-        dumper.DumpData("dfs_delete", leveldb::dfs_delete_counter.Get());
-        dumper.DumpData("dfs_delete_hang", leveldb::dfs_delete_hang_counter.Get());
-        dumper.DumpData("dfs_tell", leveldb::dfs_tell_counter.Get());
-        dumper.DumpData("dfs_tell_hang", leveldb::dfs_tell_hang_counter.Get());
-        dumper.DumpData("dfs_other", leveldb::dfs_other_counter.Get());
-        dumper.DumpData("dfs_other_hang", leveldb::dfs_other_hang_counter.Get());
+        dumper.DumpData("dfs_flush", dfs_flush_count);
+        dumper.DumpData("dfs_flush_hang", dfs_flush_hang);
+        dumper.DumpData("dfs_list", dfs_list_count);
+        dumper.DumpData("dfs_list_hang", dfs_list_hang);
+        dumper.DumpData("dfs_info", dfs_info_count);
+        dumper.DumpData("dfs_info_hang", dfs_info_hang);
+        dumper.DumpData("dfs_exists", dfs_exists_count);
+        dumper.DumpData("dfs_exists_hang", dfs_exists_hang);
+        dumper.DumpData("dfs_open", dfs_open_count);
+        dumper.DumpData("dfs_open_hang", dfs_open_hang);
+        dumper.DumpData("dfs_close", dfs_close_count);
+        dumper.DumpData("dfs_close_hang", dfs_close_hang);
+        dumper.DumpData("dfs_delete", dfs_delete_count);
+        dumper.DumpData("dfs_delete_hang", dfs_delete_hang);
+        dumper.DumpData("dfs_tell", dfs_tell_count);
+        dumper.DumpData("dfs_tell_hang", dfs_tell_hang);
+        dumper.DumpData("dfs_other", dfs_other_count);
+        dumper.DumpData("dfs_other_hang", dfs_other_hang);
     }
 
-    LOG(INFO) << "[Dfs] read " << leveldb::dfs_read_counter.Clear() << " "
-        << leveldb::dfs_read_hang_counter.Get() << " "
+    LOG(INFO) << "[Dfs] read " << dfs_read_count << " "
+        << dfs_read_hang << " "
         << "rdelay " << rdelay << " "
-        << "write " << leveldb::dfs_write_counter.Clear() << " "
-        << leveldb::dfs_write_hang_counter.Get() << " "
+        << "rdelay_total " << dfs_read_delay << " "
+        << "write " << dfs_write_count << " "
+        << dfs_write_hang << " "
         << "wdelay " << wdelay << " "
-        << "sync " << leveldb::dfs_sync_counter.Clear() << " "
-        << leveldb::dfs_sync_hang_counter.Get() << " "
+        << "wdelay_total " << dfs_write_delay << " "
+        << "sync " << dfs_sync_count << " "
+        << dfs_sync_hang << " "
         << "sdelay " << sdelay << " "
-        << "flush " << leveldb::dfs_flush_counter.Clear() << " "
-        << leveldb::dfs_flush_hang_counter.Get() << " "
-        << "list " << leveldb::dfs_list_counter.Clear() << " "
-        << leveldb::dfs_list_hang_counter.Get() << " "
-        << "info " << leveldb::dfs_info_counter.Clear() << " "
-        << leveldb::dfs_info_hang_counter.Get() << " "
-        << "exists " << leveldb::dfs_exists_counter.Clear() << " "
-        << leveldb::dfs_exists_hang_counter.Get() << " "
-        << "open " << leveldb::dfs_open_counter.Clear() << " "
-        << leveldb::dfs_open_hang_counter.Get() << " "
-        << "close " << leveldb::dfs_close_counter.Clear() << " "
-        << leveldb::dfs_close_hang_counter.Get() << " "
-        << "delete " << leveldb::dfs_delete_counter.Clear() << " "
-        << leveldb::dfs_delete_hang_counter.Get() << " "
-        << "tell " << leveldb::dfs_tell_counter.Clear() << " "
-        << leveldb::dfs_tell_hang_counter.Get() << " "
-        << "other " << leveldb::dfs_other_counter.Clear() << " "
-        << leveldb::dfs_other_hang_counter.Get();
+        << "sdelay_total " << dfs_sync_delay << " "
+        << "flush " << dfs_flush_count << " "
+        << dfs_flush_hang << " "
+        << "list " << dfs_list_count << " "
+        << dfs_list_hang << " "
+        << "info " << dfs_info_count << " "
+        << dfs_info_hang << " "
+        << "exists " << dfs_exists_count << " "
+        << dfs_exists_hang << " "
+        << "open " << dfs_open_count << " "
+        << dfs_open_hang << " "
+        << "close " << dfs_close_count << " "
+        << dfs_close_hang << " "
+        << "delete " << dfs_delete_count << " "
+        << dfs_delete_hang << " "
+        << "tell " << dfs_tell_count << " "
+        << dfs_tell_hang << " "
+        << "other " << dfs_other_count << " "
+        << dfs_other_hang;
 
     // local info
+    int64_t posix_read_count = latest_report->FindMetricValue(kPosixReadCountMetric);
+    int64_t posix_write_count = latest_report->FindMetricValue(kPosixWriteCountMetric);
+    int64_t posix_sync_count = latest_report->FindMetricValue(kPosixSyncCountMetric);
+    int64_t posix_list_count = latest_report->FindMetricValue(kPosixListCountMetric);
+    int64_t posix_info_count = latest_report->FindMetricValue(kPosixInfoCountMetric);
+    int64_t posix_exists_count = latest_report->FindMetricValue(kPosixExistsCountMetric);
+    int64_t posix_open_count = latest_report->FindMetricValue(kPosixOpenCountMetric);
+    int64_t posix_close_count = latest_report->FindMetricValue(kPosixCloseCountMetric);
+    int64_t posix_delete_count = latest_report->FindMetricValue(kPosixDeleteCountMetric);
+    int64_t posix_tell_count = latest_report->FindMetricValue(kPosixTellCountMetric);
+    int64_t posix_seek_count = latest_report->FindMetricValue(kPosixSeekCountMetric);
+    int64_t posix_other_count = latest_report->FindMetricValue(kPosixOtherCountMetric);
     if (FLAGS_tera_tabletnode_dump_running_info) {
-        dumper.DumpData("local_read", leveldb::posix_read_counter.Get());
-        dumper.DumpData("local_write", leveldb::posix_write_counter.Get());
-        dumper.DumpData("local_sync", leveldb::posix_sync_counter.Get());
-        dumper.DumpData("local_list", leveldb::posix_list_counter.Get());
-        dumper.DumpData("local_info", leveldb::posix_info_counter.Get());
-        dumper.DumpData("local_exists", leveldb::posix_exists_counter.Get());
-        dumper.DumpData("local_open", leveldb::posix_open_counter.Get());
-        dumper.DumpData("local_close", leveldb::posix_close_counter.Get());
-        dumper.DumpData("local_delete", leveldb::posix_delete_counter.Get());
-        dumper.DumpData("local_tell", leveldb::posix_tell_counter.Get());
-        dumper.DumpData("local_seek", leveldb::posix_seek_counter.Get());
-        dumper.DumpData("local_other", leveldb::posix_other_counter.Get());
+        dumper.DumpData("local_read", posix_read_count);
+        dumper.DumpData("local_write", posix_write_count);
+        dumper.DumpData("local_sync", posix_sync_count);
+        dumper.DumpData("local_list", posix_list_count);
+        dumper.DumpData("local_info", posix_info_count);
+        dumper.DumpData("local_exists", posix_exists_count);
+        dumper.DumpData("local_open", posix_open_count);
+        dumper.DumpData("local_close", posix_close_count);
+        dumper.DumpData("local_delete", posix_delete_count);
+        dumper.DumpData("local_tell", posix_tell_count);
+        dumper.DumpData("local_seek", posix_seek_count);
+        dumper.DumpData("local_other", posix_other_count);
     }
 
-    LOG(INFO) << "[Local] read " << leveldb::posix_read_counter.Clear() << " "
-        << "write " << leveldb::posix_write_counter.Clear() << " "
-        << "sync " << leveldb::posix_sync_counter.Clear() << " "
-        << "list " << leveldb::posix_list_counter.Clear() << " "
-        << "info " << leveldb::posix_info_counter.Clear() << " "
-        << "exists " << leveldb::posix_exists_counter.Clear() << " "
-        << "open " << leveldb::posix_open_counter.Clear() << " "
-        << "close " << leveldb::posix_close_counter.Clear() << " "
-        << "delete " << leveldb::posix_delete_counter.Clear() << " "
-        << "tell " << leveldb::posix_tell_counter.Clear() << " "
-        << "seek " << leveldb::posix_seek_counter.Clear() << " "
-        << "other " << leveldb::posix_other_counter.Clear();
+    LOG(INFO) << "[Local] read " << posix_read_count << " "
+        << "write " << posix_write_count << " "
+        << "sync " << posix_sync_count << " "
+        << "list " << posix_list_count << " "
+        << "info " << posix_info_count << " "
+        << "exists " << posix_exists_count << " "
+        << "open " << posix_open_count << " "
+        << "close " << posix_close_count << " "
+        << "delete " << posix_delete_count << " "
+        << "tell " << posix_tell_count << " "
+        << "seek " << posix_seek_count << " "
+        << "other " << posix_other_count;
 }
 
 } // namespace tabletnode
diff --git a/src/tabletnode/tabletnode_sysinfo.h b/src/tabletnode/tabletnode_sysinfo.h
index 453f2df95..c20a2b519 100644
--- a/src/tabletnode/tabletnode_sysinfo.h
+++ b/src/tabletnode/tabletnode_sysinfo.h
@@ -50,15 +50,9 @@ class TabletNodeSysInfo {
 private:
     TabletNodeInfo info_;
     TabletMetaList tablet_list_;
-    int64_t mem_check_ts_;
-    int64_t net_check_ts_;
-    int64_t io_check_ts_;
-    int64_t net_tx_total_;
-    int64_t net_rx_total_;
-    int64_t cpu_check_ts_;
-
-    int64_t tablet_check_ts_;
+
     mutable Mutex mutex_;
+    int64_t last_check_ts_;
 };
 } // namespace tabletnode
 } // namespace tera
diff --git a/src/tabletnode/tabletnode_zk_adapter.cc b/src/tabletnode/tabletnode_zk_adapter.cc
old mode 100644
new mode 100755
index 6c9ab06e0..d3e3d7322
--- a/src/tabletnode/tabletnode_zk_adapter.cc
+++ b/src/tabletnode/tabletnode_zk_adapter.cc
@@ -422,6 +422,9 @@ void InsTabletNodeZkAdapter::OnKickMarkCreated() {
 }
 
 void InsTabletNodeZkAdapter::OnLockChange(std::string session_id, bool deleted) {
+    LOG(INFO) << "[OnLockChange] session_id = " << session_id 
+              << " deleted = " << deleted
+              << " now_session_id = " << ins_sdk_->GetSessionID();
     if (deleted || session_id != ins_sdk_->GetSessionID()) {
         LOG(ERROR) << "I lost my lock , so quit";
         _Exit(EXIT_FAILURE);
diff --git a/src/tabletnode/test/tabletnode_impl_test.cc b/src/tabletnode/test/tabletnode_impl_test.cc
index 808250b02..efc1d61b7 100644
--- a/src/tabletnode/test/tabletnode_impl_test.cc
+++ b/src/tabletnode/test/tabletnode_impl_test.cc
@@ -16,7 +16,7 @@
 #include "proto/proto_helper.h"
 #include "io/mock_tablet_io.h"
 
-DECLARE_bool(tera_zk_enabled);
+DECLARE_string(tera_coord_type);
 DECLARE_int32(tera_tabletnode_retry_period);
 DECLARE_string(tera_leveldb_env_type);
 
@@ -40,7 +40,7 @@ class TabletNodeImplTest : public ::testing::Test {
           m_ret_io_split(false),
           m_start_key("start_key"), m_end_key("end_key"),
           m_schema(DefaultTableSchema()) {
-        FLAGS_tera_zk_enabled = false;
+        FLAGS_tera_coord_type = "fake_zk";
 
         m_tablet_meta.set_table_name("name");
         m_tablet_meta.set_path("path");
diff --git a/src/tabletnode/test/tabletnode_sysinfo_test.cc b/src/tabletnode/test/tabletnode_sysinfo_test.cc
index 4f4c06724..e15c83a7c 100644
--- a/src/tabletnode/test/tabletnode_sysinfo_test.cc
+++ b/src/tabletnode/test/tabletnode_sysinfo_test.cc
@@ -5,7 +5,7 @@
 #define private public
 
 #include "tabletnode_sysinfo.h"
-#include "utils/timer.h"
+#include "common/timer.h"
 #include "gtest/gtest.h"
 
 namespace tera {
diff --git a/src/tera_c.cc b/src/tera_c.cc
index fd3fb2994..cd10eb1ba 100644
--- a/src/tera_c.cc
+++ b/src/tera_c.cc
@@ -39,7 +39,7 @@ static bool SaveError(char** errptr, const ErrorCode& s) {
     }
     if (errptr == NULL) {
         fprintf(stderr, "%s tera error: %s.\n",
-                common::timer::get_curtime_str().c_str(), s.GetReason().c_str());
+                tera::get_curtime_str().c_str(), s.GetReason().c_str());
         return true;
     }
 
@@ -164,7 +164,7 @@ bool tera_table_put_kv(tera_table_t* table, const char* key, uint64_t keylen,
     delete mutation;
     if (SaveError(errptr, err)) {
         fprintf(stderr, "%s tera error: %s.\n",
-                common::timer::get_curtime_str().c_str(), err.GetReason().c_str());
+                tera::get_curtime_str().c_str(), err.GetReason().c_str());
         return false;
     }
     return true;
@@ -197,7 +197,7 @@ bool tera_table_delete(tera_table_t* table, const char* row_key, uint64_t keylen
     delete mutation;
     if (SaveError(NULL, err)) {
         fprintf(stderr, "%s tera delete error: %s.\n",
-                common::timer::get_curtime_str().c_str(), err.GetReason().c_str());
+                tera::get_curtime_str().c_str(), err.GetReason().c_str());
         return false;
     }
     return true;
diff --git a/src/tera_flags.cc b/src/tera_flags.cc
old mode 100644
new mode 100755
index 70dba8404..b1364506d
--- a/src/tera_flags.cc
+++ b/src/tera_flags.cc
@@ -19,8 +19,10 @@ DEFINE_int32(tera_heartbeat_retry_times, 5, "the max retry times when fail to se
 
 DEFINE_string(tera_working_dir, "./", "the base dir for system data");
 
-DEFINE_bool(tera_zk_enabled, true, "enable zk adapter to collaborate with other master instances");
-DEFINE_bool(tera_mock_zk_enabled, false, "enable mock zk adapter to collaborate with other master instances");
+DEFINE_string(tera_coord_type, "", "the coordinator service type for tera cluster [zk,ins,mock_zk,mock_ins,fake_zk]");
+
+DEFINE_bool(tera_zk_enabled, true, "[obsoleted replace by --tera_coord_type=zk] enable zk adapter to coord");
+DEFINE_bool(tera_mock_zk_enabled, false, "[obsoleted replace by --tera_coord_type=mock_zk] enable mock zk adapter to coord");
 DEFINE_string(tera_zk_addr_list, "localhost:2180", "zookeeper server list");
 DEFINE_string(tera_zk_root_path, "/tera", "zookeeper root path");
 DEFINE_string(tera_fake_zk_path_prefix, "../fakezk", "fake zk path prefix in onebox tera");
@@ -31,6 +33,12 @@ DEFINE_string(tera_zk_lib_log_path, "../log/zk.log", "zookeeper library log outp
 DEFINE_string(tera_log_prefix, "", "prefix of log file (INFO, WARNING)");
 DEFINE_string(tera_local_addr, "", "local host's ip address");
 DEFINE_bool(tera_online_schema_update_enabled, false, "enable online-schema-update");
+DEFINE_bool(tera_info_log_clean_enable, true, "enable log cleaner task, enable as default");
+DEFINE_int64(tera_info_log_clean_period_second, 2592000, "time period (in second) for log cleaner task, 30 days as default");
+DEFINE_int64(tera_info_log_expire_second, 2592000, "expire time (in second) of log file, 30 days as default");
+DEFINE_bool(tera_metric_http_server_enable, true, "enable metric http server, enable as default");
+DEFINE_int32(tera_metric_http_server_listen_port, 20221, "listen port for metric http server");
+DEFINE_int64(tera_hardware_collect_period_second, 5, "hardware metrics checking period (in second)");
 
 /////////  io  /////////
 
@@ -100,10 +108,14 @@ DEFINE_int32(tera_master_impl_retry_times, 5, "the max retry times when master i
 DEFINE_string(tera_master_meta_table_name, "meta_table", "the meta table name");
 DEFINE_string(tera_master_meta_table_path, "meta", "the path of meta table");
 
-DEFINE_double(tera_master_workload_split_threshold, 3.5, "if workload(wwl) > 3.5, halve the splitsize");
+DEFINE_double(tera_master_workload_merge_threshold, 1.0, "if workload(wwl) < 1.0, enable merge on this tablet");
+DEFINE_double(tera_master_workload_split_threshold, 9.9, "if workload(wwl) > 9.9, trigger split by workload");
 DEFINE_int64(tera_master_split_tablet_size, 512, "the size (in MB) of tablet to trigger split");
+DEFINE_int64(tera_master_min_split_size, 64, "the size (in MB) of tablet to trigger split");
+DEFINE_double(tera_master_min_split_ratio, 0.25, "min ratio of split size of tablet schema to trigger split");
+DEFINE_int64(tera_master_split_history_time_interval, 600000, "minimal split time interval(ms)");
 DEFINE_int64(tera_master_merge_tablet_size, 0, "the size (in MB) of tablet to trigger merge");
-DEFINE_string(tera_master_gc_strategy, "incremental", "gc strategy, [default, incremental, trackable]");
+DEFINE_string(tera_master_gc_strategy, "trackable", "gc strategy, [default, trackable]");
 
 DEFINE_int32(tera_master_max_split_concurrency, 1, "the max concurrency of tabletnode for split tablet");
 DEFINE_int32(tera_master_max_load_concurrency, 5, "the max concurrency of tabletnode for load tablet");
@@ -118,10 +130,11 @@ DEFINE_bool(tera_master_move_tablet_enabled, true, "enable master to auto move t
 DEFINE_bool(tera_master_meta_isolate_enabled, false, "enable master to reserve a tabletnode for meta");
 DEFINE_bool(tera_master_load_balance_table_grained, true, "whether the load balance policy only consider the specified table");
 DEFINE_double(tera_master_load_balance_size_ratio_trigger, 1.2, "ratio of heaviest node size to lightest to trigger load balance");
-DEFINE_int32(tera_master_load_balance_ts_load_threshold, 5000, "threshold of one tabletnode in QPS load-balance decision");
+DEFINE_int32(tera_master_load_balance_ts_load_threshold, 1000000000, "threshold of one tabletnode in QPS load-balance decision");
+DEFINE_int64(tera_master_load_balance_ts_size_threshold, 0, "threshold of one tabletnode in Size load-balance decision");
 DEFINE_int32(tera_master_load_balance_scan_weight, 300, "scan weight in load-balance decision");
 
-DEFINE_double(tera_safemode_tablet_locality_ratio, 0.3, "the tablet locality ratio threshold of safemode");
+DEFINE_double(tera_safemode_tablet_locality_ratio, 0.9, "the tablet locality ratio threshold of safemode");
 DEFINE_bool(tera_master_kick_tabletnode_enabled, true, "enable master to kick tabletnode");
 DEFINE_int32(tera_master_kick_tabletnode_query_fail_times, 10, "the number of query fail to kick tabletnode");
 DEFINE_int32(tera_master_control_tabletnode_retry_period, 60000, "the retry period (in ms) for master control tabletnode");
@@ -147,27 +160,31 @@ DEFINE_int64(tera_master_stat_table_interval, 60, "interval of system status dum
 DEFINE_int64(tera_master_stat_table_splitsize, 100, "default split size of stat table");
 
 DEFINE_int32(tera_master_gc_period, 60000, "the period (in ms) for master gc");
+DEFINE_bool(tera_master_gc_trash_enabled, true, "enable master gc trash");
+DEFINE_int64(tera_master_gc_trash_expire_time_s, 86400, "time (in second) for gc file keeped in trash");
+DEFINE_int64(tera_master_gc_trash_clean_period_s, 3600, "period (in second) for clean gc trash");
 DEFINE_int64(tera_master_ins_session_timeout, 10000000, "ins session timeout(us), default 10sec");
 
 DEFINE_bool(tera_master_availability_check_enabled, true, "whether execute availability check");    // reload config safety
 DEFINE_bool(tera_master_availability_show_details_enabled, false, "whether show details of not-ready tablets"); // reload config safety
 DEFINE_int64(tera_master_not_available_threshold, 0, "the threshold (in s) of not available");     // reload config safety
 DEFINE_int64(tera_master_availability_check_period, 60, "the period (in s) of availability check"); // reload config safety
-DEFINE_int64(tera_master_availability_warning_threshold, 30, "30s, the threshold (in s) of warning availability"); // reload config safety
-DEFINE_int64(tera_master_availability_error_threshold, 300, "5 minutes, the threshold (in s) of error availability");        // reload config safety
-DEFINE_int64(tera_master_availability_fatal_threshold, 1800, "30 minutes, the threshold (in s) of fatal availability");        // reload config safety
+DEFINE_int64(tera_master_availability_warning_threshold, 60, "1 minute, the threshold (in s) of warning availability"); // reload config safety
+DEFINE_int64(tera_master_availability_error_threshold, 600, "10 minutes, the threshold (in s) of error availability");        // reload config safety
+DEFINE_int64(tera_master_availability_fatal_threshold, 3600, "1 hour, the threshold (in s) of fatal availability");        // reload config safety
+DEFINE_bool(tera_master_update_split_meta, true, "[split] update child tablets meta from master");
 
 ///////// tablet node  /////////
 
 DEFINE_string(tera_tabletnode_port, "20000", "the tablet node port of tera system");
-DEFINE_int32(tera_tabletnode_ctrl_thread_num, 10, "control thread number of tablet node (query/load/unload/split)");
+DEFINE_int32(tera_tabletnode_ctrl_thread_num, 20, "control thread number of tablet node (query/load/unload/split)");
 DEFINE_int32(tera_tabletnode_write_thread_num, 10, "write thread number of tablet node");
 DEFINE_int32(tera_tabletnode_read_thread_num, 40, "read thread number of tablet node");
-DEFINE_int32(tera_tabletnode_scan_thread_num, 5, "scan thread number of tablet node");
+DEFINE_int32(tera_tabletnode_scan_thread_num, 30, "scan thread number of tablet node");
 DEFINE_int32(tera_tabletnode_manual_compact_thread_num, 2, "the manual compact thread number of tablet node server");
 DEFINE_int32(tera_tabletnode_impl_thread_min_num, 1, "the min thread number for tablet node impl operations");
 DEFINE_int32(tera_tabletnode_impl_thread_max_num, 10, "the max thread number for tablet node impl operations");
-DEFINE_int32(tera_tabletnode_compact_thread_num, 10, "the max thread number for leveldb compaction");
+DEFINE_int32(tera_tabletnode_compact_thread_num, 30, "the max thread number for leveldb compaction");
 
 DEFINE_int32(tera_tabletnode_scanner_cache_size, 5, "default tablet scanner manager cache no more than 100 stream");
 DEFINE_int32(tera_tabletnode_connect_retry_times, 5, "the max retry times when connect to tablet node");
@@ -180,16 +197,20 @@ DEFINE_int32(tera_tabletnode_scan_pack_max_size, 10240, "the max size(KB) of the
 
 DEFINE_int32(tera_asyncwriter_pending_limit, 10000, "the max pending data size (KB) in async writer");
 DEFINE_bool(tera_enable_level0_limit, true, "enable level0 limit");
-DEFINE_int32(tera_tablet_level0_file_limit, 20000, "the max level0 file num before write busy");
+DEFINE_int32(tera_tablet_level0_file_limit, 500, "the max level0 file num before write busy");
 DEFINE_int32(tera_tablet_ttl_percentage, 99, "percentage of ttl tag in sst file begin to trigger compaction");
 DEFINE_int32(tera_tablet_del_percentage, 20, "percentage of del tag in sst file begin to trigger compaction");
-DEFINE_int32(tera_asyncwriter_sync_interval, 100, "the interval (in ms) to sync write buffer to disk");
+DEFINE_int32(tera_asyncwriter_sync_interval, 10, "the interval (in ms) to sync write buffer to disk");
 DEFINE_int32(tera_asyncwriter_sync_size_threshold, 1024, "force sync per X KB");
 DEFINE_int32(tera_asyncwriter_batch_size, 1024, "write batch to leveldb per X KB");
 DEFINE_int32(tera_request_pending_limit, 100000, "the max read/write request pending");
 DEFINE_int32(tera_scan_request_pending_limit, 1000, "the max scan request pending");
 DEFINE_int32(tera_garbage_collect_period, 1800, "garbage collect period in s");
 DEFINE_int32(tera_garbage_collect_debug_log, 0, "garbage collect debug log");
+DEFINE_bool(tera_leveldb_ignore_corruption_in_open, false, "ignore fs error when open db");
+DEFINE_int32(tera_leveldb_slow_down_level0_score_limit, 100, "control level 0 score compute, score / 2 or sqrt(score / 2)");
+DEFINE_int32(tera_leveldb_max_background_compactions, 8, "multi-thread compaction number");
+DEFINE_int32(tera_tablet_max_sub_parallel_compaction, 10, "max sub compaction in parallel");
 
 DEFINE_int32(tera_tabletnode_write_meta_rpc_timeout, 60000, "the timeout period (in ms) for tabletnode write meta");
 DEFINE_int32(tera_tabletnode_retry_period, 100, "the retry interval period (in ms) when operate tablet");
@@ -219,6 +240,7 @@ DEFINE_int32(tera_tabletnode_tcm_cache_release_period, 180, "the period (in sec)
 DEFINE_int64(tera_tabletnode_tcm_cache_size, 838860800, "TCMALLOC_MAX_TOTAL_THREAD_CACHE_BYTES");
 DEFINE_bool(tera_tabletnode_dump_running_info, true, "dump tabletnode running info");
 DEFINE_string(tera_tabletnode_running_info_dump_file, "../monitor/ts.info.data", "file path for dump running info");
+DEFINE_int64(tera_tabletnode_sysinfo_check_interval, 9223372036854775806, "sysinfo check db health interval in us, default int64_max - 1");
 
 ///////// SDK  /////////
 DEFINE_string(tera_sdk_impl_type, "tera", "the activated type of SDK impl");
@@ -248,9 +270,9 @@ DEFINE_int32(tera_sdk_timeout_precision, 100, "precision of sdk read/write timeo
 DEFINE_int32(tera_sdk_delay_send_internal, 2, "the sdk resend the request internal time(s)");
 DEFINE_int32(tera_sdk_scan_buffer_limit, 2048000, "the pack size limit for scan operation");
 DEFINE_bool(tera_sdk_write_sync, false, "sync flag for write");
-DEFINE_int32(tera_sdk_batch_size, 100, "batch_size");
-DEFINE_int32(tera_sdk_write_send_interval, 100, "write batch send interval time");
-DEFINE_int32(tera_sdk_read_send_interval, 10, "read batch send interval time");
+DEFINE_int32(tera_sdk_batch_size, 250, "batch_size");
+DEFINE_int32(tera_sdk_write_send_interval, 10, "write batch send interval time");
+DEFINE_int32(tera_sdk_read_send_interval, 5, "read batch send interval time");
 DEFINE_int64(tera_sdk_max_mutation_pending_num, INT64_MAX, "default number of pending mutations in async put op");
 DEFINE_int64(tera_sdk_max_reader_pending_num, INT64_MAX, "default number of pending readers in async get op");
 DEFINE_bool(tera_sdk_async_blocking_enabled, true, "enable blocking when async writing and reading");
@@ -264,24 +286,120 @@ DEFINE_int32(tera_sdk_cookie_update_interval, 600, "the interval of cookie updat
 
 DEFINE_bool(tera_sdk_perf_counter_enabled, true, "enable performance counter log");
 DEFINE_int64(tera_sdk_perf_counter_log_interval, 60, "the interval period (in sec) of performance counter log dumping");
+DEFINE_bool(tera_sdk_perf_collect_enabled, false, "enable collect perf counter for metrics");
+DEFINE_int32(tera_sdk_perf_collect_interval, 10000, "the interval of collect perf counter(ms)");
 
 DEFINE_bool(tera_sdk_batch_scan_enabled, true, "enable batch scan");
 DEFINE_int64(tera_sdk_scan_buffer_size, 65536, "default buffer limit for scan");
 DEFINE_int64(tera_sdk_scan_number_limit, 1000000000, "default number limit for scan");
 DEFINE_int32(tera_sdk_max_batch_scan_req, 30, "the max number of concurrent scan req");
-DEFINE_int32(tera_sdk_batch_scan_max_retry, 60, "the max retry times for session scan");
 DEFINE_int64(tera_sdk_scan_timeout, 30000, "scan timeout");
+DEFINE_int32(tera_sdk_batch_scan_max_retry, 60, "the max retry times for session scan");
 DEFINE_int64(batch_scan_delay_retry_in_us, 1000000, "timewait in us before retry batch scan");
+DEFINE_int32(tera_sdk_sync_scan_max_retry, 10, "the max retry times for sync scan");
+DEFINE_int64(sync_scan_delay_retry_in_ms, 1000, "timewait in ms before retry sync scan");
 
 DEFINE_string(tera_ins_addr_list, "", "the ins cluster addr. e.g. abc.com:1234,abb.com:1234");
 DEFINE_string(tera_ins_root_path, "", "root path on ins. e.g /ps/sandbox");
-DEFINE_bool(tera_ins_enabled, false, "option to open ins naming");
-DEFINE_bool(tera_mock_ins_enabled, false, "option to open mock ins naming");
+DEFINE_bool(tera_ins_enabled, false, "[obsoleted replace by --tera_coord_type=ins] option to open ins naming");
+DEFINE_bool(tera_mock_ins_enabled, false, "[obsoleted replace by --tera_coord_type=mock_ins] option to open mock ins naming");
 DEFINE_int64(tera_ins_session_timeout, 600000000, "ins session timeout(us), default 10min");
+DEFINE_int64(tera_sdk_ins_session_timeout, 10000000, "ins session timeout(us), default 10s");
 
 DEFINE_int64(tera_sdk_status_timeout, 600, "(s) check tablet/tabletnode status timeout");
+DEFINE_uint64(tera_sdk_read_max_qualifiers, 18446744073709551615U, "read qu limit of each cf, default value is the max of uint64");
 
 /////////  http /////////
 DEFINE_string(tera_http_port, "8657", "the http proxy port of tera");
 DEFINE_int32(tera_http_request_thread_num, 30, "the http proxy thread num for handle client request");
 DEFINE_int32(tera_http_ctrl_thread_num, 10, "the http proxy thread num for it self");
+
+/////////  timeoracle /////////
+DEFINE_string(tera_timeoracle_port, "30000", "the timeoracle port of tera");
+DEFINE_int32(tera_timeoracle_max_lease_second, 30, "timeoracle work this seconds for a lease");
+DEFINE_int32(tera_timeoracle_refresh_lease_second, 10, "timeoracle refresh lease before this seconds");
+
+// only used by timeoracle
+DEFINE_bool(tera_timeoracle_mock_enabled, false, "used local filesystem replace zk and ins.");
+DEFINE_string(tera_timeoracle_mock_root_path, "/tmp/", "the root path of local filesystem.");
+DEFINE_int32(tera_timeoracle_work_thread_num, 16, "timeoracle sofarpc server work_thread_number");
+DEFINE_int32(tera_timeoracle_io_service_pool_size, 4, "timeoracle sofarpc server io_service_pool_size");
+
+/////////  global transaction  ////////
+DEFINE_bool(tera_sdk_client_for_gtxn, false, "build thread_pool for global transaction");
+DEFINE_bool(tera_sdk_tso_client_enabled, false, "get timestamp from timeoracle, default from local timestamp");
+DEFINE_int32(tera_gtxn_thread_max_num, 20, "the max thread number for global transaction operations");
+DEFINE_int32(tera_gtxn_timeout_ms, 600000, "global transaction timeout limit (ms) default 10 minutes");
+DEFINE_int32(tera_gtxn_get_waited_times_limit, 10, "global txn wait other locked times limit");
+DEFINE_int32(tera_gtxn_all_puts_size_limit, 10000, "global txn all puts data size limit");
+
+//////// observer ///////
+DEFINE_int32(observer_proc_thread_num, 3, "");
+DEFINE_int64(observer_max_pending_task, 10000, "");
+DEFINE_int32(observer_scanner_thread_num, 20, "");
+DEFINE_int32(observer_read_thread_num, 20, "observer read thread num");
+DEFINE_int32(observer_ack_conflict_timeout, 3600, "timeout for ack column conflict check");
+DEFINE_int32(observer_rowlock_client_thread_num, 20, "");
+
+//////// rowlock server ////////
+DEFINE_bool(rowlock_rpc_limit_enabled, false, "enable the rpc traffic limit in sdk");
+DEFINE_int32(rowlock_rpc_limit_max_inflow, 10, "the max bandwidth (in MB/s) for sdk rpc traffic limitation on input flow");
+DEFINE_int32(rowlock_rpc_limit_max_outflow, 10, "the max bandwidth (in MB/s) for sdk rpc traffic limitation on output flow");
+DEFINE_int32(rowlock_rpc_max_pending_buffer_size, 200, "max pending buffer size (in MB) for sdk rpc");
+DEFINE_int32(rowlock_rpc_work_thread_num, 2, "thread num of sdk rpc client");
+
+DEFINE_string(rowlock_server_ip, "0.0.0.0", "rowlock server ip");
+DEFINE_string(rowlock_server_port, "22222", "rowlock server port");
+DEFINE_string(rowlock_zk_root_path, "/rowlock", "");
+DEFINE_int32(rowlock_zk_timeout, 10000, "zk timeout");
+DEFINE_string(rowlock_ins_root_path, "/rowlock", "ins rowlock root path");
+DEFINE_int32(rowlock_server_node_num, 1, "number of rowlock servers in cluster");
+
+DEFINE_int32(rowlock_db_ttl, 600000, "timeout for an unlocked lock, 10min");
+DEFINE_int32(rowlock_timing_wheel_patch_num, 600, "the number of timing wheel, every patch_num step the oldest data will be cleared");
+DEFINE_int32(rowlock_db_sharding_number, 1024, "sharding number, enhance concurrency");
+DEFINE_string(rowlock_fake_root_path, "../fakezk/rowlock", "one box fake zk root path");
+DEFINE_int32(rowlock_thread_max_num, 20, "the max thread number of rowlock server");
+DEFINE_int32(rowlock_client_max_fail_times, 5, "client max failure time");
+
+DEFINE_bool(rowlock_proxy_async_enable, false, "sync | async");
+DEFINE_string(rowlock_proxy_port, "22223", "rowlock proxy port");
+/////////  load balancer  ////////
+DEFINE_string(tera_lb_server_addr, "0.0.0.0", "default load balancer rpc server addr");
+DEFINE_string(tera_lb_server_port, "31000", "default load balancer rpc server port");
+DEFINE_int32(tera_lb_server_thread_num, 2, "default load balancer rpc server thread pool num");
+DEFINE_int32(tera_lb_impl_thread_num, 1, "default load balancer impl thread pool num");
+DEFINE_int32(tera_lb_load_balance_period_s, 300, "default load balance period(s)");
+DEFINE_int32(tera_lb_max_compute_steps, 1000000, "default max compute steps for one balance procedure");
+DEFINE_int32(tera_lb_max_compute_steps_per_tablet, 1000, "default max compute steps per tablet for one balance procedure");
+DEFINE_int32(tera_lb_max_compute_time_ms, 30000, "default max compute time(ms) for one balance procedure");
+DEFINE_double(tera_lb_min_cost_need_balance, 0.1, "min cost needed for balance");
+DEFINE_double(tera_lb_move_count_cost_weight, 10, "move cost weight");
+DEFINE_int32(tera_lb_tablet_max_move_num, 10, "default tablet max move num for one balance procedure");
+DEFINE_double(tera_lb_tablet_max_move_percent, 0.001, "default tablet max move percent for one balance procedure");
+DEFINE_double(tera_lb_move_frequency_cost_weight, 10, "move frequency cost weight");
+DEFINE_int32(tera_lb_tablet_move_too_frequently_threshold_s, 600, "if move a tablet in this threshold time(s) again, it's been moved too frequently");
+DEFINE_double(tera_lb_abnormal_node_cost_weight, 10, "abnormal node cost weight");
+DEFINE_double(tera_lb_abnormal_node_ratio, 0.5, "abnormal node ratio");
+DEFINE_double(tera_lb_read_pending_node_cost_weight, 10, "read pending node cost weight");
+DEFINE_double(tera_lb_write_pending_node_cost_weight, 10, "write pending node cost weight");
+DEFINE_double(tera_lb_scan_pending_node_cost_weight, 10, "scan pending node cost weight");
+DEFINE_double(tera_lb_tablet_count_cost_weight, 0, "tablet count cost weight");
+DEFINE_double(tera_lb_size_cost_weight, 100, "size cost weight");
+DEFINE_double(tera_lb_read_load_cost_weight, 0, "read load cost weight");
+DEFINE_double(tera_lb_write_load_cost_weight, 0, "write load cost weight");
+DEFINE_double(tera_lb_scan_load_cost_weight, 0, "scan load cost weight");
+DEFINE_bool(tera_lb_debug_mode_enabled, false, "debug mode");
+
+DEFINE_int32(rowlock_io_service_pool_size, 4, "rowlock server sofarpc server io_service_pool_size");
+
+DEFINE_bool(mock_rowlock_enable, false, "test case switch");
+DEFINE_int64(tera_metric_hold_max_time, 300000, "interval of prometheus collectors push a value to hold_queue in ms");
+
+////////// PROFILER ///////////
+DEFINE_bool(cpu_profiler_enabled, false, "enable cpu profiler");
+DEFINE_bool(heap_profiler_enabled, false, "enable heap profiler");
+DEFINE_int32(cpu_profiler_dump_interval, 120, "cpu profiler dump interval");
+DEFINE_int32(heap_profiler_dump_interval, 120, "heap profiler dump interval");
+DEFINE_int64(heap_profile_allocation_interval, 1073741824, "Env variable for heap profiler's allocation interval");
+DEFINE_int64(heap_profile_inuse_interval, 1073741824, "Env variable for heap profiler's inuse interval");
diff --git a/src/tera_main.cc b/src/tera_main.cc
index 2331436b9..aa86c952f 100644
--- a/src/tera_main.cc
+++ b/src/tera_main.cc
@@ -8,12 +8,20 @@
 #include <glog/logging.h>
 
 #include "common/base/scoped_ptr.h"
+#include "common/log/log_cleaner.h"
+#include "common/heap_profiler.h"
+#include "common/cpu_profiler.h"
 #include "tera_entry.h"
 #include "utils/utils_cmd.h"
 #include "version.h"
 
+DECLARE_bool(cpu_profiler_enabled);
+DECLARE_bool(heap_profiler_enabled);
+DECLARE_int32(cpu_profiler_dump_interval);
+DECLARE_int32(heap_profiler_dump_interval);
 DECLARE_string(tera_log_prefix);
 DECLARE_string(tera_local_addr);
+DECLARE_bool(tera_info_log_clean_enable);
 
 extern std::string GetTeraEntryName();
 extern tera::TeraEntry* GetTeraEntry();
@@ -27,11 +35,25 @@ static void SignalIntHandler(int sig) {
 int main(int argc, char** argv) {
     ::google::ParseCommandLineFlags(&argc, &argv, true);
     ::google::InitGoogleLogging(argv[0]);
-    if (!FLAGS_tera_log_prefix.empty()) {
-        tera::utils::SetupLog(FLAGS_tera_log_prefix);
-    } else {
-        tera::utils::SetupLog(GetTeraEntryName());
+
+
+    if (FLAGS_tera_log_prefix.empty()) {
+        FLAGS_tera_log_prefix = GetTeraEntryName();
+        if (FLAGS_tera_log_prefix.empty()) {
+            FLAGS_tera_log_prefix = "tera";
+        }
     }
+    tera::utils::SetupLog(FLAGS_tera_log_prefix);
+
+    tera::CpuProfiler cpu_profiler;
+    cpu_profiler.SetEnable(FLAGS_cpu_profiler_enabled)
+                .SetInterval(FLAGS_cpu_profiler_dump_interval)
+                .SetProfilerFile("Cpu");
+
+    tera::HeapProfiler heap_profiler;
+    heap_profiler.SetEnable(FLAGS_heap_profiler_enabled)
+                 .SetInterval(FLAGS_heap_profiler_dump_interval)
+                 .SetProfilerFile("Heap");
 
     if (argc > 1) {
         std::string ext_cmd = argv[1];
@@ -52,6 +74,14 @@ int main(int argc, char** argv) {
     if (!entry->Start()) {
         return -1;
     }
+    
+    // start log cleaner
+    if (FLAGS_tera_info_log_clean_enable) {
+        common::LogCleaner::StartCleaner();
+        LOG(INFO) << "start log cleaner";
+    } else {
+        LOG(INFO) << "log cleaner is disable";
+    }
 
     while (!g_quit) {
         if (!entry->Run()) {
@@ -63,6 +93,8 @@ int main(int argc, char** argv) {
         LOG(INFO) << "received interrupt signal from user, will stop";
     }
 
+    common::LogCleaner::StopCleaner();
+
     if (!entry->Shutdown()) {
         return -1;
     }
diff --git a/src/tera_test_main.cc b/src/tera_test_main.cc
index f7fb788c7..915c172d9 100644
--- a/src/tera_test_main.cc
+++ b/src/tera_test_main.cc
@@ -34,7 +34,6 @@ DEFINE_int64(pending_num, 100000, "");
 DECLARE_string(flagfile);
 
 using namespace tera;
-using namespace common::timer;
 
 void Usage(const std::string& prg_name) {
     std::cout << "DESCRIPTION \n\
@@ -43,13 +42,13 @@ void Usage(const std::string& prg_name) {
        version \n";
 }
 
-static common::Counter w_pending;
-static common::Counter w_succ;
-static common::Counter w_total;
-static common::Counter r_pending;
-static common::Counter r_succ;
-static common::Counter r_total;
-static common::Counter launch_time;
+static Counter w_pending;
+static Counter w_succ;
+static Counter w_total;
+static Counter r_pending;
+static Counter r_succ;
+static Counter r_total;
+static Counter launch_time;
 
 void PrintStat() {
     LOG(INFO) << "Write total " << w_total.Get()
@@ -298,7 +297,7 @@ int32_t SharedTableImplTest(int32_t argc, char** argv, ErrorCode* err) {
         thread_pool.AddTask(task);
     }
     while (thread_pool.PendingNum() > 0) {
-        std::cerr << common::timer::get_time_str(time(NULL)) << " "
+        std::cerr << get_time_str(time(NULL)) << " "
             << "waiting for test finish, pending " << thread_pool.PendingNum()
             << " tasks ..." << std::endl;
         sleep(1);
diff --git a/src/teracli_main.cc b/src/teracli_main.cc
index 49c29dd6d..31c9dd55a 100644
--- a/src/teracli_main.cc
+++ b/src/teracli_main.cc
@@ -2,7 +2,6 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.
 //
-
 #include <stdio.h>
 #include <stdlib.h>
 #include <unistd.h>
@@ -10,13 +9,14 @@
 #include <readline/readline.h>
 
 #include <fstream>
+#include <fcntl.h>
 #include <iomanip>
 #include <iostream>
 #include <limits>
 #include <map>
 #include <memory>
 #include <sstream>
-
+#include <errno.h>
 #include <gflags/gflags.h>
 #include <glog/logging.h>
 
@@ -26,6 +26,10 @@
 #include "common/console/progress_bar.h"
 #include "common/file/file_path.h"
 #include "io/coding.h"
+#include "io/utils_leveldb.h"
+#include "leveldb/dfs.h"
+#include "util/nfs.h"
+#include "util/hdfs.h"
 #include "proto/kv_helper.h"
 #include "proto/proto_helper.h"
 #include "proto/tabletnode.pb.h"
@@ -36,6 +40,7 @@
 #include "sdk/sdk_zk.h"
 #include "sdk/table_impl.h"
 #include "tera.h"
+#include "types.h"
 #include "utils/crypt.h"
 #include "utils/string_util.h"
 #include "utils/tprinter.h"
@@ -50,6 +55,15 @@ DECLARE_string(tera_zk_root_path);
 DECLARE_bool(tera_sdk_batch_scan_enabled);
 DECLARE_int64(tera_sdk_status_timeout);
 
+DECLARE_string(tera_leveldb_env_type);
+DECLARE_string(tera_leveldb_env_dfs_type);
+DECLARE_string(tera_leveldb_env_nfs_mountpoint);
+DECLARE_string(tera_leveldb_env_nfs_conf_path);
+DECLARE_string(tera_leveldb_env_hdfs2_nameservice_list);
+DECLARE_string(tera_dfs_so_path);
+DECLARE_string(tera_dfs_conf);
+DECLARE_uint64(tera_sdk_read_max_qualifiers);
+
 DEFINE_int32(tera_client_batch_put_num, 1000, "num of each batch in batch put mode");
 DEFINE_int32(tera_client_scan_package_size, 1024, "the package size (in KB) of each scan request");
 
@@ -59,6 +73,7 @@ DEFINE_string(rollback_name, "", "rollback operation's name");
 
 DEFINE_int32(lg, -1, "locality group number.");
 DEFINE_int32(concurrency, 1, "concurrency for compact table.");
+DEFINE_int32(compact_timeout, 120000, "tablet compact timeout(ms), default 20min");
 DEFINE_int64(timestamp, -1, "timestamp.");
 DEFINE_string(tablets_file, "", "tablet set file");
 
@@ -71,6 +86,15 @@ DEFINE_bool(rowkey_count, false, "is print rowkey count when scan");
 DEFINE_bool(stdout_is_tty, true, "is stdout connected to a tty");
 DEFINE_bool(reorder_tablets, false, "reorder tablets by ts list");
 
+// dfs related FLAGS
+DEFINE_bool(asowner, false, "become owner and execute the command");
+DEFINE_bool(e, false, "test dfs file exist or not");
+DEFINE_bool(z, false, "test dfs file is zero or not");
+DEFINE_bool(d, false, "test dfs file is directory or not");
+DEFINE_bool(override, false, "dfs put file override the existing one");
+DEFINE_bool(attribute, false, "dfs list file detail attribute");
+DEFINE_bool(recursive, false, "dfs remove file recursively");
+
 volatile int32_t g_start_time = 0;
 volatile int32_t g_end_time = 0;
 volatile int32_t g_used_time = 0;
@@ -88,16 +112,25 @@ using namespace tera;
 typedef std::shared_ptr<Table> TablePtr;
 typedef std::shared_ptr<TableImpl> TableImplPtr;
 typedef std::map<std::string, int32_t(*)(Client*, int32_t, std::string*, ErrorCode*)> CommandTable;
-
+// FileSystem command table
+typedef std::map<std::string, int32_t(*)(int32_t, std::string*, ErrorCode*)> FSCommandTable;
+//typedef std::map<std::string, std::function<int32_t(int32_t, std::string*, ErrorCode*)> > FSCommandTable;
 /// global variables of single-row-txn used in interactive mode
 tera::Transaction* g_row_txn = NULL;
 Table* g_row_txn_table = NULL;
 
+leveldb::Dfs* g_dfs = NULL;
+
 static CommandTable& GetCommandTable(){
     static CommandTable command_table;
     return command_table;
 }
 
+static FSCommandTable& GetFSCommandTable() {
+    static FSCommandTable fs_command_table;
+    return fs_command_table;
+}
+
 const char* builtin_cmd_list[] = {
     "create",
     "create   <schema> [<delimiter_file>]                              \n\
@@ -224,6 +257,13 @@ const char* builtin_cmd_list[] = {
          commit                                                           \n\
          (only support single row transaction)",
 
+    "cas",
+    "cas <tablename> <rowkey> <columnfamily:qualifier> <old_value> <new_value>                     \n\
+         Compare and set a value atomically. (The txn value of table schema must be 'on')          \n\
+         This command will compare the value at rowkey:columnfamily:qualifier with <old_value>:    \n\
+         -> equal    : put <new_value> to this location.                                           \n\
+         -> not equal: do nothing.",
+
     "user",
     "user <operation> <params>                                            \n\
           create          <username> <password>                           \n\
@@ -236,8 +276,14 @@ const char* builtin_cmd_list[] = {
     "tablet",
     "tablet <operation> <params>                                          \n\
             move    <tablet_path> <target_addr>                           \n\
+            movex   <tablet_path> <target_addr> <lg_list>                 \n\
+                    * only for force move tablet ignore error             \n\
             reload  <tablet_path>                                         \n\
                     force to unload and load on the same ts               \n\
+            reloadx <tablet_path> <lg_list>                               \n\
+                    force to unload and load on the same ts               \n\
+                    * only for force reload tablet ignore error           \n\
+                    lg_list : lg1:lg2:lg3                                 \n\
             compact <tablet_path>                                         \n\
             split   <tablet_path>                                         \n\
             merge   <tablet_path>                                         \n\
@@ -290,9 +336,27 @@ const char* builtin_cmd_list[] = {
     "help [cmd]                                                           \n\
           show manual for a or all cmd(s)",
 
+    "dfs",
+    "dfs [cmd]        args                                                \n\
+         mkdir        $NFS_PATH                                           \n\
+         touchz       $NFS_PATH                                           \n\
+         test         [-e|-z|-d]     $NFS_PATH                            \n\
+         get          $NFS_PATH      $LOCAL_PATH                          \n\
+         put          [--override]   $LOCAL_PATH    $NFS_PATH             \n\
+         ls           [--attribute]  $NFS_PATH                            \n\
+         lsr          [--attribute]  $NFS_PATH                            \n\
+         dus          $NFS_PATH                                           \n\
+         rm           [--recursive]  $NFS_PATH                            \n\
+         stat         $NFS_PATH                                           \n\
+         rename       $NFS_PATH_SRC  $NFS_PATH_DEST                       \n\
+         unlockdir    $NFS_PATH                                           \n\
+         checksum     $NFS_PATH      $OFFSET        $LENGTH               \n\
+         forcerelease $NFS_PATH",
+
     "version",
     "version                                                              \n\
              show version info",
+
 };
 
 static void PrintCmdHelpInfo(const char* msg) {
@@ -662,21 +726,21 @@ int32_t PutOp(Client* client, int32_t argc, std::string* argv, ErrorCode* err) {
         value = argv[5];
     }
 
-    RowMutation* mutation = table->NewRowMutation(rowkey);
+    std::unique_ptr<RowMutation> mutation(table->NewRowMutation(rowkey));
     if (FLAGS_timestamp == -1) {
         mutation->Put(columnfamily, qualifier, value);
     } else {
         mutation->Put(columnfamily, qualifier, FLAGS_timestamp, value);
     }
     if (g_row_txn != NULL) {
-        g_row_txn->ApplyMutation(mutation);
+        g_row_txn->ApplyMutation(mutation.get());
     } else {
-        table->ApplyMutation(mutation);
+        table->ApplyMutation(mutation.get());
     }
     if (mutation->GetError().GetType() != tera::ErrorCode::kOK) {
         std::cout << mutation->GetError().ToString() << std::endl;
+        return -1;
     }
-    delete mutation;
     return 0;
 }
 
@@ -912,7 +976,7 @@ int32_t GetOp(Client* client, int32_t argc, std::string* argv, ErrorCode* err) {
     std::string columnfamily = "";
     std::string qualifier = "";
     std::string value;
-    RowReader* reader = table->NewRowReader(rowkey);
+    std::unique_ptr<RowReader> reader(table->NewRowReader(rowkey));
     if (argc == 4) {
         // use table as kv or get row
     } else if (argc == 5) {
@@ -924,10 +988,11 @@ int32_t GetOp(Client* client, int32_t argc, std::string* argv, ErrorCode* err) {
             reader->AddColumnFamily(columnfamily);
         }
     }
+    reader->SetMaxQualifiers(FLAGS_tera_sdk_read_max_qualifiers);
     if (g_row_txn != NULL) {
-        g_row_txn->Get(reader);
+        g_row_txn->Get(reader.get());
     } else {
-        table->Get(reader);
+        table->Get(reader.get());
     }
     while (!reader->Done()) {
         std::cout << PrintableFormatter(reader->RowName()) << ":"
@@ -939,8 +1004,8 @@ int32_t GetOp(Client* client, int32_t argc, std::string* argv, ErrorCode* err) {
     if (reader->GetError().GetType() != tera::ErrorCode::kOK
         && reader->GetError().GetType() != tera::ErrorCode::kNotFound) {
         std::cout << reader->GetError().ToString() << std::endl;
+        return -1;
     }
-    delete reader;
     return 0;
 }
 
@@ -1052,6 +1117,7 @@ int32_t ScanRange(TablePtr& table, ScanDescriptor& desc, ErrorCode* err) {
     desc.SetBufferSize(FLAGS_tera_client_scan_package_size << 10);
     desc.SetAsync(FLAGS_tera_sdk_batch_scan_enabled);
     desc.SetSnapshot(FLAGS_snapshot);
+    desc.SetMaxQualifiers(FLAGS_tera_sdk_read_max_qualifiers);
 
     ResultStream* result_stream;
     if ((result_stream = table->Scan(desc, err)) == NULL) {
@@ -1161,7 +1227,7 @@ std::string BytesNumberToString(const uint64_t size) {
 
 std::string DateNumberToString(int64_t ts) {
     if (FLAGS_stdout_is_tty) {
-        return common::timer::get_time_str(ts);
+        return get_time_str(ts);
     }
     return NumberToString(ts);
 }
@@ -1172,6 +1238,10 @@ static std::string GetTabletStatusString(const TabletMetaList& tablet_list, int6
         // new tera master
         int64_t delta = now - tablet_list.timestamp(i);
         TabletStatus status = tablet_list.meta(i).status();
+        TabletStatus db_status = tablet_list.counter(i).db_status();
+        if (db_status == kTabletCorruption) {
+            return StatusCodeToString(db_status);
+        }
         if ((status == kTableReady) && (delta > FLAGS_tera_sdk_status_timeout * 1000000)) {
             return "kUnknown";
         } else {
@@ -1187,7 +1257,7 @@ int32_t ShowTabletList(const TabletMetaList& tablet_list, bool is_server_addr, b
     TPrinter printer;
     int cols;
     std::vector<string> row;
-    int64_t now = common::timer::get_micros();
+    int64_t now = get_micros();
     if (is_x) {
         if (is_server_addr) {
             cols = 14;
@@ -1492,7 +1562,7 @@ int32_t ShowSingleTable(Client* client, const string& table_name,
     if (FLAGS_stdout_is_tty) {
         std::cout << std::endl;
         std::cout << "create time: "
-            << common::timer::get_time_str(table_meta.create_time()) << std::endl;
+            << get_time_str(table_meta.create_time()) << std::endl;
         std::cout << std::endl;
     }
     ShowTabletList(tablet_list, true, is_x);
@@ -1514,7 +1584,7 @@ int32_t ShowSingleTabletNodeInfo(Client* client, const string& addr,
     std::cout << "  address:  " << info.addr() << std::endl;
     std::cout << "  status:   " << info.status_m() << std::endl;
     std::cout << "  update time:   "
-        << common::timer::get_time_str(info.timestamp() / 1000000) << "\n\n";
+        << get_time_str(info.timestamp() / 1000000) << "\n\n";
 
     int cols = 4;
     TPrinter printer(cols, "workload", "tablets", "load", "split");
@@ -1582,7 +1652,7 @@ int32_t ShowTabletNodesInfo(Client* client, bool is_x, ErrorCode* err) {
         return -1;
     }
 
-    int64_t now = common::timer::get_micros();
+    int64_t now = get_micros();
     int cols;
     TPrinter printer;
     if (is_x) {
@@ -2256,7 +2326,7 @@ int32_t CompactTablet(TabletInfo& tablet, int lg) {
     request.set_tablet_name(tablet.table_name);
     request.mutable_key_range()->set_key_start(tablet.start_key);
     request.mutable_key_range()->set_key_end(tablet.end_key);
-    tabletnode::TabletNodeClient tabletnode_client(tablet.server_addr, 60000);
+    tabletnode::TabletNodeClient tabletnode_client(tablet.server_addr, FLAGS_compact_timeout);
 
     std::string path;
     if (lg >= 0) {
@@ -2292,6 +2362,77 @@ int32_t CompactTablet(TabletInfo& tablet, int lg) {
     return 0;
 }
 
+static bool ComputeCompactInsertKeys(RawKey rawkey, std::string* start_key, std::string* end_key) {
+    static std::string x0("\x0", 1);
+    static std::string x1("\x1", 1);
+    *start_key = (rawkey == Readable ? *start_key + x1 : *start_key + x0);
+
+    // pop all '\x0' charcters at the tailing of end_key. Note that Readable should not contain any
+    // '\x0' characters but here we do not
+    while (end_key->size() > 0) {
+        unsigned char last = end_key->at(end_key->size() - 1);
+        if (last == '\x0') {
+            end_key->pop_back();
+        }
+        // for Readable key, if the last nonzero character of end_key is '\x1', the wanted key that
+        // is barely smaller than end_key is computed as: end_key.substr(0, end_key.rfind('\x1'));
+        // eg: end_key: abcde'\x1' -> wanted key: abcde
+        else if (rawkey == Readable && last == '\x1'){
+            end_key->pop_back();
+            return true;
+        }
+        else {
+            break;
+        }
+    }
+    // for other case, the wanted key that is barely smaller than end_key is computed as:  minus the
+    // last char of end_key with 1 and append '\x255' to end key until it reaches the max keysize
+    // allowed. Notice that the last char of end_key will not be '\x0' for Binary key and not be
+    // '\x0' nor '\x1' for Readable key here
+    if (end_key->size() > 0) {
+        (*end_key)[end_key->size() - 1] = char((*end_key)[end_key->size() - 1] - 1);
+    }
+    end_key->resize(kRowkeySize - 1, char(255));
+    return true;
+}
+
+void CompactPreprocess(TableImplPtr table, const std::vector<TabletInfo>& tablet_infos) {
+    std::vector<RowReader*> readers;
+    for (std::size_t i = 0; i < tablet_infos.size(); ++i) {
+        const TabletInfo& tablet_info = tablet_infos[i];
+        std::string start_key(tablet_info.start_key);
+        std::string end_key(tablet_info.end_key);
+        ComputeCompactInsertKeys(table->GetTableSchema().raw_key(), &start_key, &end_key);
+        std::vector<RowReader*> readers;
+        RowReader* start_reader = table->NewRowReader(start_key);
+        RowReader* end_reader = table->NewRowReader(end_key);
+        readers.push_back(start_reader);
+        readers.push_back(end_reader);
+    }
+    if (readers.size() > 0) {
+        table->Get(readers);
+    }
+    std::vector<RowMutation*> mutations;
+    for (std::size_t i = 0; i < readers.size(); ++i) {
+        if (readers[i]->GetError().GetType() == tera::ErrorCode::kNotFound) {
+            RowMutation* mutation = table->NewRowMutation(readers[i]->RowKey());
+            mutation->DeleteRow();
+            mutations.push_back(mutation);
+        }
+        delete readers[i];
+    }
+    if (mutations.size() > 0) {
+        table->ApplyMutation(mutations);
+        for (std::size_t i = 0; i < mutations.size(); ++i) {
+            if (mutations[i]->GetError().GetType() != tera::ErrorCode::kOK) {
+                LOG(WARNING) <<"write key " << DebugString(mutations[i]->RowKey())
+                    << " failed, error: " << mutations[i]->GetError().ToString();
+            }
+            delete mutations[i];
+        }
+    }
+}
+
 int32_t CompactTabletOp(Client* client, int32_t argc, std::string* argv, ErrorCode* err) {
     if (argc != 4) {
         PrintCmdHelpInfo(argv[1]);
@@ -2337,6 +2478,18 @@ int32_t CompactTabletOp(Client* client, int32_t argc, std::string* argv, ErrorCo
             << ", total tablets: " << tablet_list.size();
         return -4;
     }
+    std::string command = argv[1];
+    if (command == "compactx")
+    {
+        tera::ClientImpl* client_impl = static_cast<tera::ClientImpl*>(client);
+        TableImplPtr table_impl(client_impl->OpenTableInternal(table, err));
+        if (table_impl == NULL) {
+            LOG(ERROR) << "fail to open table: " << table;
+            return -5;
+        }
+        std::vector<TabletInfo> tablet_infos(1, *tablet_it);
+        CompactPreprocess(table_impl, tablet_infos);
+    }
 
     return CompactTablet(*tablet_it, lg);
 }
@@ -2409,32 +2562,34 @@ int32_t ScanTabletOp(Client* client, int32_t argc, std::string* argv, ErrorCode*
 }
 
 int32_t TabletOp(Client* client, int32_t argc, std::string* argv, ErrorCode* err) {
-    if ((argc != 4) && (argc != 5)) {
+    if ((argc != 4) && (argc != 5) && (argc != 6)) {
         PrintCmdHelpInfo(argv[1]);
         return -1;
     }
 
     std::string op = argv[2];
+    std::string tablet_id = argv[3];
+    std::string server_addr;
 
-    if (op == "compact") {
+    std::vector<std::string> arg_list;
+    arg_list.push_back(op);
+    arg_list.push_back(tablet_id);
+    if (op == "compact" || op == "compactx") {
         return CompactTabletOp(client, argc, argv, err);
     } else if (op == "scan" || op == "scanallv") {
         return ScanTabletOp(client, argc, argv, err);
-    } else if (op != "move" && op != "split" && op != "merge" && op != "reload") {
+    } else if (argc == 4 && (op == "reload" || op == "merge" || op == "split")) {
+        // nothing to do
+    } else if (argc == 5 && (op == "reloadx" || op == "move" || op == "split")) {
+        // reloadx->lg_list  move->server_addr  split->split_key
+        arg_list.push_back(argv[4]);
+    } else if (argc == 6 && op == "movex") {
+        arg_list.push_back(argv[4]); // server_addr
+        arg_list.push_back(argv[5]); // lg_list
+    } else {
         PrintCmdHelpInfo(argv[1]);
         return -1;
     }
-
-    std::string tablet_id = argv[3];
-    std::string server_addr;
-    if (argc == 5) {
-        server_addr = argv[4];
-    }
-
-    std::vector<std::string> arg_list;
-    arg_list.push_back(op);
-    arg_list.push_back(tablet_id);
-    arg_list.push_back(server_addr);
     if (!client->CmdCtrl("tablet", arg_list, NULL, NULL, err)) {
         LOG(ERROR) << "fail to " << op << " tablet " << tablet_id;
         return -1;
@@ -2543,6 +2698,19 @@ int32_t CompactOp(Client* client, int32_t argc, std::string* argv, ErrorCode* er
     }
     ReorderTabletList(&tablet_list);
 
+    std::string command = argv[1];
+    if (command == "compactx")
+    {
+        tera::ClientImpl* client_impl = static_cast<tera::ClientImpl*>(client);
+        TableImplPtr table_impl(client_impl->OpenTableInternal(tablename, err));
+        if (table_impl == NULL) {
+            LOG(ERROR) << "fail to open table: " << tablename;
+            return -5;
+        }
+        std::cout << "begin compact preprocess tablet: " << tablename << std::endl;
+        CompactPreprocess(table_impl, tablet_list);
+    }
+
     int conc = FLAGS_concurrency;
     if (conc <= 0 || conc > 1000) {
         LOG(ERROR) << "compact concurrency illegal: " << conc;
@@ -2556,7 +2724,7 @@ int32_t CompactOp(Client* client, int32_t argc, std::string* argv, ErrorCode* er
         thread_pool.AddTask(task);
     }
     while (thread_pool.PendingNum() > 0) {
-        std::cerr << common::timer::get_time_str(time(NULL)) << " "
+        std::cerr << get_time_str(time(NULL)) << " "
             << thread_pool.PendingNum()
             << " tablets waiting for compact ..." << std::endl;
         sleep(5);
@@ -3189,6 +3357,65 @@ int TxnOp(Client* client, int32_t argc, std::string* argv, ErrorCode* err) {
     }
 }
 
+int32_t CasOp(Client* client, int32_t argc, std::string* argv, ErrorCode* err) {
+    if (argc != 7) {
+        LOG(ERROR) << "args number error: " << argc << ", need 7";
+        PrintCmdHelpInfo(argv[1]);
+        return -1;
+    }
+
+    const std::string& tablename = argv[2];
+    TablePtr table(client->OpenTable(tablename, err));
+    if (!table) {
+        LOG(ERROR) << "fail to open table";
+        return -1;
+    }
+
+    const std::string& rowkey = argv[3];
+    const std::string& old_val = argv[5];
+    const std::string& new_val = argv[6];
+    std::string columnfamily = "";
+    std::string qualifier = "";
+    ParseCfQualifier(argv[4], &columnfamily, &qualifier);
+
+    std::unique_ptr<tera::Transaction> txn(table->StartRowTransaction(rowkey));
+    if (!txn) {
+        LOG(ERROR) << "fail to start row txn";
+        return -1;
+    }
+
+    std::unique_ptr<tera::RowReader> reader(table->NewRowReader(rowkey));
+    reader->AddColumn(columnfamily, qualifier);
+    txn->Get(reader.get());
+    if (reader->Done()) {
+        std::cout << "cas failed: NotFound" << std::endl;
+        return -1;
+    }
+    std::string cur_val = reader->Value();
+    if (cur_val != old_val) {
+        std::cout << "cas failed: NotEqual" << std::endl;
+        return -1;
+    }
+
+    std::unique_ptr<tera::RowMutation> mutation(table->NewRowMutation(rowkey));
+    mutation->Put(columnfamily, qualifier, new_val);
+    txn->ApplyMutation(mutation.get());
+    if (mutation->GetError().GetType() != tera::ErrorCode::kOK) {
+        std::cout << "cas failed: " << tera::strerr(mutation->GetError()) << std::endl;
+        return -1;
+    }
+
+    auto error_code = txn->Commit();
+    if (error_code.GetType() != tera::ErrorCode::kOK) {
+        std::cout << "cas failed: " << tera::strerr(error_code) << std::endl;
+        return -1;
+    } else {
+        std::cout << "cas success" << std::endl;
+    }
+
+    return 0;
+}
+
 int32_t HelpOp(Client*, int32_t argc, std::string* argv, ErrorCode*) {
     if (argc == 2) {
         PrintAllCmd();
@@ -3217,6 +3444,469 @@ bool ParseCommand(int argc, char** arg_list, std::vector<std::string>* parsed_ar
     return true;
 }
 
+
+int32_t InitDfsClient() {
+    if (g_dfs != NULL) {
+        return 0;
+    }
+    if (FLAGS_tera_leveldb_env_dfs_type == "nfs") {
+        if (access(FLAGS_tera_leveldb_env_nfs_conf_path.c_str(), R_OK) == 0) {
+            LOG(INFO) << "init nfs system: use configure file" << FLAGS_tera_leveldb_env_nfs_conf_path;
+            leveldb::Nfs::Init(FLAGS_tera_leveldb_env_nfs_mountpoint, FLAGS_tera_leveldb_env_nfs_conf_path);
+            g_dfs = leveldb::Nfs::GetInstance();
+        }
+        else {
+            LOG(FATAL) << "init nfs system: no configure file found";
+            return -1;
+        }
+    } else if (FLAGS_tera_leveldb_env_dfs_type == "hdfs2") {
+        LOG(INFO) << "hdfs2 system support currently, please use hadoop-client";
+        g_dfs = new leveldb::Hdfs2(FLAGS_tera_leveldb_env_hdfs2_nameservice_list);
+    } else if (FLAGS_tera_leveldb_env_dfs_type == "hdfs") {
+        g_dfs = new leveldb::Hdfs();
+    }
+    else {
+        LOG(INFO) << "init dfs system: " << FLAGS_tera_dfs_so_path << "(" << FLAGS_tera_dfs_conf << ")";
+        g_dfs = leveldb::Dfs::NewDfs(FLAGS_tera_dfs_so_path, FLAGS_tera_dfs_conf);
+    }
+    return 0;
+}
+
+int32_t FileSystemOp(Client* client, int32_t argc, std::string* argv, ErrorCode* err) {
+    if (argc < 4) {
+        PrintCmdHelpInfo(argv[1]);
+        return -1;
+    }
+    if (0 != InitDfsClient()) {
+        LOG(FATAL) << "InitDfsClient failed";
+        return -1;
+    }
+    std::string operation = argv[2];
+    if (GetFSCommandTable().find(operation) == GetFSCommandTable().end()) {
+        std::cerr << "unsupported dfs command: " << operation << std::endl;
+        return -1;
+    }
+    int ret = (GetFSCommandTable().find(operation)->second)(argc, argv, err);
+    return ret;
+}
+
+int DfsPrintAttr(const char* pathname, struct stat* st, void* arg = NULL) {
+    char mode_str[10];
+    memset(mode_str, '-', sizeof(mode_str));
+    char time_str[64];
+    strftime(time_str, sizeof(time_str), "%b %d %H:%M %Y", localtime(&st->st_mtime));
+    printf("%c%c%c%c%c%c%c%c%c%c %16lx %16ld %s %s",
+            (S_IFDIR & st->st_mode) ? 'd' : '-',
+            (S_IRUSR & st->st_mode) ? 'r' : '-',
+            (S_IWUSR & st->st_mode) ? 'w' : '-',
+            (S_IXUSR & st->st_mode) ? 'x' : '-',
+            (S_IRGRP & st->st_mode) ? 'r' : '-',
+            (S_IWGRP & st->st_mode) ? 'w' : '-',
+            (S_IXGRP & st->st_mode) ? 'x' : '-',
+            (S_IROTH & st->st_mode) ? 'r' : '-',
+            (S_IWOTH & st->st_mode) ? 'w' : '-',
+            (S_IXOTH & st->st_mode) ? 'x' : '-',
+            st->st_ino,
+            st->st_size, time_str, pathname);
+    if (S_IFDIR & st->st_mode) {
+        printf("/");
+    }
+    printf("\n");
+    return 0;
+}
+
+static std::string FormatPath(const std::string pathname) {
+    std::string result;
+    bool need_strip = false;
+    for (std::string::size_type i = 0; i < pathname.length(); ++i) {
+        if (pathname.at(i) == '/') {
+            if (need_strip) {
+                continue;
+            }
+            else {
+                result.push_back(pathname.at(i));
+                need_strip = true;
+            }
+        } else {
+            need_strip = false;
+            result.push_back(pathname.at(i));
+        }
+    }
+    if (result.at(result.length() - 1) == '/') {
+        result.pop_back();
+    }
+    return result;
+}
+
+int32_t DfsPrintPath(const char* pathname, struct stat* st, void* arg = NULL) {
+    printf("%s", FormatPath(pathname).c_str());
+    if (S_IFDIR & st->st_mode) {
+        printf("/");
+    }
+    printf("\n");
+    return 0;
+}
+
+int32_t DfsSizeSum(const char* pathname, struct stat* st, void* arg) {
+    uint64_t* sum = reinterpret_cast<uint64_t*>(arg);
+    if (!(S_IFDIR & st->st_mode)) {
+        *sum += st->st_size;
+    }
+    return 0;
+}
+
+int32_t DfsTryLockParentPath(const std::string path) {
+    std::string parent_path = path;
+    if (parent_path.at(parent_path.length() - 1) == '/') {
+        parent_path.pop_back();
+    }
+    std::string::size_type pos = parent_path.rfind("/");
+    if (pos == std::string::npos) {
+        fprintf(stderr, "invalid path: %s\n", path.c_str());
+        return -1;
+    }
+    if (pos == 0) {
+        parent_path = "/";
+    }
+    parent_path = parent_path.substr(0, pos);
+    return g_dfs->LockDirectory(parent_path);
+}
+
+int32_t DfsRmPath(const char* pathname, struct stat* st, void*) {
+    int ret = 0;
+    if (S_IFDIR & st->st_mode) {
+        ret = g_dfs->DeleteDirectory(pathname);
+        if (0 != ret) {
+            perror("RmDir fail");
+            return ret;
+        }
+    } else {
+        ret = g_dfs->Delete(pathname);
+        if (0 != ret) {
+            perror("unlink fail");
+        }
+    }
+    return ret;
+}
+
+typedef int(*WalkFunc)(const char*, struct stat*, void* arg);
+int32_t DfsDirWalk(const char* dir_name, WalkFunc func, bool is_recursive, void* arg = NULL) {
+    struct stat st;
+    memset(&st, 0, sizeof(struct stat));
+    char fullpath[4096] = {0};
+    // not a directory, end of recursive call
+    if (0 == g_dfs->Stat(dir_name, &st) && !(S_IFDIR & st.st_mode)) {
+        return 0;
+    }
+    std::vector<std::string> sub_paths;
+    if (0 != g_dfs->ListDirectory(dir_name, &sub_paths)) {
+        return -1;
+    }
+    if (func == DfsRmPath && FLAGS_asowner) {
+        if (0 != g_dfs->LockDirectory(dir_name)) {
+            fprintf(stderr, "Lock Directory %s failed", dir_name);
+            return -1;
+        }
+    }
+    for (std::size_t i = 0; i < sub_paths.size(); ++i) {
+        snprintf(fullpath, sizeof(fullpath), "%s/%s", dir_name, sub_paths[i].c_str());
+        memset(&st, 0, sizeof(struct stat));
+        if (g_dfs->Stat(fullpath, &st) < 0) {
+            perror("Stat failed");
+            continue;
+        }
+        if (is_recursive && (S_IFDIR & st.st_mode)) {
+            DfsDirWalk(fullpath, func, true, arg);
+        }
+        func(fullpath, &st, arg);
+    }
+    return 0;
+}
+
+
+int32_t DfsGetOp(int32_t argc, std::string* argv, ErrorCode* err) {
+    if (argc != 5) {
+        fprintf(stderr, "Invalid arguments");
+        return -1;
+    }
+    int ret = 0;
+    const std::string& src_path = argv[3];
+    const std::string& local_path = argv[4];
+    std::string local_file_path = local_path;
+    int local_fd = 0;
+    if (local_path != "-") {
+        struct stat st;
+        if (stat(local_path.c_str(), &st) == 0 && (S_IFDIR & st.st_mode)) {
+            char* tmp_src_path = strdup(src_path.c_str());
+            char* filename = basename(tmp_src_path);
+            local_file_path.append("/").append(filename);
+            free(tmp_src_path);
+        }
+        local_fd = open(local_file_path.c_str(), O_CREAT | O_WRONLY | O_TRUNC, 0644);
+        if (local_fd < 0) {
+            fprintf(stderr, "local file open fail, path=%s, errno=%d", local_file_path.c_str(), errno);
+            return errno;
+        }
+    }
+    leveldb::DfsFile* file = g_dfs->OpenFile(src_path, leveldb::RDONLY);
+    if (NULL == file) {
+        fprintf(stderr, "open dfs file fail, path=%s, errno=%d", src_path.c_str(), errno);
+        return errno;
+    }
+    char buf[128 * 1024];
+    ssize_t ret_size = 0;
+    while ((ret_size = file->Read(buf, sizeof(buf))) > 0) {
+        ssize_t writelen = write(local_fd, buf, ret_size);
+        if (writelen < 0) {
+            fprintf(stderr, "write local file fail, path=%s, errno=%d", local_file_path.c_str(), errno);
+            break;
+            ret = errno;
+        }
+    }
+    if (local_fd > 0) {
+        close(local_fd);
+    }
+    file->CloseFile();
+
+    return ret;
+}
+
+int32_t DfsPutOp(int32_t argc, std::string* argv, ErrorCode* err) {
+    fprintf(stderr, "not implemented");
+    return -1;
+}
+
+
+int32_t DfsLsOp(int32_t argc, std::string* argv, ErrorCode* err) {
+    const std::string& filename = argv[3];
+    struct stat fstat;
+    int ret = 0;
+    if (0 == g_dfs->Stat(filename.c_str(), &fstat)) {
+        if (S_IFDIR & fstat.st_mode) {
+            if (FLAGS_attribute) {
+                DfsPrintAttr(filename.c_str(), &fstat);
+                ret = DfsDirWalk(filename.c_str(), DfsPrintAttr, FLAGS_recursive);
+            } else {
+                DfsPrintPath(filename.c_str(), &fstat);
+                ret = DfsDirWalk(filename.c_str(), DfsPrintPath, FLAGS_recursive);
+            }
+        }
+        else {
+            if (FLAGS_attribute) {
+                DfsPrintAttr(filename.c_str(), &fstat);
+            }
+            else {
+                DfsPrintPath(filename.c_str(), &fstat);
+            }
+        }
+    }
+    return ret;
+}
+int32_t DfsLsrOp(int32_t argc, std::string* argv, ErrorCode* err) {
+
+    bool old_recursive_flag = FLAGS_recursive;
+    FLAGS_recursive = true;
+    DfsLsOp(argc, argv, err);
+    FLAGS_recursive = old_recursive_flag;
+    return errno;
+}
+
+int32_t DfsDusOp(int32_t argc, std::string* argv, ErrorCode* err) {
+    struct stat st;
+    const std::string& path = argv[3];
+    uint64_t size = 0;
+    if (g_dfs->Stat(path, &st) != 0) {
+        perror("Stat failed");
+        return errno;
+    }
+    if (S_IFDIR & st.st_mode) {
+        DfsDirWalk(path.c_str(), DfsSizeSum, true, &size);
+    } else {
+        DfsSizeSum(path.c_str(), &st, &size);
+    }
+    fprintf(stdout, "%s:\t%lu\n", path.c_str(), size);
+    return 0;
+}
+
+int32_t DfsTouchzOp(int32_t argc, std::string* argv, ErrorCode* err) {
+    const std::string& path = argv[3];
+    struct stat st;
+    std::string::size_type pos = path.rfind("/");
+    if (pos == std::string::npos || pos == path.length() - 1) {
+        fprintf(stderr, "invalid filepath: %s", path.c_str());
+        return -1;
+    }
+
+    int ret = g_dfs->Stat(path, &st);
+    if (0 != ret) {
+        if (errno != ENOENT) {
+            perror("Stat failed");
+            return errno;
+        }
+        std::string parent_path = path.substr(0, pos);
+        ret = g_dfs->CreateDirectory(parent_path);
+        if (0 != ret) {
+            perror("create parent path failed");
+            return errno;
+        }
+        if (FLAGS_asowner) {
+            DfsTryLockParentPath(path);
+        }
+        leveldb::DfsFile* file = g_dfs->OpenFile(path, leveldb::WRONLY);
+        if (NULL == file) {
+            perror("create or open file fail");
+            return errno;
+        }
+    } else {
+        if (S_IFDIR & st.st_mode) {
+            fprintf(stderr, "Touchz fail: %s not Regular file", path.c_str());
+            ret = EISDIR;
+        } else {
+            fprintf(stdout, "%s already exists", path.c_str());
+            ret = EEXIST;
+        }
+    }
+    return ret;
+}
+
+int32_t DfsMkdirOp(int32_t argc, std::string* argv, ErrorCode* err) {
+    const std::string& path = argv[3];
+    if (FLAGS_asowner) {
+        if (0 != DfsTryLockParentPath(path)) {
+            fprintf(stderr, "Try lock parent path failed");
+            return -1;
+        }
+    }
+    int ret = g_dfs->CreateDirectory(path);
+    if (0 != ret) {
+        fprintf(stderr, "Create Path: %s failed, errno=%d\n", path.c_str(), errno);
+        ret = errno;
+    }
+    return ret;
+}
+
+int32_t DfsRmOp(int32_t argc, std::string* argv, ErrorCode* err) {
+    const std::string& path = argv[3];
+    struct stat st;
+    if (0 != g_dfs->Stat(path.c_str(), &st)) {
+        perror("Stat fail: ");
+        return -1;
+    }
+    int ret = 0;
+    if (FLAGS_asowner) {
+        DfsTryLockParentPath(path);
+    }
+    if (st.st_mode & S_IFDIR) {
+        if (FLAGS_recursive) {
+            DfsDirWalk(path.c_str(), DfsRmPath, true, NULL);
+            ret = g_dfs->DeleteDirectory(path);
+        } else {
+            ret = g_dfs->DeleteDirectory(path);
+        }
+    } else {
+        ret = g_dfs->Delete(path);
+    }
+    if (0 != ret) {
+        perror("delete failed: ");
+    }
+
+    return errno;
+}
+
+int32_t DfsTestOp(int32_t argc, std::string* argv, ErrorCode* err) {
+    fprintf(stderr, "not implemented\n");
+    return -1;
+}
+
+int32_t DfsStatOp(int32_t argc, std::string* argv, ErrorCode* err) {
+    struct stat st;
+    const std::string& filename = argv[3];
+    if (0 != g_dfs->Stat(filename, &st)) {
+        return errno;
+    }
+    const char* file_type;
+    if (S_IFREG & st.st_mode) {
+        file_type = "Regular";
+    } else if (S_IFDIR & st.st_mode) {
+        file_type = "Directory";
+    } else {
+        file_type = "Symlink";
+    }
+    fprintf(stdout, "File:\t%s\n", filename.c_str());
+    fprintf(stdout, "Inode:\t0x%lx\n", st.st_ino);
+    fprintf(stdout, "Type:\t%s\n", file_type);
+    fprintf(stdout, "Size:\t%lu\n", st.st_size);
+    fprintf(stdout, "Mode:\t%o\n", st.st_mode & 0777);
+    fprintf(stdout, "Link:\t%lu\n", st.st_nlink);
+    fprintf(stdout, "Atime:\t%lu\t%s", st.st_atime, ctime(&st.st_atime));
+    fprintf(stdout, "Mtime:\t%lu\t%s", st.st_mtime, ctime(&st.st_mtime));
+    fprintf(stdout, "Ctime:\t%lu\t%s", st.st_ctime, ctime(&st.st_ctime));
+
+    return 0;
+}
+
+int32_t DfsRenameOp(int32_t argc, std::string* argv, ErrorCode* err) {
+    if (argc != 5) {
+        fprintf(stderr, "invalid arguments\n");
+        return -1;
+    }
+    std::string& src_path = argv[3];
+    std::string& dest_path = argv[4];
+    if (FLAGS_asowner) {
+        if (0 != DfsTryLockParentPath(dest_path)) {
+            fprintf(stderr, "Lock ParentPath failed");
+            return -1;
+        }
+    }
+
+    int ret = g_dfs->Rename(src_path, dest_path);
+    if (0 != ret) {
+        perror("Rename fail");
+        ret = errno;
+    }
+    return ret;
+}
+
+int32_t DfsUnlockDirOp(int32_t argc, std::string* argv, ErrorCode* err) {
+    const std::string& path = argv[3];
+    return g_dfs->ClearDirOwner(path);
+}
+
+int32_t DfsChecksumOp(int32_t argc, std::string* argv, ErrorCode* err) {
+    fprintf(stderr, "Not Implemented");
+    return -1;
+}
+
+int32_t DfsLChecksumOp(int32_t argc, std::string* argv, ErrorCode* err) {
+    fprintf(stderr, "Not Implemented");
+    return -1;
+}
+
+int32_t DfsForceReleaseOp(int32_t argc, std::string* argv, ErrorCode* err) {
+    fprintf(stderr, "Not Implemented");
+    return -1;
+}
+
+static void InitializeFileSystemCommandTable() {
+    FSCommandTable& fs_command_table = GetFSCommandTable();
+    fs_command_table["get"] = DfsGetOp;
+    fs_command_table["put"] = DfsPutOp;
+    fs_command_table["lsr"] = DfsLsrOp;
+    fs_command_table["ls"] = DfsLsOp;
+    fs_command_table["dus"] = DfsDusOp;
+    fs_command_table["touchz"] = DfsTouchzOp;
+    fs_command_table["mkdir"] = DfsMkdirOp;
+    fs_command_table["rm"] = DfsRmOp;
+    fs_command_table["test"] = DfsTestOp;
+    fs_command_table["stat"] = DfsStatOp;
+    fs_command_table["rename"] = DfsRenameOp;
+    fs_command_table["unlockdir"] = DfsUnlockDirOp;
+    fs_command_table["checksum"] = DfsChecksumOp;
+    fs_command_table["lchecksum"] = DfsLChecksumOp;
+    fs_command_table["forcerelease"] = DfsForceReleaseOp;
+    return;
+}
+
 static void InitializeCommandTable(){
     CommandTable& command_table = GetCommandTable();
     command_table["create"] = CreateOp;
@@ -3257,6 +3947,7 @@ static void InitializeCommandTable(){
     command_table["rename"] = RenameOp;
     command_table["meta"] = MetaOp;
     command_table["compact"] = CompactOp;
+    command_table["compactx"] = CompactOp;
     command_table["findmaster"] = FindMasterOp;
     command_table["findts"] = FindTsOp;
     command_table["findtablet"] = FindTabletOp;
@@ -3270,6 +3961,9 @@ static void InitializeCommandTable(){
     command_table["rangex"] = RangeOp;
     command_table["txn"] = TxnOp;
     command_table["help"] = HelpOp;
+    command_table["cas"] = CasOp;
+    command_table["dfs"] = FileSystemOp;
+    InitializeFileSystemCommandTable();
 }
 
 int ExecuteCommand(Client* client, int argc, char** arg_list) {
diff --git a/src/terautil.cc b/src/terautil.cc
new file mode 100644
index 000000000..e4f5727d0
--- /dev/null
+++ b/src/terautil.cc
@@ -0,0 +1,732 @@
+// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+//
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include <fstream>
+#include <iomanip>
+#include <iostream>
+#include <limits>
+#include <map>
+#include <memory>
+#include <sstream>
+
+#include <gflags/gflags.h>
+#include <glog/logging.h>
+
+#include "ins_sdk.h"
+
+#include "common/base/string_ext.h"
+#include "common/base/string_number.h"
+#include "common/console/progress_bar.h"
+#include "common/file/file_path.h"
+#include "io/coding.h"
+#include "proto/kv_helper.h"
+#include "proto/proto_helper.h"
+#include "proto/tabletnode.pb.h"
+#include "proto/tabletnode_client.h"
+#include "sdk/client_impl.h"
+#include "sdk/cookie.h"
+#include "sdk/sdk_utils.h"
+#include "sdk/sdk_zk.h"
+#include "sdk/table_impl.h"
+#include "tera.h"
+#include "types.h"
+#include "utils/config_utils.h"
+#include "utils/crypt.h"
+#include "utils/schema_utils.h"
+#include "utils/string_util.h"
+#include "utils/tprinter.h"
+#include "utils/utils_cmd.h"
+#include "version.h"
+
+DECLARE_string(flagfile);
+DECLARE_string(log_dir);
+DECLARE_string(tera_master_meta_table_name);
+
+DEFINE_string(dump_tera_src_conf, "../conf/src_tera.flag", "src cluster for tera");
+DEFINE_string(dump_tera_dest_conf, "../conf/dest_tera.flag", "dest cluster for tera");
+DEFINE_string(dump_tera_src_root_path, "/xxx_", "src tera root path");
+DEFINE_string(dump_tera_dest_root_path, "/xxx_", "dest tera root path");
+DEFINE_string(ins_cluster_addr, "terautil_ins", "terautil dump ins cluster conf");
+DEFINE_string(ins_cluster_root_path, "/terautil/dump/xxxx", "dump meta ins");
+DEFINE_string(dump_tera_src_meta_addr, "", "src addr for meta_table");
+DEFINE_string(dump_tera_dest_meta_addr, "", "dest addr for meta_table");
+DEFINE_int64(dump_manual_split_interval, 1000, "manual split interval in ms");
+DEFINE_bool(dump_enable_manual_split, false, "manual split may take a long time, so disable it");
+
+using namespace tera;
+
+const char* terautil_builtin_cmds[] = {
+    "dump",
+    "dump <operation>                                                  \n\
+            prepare_safe                                                    \n\
+            prepare                                                    \n\
+            run                                                        \n\
+            show                                                       \n\
+            check",
+
+    "help",
+    "help [cmd]                                                           \n\
+          show manual for a or all cmd(s)",
+
+    "version",
+    "version                                                              \n\
+             show version info",
+};
+
+static void ShowCmdHelpInfo(const char* msg) {
+    if (msg == NULL) {
+        return;
+    }
+    int count = sizeof(terautil_builtin_cmds)/sizeof(char*);
+    for (int i = 0; i < count; i+=2) {
+        if(strncmp(msg, terautil_builtin_cmds[i], 32) == 0) {
+            std::cout << terautil_builtin_cmds[i + 1] << std::endl;
+            return;
+        }
+    }
+}
+
+static void ShowAllCmd() {
+    std::cout << "there is cmd list:" << std::endl;
+    int count = sizeof(terautil_builtin_cmds)/sizeof(char*);
+    bool newline = false;
+    for (int i = 0; i < count; i+=2) {
+        std::cout << std::setiosflags(std::ios::left) << std::setw(20) << terautil_builtin_cmds[i];
+        if (newline) {
+            std::cout << std::endl;
+            newline = false;
+        } else {
+            newline = true;
+        }
+    }
+    std::cout << std::endl << "help [cmd] for details." << std::endl;
+}
+
+int32_t HelpOp(int32_t argc, char** argv) {
+    if (argc == 2) {
+        ShowAllCmd();
+    } else if (argc == 3) {
+        ShowCmdHelpInfo(argv[2]);
+    } else {
+        ShowCmdHelpInfo("help");
+    }
+    return 0;
+}
+
+int DumpRange(const std::string& ins_cluster_addr,
+              const std::string& ins_cluster_root_path,
+              const tera::TableMetaList& table_list,
+              const tera::TabletMetaList& tablet_list) {
+    int res = 0;
+    galaxy::ins::sdk::SDKError ins_err;
+    galaxy::ins::sdk::InsSDK ins_sdk(ins_cluster_addr);
+    std::string table_path = ins_cluster_root_path + "/table";
+    std::string tablet_path = ins_cluster_root_path + "/tablet";
+    //std::string lock_path = ins_cluster_root_path + "/lock";
+
+    for (int32_t i = 0; i < table_list.meta_size(); i++) {
+        const tera::TableMeta& meta = table_list.meta(i);
+        if (meta.table_name() == FLAGS_tera_master_meta_table_name) {
+            continue;
+        }
+        std::string key = table_path + "/" + meta.table_name();
+        if(!ins_sdk.Put(key, meta.table_name(), &ins_err)) {
+            LOG(WARNING) << "ins put: " << key << ", error " << ins_err;
+            return -1;
+        }
+    }
+
+    for (int32_t i = 0; i < tablet_list.meta_size(); i++) {
+        const tera::TabletMeta& meta = tablet_list.meta(i);
+        if (meta.table_name() == FLAGS_tera_master_meta_table_name) {
+            continue;
+        }
+        std::string table_name = meta.table_name();
+        std::string key = tablet_path + "/" + meta.table_name() + "/" + meta.key_range().key_start();
+        std::string val = "0";
+        val.append(meta.key_range().key_end());
+        if(!ins_sdk.Put(key, val, &ins_err)) {
+            LOG(WARNING) << "ins put: " << key << ", error " << ins_err;
+            return -1;
+        }
+        //std::string lock_key = lock_path + "/" + meta.table_name() + "/" + meta.key_range().key_start();
+    }
+    return res;
+}
+
+int ScanAndDumpMeta(const std::string& src_meta_tablet_addr,
+                    const std::string& dest_meta_tablet_addr,
+                    tera::TableMetaList* table_list,
+                    tera::TabletMetaList* tablet_list) {
+    uint64_t seq_id = 0;
+    tera::ScanTabletRequest request;
+    tera::ScanTabletResponse response;
+    tera::WriteTabletRequest write_request;
+    tera::WriteTabletResponse write_response;
+    uint64_t request_size = 0;
+    write_request.set_sequence_id(seq_id++);
+    write_request.set_tablet_name(FLAGS_tera_master_meta_table_name);
+    write_request.set_is_sync(true);
+    write_request.set_is_instant(true);
+
+    request.set_sequence_id(seq_id++);
+    request.set_table_name(FLAGS_tera_master_meta_table_name);
+    request.set_start("");
+    request.set_end("");
+    tera::tabletnode::TabletNodeClient src_meta_node_client(src_meta_tablet_addr);
+    bool success = true;
+    while ((success = src_meta_node_client.ScanTablet(&request, &response))) {
+        if (response.status() != tera::kTabletNodeOk) {
+            LOG(WARNING) << "dump: fail to load meta table: "
+                << StatusCodeToString(response.status());
+            return -1;
+        }
+        int32_t record_size = response.results().key_values_size();
+        LOG(INFO) << "scan meta table: " << record_size << " records";
+
+        bool need_dump = false;
+        std::string last_record_key;
+        for (int32_t i = 0; i < record_size; i++) {
+            const tera::KeyValuePair& record = response.results().key_values(i);
+            last_record_key = record.key();
+            char first_key_char = record.key()[0];
+
+            TableMeta table_meta;
+            TabletMeta tablet_meta;
+            if (first_key_char == '~') {
+                LOG(INFO) << "(user: " << record.key().substr(1) << ")";
+            } else if (first_key_char == '@') {
+                //ParseMetaTableKeyValue(record.key(), record.value(), table_list->add_meta());
+                table_meta.Clear();
+                ParseMetaTableKeyValue(record.key(), record.value(), &table_meta);
+
+                std::string key, val;
+                //table_meta.set_status(kTableDisable);
+                table_meta.mutable_schema()->set_merge_size(0); // never merge during dump
+                table_meta.mutable_schema()->set_split_size(10000000); // never split during dump
+                MakeMetaTableKeyValue(table_meta, &key, &val);
+
+                RowMutationSequence* mu_seq = write_request.add_row_list();
+                mu_seq->set_row_key(record.key());
+                Mutation* mutation = mu_seq->add_mutation_sequence();
+                mutation->set_type(tera::kPut);
+                mutation->set_value(val);
+                request_size += mu_seq->ByteSize();
+                if (request_size >= kMaxRpcSize) { // write req too large, dump into new tera cluster
+                    need_dump = true;
+                }
+
+                TableMeta* table_meta2 = table_list->add_meta();
+                table_meta2->CopyFrom(table_meta);
+            } else if (first_key_char > '@') {
+                //ParseMetaTableKeyValue(record.key(), record.value(), tablet_list->add_meta());
+                tablet_meta.Clear();
+                ParseMetaTableKeyValue(record.key(), record.value(), &tablet_meta);
+
+                std::string key, val;
+                tablet_meta.clear_parent_tablets();
+                //tablet_meta.set_status(kTabletDisable);
+                MakeMetaTableKeyValue(tablet_meta, &key, &val);
+
+                RowMutationSequence* mu_seq = write_request.add_row_list();
+                mu_seq->set_row_key(record.key());
+                Mutation* mutation = mu_seq->add_mutation_sequence();
+                mutation->set_type(tera::kPut);
+                mutation->set_value(val);
+                request_size += mu_seq->ByteSize();
+                if (request_size >= kMaxRpcSize) { // write req too large, dump into new tera cluster
+                    need_dump = true;
+                }
+
+                TabletMeta* tablet_meta2 = tablet_list->add_meta();
+                tablet_meta2->CopyFrom(tablet_meta);
+            } else {
+                LOG(WARNING) << "dump: invalid meta record: " << record.key();
+            }
+        }
+
+        if ((need_dump || record_size <= 0) &&
+            write_request.row_list_size() > 0) {
+            tabletnode::TabletNodeClient dest_meta_node_client(dest_meta_tablet_addr);
+            if (!dest_meta_node_client.WriteTablet(&write_request, &write_response)) {
+                LOG(WARNING) << "dump: fail to dump meta tablet: "
+                    << StatusCodeToString(kRPCError);
+                return -1;
+            }
+            tera::StatusCode status = write_response.status();
+            if (status == tera::kTabletNodeOk && write_response.row_status_list_size() > 0) {
+                status = write_response.row_status_list(0);
+            }
+            if (status != kTabletNodeOk) {
+                LOG(WARNING) << "dump: fail to dump meta tablet: "
+                    << StatusCodeToString(status);
+                return -1;
+            }
+            write_request.clear_row_list();
+            write_response.Clear();
+            request_size = 0;
+        }
+        if (record_size <= 0) {
+            response.Clear();
+            LOG(INFO) << "dump: scan meta table success";
+            break;
+        }
+
+        std::string next_record_key = tera::NextKey(last_record_key);
+        request.set_start(next_record_key);
+        request.set_end("");
+        request.set_sequence_id(seq_id++);
+        response.Clear();
+    }
+    return success? 0: -1;
+}
+
+int DumpPrepareOp() {
+    int res = 0;
+    std::string tera_src_conf = FLAGS_dump_tera_src_conf;
+    std::string tera_src_root = FLAGS_dump_tera_src_root_path;
+    std::string tera_dest_conf = FLAGS_dump_tera_dest_conf;
+    std::string tera_dest_root = FLAGS_dump_tera_dest_root_path;
+
+    // read src meta ts addr and dest meta ts addr
+    std::string src_meta_addr, dest_meta_addr;
+    src_meta_addr = FLAGS_dump_tera_src_meta_addr;
+    dest_meta_addr = FLAGS_dump_tera_dest_meta_addr;
+
+    // scan and dump meta
+    tera::TableMetaList table_list;
+    tera::TabletMetaList tablet_list;
+    if ((res = ScanAndDumpMeta(src_meta_addr, dest_meta_addr, &table_list, &tablet_list)) >= 0) {
+        // create key range in nexus
+        std::string ins_cluster_addr = FLAGS_ins_cluster_addr;
+        std::string ins_cluster_root_path = FLAGS_ins_cluster_root_path;
+        res = DumpRange(ins_cluster_addr, ins_cluster_root_path, table_list, tablet_list);
+    }
+    return res;
+}
+
+int GetAndLockDumpRange(const std::string& ins_cluster_root_path,
+                        std::string* table_name,
+                        std::string* start_key,
+                        std::string* end_key,
+                        galaxy::ins::sdk::InsSDK* ins_sdk) {
+    int res = -1;
+    galaxy::ins::sdk::SDKError ins_err;
+    //std::string table_path = ins_cluster_root_path + "/table";
+    std::string tablet_path = ins_cluster_root_path + "/tablet";
+    std::string lock_path = ins_cluster_root_path + "/lock";
+
+    std::string start = tablet_path + "/";
+    std::string end = tablet_path + "/";
+    if (table_name->size()) {
+        start.append(*table_name);
+        start.append("/");
+        start.append(*start_key);
+        if (*start_key == "") {
+            start.append(1, '\0');
+        }
+    }
+    end.append(1, '\255');
+    galaxy::ins::sdk::ScanResult* result = ins_sdk->Scan(start, end);
+    while (!result->Done()) {
+        if (result->Error() != galaxy::ins::sdk::kOK) {
+            LOG(INFO) << "scan fail: start " << start << ", end " << end << ", err " << result->Error();
+            res = -1;
+            break;
+        }
+        std::string key = result->Key();
+        std::string val = result->Value();
+        std::string has_done = val.substr(0, 1);
+        if (has_done == "1") { // someone has copy it
+            result->Next();
+            continue;
+        }
+
+        //std::string key = tablet_path + "/" + meta.table_name() + "/" + meta.key_range().key_start();
+        std::string str = key.substr(tablet_path.length() + 1);
+        std::size_t pos = str.find('/');
+        *table_name = str.substr(0, pos);
+        *start_key = str.substr(pos + 1);
+        *end_key = val.substr(1);
+
+        std::string lock_key = lock_path + "/" + *table_name + "/" + *start_key + "/";
+        if (!ins_sdk->TryLock(lock_key, &ins_err)) {
+            LOG(INFO) << "ins: TryLock fail: " << lock_key << ", err " << ins_err;
+            result->Next();
+            continue;
+        }
+
+        std::string val1;
+        if (ins_sdk->Get(key, &val1, &ins_err)) {
+            has_done = val1.substr(0, 1);
+        } else {
+            LOG(INFO) << "ins: get fail: " << key << ", err " << ins_err;
+        }
+        if (has_done == "1") { // someone has copy it
+            if (!ins_sdk->UnLock(lock_key, &ins_err)) {
+                LOG(INFO) << "ins: unlock fail: " << lock_key << ", err " << ins_err;
+            }
+            result->Next();
+            continue;
+        }
+
+        res = 0;
+        break; // begin to scan
+    }
+    delete result;
+    return res;
+}
+
+int ReleaseAndUnlockDumpRange(const std::string& ins_cluster_root_path,
+                              const std::string& table_name,
+                              const std::string& start_key,
+                              const std::string& end_key,
+                              galaxy::ins::sdk::InsSDK* ins_sdk) {
+    int res = 0;
+    galaxy::ins::sdk::SDKError ins_err;
+    //std::string table_path = ins_cluster_root_path + "/table";
+    std::string tablet_path = ins_cluster_root_path + "/tablet";
+    std::string lock_path = ins_cluster_root_path + "/lock";
+
+    std::string key = tablet_path + "/" + table_name + "/" + start_key;
+    std::string val = "1";
+    val.append(end_key);
+
+    if(!ins_sdk->Put(key, val, &ins_err)) {
+        LOG(WARNING) << "ins put: " << key << ", error " << ins_err;
+    }
+
+    std::string lock_key = lock_path + "/" + table_name + "/" + start_key + "/";
+    if (!ins_sdk->UnLock(lock_key, &ins_err)) {
+        LOG(WARNING) << "ins unlock fail: " << lock_key << ", error " << ins_err;
+    }
+    return res;
+}
+
+struct ScanDumpContext {
+    Counter counter;
+    volatile bool fail;
+    std::string reason;
+};
+
+void ScanAndDumpCallBack(RowMutation* mu) {
+    ScanDumpContext* ctx = (ScanDumpContext*)mu->GetContext();
+    if (mu->GetError().GetType() != tera::ErrorCode::kOK) {
+        if (ctx->fail == false) {
+            ctx->fail = true;
+            ctx->reason = mu->GetError().ToString();
+        }
+    }
+    delete mu;
+
+    ctx->counter.Dec();
+    return;
+}
+
+int ScanAndDumpData(Table* src, Table* dest,
+                    const std::string& table_name,
+                    const std::string& start_key,
+                    const std::string& end_key) {
+    int res = 0;
+    ErrorCode err;
+
+    ScanDescriptor desc(start_key);
+    desc.SetEnd(end_key);
+    desc.SetMaxVersions(std::numeric_limits<int>::max());
+    ResultStream* result_stream;
+    if ((result_stream = src->Scan(desc, &err)) == NULL) {
+        LOG(INFO) << "scan dump fail(new scan): " << table_name << ", start " << start_key
+            << ", end " << end_key;
+        return -1;
+    }
+    ScanDumpContext* ctx = new ScanDumpContext;
+    ctx->counter.Set(1);
+    ctx->fail = false;
+    while (!result_stream->Done(&err)) {
+        RowMutation* mu = dest->NewRowMutation(result_stream->RowName());
+        mu->Put(result_stream->Family(), result_stream->Qualifier(),
+                result_stream->Value(), result_stream->Timestamp());
+        ctx->counter.Inc();
+        mu->SetContext(ctx);
+        mu->SetCallBack(ScanAndDumpCallBack);
+        dest->ApplyMutation(mu);
+
+        result_stream->Next();
+    }
+    delete result_stream;
+    ctx->counter.Dec();
+
+    while (ctx->counter.Get() > 0) {
+        sleep(3);
+    }
+    if (ctx->fail == true) {
+        LOG(INFO) << "scan dump fail: " << table_name << ", start " << start_key
+            << ", end " << end_key << ", reason " << ctx->reason;
+        res = -1;
+    }
+    delete ctx;
+
+    if (err.GetType() != tera::ErrorCode::kOK) {
+        LOG(INFO) << "scan dump fail: " << table_name << ", start " << start_key
+            << ", end " << end_key << ", reason " << err.GetReason();
+        res = -1;
+    }
+    return res;
+}
+
+int DumpRunOp() {
+    int res = 0;
+    std::string ins_cluster_addr = FLAGS_ins_cluster_addr;
+    std::string ins_cluster_root_path = FLAGS_ins_cluster_root_path;
+    std::string tera_src_conf = FLAGS_dump_tera_src_conf;
+    std::string tera_dest_conf = FLAGS_dump_tera_dest_conf;
+
+    // get and lock range
+    ErrorCode err;
+    Client* src_client = Client::NewClient(tera_src_conf, &err);
+    if (src_client == NULL) {
+        LOG(INFO) << "open src client fail: " << tera_src_conf << ", err " << err.ToString();
+        return -1;
+    }
+    Client* dest_client = Client::NewClient(tera_dest_conf, &err);
+    if (dest_client == NULL) {
+        delete src_client;
+        src_client = NULL;
+        LOG(INFO) << "open dest client fail: " << tera_dest_conf << ", err " << err.ToString();
+        return -1;
+    }
+    Table* src_table = NULL;
+    Table* dest_table = NULL;
+
+    galaxy::ins::sdk::InsSDK ins_sdk(ins_cluster_addr);
+    std::string table_name, start_key, end_key, last_table_name;
+    while (GetAndLockDumpRange(ins_cluster_root_path, &table_name, &start_key, &end_key, &ins_sdk) == 0) {
+        if (last_table_name != table_name) { // table change
+            delete src_table;
+            delete dest_table;
+            src_table = NULL;
+            dest_table = NULL;
+            src_table = src_client->OpenTable(table_name, &err);
+            if (src_table == NULL) {
+                LOG(INFO) << "open src table fail: " << table_name << ", err " << err.ToString();
+                continue;
+            }
+            dest_table = dest_client->OpenTable(table_name, &err);
+            if (dest_table == NULL) {
+                delete src_table;
+                src_table = NULL;
+                LOG(INFO) << "open dest table fail: " << table_name << ", err " << err.ToString();
+                continue;
+            }
+        }
+        last_table_name = table_name;
+        if ((res = ScanAndDumpData(src_table, dest_table, table_name, start_key, end_key)) < 0) {
+            LOG(INFO) << "scan dump data fail: " << table_name << ", start " << start_key
+                << ", end " << end_key;
+        } else {
+            ReleaseAndUnlockDumpRange(ins_cluster_root_path, table_name, start_key, end_key, &ins_sdk);
+        }
+        start_key = end_key;
+    }
+    delete src_client;
+    delete dest_client;
+    return res;
+}
+
+void GetTableKeyRange(const std::string& table_name,
+                     const TabletMetaList& tablet_list,
+                     std::vector<std::string>* delimiters) {
+    for (int32_t i = 0; i < tablet_list.meta_size(); i++) {
+        const tera::TabletMeta& meta = tablet_list.meta(i);
+        if (table_name == meta.table_name() &&
+            meta.key_range().key_start().size() > 0) {
+            delimiters->push_back(meta.key_range().key_start());
+        }
+    }
+}
+
+int ManualCreateTable(tera::ClientImpl* client,
+                      const std::string& table_name,
+                      const TableSchema& schema,
+                      const std::vector<std::string>& delimiters) {
+    ErrorCode err;
+    TableDescriptor table_desc;
+    table_desc.SetTableName(table_name);
+    TableSchemaToDesc(schema, &table_desc);
+    table_desc.SetSplitSize(10000000);
+    table_desc.SetMergeSize(0);
+    if (!client->CreateTable(table_desc, delimiters, &err)) {
+        LOG(INFO) << "manual create error: " << table_name << ", err: " << err.ToString();
+        return -1;
+    }
+    return 0;
+}
+
+int ManualSplitTable(tera::ClientImpl* client,
+                     const std::string& table_name,
+                     const std::vector<std::string>& delimiters) {
+    ErrorCode err;
+    std::vector<std::string> arg_list;
+    arg_list.push_back("split");
+    arg_list.push_back(table_name);
+    for (uint32_t i = 0; i < delimiters.size(); i++) {
+        arg_list.push_back(delimiters[i]);
+        if (!client->CmdCtrl("table", arg_list, NULL, NULL, &err)) {
+            LOG(INFO) << "manual split table fail(ignore old master):  " << table_name
+                      << ", delimiters_size: " << delimiters.size()
+                      << ", err: " << err.ToString();
+        }
+        usleep(FLAGS_dump_manual_split_interval);
+        arg_list.pop_back();
+    }
+    return 0;
+}
+
+bool SchemaCompare(const TableSchema& src, const TableSchema& dest) {
+    return ((src.raw_key() == dest.raw_key()) &&
+           (src.kv_only() == dest.kv_only()) &&
+           (src.name() == dest.name()) &&
+           (!IsSchemaCfDiff(src, dest)) &&
+           (!IsSchemaLgDiff(src, dest)));
+}
+
+int GetOrSetTabletLocationSafe(Client* src_client,
+                               Client* dest_client,
+                               TableMetaList* table_list,
+                               TabletMetaList* tablet_list) {
+    // get src and dest tablet location
+    ErrorCode err;
+    TableMetaList src_table_list;
+    TabletMetaList src_tablet_list;
+    tera::ClientImpl* src_client_impl = static_cast<tera::ClientImpl*>(src_client);
+    if (!src_client_impl->ShowTablesInfo(&src_table_list, &src_tablet_list, false, &err)) {
+        LOG(INFO) << "tera_master show src cluster fail: " << err.ToString();
+        return -1;
+    }
+
+    TableMetaList dest_table_list;
+    TabletMetaList dest_tablet_list;
+    tera::ClientImpl* dest_client_impl = static_cast<tera::ClientImpl*>(dest_client);
+    if (!dest_client_impl->ShowTablesInfo(&dest_table_list, &dest_tablet_list, false, &err)) {
+        LOG(INFO) << "tera_master show dest cluster fail: " << err.ToString();
+        return -1;
+    }
+
+    // get table meta set
+    std::map<std::string, TableSchema> src_table_set;
+    for (int32_t i = 0; i < src_table_list.meta_size(); i++) {
+        const tera::TableMeta& meta = src_table_list.meta(i);
+        TableSchema& schema = src_table_set[meta.table_name()];
+        schema.CopyFrom(meta.schema());
+    }
+    std::map<std::string, TableSchema> dest_table_set;
+    for (int32_t i = 0; i < dest_table_list.meta_size(); i++) {
+        const tera::TableMeta& meta = dest_table_list.meta(i);
+        TableSchema& schema = dest_table_set[meta.table_name()];
+        schema.CopyFrom(meta.schema());
+    }
+
+    // create or split table, and filter schema not match meta
+    for (int32_t i = 0; i < src_table_list.meta_size(); i++) {
+        const tera::TableMeta& meta = src_table_list.meta(i);
+        if (meta.table_name() == FLAGS_tera_master_meta_table_name) {
+            continue;
+        }
+        std::vector<std::string> delimiters;
+        GetTableKeyRange(meta.table_name(), src_tablet_list, &delimiters);
+        if (dest_table_set.find(meta.table_name()) == dest_table_set.end()) {
+            if (ManualCreateTable(dest_client_impl, meta.table_name(), meta.schema(), delimiters) < 0) {
+                return -1;
+            }
+        } else if (SchemaCompare(dest_table_set[meta.table_name()], meta.schema())) {
+            if (FLAGS_dump_enable_manual_split &&
+                ManualSplitTable(dest_client_impl, meta.table_name(), delimiters) < 0) {
+                return -1;
+            }
+        } else {
+            LOG(INFO) << "table schema not match: " << meta.table_name() << ", src schema: " << meta.schema().ShortDebugString()
+                << ", dest schema: " << dest_table_set[meta.table_name()].ShortDebugString();
+            src_table_set.erase(meta.table_name());
+            continue;
+        }
+        tera::TableMeta* meta2 = table_list->add_meta();
+        meta2->CopyFrom(meta);
+    }
+
+    // filter key range
+    for (int32_t i = 0; i < src_tablet_list.meta_size(); i++) {
+        const tera::TabletMeta& meta = src_tablet_list.meta(i);
+        if (src_table_set.find(meta.table_name()) == src_table_set.end()) {
+            continue;
+        }
+        tera::TabletMeta* meta2 = tablet_list->add_meta();
+        meta2->CopyFrom(meta);
+    }
+    return 0;
+}
+
+int DumpPrepareSafeOp() {
+    int res = 0;
+    std::string ins_cluster_addr = FLAGS_ins_cluster_addr;
+    std::string ins_cluster_root_path = FLAGS_ins_cluster_root_path;
+    std::string tera_src_conf = FLAGS_dump_tera_src_conf;
+    std::string tera_dest_conf = FLAGS_dump_tera_dest_conf;
+
+    ErrorCode err;
+    std::unique_ptr<Client> src_client(Client::NewClient(tera_src_conf, &err));
+    if (src_client == nullptr) {
+        LOG(INFO) << "open src client fail: " << tera_src_conf << ", err " << err.ToString();
+        return -1;
+    }
+    std::unique_ptr<Client> dest_client(Client::NewClient(tera_dest_conf, &err));
+    if (dest_client == nullptr) {
+        src_client = nullptr;
+        LOG(INFO) << "open dest client fail: " << tera_dest_conf << ", err " << err.ToString();
+        return -1;
+    }
+
+    // dump src cluster range into ins
+    TableMetaList table_list;
+    TabletMetaList tablet_list;
+    if (GetOrSetTabletLocationSafe(src_client.get(), dest_client.get(), &table_list, &tablet_list) < 0) {
+        return -1;
+    }
+    res = DumpRange(ins_cluster_addr, ins_cluster_root_path, table_list, tablet_list);
+    return res;
+}
+
+int main(int argc, char* argv[]) {
+    ::google::ParseCommandLineFlags(&argc, &argv, true);
+    if (FLAGS_flagfile == "") {
+        FLAGS_flagfile = "../conf/tera.flag";
+        if (access(FLAGS_flagfile.c_str(), R_OK) != 0) {
+            FLAGS_flagfile = "./tera.flag";
+        }
+        utils::LoadFlagFile(FLAGS_flagfile);
+    }
+
+    if (argc > 1 && std::string(argv[1]) == "version") {
+        PrintSystemVersion();
+    } else if (argc > 2 && std::string(argv[1]) == "dump" && std::string(argv[2]) == "prepare") {
+        return DumpPrepareOp();
+    } else if (argc > 2 && std::string(argv[1]) == "dump" && std::string(argv[2]) == "prepare_safe") {
+        return DumpPrepareSafeOp();
+    } else if (argc > 2 && std::string(argv[1]) == "dump" && std::string(argv[2]) == "run") {
+        return DumpRunOp();
+    //} else if (argc > 2 && std::string(argv[1]) == "dump" && std::string(argv[2]) == "show") {
+    //    return DumpShowOp();
+    //} else if (argc > 2 && std::string(argv[1]) == "dump" && std::string(argv[2]) == "check") {
+    //    return DumpCheckOp():
+    } else {
+        HelpOp(argc, argv);
+        return -1;
+    }
+    return 0;
+}
+
diff --git a/src/timeoracle/bench/timeoracle_bench.cc b/src/timeoracle/bench/timeoracle_bench.cc
new file mode 100644
index 000000000..4140005bc
--- /dev/null
+++ b/src/timeoracle/bench/timeoracle_bench.cc
@@ -0,0 +1,48 @@
+#include <iostream>
+#include <gflags/gflags.h>
+#include <glog/logging.h>
+#include "common/mutex.h"
+#include "common/timer.h"
+#include "common/thread_pool.h"
+#include "common/this_thread.h"
+#include "sdk/sdk_zk.h"
+
+#include "sdk/timeoracle_client_impl.h"
+#include <thread>
+
+DEFINE_int64(client_thread_num, 10, "");
+
+using namespace tera;
+using namespace tera::timeoracle;
+
+std::shared_ptr<common::ThreadPool> g_thread_pool;
+
+
+void worker() {
+    tera::sdk::ClusterFinder* cluster_finder = sdk::NewTimeoracleClusterFinder();
+    tera::timeoracle::TimeoracleClientImpl client(g_thread_pool.get(), cluster_finder);
+
+    while (true) {
+        int64_t st = client.GetTimestamp(1);
+        if (st <= 0) {
+            std::cout << "rpc failed" << std::endl;
+            ThisThread::Sleep(200);
+        }
+    }
+}
+
+int main(int argc, char** argv) {
+    ::google::ParseCommandLineFlags(&argc, &argv, true);
+    g_thread_pool.reset(new common::ThreadPool(FLAGS_client_thread_num + 1));
+
+    std::vector<std::thread>    thread_list;
+    for (int64_t i = 0; i < FLAGS_client_thread_num; ++i) {
+        thread_list.push_back(std::thread(&worker));
+    }
+
+    for (auto& th : thread_list) {
+        th.join();
+    }
+
+    return 0;
+}
diff --git a/src/timeoracle/remote_timeoracle.h b/src/timeoracle/remote_timeoracle.h
new file mode 100644
index 000000000..588bd0547
--- /dev/null
+++ b/src/timeoracle/remote_timeoracle.h
@@ -0,0 +1,73 @@
+// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef TERA_TIMEORACLE_REMOTE_TIMEORACLE_H
+#define TERA_TIMEORACLE_REMOTE_TIMEORACLE_H
+
+#include <sofa/pbrpc/pbrpc.h>
+#include "common/thread_pool.h"
+#include "proto/timeoracle_rpc.pb.h"
+#include "timeoracle/timeoracle.h"
+
+namespace tera {
+namespace timeoracle {
+
+class ClosureGuard {
+public:
+    ClosureGuard(::google::protobuf::Closure* done) : done_(done) {
+    }
+
+    ~ClosureGuard() {
+        if (done_) {
+            done_->Run();
+        }
+    }
+
+    ::google::protobuf::Closure* release() {
+        auto done = done_;
+        done_ = nullptr;
+        return done;
+    }
+
+private:
+    ClosureGuard(const ClosureGuard&) = delete;
+private:
+    ::google::protobuf::Closure* done_;
+};
+
+class RemoteTimeoracle : public TimeoracleServer {
+public:
+    RemoteTimeoracle(int64_t start_timestamp) : timeoracle_(start_timestamp) {
+    }
+
+    virtual void GetTimestamp(::google::protobuf::RpcController* controller,
+                              const ::tera::GetTimestampRequest* request,
+                              ::tera::GetTimestampResponse* response,
+                              ::google::protobuf::Closure* done) {
+        ClosureGuard    closure_guard(done);
+
+        int64_t count = request->count();
+        int64_t start_timestamp = timeoracle_.GetTimestamp(count);
+
+        if (start_timestamp) {
+            response->set_start_timestamp(start_timestamp);
+            response->set_count(count);
+            response->set_status(kTimeoracleOk);
+        } else {
+            response->set_status(kTimeoracleBusy);
+        }
+    }
+
+    Timeoracle* GetTimeoracle() {
+        return &timeoracle_;
+    }
+
+private:
+    Timeoracle      timeoracle_;
+};
+
+} // namespace timeoracle
+} // namespace tera
+
+#endif // TERA_TIMEORACLE_REMOTE_TIMEORACLE_H
diff --git a/src/timeoracle/test/timeoracle_test.cc b/src/timeoracle/test/timeoracle_test.cc
new file mode 100644
index 000000000..e7b6f4472
--- /dev/null
+++ b/src/timeoracle/test/timeoracle_test.cc
@@ -0,0 +1,78 @@
+// Copyright (c) 2017, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include <signal.h>
+
+#include <gflags/gflags.h>
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+#include "timeoracle/timeoracle.h"
+#include "utils/utils_cmd.h"
+
+DECLARE_string(log_dir);
+DECLARE_string(tera_coord_type);
+DECLARE_string(tera_leveldb_env_type);
+DECLARE_string(tera_fake_zk_path_prefix);
+
+namespace tera {
+namespace timeoracle {
+
+class TimeoracleTest: public ::testing::Test {
+public:
+};
+
+TEST_F(TimeoracleTest, UniqueTimestampMsTest) {
+    int64_t ts0 = Timeoracle::UniqueTimestampMs();
+    for (int i = 0; i < 10000; ++i) {
+        int64_t ts = Timeoracle::UniqueTimestampMs();
+        EXPECT_LT(ts0, ts);
+        ts0 = ts;
+    }
+}
+
+TEST_F(TimeoracleTest, TimeoracleFunc) {
+    Timeoracle to(1024LL);
+
+    auto tmp = to.GetTimestamp(10LL);
+    EXPECT_EQ(tmp, 0);
+
+    tmp = to.UpdateLimitTimestamp(10LL);
+    EXPECT_EQ(tmp, 10);
+
+    tmp = to.GetTimestamp(10LL);
+    EXPECT_EQ(tmp, 0);
+
+    tmp = to.UpdateLimitTimestamp(2000LL);
+    EXPECT_EQ(tmp, 2000);
+
+    tmp = to.GetTimestamp(10LL);
+    EXPECT_EQ(tmp, 1044);
+
+    tmp = to.GetTimestamp(10LL);
+    EXPECT_EQ(tmp, 1054);
+
+    EXPECT_EQ(to.GetStartTimestamp(), 1064);
+
+    tmp = to.UpdateStartTimestamp();
+
+    EXPECT_GT(tmp, 1064);
+
+    auto new_ts = to.GetTimestamp(10LL);
+    EXPECT_EQ(new_ts, 0);
+}
+
+} // namespace timeoracle
+} // namespace tera
+
+int main(int argc, char** argv) {
+    ::google::ParseCommandLineFlags(&argc, &argv, true);
+    ::google::InitGoogleLogging(argv[0]);
+    FLAGS_tera_coord_type = "fake_zk";
+    FLAGS_tera_leveldb_env_type = "local";
+
+    tera::utils::SetupLog("timeorcale_test");
+    ::testing::InitGoogleTest(&argc, argv);
+    return RUN_ALL_TESTS();
+}
+
diff --git a/src/timeoracle/timeoracle.cc b/src/timeoracle/timeoracle.cc
new file mode 100644
index 000000000..9d755445b
--- /dev/null
+++ b/src/timeoracle/timeoracle.cc
@@ -0,0 +1,13 @@
+// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "timeoracle/timeoracle.h"
+
+namespace tera {
+namespace timeoracle {
+
+std::atomic<int64_t>    Timeoracle::s_last_timestamp_ms;
+
+} // namespace timeoracle
+} // namespace tera
diff --git a/src/timeoracle/timeoracle.h b/src/timeoracle/timeoracle.h
new file mode 100644
index 000000000..eb690de56
--- /dev/null
+++ b/src/timeoracle/timeoracle.h
@@ -0,0 +1,124 @@
+// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef TERA_TIMEORACLE_TIMEORACLE_H_
+#define TERA_TIMEORACLE_TIMEORACLE_H_
+
+#include <atomic>
+#include <iostream>
+#include <time.h>
+#include <gflags/gflags.h>
+#include <glog/logging.h>
+
+namespace tera {
+namespace timeoracle {
+
+constexpr int64_t kTimestampPerMilliSecond = 10000ULL;
+constexpr int64_t kTimestampPerSecond = kTimestampPerMilliSecond * 1000ULL;
+constexpr int64_t kBaseTimestampMilliSecond = 1483200000000ULL; // 20170101 00:00
+
+inline int64_t clock_realtime_ms() {
+    struct timespec tp;
+    ::clock_gettime(CLOCK_REALTIME, &tp);
+    return tp.tv_sec * 1000ULL + tp.tv_nsec / 1000000ULL - kBaseTimestampMilliSecond;
+}
+
+class Timeoracle {
+public:
+    Timeoracle(int64_t start_timestamp) : start_timestamp_(start_timestamp),
+        limit_timestamp_(0) {
+    }
+
+    // if num == 0, see next timstamp
+    // if return 0, allocate timestamp failed
+    int64_t GetTimestamp(int64_t num) {
+        int64_t start_timestamp = start_timestamp_.fetch_add(num);
+
+        if ((start_timestamp + num) >= limit_timestamp_) {
+            return 0;
+        }
+
+        return start_timestamp;
+    }
+
+    int64_t UpdateLimitTimestamp(int64_t limit_timestamp) {
+        if (limit_timestamp > limit_timestamp_) {
+            limit_timestamp_ = limit_timestamp;
+        } else {
+            LOG(ERROR) << "update limit timestamp failed, limit_timestamp_=" << limit_timestamp_
+                << ",update to " << limit_timestamp;
+            return 0;
+        }
+        return limit_timestamp;
+    }
+
+    int64_t UpdateStartTimestamp() {
+        const int64_t cur_timestamp = CurrentTimestamp();
+
+        int64_t start_timestamp = 0;
+        while (1) {
+            start_timestamp = start_timestamp_;
+            if (start_timestamp < cur_timestamp) {
+                if (start_timestamp_.compare_exchange_strong(start_timestamp, cur_timestamp)) {
+                    return cur_timestamp;
+                }
+                continue;
+            }
+
+            int64_t limit_timestamp = limit_timestamp_;
+            if (start_timestamp > limit_timestamp) {
+                if (start_timestamp_.compare_exchange_strong(start_timestamp, limit_timestamp)) {
+                    LOG(WARNING) << "adjust start timestamp to limit timestamp " << limit_timestamp;
+                    return limit_timestamp;
+                }
+                continue;
+            }
+
+            break;
+        }
+
+        LOG(INFO) << "ignore to adjust start timestamp, current timestamp is " << cur_timestamp;
+        return start_timestamp;
+    }
+
+    int64_t GetStartTimestamp() const {
+        return start_timestamp_;
+    }
+
+    int64_t GetLimitTimestamp() const {
+        return limit_timestamp_;
+    }
+
+private:
+    std::atomic<int64_t>  start_timestamp_;
+    std::atomic<int64_t>  limit_timestamp_;
+
+public:
+    static int64_t UniqueTimestampMs() {
+        while (true) {
+            int64_t ts = clock_realtime_ms();
+            int64_t last_timestamp_ms = s_last_timestamp_ms;
+
+            if (ts <= last_timestamp_ms) {
+                return s_last_timestamp_ms.fetch_add(1) + 1;
+            }
+
+            if (s_last_timestamp_ms.compare_exchange_strong(last_timestamp_ms, ts)) {
+                return ts;
+            }
+        }
+    }
+
+    static int64_t CurrentTimestamp() {
+        return UniqueTimestampMs() * kTimestampPerMilliSecond;
+    }
+
+private:
+    static std::atomic<int64_t>    s_last_timestamp_ms;
+};
+
+} // namespace timeoracle
+} // namespace tera
+
+#endif // TERA_TIMEORACLE_TIMEORACLE_H_
diff --git a/src/timeoracle/timeoracle_entry.cc b/src/timeoracle/timeoracle_entry.cc
new file mode 100644
index 000000000..8bff587ad
--- /dev/null
+++ b/src/timeoracle/timeoracle_entry.cc
@@ -0,0 +1,174 @@
+// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "timeoracle/timeoracle_entry.h"
+
+#include <iostream>
+#include <gflags/gflags.h>
+#include <glog/logging.h>
+#include "common/net/ip_address.h"
+#include "common/this_thread.h"
+#include "utils/utils_cmd.h"
+
+#include "timeoracle/remote_timeoracle.h"
+#include "timeoracle/timeoracle_zk_adapter.h"
+
+DECLARE_string(tera_local_addr);
+DECLARE_string(tera_timeoracle_port);
+DECLARE_int32(tera_timeoracle_refresh_lease_second);
+DECLARE_int32(tera_timeoracle_max_lease_second);
+DECLARE_bool(tera_timeoracle_mock_enabled);
+DECLARE_int32(tera_timeoracle_work_thread_num);
+DECLARE_int32(tera_timeoracle_io_service_pool_size);
+DECLARE_string(tera_coord_type);
+
+namespace tera {
+namespace timeoracle {
+
+TimeoracleEntry::TimeoracleEntry() : 
+    remote_timeoracle_(nullptr),
+    startup_timestamp_(0), 
+    need_quit_(false) {
+    sofa::pbrpc::RpcServerOptions rpc_options;
+    rpc_options.work_thread_num = FLAGS_tera_timeoracle_work_thread_num;
+    rpc_options.io_service_pool_size = FLAGS_tera_timeoracle_io_service_pool_size;
+    rpc_options.no_delay = false;                   //use Nagle's Algorithm
+    rpc_options.write_buffer_base_block_factor = 0; //64Bytes per malloc
+    rpc_options.read_buffer_base_block_factor = 7;  //8kBytes per malloc
+    sofa_pbrpc_server_.reset(new sofa::pbrpc::RpcServer(rpc_options));
+
+    if (FLAGS_tera_local_addr.empty()) {
+        local_addr_ = utils::GetLocalHostName()+ ":" + FLAGS_tera_timeoracle_port;
+    } else {
+        local_addr_ = FLAGS_tera_local_addr + ":" + FLAGS_tera_timeoracle_port;
+    }
+}
+
+bool TimeoracleEntry::Start() {
+    if (!InitZKAdaptor()) {
+        return false;
+    }
+
+    int64_t current_timestamp = Timeoracle::CurrentTimestamp();
+    if (startup_timestamp_ < current_timestamp) {
+        startup_timestamp_ = current_timestamp;
+    } else {
+        LOG(WARNING) << "startup timestamp big than current timestamp,"
+                     << "startup timestamp is " << startup_timestamp_
+                     << "current timestamp is " << current_timestamp;
+    }
+
+    LOG(INFO) << "set startup timestamp to " << startup_timestamp_;
+
+    if (!StartServer()) {
+        return false;
+    }
+
+    return true;
+}
+
+TimeoracleEntry::~TimeoracleEntry() {
+    need_quit_ = true;
+    if (lease_thread_.joinable()) {
+        lease_thread_.join();
+    }
+}
+
+bool TimeoracleEntry::InitZKAdaptor() {
+    if (FLAGS_tera_timeoracle_mock_enabled) {
+        LOG(INFO) << "mock mode" ;
+        zk_adapter_.reset(new TimeoracleMockAdapter(local_addr_));
+    } else if (FLAGS_tera_coord_type == "zk") {
+        LOG(INFO) << "zk mode" ;
+        zk_adapter_.reset(new TimeoracleZkAdapter(local_addr_));
+    } else if (FLAGS_tera_coord_type == "ins") {
+        LOG(INFO) << "ins mode" ;
+        zk_adapter_.reset(new TimeoracleInsAdapter(local_addr_));
+    } else {
+        LOG(FATAL) << "invalid configure for coord service, please check "
+                   << "--tera_timeoracle_mock_enabled=true or "
+                   << "--tera_coord_type=zk|ins";
+        assert(0);
+    }
+
+    return zk_adapter_->Init(&startup_timestamp_);
+}
+
+bool TimeoracleEntry::StartServer() {
+    IpAddress timeoracle_addr("0.0.0.0", FLAGS_tera_timeoracle_port);
+    LOG(INFO) << "Start timeoracle RPC server at: " << timeoracle_addr.ToString();
+
+    remote_timeoracle_ = new RemoteTimeoracle(startup_timestamp_);
+    std::thread lease_thread(&TimeoracleEntry::LeaseThread, this);
+    lease_thread_ = std::move(lease_thread);
+
+    auto timeoracle = remote_timeoracle_->GetTimeoracle();
+
+    while (startup_timestamp_ < timeoracle->GetLimitTimestamp()) {
+        if (need_quit_) {
+            return false;
+        }
+        ThisThread::Sleep(100);
+    }
+
+    sofa_pbrpc_server_->RegisterService(remote_timeoracle_);
+    if (!sofa_pbrpc_server_->Start(timeoracle_addr.ToString())) {
+        LOG(ERROR) << "start timeoracle RPC server error";
+        return false;
+    }
+
+    LOG(INFO) << "finish start timeoracle RPC server";
+    return true;
+}
+
+bool TimeoracleEntry::Run() {
+    if (need_quit_) {
+        return false;
+    }
+
+    int64_t start_timestamp = remote_timeoracle_->GetTimeoracle()->UpdateStartTimestamp();
+
+    VLOG(100) << "adjust start timestamp finished, start timestmap is " << start_timestamp;
+
+    ThisThread::Sleep(1000);
+    return true;
+}
+
+void TimeoracleEntry::ShutdownServer() {
+    need_quit_ = true;
+    sofa_pbrpc_server_->Stop();
+}
+
+void TimeoracleEntry::LeaseThread() {
+    auto timeoracle = remote_timeoracle_->GetTimeoracle();
+
+    while (!need_quit_) {
+        int64_t start_timestamp = timeoracle->GetStartTimestamp();
+        int64_t limit_timestamp = timeoracle->GetLimitTimestamp();
+        int64_t refresh_lease_timestamp =
+            FLAGS_tera_timeoracle_refresh_lease_second * kTimestampPerSecond;
+
+        if (start_timestamp + refresh_lease_timestamp >= limit_timestamp) {
+            // need to require lease
+            if (limit_timestamp < start_timestamp) {
+                limit_timestamp = start_timestamp;
+            }
+
+            int64_t next_limit_timestamp =
+                limit_timestamp + FLAGS_tera_timeoracle_max_lease_second * kTimestampPerSecond;
+
+            if (!zk_adapter_->UpdateTimestamp(next_limit_timestamp)) {
+                need_quit_ = true;
+                return;
+            }
+
+            timeoracle->UpdateLimitTimestamp(next_limit_timestamp);
+        }
+
+        ThisThread::Sleep(1000);
+    }
+}
+
+} // namespace timeoracle
+} // namespace tera
diff --git a/src/timeoracle/timeoracle_entry.h b/src/timeoracle/timeoracle_entry.h
new file mode 100644
index 000000000..356ae452a
--- /dev/null
+++ b/src/timeoracle/timeoracle_entry.h
@@ -0,0 +1,49 @@
+// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef TERA_TIMEORACLE_TIMEORACLE_ENTRY_H_
+#define TERA_TIMEORACLE_TIMEORACLE_ENTRY_H_
+
+#include <sofa/pbrpc/pbrpc.h>
+
+#include "tera_entry.h"
+#include <thread>
+#include <atomic>
+#include <memory>
+
+namespace tera {
+namespace timeoracle {
+
+class RemoteTimeoracle;
+class TimeoracleZkAdapterBase;
+
+class TimeoracleEntry : public TeraEntry {
+public:
+    TimeoracleEntry();
+    ~TimeoracleEntry();
+
+
+    virtual bool Start() override;
+    virtual bool Run() override;
+    virtual void ShutdownServer() override;
+
+private:
+    bool InitZKAdaptor();
+    bool StartServer();
+    void LeaseThread();
+
+private:
+    std::string                                 local_addr_;
+    RemoteTimeoracle*                           remote_timeoracle_;
+    std::unique_ptr<sofa::pbrpc::RpcServer>     sofa_pbrpc_server_;
+    int64_t                                     startup_timestamp_;
+    std::unique_ptr<TimeoracleZkAdapterBase>    zk_adapter_;
+    std::thread                                 lease_thread_;
+    std::atomic<bool>                           need_quit_;
+};
+
+} // namespace timeoracle
+} // namespace tera
+
+#endif // TERA_TIMEORACLE_TIMEORACLE_ENTRY_H_
diff --git a/src/timeoracle/timeoracle_zk_adapter.cc b/src/timeoracle/timeoracle_zk_adapter.cc
new file mode 100644
index 000000000..58dd4a554
--- /dev/null
+++ b/src/timeoracle/timeoracle_zk_adapter.cc
@@ -0,0 +1,477 @@
+// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include <unistd.h>
+#include <sys/file.h>
+#include "timeoracle/timeoracle_zk_adapter.h"
+#include "common/file/file_path.h"
+#include "common/this_thread.h"
+#include "types.h"
+#include "zk/zk_util.h"
+#include "ins_sdk.h"
+
+DECLARE_string(tera_zk_addr_list);
+DECLARE_string(tera_zk_root_path);
+DECLARE_string(tera_fake_zk_path_prefix);
+DECLARE_int32(tera_zk_timeout);
+DECLARE_int64(tera_zk_retry_period);
+DECLARE_int32(tera_zk_retry_max_times);
+
+DECLARE_string(tera_ins_addr_list);
+DECLARE_string(tera_ins_root_path);
+DECLARE_int64(tera_master_ins_session_timeout);
+DECLARE_string(tera_timeoracle_mock_root_path);
+
+namespace tera {
+namespace timeoracle {
+
+void TimeoracleZkAdapterBase::OnNodeValueChanged(const std::string& path,
+        const std::string& value) {
+    LOG(INFO) << "zk OnNodeValueChanged, path=" << path;
+}
+
+void TimeoracleZkAdapterBase::OnChildrenChanged(const std::string& path,
+        const std::vector<std::string>& name_list,
+        const std::vector<std::string>& data_list) {
+    LOG(INFO) << "zk OnChildrenChanged, path=" << path;
+}
+
+void TimeoracleZkAdapterBase::OnNodeCreated(const std::string& path) {
+    LOG(INFO) << "zk OnNodeCreated, path=" << path;
+}
+
+void TimeoracleZkAdapterBase::OnNodeDeleted(const std::string& path) {
+    LOG(INFO) << "zk OnNodeDeleted, path=" << path;
+    Finalize();
+    _Exit(EXIT_FAILURE);
+}
+
+void TimeoracleZkAdapterBase::OnWatchFailed(const std::string& path, int watch_type,
+                               int err) {
+    LOG(INFO) << "zk OnWatchFailed, path=" << path;
+    Finalize();
+    _Exit(EXIT_FAILURE);
+}
+
+void TimeoracleZkAdapterBase::OnSessionTimeout() {
+    LOG(ERROR) << "zk session timeout!";
+    _Exit(EXIT_FAILURE);
+}
+
+TimeoracleZkAdapter::~TimeoracleZkAdapter() {
+}
+
+bool TimeoracleZkAdapter::Init(int64_t* last_timestamp) {
+    if (!InitZk()) {
+        return false;
+    }
+
+    if (!LockTimeoracleLock()) {
+        return false;
+    }
+
+    if (ReadTimestamp(last_timestamp)) {
+        LOG(INFO) << "read timestamp sucess,get start_timestamp=" << *last_timestamp;
+        return CreateTimeoracleNode();
+    }
+
+    return false;
+}
+
+bool TimeoracleZkAdapter::CreateTimeoracleNode() {
+    LOG(INFO) << "try create timeoracle nod,path=" << kTimeoracleNodePath;
+    int32_t retry_count = 0;
+    int zk_errno = zk::ZE_OK;
+    while (!CreateEphemeralNode(kTimeoracleNodePath, server_addr_, &zk_errno)) {
+        if (retry_count++ >= FLAGS_tera_zk_retry_max_times) {
+            LOG(ERROR) << "fail to create timeoracle node";
+            return false;
+        }
+        LOG(ERROR) << "retry create timeoracle node in "
+            << FLAGS_tera_zk_retry_period << " ms, retry=" << retry_count;
+        ThisThread::Sleep(FLAGS_tera_zk_retry_period);
+        zk_errno = zk::ZE_OK;
+    }
+    LOG(INFO) << "create timeoracle node success";
+    return true;
+}
+
+bool TimeoracleZkAdapter::InitZk() {
+    LOG(INFO) << "try to init zk,zk_addr_list=" << FLAGS_tera_zk_addr_list
+        << ",zk_root_path=" << FLAGS_tera_zk_root_path;
+    int zk_errno = zk::ZE_OK;
+    int32_t retry_count = 0;
+    while (!ZooKeeperAdapter::Init(FLAGS_tera_zk_addr_list,
+                                   FLAGS_tera_zk_root_path,
+                                   FLAGS_tera_zk_timeout,
+                                   server_addr_, &zk_errno)) {
+        if (retry_count++ >= FLAGS_tera_zk_retry_max_times) {
+            LOG(ERROR) << "fail to init zk: " << zk::ZkErrnoToString(zk_errno);
+            return false;
+        }
+        LOG(ERROR) << "init zk fail: " << zk::ZkErrnoToString(zk_errno)
+            << ". retry in " << FLAGS_tera_zk_retry_period << " ms, retry: "
+            << retry_count;
+        ThisThread::Sleep(FLAGS_tera_zk_retry_period);
+        zk_errno = zk::ZE_OK;
+    }
+    LOG(INFO) << "init zk success";
+    return true;
+}
+
+bool TimeoracleZkAdapter::LockTimeoracleLock() {
+    LOG(INFO) << "try to lock timeoracle lock,path=" << kTimeoracleLockPath;
+    int32_t retry_count = 0;
+    int zk_errno = zk::ZE_OK;
+    while (!SyncLock(kTimeoracleLockPath, &zk_errno, -1)) {
+        if (retry_count++ >= FLAGS_tera_zk_retry_max_times) {
+            LOG(ERROR) << "fail to acquire timeoracle lock";
+            return false;
+        }
+        LOG(ERROR) << "retry lock timeoracle lock in "
+            << FLAGS_tera_zk_retry_period << " ms, retry=" << retry_count;
+        ThisThread::Sleep(FLAGS_tera_zk_retry_period);
+        zk_errno = zk::ZE_OK;
+    }
+    LOG(INFO) << "acquire timeoracle lock success";
+    return true;
+}
+
+bool TimeoracleZkAdapter::ReadTimestamp(int64_t* timestamp) {
+    LOG(INFO) << "try to read timestamp, path=" << kTimeoracleTimestampPath;
+
+    std::string timestamp_str;
+    int32_t retry_count = 0;
+    int zk_errno = zk::ZE_OK;
+    while (!ReadNode(kTimeoracleTimestampPath, &timestamp_str, &zk_errno)
+        && zk_errno != zk::ZE_NOT_EXIST) {
+        if (retry_count++ >= FLAGS_tera_zk_retry_max_times) {
+            LOG(ERROR) << "fail to read timestamp node";
+            return false;
+        }
+        LOG(ERROR) << "retry read timestamp node in "
+            << FLAGS_tera_zk_retry_period << " ms, retry=" << retry_count;
+        ThisThread::Sleep(FLAGS_tera_zk_retry_period);
+        zk_errno = zk::ZE_OK;
+    }
+    if (zk_errno == zk::ZE_NOT_EXIST) {
+        *timestamp = 0;
+        return true;
+    }
+
+    char * pEnd = nullptr;
+    *timestamp = ::strtoull(timestamp_str.c_str(), &pEnd, 10);
+    if (*pEnd != '\0') {
+        // TODO (chenzongjia)
+        LOG(WARNING) << "read invalid timestamp value=" << timestamp_str;
+        return false;
+    }
+
+    LOG(INFO) << "read timestamp value=" << timestamp_str;
+
+    return true;
+}
+
+bool TimeoracleZkAdapter::UpdateTimestamp(int64_t timestamp) {
+    char timestamp_str[64];
+    snprintf(timestamp_str, sizeof(timestamp_str), "%lu", timestamp);
+    LOG(INFO) << "try to update timestamp to " << timestamp;
+    int zk_errno = zk::ZE_OK;
+    while (!WriteNode(kTimeoracleTimestampPath, timestamp_str, &zk_errno)
+        && zk_errno != zk::ZE_NOT_EXIST) {
+        return false;
+        /*
+        if (retry_count++ >= FLAGS_tera_zk_retry_max_times) {
+            LOG(INFO) << "fail to update timestamp";
+            return false;
+        }
+        LOG(ERROR) << "retry update timestamp in "
+            << FLAGS_tera_zk_retry_period << " ms, retry=" << retry_count;
+        ThisThread::Sleep(FLAGS_tera_zk_retry_period);
+        zk_errno = zk::ZE_OK;
+        */
+    }
+    if (zk_errno == zk::ZE_OK) {
+        LOG(INFO) << "update zk path=" << kTimeoracleTimestampPath << " to "
+            << timestamp_str << " success.";
+        return true;
+    }
+
+    LOG(INFO) << "timestamp node not exist, try create timestamp node";
+    zk_errno = zk::ZE_OK;
+    while (!CreatePersistentNode(kTimeoracleTimestampPath, timestamp_str, &zk_errno)) {
+        return false;
+    }
+    LOG(INFO) << "create timestamp node success";
+    return true;
+
+}
+
+TimeoracleInsAdapter::~TimeoracleInsAdapter() {
+    if (ins_sdk_) {
+        std::string lock_path = FLAGS_tera_ins_root_path + kTimeoracleLockPath;
+        galaxy::ins::sdk::SDKError err;
+        ins_sdk_->UnLock(lock_path, &err);
+    }
+}
+
+bool TimeoracleInsAdapter::Init(int64_t* last_timestamp) {
+    if (!InitInsAndLock()) {
+        return false;
+    }
+
+    if (ReadTimestamp(last_timestamp)) {
+        LOG(INFO) << "read timestamp sucess,get start_timestamp=" << *last_timestamp;
+        return CreateTimeoracleNode();
+    }
+
+    return false;
+}
+
+bool TimeoracleInsAdapter::CreateTimeoracleNode() {
+    std::string put_path = FLAGS_tera_ins_root_path + kTimeoracleNodePath;
+
+    LOG(INFO) << "try write timeoracle nod,path=" << put_path;
+
+    galaxy::ins::sdk::SDKError err;
+
+    if (!ins_sdk_->Put(put_path, server_addr_, &err)) {
+        LOG(ERROR) << "update timestamp node, path=" << put_path << ",failed "
+            << ins_sdk_->ErrorToString(err);
+        return false;
+    }
+
+    LOG(INFO) << "update timeoracle node success";
+    return true;
+}
+
+static void InsOnSessionTimeout(void * context) {
+    TimeoracleInsAdapter* ins_adp = static_cast<TimeoracleInsAdapter*>(context);
+    ins_adp->OnSessionTimeout();
+}
+
+static void InsOnLockChange(const galaxy::ins::sdk::WatchParam& param,
+                            galaxy::ins::sdk::SDKError error) {
+    TimeoracleInsAdapter* ins_adp = static_cast<TimeoracleInsAdapter*>(param.context);
+    ins_adp->OnLockChange(param.value, param.deleted);
+}
+
+bool TimeoracleInsAdapter::InitInsAndLock() {
+    MutexLock lock(&mutex_);
+    LOG(INFO) << "try to init ins,ins_addr_list=" << FLAGS_tera_ins_addr_list
+        << ",ins_root_path=" << FLAGS_tera_ins_root_path;
+    ins_sdk_ = new galaxy::ins::sdk::InsSDK(FLAGS_tera_ins_addr_list);
+    ins_sdk_->SetTimeoutTime(FLAGS_tera_master_ins_session_timeout);
+
+    std::string lock_path = FLAGS_tera_ins_root_path + kTimeoracleLockPath;
+
+    galaxy::ins::sdk::SDKError err;
+
+    ins_sdk_->RegisterSessionTimeout(InsOnSessionTimeout, this);
+
+    if (!ins_sdk_->Lock(lock_path, &err)) {
+        LOG(ERROR) << "try to lock timeoracle lock,path=" << kTimeoracleLockPath << " failed,"
+                << ins_sdk_->ErrorToString(err);
+        return false;
+    }
+
+    LOG(INFO) << "try to lock timeoracle lock,path=" << kTimeoracleLockPath << " success";
+
+    if (!ins_sdk_->Watch(lock_path, InsOnLockChange, this, &err)) {
+        LOG(ERROR) << "try to watch timeoracle lock,path=" << kTimeoracleLockPath << " failed,"
+                << ins_sdk_->ErrorToString(err);
+        return false;
+    }
+
+    LOG(INFO) << "try to watch timeoracle lock,path=" << kTimeoracleLockPath << " success";
+
+    return true;
+}
+
+bool TimeoracleInsAdapter::ReadTimestamp(int64_t* timestamp) {
+    std::string read_path = FLAGS_tera_ins_root_path + kTimeoracleTimestampPath;
+
+    LOG(INFO) << "try to read timestamp, path=" << read_path;
+
+    std::string timestamp_str;
+    galaxy::ins::sdk::SDKError err;
+
+    if (!ins_sdk_->Get(read_path, &timestamp_str, &err)) {
+        if (err == galaxy::ins::sdk::SDKError::kNoSuchKey) {
+            *timestamp = 0;
+            return true;
+        }
+
+        LOG(ERROR) << "try to read timestamp, path=" << read_path << ",failed "
+            << ins_sdk_->ErrorToString(err);
+        return false;
+    }
+
+    char * pEnd = nullptr;
+    *timestamp = ::strtoull(timestamp_str.c_str(), &pEnd, 10);
+    if (*pEnd != '\0') {
+        // TODO (chenzongjia)
+        LOG(WARNING) << "read invalid timestamp value=" << timestamp_str;
+        return false;
+    }
+
+    LOG(INFO) << "read timestamp value=" << timestamp_str;
+    return true;
+}
+
+bool TimeoracleInsAdapter::UpdateTimestamp(int64_t timestamp) {
+    char buf[64];
+    snprintf(buf, sizeof(buf), "%lu", timestamp);
+    LOG(INFO) << "try to update timestamp to " << timestamp;
+
+    std::string timestamp_str(buf);
+    galaxy::ins::sdk::SDKError err;
+    std::string put_path = FLAGS_tera_ins_root_path + kTimeoracleTimestampPath;
+
+    if (!ins_sdk_->Put(put_path, timestamp_str, &err)) {
+        LOG(ERROR) << "update timestamp, path=" << put_path << ",failed "
+            << ins_sdk_->ErrorToString(err);
+        return false;
+    }
+
+    return true;
+}
+
+void TimeoracleInsAdapter::OnLockChange(std::string session_id, bool deleted) {
+    if (deleted || session_id != ins_sdk_->GetSessionID()) {
+        LOG(ERROR) << "timeoracle lock losted";
+        exit(1);
+    }
+}
+
+class FdGuard {
+public:
+    explicit FdGuard(int fd) : fd_(fd) {}
+
+    FdGuard() : fd_(-1) {}
+
+    ~FdGuard() {
+        if (fd_ >= 0) {
+            ::close(fd_);
+        }
+    }
+
+    operator int() const {
+        return fd_;
+    }
+
+    void reset(int fd) {
+        if (fd_ >= 0) {
+            ::close(fd_);
+        }
+        fd_ = fd;
+    }
+
+    int relese() {
+        const int ret = fd_;
+        fd_ = -1;
+        return ret;
+    }
+
+private:
+    FdGuard(const FdGuard&) = delete;
+    void operator=(const FdGuard&) = delete;
+    int fd_;
+};
+
+// not thread safe
+bool TimeoracleMockAdapter::Init(int64_t* last_timestamp) {
+    std::string lock_path = FLAGS_tera_timeoracle_mock_root_path + kTimeoracleLockPath;
+    static FdGuard lock_fd(::open(lock_path.c_str(), O_CREAT | O_RDWR, 0666));
+
+    if (lock_fd < 0) {
+        return false;
+    }
+
+    LOG(INFO) << "TimeoracleMockAdapter try to get lock for file=" << lock_path;
+
+    if (::flock(lock_fd, LOCK_EX) < 0) {
+        LOG(WARNING) << "lock file failed for path=" << lock_path;
+        return false;
+    }
+
+    LOG(INFO) << "TimeoracleMockAdapter got the lock for file=" << lock_path;
+
+    std::string get_path = FLAGS_tera_timeoracle_mock_root_path + kTimeoracleTimestampPath;
+
+    FdGuard tmp_fd(::open(get_path.c_str(), O_CREAT | O_RDWR, 0666));
+
+    if (tmp_fd < 0) {
+        LOG(WARNING) << "open file failed for file=" << get_path;
+        return false;
+    }
+
+    char buf[64];
+
+    ssize_t len = pread(tmp_fd, buf, sizeof(buf), 0);
+    if (len < 0) {
+        LOG(WARNING) << "read file failed for file=" << get_path;
+        return false;
+    }
+
+    if (len == 0) {
+        *last_timestamp = 0;
+        return true;
+    }
+
+    buf[len] = '\0';
+    char * pEnd = nullptr;
+    *last_timestamp = ::strtoull(buf, &pEnd, 10);
+    if (*pEnd != '\0') {
+        // TODO (chenzongjia)
+        LOG(WARNING) << "read invalid timestamp value=" << buf;
+        return false;
+    }
+
+    LOG(INFO) << "read timestamp value=" << *last_timestamp;
+
+    std::string put_path = FLAGS_tera_timeoracle_mock_root_path + kTimeoracleNodePath;
+
+    tmp_fd.reset(::open(put_path.c_str(), O_CREAT | O_RDWR, 0666));
+
+    if (tmp_fd < 0) {
+        LOG(WARNING) << "open file failed for file=" << put_path;
+        return false;
+    }
+
+    if (::pwrite(tmp_fd, server_addr_.data(), server_addr_.size(), 0)
+            != (ssize_t)server_addr_.size()) {
+        LOG(WARNING) << "write file failed for file=" << put_path;
+        return false;
+    }
+
+    return true;
+}
+
+// not thread safe
+bool TimeoracleMockAdapter::UpdateTimestamp(int64_t new_timestamp) {
+    std::string put_path = FLAGS_tera_timeoracle_mock_root_path + kTimeoracleTimestampPath;
+    FdGuard tmp_fd(::open(put_path.c_str(), O_CREAT | O_RDWR, 0666));
+
+    if (tmp_fd < 0) {
+        LOG(WARNING) << "open file failed for file=" << put_path;
+        return false;
+    }
+
+    char buf[64];
+    snprintf(buf, sizeof(buf), "%lu", new_timestamp);
+    std::string timestamp_str(buf);
+    LOG(INFO) << "try to update timestamp to " << put_path;
+
+    if (::pwrite(tmp_fd, timestamp_str.data(), timestamp_str.size(), 0)
+            != (ssize_t)timestamp_str.size()) {
+        LOG(WARNING) << "write file failed for file=" << put_path;
+        return false;
+    }
+
+    return true;
+}
+
+} // namespace timeoracle
+} // namespace tera
diff --git a/src/timeoracle/timeoracle_zk_adapter.h b/src/timeoracle/timeoracle_zk_adapter.h
new file mode 100644
index 000000000..b0f6a970c
--- /dev/null
+++ b/src/timeoracle/timeoracle_zk_adapter.h
@@ -0,0 +1,124 @@
+// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef  TERA_TIMEORACLE_TIMEORACLE_ZK_ADAPTER_H
+#define  TERA_TIMEORACLE_TIMEORACLE_ZK_ADAPTER_H
+
+#include <string>
+#include <vector>
+#include "zk/zk_adapter.h"
+
+// forward declare
+namespace galaxy{
+namespace ins{
+namespace sdk {
+    class InsSDK;
+}
+}
+}
+
+namespace tera {
+namespace timeoracle {
+
+class TimeoracleZkAdapterBase : public zk::ZooKeeperAdapter {
+public:
+    virtual ~TimeoracleZkAdapterBase() {};
+
+    // not thread safe
+    virtual bool Init(int64_t* last_timestamp) = 0;
+
+    // not thread safe
+    virtual bool UpdateTimestamp(int64_t new_timestamp) = 0;
+
+    virtual void OnChildrenChanged(const std::string& path,
+                                   const std::vector<std::string>& name_list,
+                                   const std::vector<std::string>& data_list) override;
+
+    virtual void OnNodeValueChanged(const std::string& path,
+                                    const std::string& value) override;
+
+    virtual void OnNodeCreated(const std::string& path) override;
+
+    virtual void OnNodeDeleted(const std::string& path) override;
+
+    virtual void OnWatchFailed(const std::string& path, int watch_type,
+                               int err) override;
+
+    virtual void OnSessionTimeout() final;
+};
+
+class TimeoracleZkAdapter : public TimeoracleZkAdapterBase {
+public:
+    TimeoracleZkAdapter(const std::string& server_addr) : server_addr_(server_addr) {}
+
+    virtual ~TimeoracleZkAdapter();
+
+    virtual bool Init(int64_t* last_timestamp) override;
+
+    virtual bool UpdateTimestamp(int64_t new_timestamp) override;
+
+private:
+    bool InitZk();
+
+    bool LockTimeoracleLock();
+
+    bool ReadTimestamp(int64_t* timestamp);
+
+    bool CreateTimeoracleNode();
+
+private:
+    std::string     server_addr_;
+};
+
+class TimeoracleInsAdapter : public TimeoracleZkAdapterBase {
+public:
+    TimeoracleInsAdapter(const std::string & server_addr) : server_addr_(server_addr) {}
+
+    virtual ~TimeoracleInsAdapter();
+
+    virtual bool Init(int64_t* last_timestamp) override;
+
+    virtual bool UpdateTimestamp(int64_t new_timestamp) override;
+
+    void OnLockChange(std::string session_id, bool deleted);
+
+private:
+    bool InitInsAndLock();
+
+    bool ReadTimestamp(int64_t* timestamp);
+
+    bool CreateTimeoracleNode();
+
+private:
+    mutable Mutex mutex_;
+    std::string server_addr_;
+    galaxy::ins::sdk::InsSDK* ins_sdk_{NULL};
+};
+
+
+/*
+ * This is not zookeeper!
+ * Just used on onebox for tasting tera briefly.
+ * This is implemented through local file system.
+ * Not support watching.
+ */
+class TimeoracleMockAdapter: public TimeoracleZkAdapterBase {
+public:
+    TimeoracleMockAdapter(const std::string& server_addr) : server_addr_(server_addr) {
+    }
+
+    // not thread safe
+    virtual bool Init(int64_t* last_timestamp) override;
+
+    // not thread safe
+    virtual bool UpdateTimestamp(int64_t new_timestamp) override;
+
+private:
+    std::string     server_addr_;
+};
+
+} // namespace timeoracle
+} // namespace tera
+
+#endif // TERA_TIMEORACLE_TIMEORACLE_ZK_ADAPTER_H
diff --git a/src/timeoracle_main.cc b/src/timeoracle_main.cc
new file mode 100644
index 000000000..3c7f713be
--- /dev/null
+++ b/src/timeoracle_main.cc
@@ -0,0 +1,69 @@
+// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include <iostream>
+#include <signal.h>
+
+#include <gflags/gflags.h>
+#include <glog/logging.h>
+
+#include "common/base/scoped_ptr.h"
+#include "tera_entry.h"
+#include "utils/utils_cmd.h"
+#include "version.h"
+#include "timeoracle/timeoracle_entry.h"
+
+DECLARE_string(tera_log_prefix);
+
+volatile sig_atomic_t g_quit = 0;
+
+static void SignalIntHandler(int sig) {
+    g_quit = 1;
+}
+
+int main(int argc, char* argv[]) {
+    ::google::SetUsageMessage("./timeoracle --flagfile=xxx.flag");
+    ::google::ParseCommandLineFlags(&argc, &argv, true);
+    ::google::InitGoogleLogging(argv[0]);
+    if (!FLAGS_tera_log_prefix.empty()) {
+        tera::utils::SetupLog(FLAGS_tera_log_prefix);
+    } else {
+        tera::utils::SetupLog("timeoracle");
+    }
+
+    if (argc > 1) {
+        std::string ext_cmd = argv[1];
+        if (ext_cmd == "version") {
+            PrintSystemVersion();
+            return 0;
+        }
+    }
+
+    signal(SIGINT, SignalIntHandler);
+    signal(SIGTERM, SignalIntHandler);
+
+    scoped_ptr<tera::timeoracle::TimeoracleEntry> entry(new tera::timeoracle::TimeoracleEntry());
+
+    if (!entry->Start()) {
+        return -1;
+    }
+
+    while (!g_quit) {
+        if (!entry->Run()) {
+            LOG(ERROR) << "Server run error ,and then exit now ";
+            break;
+        }
+    }
+    if (g_quit) {
+        LOG(INFO) << "received interrupt signal from user, will stop";
+    }
+
+    if (!entry->Shutdown()) {
+        return -1;
+    }
+
+    return 0;
+}
+
+/* vim: set ts=4 sw=4 sts=4 tw=100 */
diff --git a/src/types.h b/src/types.h
index bfad100da..1f50f0f8f 100644
--- a/src/types.h
+++ b/src/types.h
@@ -27,6 +27,10 @@ const std::string kTsListPath = "/ts";
 const std::string kKickPath = "/kick";
 const std::string kRootTabletNodePath = "/root_table";
 const std::string kSafeModeNodePath = "/safemode";
+const std::string kTimeoracleNodePath = "/timeoracle";
+const std::string kTimeoracleLockPath = "/timeoracle-lock";
+const std::string kTimeoracleTimestampPath = "/timeoracle-timestamp";
+const std::string kClientsNodePath = "/clients";
 const std::string kSms = "[SMS] ";
 const std::string kMail = "[MAIL] ";
 const int64_t kLatestTs = INT64_MAX;
@@ -36,6 +40,16 @@ const uint64_t kRowkeySize = (64 << 10);       // 64KB
 const uint64_t kQualifierSize = (64 << 10);    // 64KB
 const uint64_t kValueSize = (32 << 20);        // 32MB
 
+// observer
+const std::string kRowlockNodeIdListPath = "/id_lock";
+const std::string kRowlockNodeHostListPath = "/host_lock";
+const std::string kRowlockNodeNumPath = "/node_num";
+const std::string kRowlockProxyPath = "/proxy";
+const uint64_t kObserverWaitTime = 1000000;
+
+// global transaction
+const char* const kNotifyColumnFamily = "_N_";
+
 } // namespace tera
 
 #endif // TERA_TYPES_H_
diff --git a/src/utils/atomic.h b/src/utils/atomic.h
deleted file mode 100644
index 69434be09..000000000
--- a/src/utils/atomic.h
+++ /dev/null
@@ -1,110 +0,0 @@
-// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file.
-
-#ifndef  TERA_UTILS_ATOMIC_H_
-#define  TERA_UTILS_ATOMIC_H_
-
-namespace tera {
-
-static inline int atomic_add(volatile int *mem, int add)
-{
-    asm volatile(
-            "lock xadd %0, (%1);"
-            : "=a"(add)
-            : "r"(mem), "a"(add)
-            : "memory"
-    );
-    return add;
-}
-
-static inline int64_t atomic_add64(volatile int64_t* mem, int64_t add)
-{
-    asm volatile(
-            "lock xaddq %0, (%1)"
-            : "=a" (add)
-            : "r" (mem), "a" (add)
-            : "memory"
-    );
-    return add;
-}
-
-static inline void atomic_inc(volatile int *mem)
-{
-    asm volatile(
-            "lock incl %0;"
-            : "=m"(*mem)
-            : "m"(*mem)
-    );
-}
-static inline void atomic_inc64(volatile int64_t *mem)
-{
-    asm volatile(
-            "lock incq %0;"
-            : "=m"(*mem)
-            : "m"(*mem)
-    );
-}
-
-static inline void atomic_dec(volatile int *mem)
-{
-    asm volatile(
-            "lock decl %0;"
-            : "=m"(*mem)
-            : "m"(*mem)
-    );
-}
-
-static inline void atomic_dec64(volatile int64_t *mem)
-{
-    asm volatile(
-            "lock decq %0;"
-            : "=m"(*mem)
-            : "m"(*mem)
-    );
-}
-
-static inline int atomic_swap(volatile void *lockword, int value)
-{
-    asm volatile(
-            "lock xchg %0, (%1);"
-            : "=a"(value)
-            : "r"(lockword), "a"(value)
-            : "memory"
-    );
-    return value;
-}
-
-static inline int64_t atomic_swap64(volatile void *lockword, int64_t value)
-{
-    asm volatile(
-            "lock xchg %0, (%1);"
-            : "=a"(value)
-            : "r"(lockword), "a"(value)
-            : "memory"
-    );
-    return value;
-}
-
-static inline int atomic_comp_swap(volatile void *mem, int xchg, int cmp)
-{
-    asm volatile(
-            "lock cmpxchg %1, (%2)"
-            :"=a"(cmp)
-            :"d"(xchg), "r"(mem), "a"(cmp)
-    );
-    return cmp;
-}
-
-static inline int64_t atomic_comp_swap64(volatile void *mem, int64_t xchg, int64_t cmp)
-{
-    asm volatile(
-            "lock cmpxchg %1, (%2)"
-            :"=a"(cmp)
-            :"d"(xchg), "r"(mem), "a"(cmp)
-    );
-    return cmp;
-}
-
-}
-#endif  // TERA_UTILS_ATOMIC_H_
diff --git a/src/utils/counter.h b/src/utils/counter.h
deleted file mode 100644
index 3f4da00a9..000000000
--- a/src/utils/counter.h
+++ /dev/null
@@ -1,76 +0,0 @@
-// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file.
-
-#ifndef  TERA_UTILS_COUNTER_H_
-#define  TERA_UTILS_COUNTER_H_
-
-#include <stdio.h>
-
-#include "atomic.h"
-#include "timer.h"
-
-namespace tera {
-
-class Counter {
-public:
-    Counter() : val_(0) {}
-    int64_t Add(int64_t v) {
-        return atomic_add64(&val_, v) + v;
-    }
-    int64_t Sub(int64_t v) {
-        return atomic_add64(&val_, -v) - v;
-    }
-    int64_t Inc() {
-        return atomic_add64(&val_, 1) + 1;
-    }
-    int64_t Dec() {
-        return atomic_add64(&val_, -1) - 1;
-    }
-    int64_t Get() {
-        return val_;
-    }
-    int64_t Set(int64_t v) {
-        return atomic_swap64(&val_, v);
-    }
-    int64_t Clear() {
-        return atomic_swap64(&val_, 0);
-    }
-
-private:
-    volatile int64_t val_;
-};
-
-class AutoCounter {
-public:
-    AutoCounter(Counter* counter, const char* msg1, const char* msg2 = NULL)
-        : counter_(counter),
-          msg1_(msg1),
-          msg2_(msg2) {
-        start_ = get_micros();
-        counter_->Inc();
-    }
-    ~AutoCounter() {
-        int64_t end = get_micros();
-        if (end - start_ > 5000000) {
-            int64_t t = (end - start_) / 1000000;
-            if (!msg2_) {
-                fprintf(stderr, "%s [AutoCounter] %s hang for %ld s\n",
-                    get_curtime_str().data(), msg1_, t);
-            } else {
-                fprintf(stderr, "%s [AutoCounter] %s %s hang for %ld s\n",
-                    get_curtime_str().data(), msg1_, msg2_, t);
-            }
-        }
-        counter_->Dec();
-    }
-
-private:
-    Counter* counter_;
-    int64_t start_;
-    const char* msg1_;
-    const char* msg2_;
-};
-}
-
-#endif  // TERA_UTILS_COUNTER_H_
diff --git a/src/utils/timer.h b/src/utils/timer.h
deleted file mode 100644
index 62428c754..000000000
--- a/src/utils/timer.h
+++ /dev/null
@@ -1,57 +0,0 @@
-// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file.
-
-#ifndef  TERA_UTILS_TIMER_H_
-#define  TERA_UTILS_TIMER_H_
-
-#include <sys/time.h>
-#include <string>
-
-namespace tera {
-
-static inline std::string get_curtime_str() {
-    struct tm tt;
-    char buf[20];
-    time_t t = time(NULL);
-    strftime(buf, 20, "%Y%m%d-%H:%M:%S", localtime_r(&t, &tt));
-    return std::string(buf, 17);
-}
-
-static inline std::string get_curtime_str_plain() {
-    struct tm tt;
-    char buf[20];
-    time_t t = time(NULL);
-    strftime(buf, 20, "%Y%m%d%H%M%S", localtime_r(&t, &tt));
-    return std::string(buf);
-}
-
-static inline int64_t get_micros() {
-    struct timespec ts;
-    clock_gettime(CLOCK_REALTIME, &ts);
-    return static_cast<int64_t>(ts.tv_sec) * 1000000 + static_cast<int64_t>(ts.tv_nsec) / 1000;
-}
-
-static inline int64_t get_millis() {
-    return get_micros() / 1000;
-}
-
-static inline int64_t get_unique_micros(int64_t ref) {
-    int64_t now;
-    do {
-        now = get_micros();
-    } while (now == ref);
-    return now;
-}
-
-static inline int64_t GetTimeStampInUs() {
-    return get_micros();
-}
-
-static inline int64_t GetTimeStampInMs() {
-    return get_millis();
-}
-
-}  // namespace tera
-
-#endif  // TERA_UTILS_TIMER_H_
diff --git a/src/zk/zk_adapter.cc b/src/zk/zk_adapter.cc
old mode 100644
new mode 100755
index 1b83d6f87..9fd1aa1ac
--- a/src/zk/zk_adapter.cc
+++ b/src/zk/zk_adapter.cc
@@ -51,7 +51,8 @@ bool ZooKeeperAdapter::Init(const std::string& server_list,
                             const std::string& root_path,
                             uint32_t session_timeout,
                             const std::string& id,
-                            int* zk_errno) {
+                            int* zk_errno,
+                            int wait_timeout) {
     MutexLock mutex(&state_mutex_);
 
     if (NULL != handle_) {
@@ -79,7 +80,12 @@ bool ZooKeeperAdapter::Init(const std::string& server_list,
     }
 
     while (state_ == ZS_DISCONN || state_ == ZS_CONNECTING) {
-        state_cond_.Wait();
+        if (wait_timeout > 0) {
+            state_cond_.TimeWait(wait_timeout);
+            break;
+        } else {
+            state_cond_.Wait();
+        }
     }
 
     int code = ZE_OK;
@@ -427,7 +433,7 @@ bool ZooKeeperAdapter::ListAndWatchChildren(const std::string& path,
     }
 }
 
-bool ZooKeeperAdapter::CheckExist(const std::string&path, bool* is_exist,
+bool ZooKeeperAdapter::CheckExist(const std::string& path, bool* is_exist,
                                   int* zk_errno) {
     MutexLock mutex(&state_mutex_);
     if (!ZooKeeperUtil::IsValidPath(path)) {
diff --git a/src/zk/zk_adapter.h b/src/zk/zk_adapter.h
index 56cf8e2b3..010efed75 100644
--- a/src/zk/zk_adapter.h
+++ b/src/zk/zk_adapter.h
@@ -9,7 +9,7 @@
 
 #include <map>
 #include <string>
-#include <zookeeper/zookeeper.h>
+#include <zookeeper.h>
 
 #include "common/mutex.h"
 #include "common/thread_pool.h"
@@ -17,6 +17,7 @@
 #include "zk/zk_lock.h"
 #include "zk/zk_util.h"
 
+
 namespace tera {
 namespace zk {
 
@@ -39,7 +40,8 @@ class ZooKeeperAdapter {
     virtual ~ZooKeeperAdapter();
 
     bool Init(const std::string& server_list, const std::string& root_path,
-              uint32_t session_timeout, const std::string& id, int* zk_errno);
+              uint32_t session_timeout, const std::string& id, int* zk_errno,
+              int wait_timeout = -1); // default wait until zk server ready
     void Finalize();
     bool GetSessionId(int64_t* session_id, int* zk_errno);
 
diff --git a/src/zk/zk_util.cc b/src/zk/zk_util.cc
index 446ef6108..579a59f0d 100644
--- a/src/zk/zk_util.cc
+++ b/src/zk/zk_util.cc
@@ -9,7 +9,7 @@
 
 #include <gflags/gflags.h>
 #include <glog/logging.h>
-#include <zookeeper/zookeeper.h>
+#include <zookeeper.h>
 
 #include "common/file/file_path.h"
 #include "common/file/file_stream.h"