diff --git a/Makefile b/Makefile index c6eb1d15d..6c7f4a51b 100644 --- a/Makefile +++ b/Makefile @@ -11,7 +11,7 @@ ifndef CC CC = gcc endif -INCPATH += -I./src -I./include -I./src/leveldb/include -I./src/leveldb \ +INCPATH += -I./src -I./include -I./src/leveldb/include -I./src/leveldb -I./src/sdk \ -I./src/sdk/java/native-src $(DEPS_INCPATH) CFLAGS += $(OPT) $(INCPATH) -fPIC -fvisibility=hidden # hide internal symbol of tera CXXFLAGS += -std=gnu++11 $(CFLAGS) @@ -28,28 +28,42 @@ PROTO_OUT_H := $(PROTO_FILES:.proto=.pb.h) MASTER_SRC := $(wildcard src/master/*.cc) TABLETNODE_SRC := $(wildcard src/tabletnode/*.cc) IO_SRC := $(wildcard src/io/*.cc) -SDK_SRC := $(wildcard src/sdk/*.cc) +SDK_SRC := $(wildcard src/sdk/*.cc) $(wildcard src/sdk/test/global_txn_testutils.cc) \ + src/observer/rowlocknode/zk_rowlock_client_zk_adapter.cc src/observer/rowlocknode/ins_rowlock_client_zk_adapter.cc HTTP_SRC := $(wildcard src/sdk/http/*.cc) PROTO_SRC := $(filter-out %.pb.cc, $(wildcard src/proto/*.cc)) $(PROTO_OUT_CC) JNI_TERA_SRC := $(wildcard src/sdk/java/native-src/*.cc) VERSION_SRC := src/version.cc OTHER_SRC := $(wildcard src/zk/*.cc) $(wildcard src/utils/*.cc) $(VERSION_SRC) \ - src/tera_flags.cc + src/tera_flags.cc src/sdk/test/global_txn_testutils.cc COMMON_SRC := $(wildcard src/common/base/*.cc) $(wildcard src/common/net/*.cc) \ $(wildcard src/common/file/*.cc) $(wildcard src/common/file/recordio/*.cc) \ - $(wildcard src/common/console/*.cc) + $(wildcard src/common/console/*.cc) $(wildcard src/common/log/*.cc) \ + $(wildcard src/common/metric/*.cc) SERVER_WRAPPER_SRC := src/tera_main_wrapper.cc SERVER_SRC := src/tera_main.cc src/tera_entry.cc CLIENT_SRC := src/teracli_main.cc +TERAUTIL_SRC := src/terautil.cc +GTXN_TEST_SRC := src/sdk/test/global_txn_test_tool.cc TEST_CLIENT_SRC := src/tera_test_main.cc TERA_C_SRC := src/tera_c.cc MONITOR_SRC := src/monitor/teramo_main.cc MARK_SRC := src/benchmark/mark.cc src/benchmark/mark_main.cc +COMMON_TEST_SRC := $(wildcard src/common/test/*.cc) TEST_SRC := src/utils/test/prop_tree_test.cc src/utils/test/tprinter_test.cc \ src/io/test/tablet_io_test.cc src/io/test/tablet_scanner_test.cc \ src/io/test/load_test.cc src/master/test/master_test.cc \ src/master/test/master_impl_test.cc src/master/test/trackable_gc_test.cc \ - src/common/test/thread_pool_test.cc + src/observer/test/rowlock_test.cc src/observer/test/scanner_test.cc \ + src/observer/test/observer_test.cc \ + $(wildcard src/sdk/test/*_test.cc) $(COMMON_TEST_SRC) + +TIMEORACLE_SRC := $(wildcard src/timeoracle/*.cc) src/tera_entry.cc +TIMEORACLE_BENCH_SRC := src/timeoracle/bench/timeoracle_bench.cc +ROWLOCK_SRC := $(wildcard src/observer/rowlocknode/*.cc) src/sdk/rowlock_client.cc +ROWLOCK_PROXY_SRC := $(wildcard src/observer/rowlockproxy/*.cc) +OBSERVER_SRC := src/observer/executor/scanner_impl.cc src/observer/executor/random_key_selector.cc +OBSERVER_DEMO_SRC := $(wildcard src/observer/observer_demo.cc) TEST_OUTPUT := test_output UNITTEST_OUTPUT := $(TEST_OUTPUT)/unittest @@ -65,39 +79,53 @@ COMMON_OBJ := $(COMMON_SRC:.cc=.o) SERVER_WRAPPER_OBJ := $(SERVER_WRAPPER_SRC:.cc=.o) SERVER_OBJ := $(SERVER_SRC:.cc=.o) CLIENT_OBJ := $(CLIENT_SRC:.cc=.o) +TERAUTIL_OBJ := $(TERAUTIL_SRC:.cc=.o) +GTXN_TEST_OBJ := $(GTXN_TEST_SRC:.cc=.o) TEST_CLIENT_OBJ := $(TEST_CLIENT_SRC:.cc=.o) TERA_C_OBJ := $(TERA_C_SRC:.cc=.o) MONITOR_OBJ := $(MONITOR_SRC:.cc=.o) MARK_OBJ := $(MARK_SRC:.cc=.o) HTTP_OBJ := $(HTTP_SRC:.cc=.o) +COMMON_TEST_OBJ := $(COMMON_TEST_SRC:.cc=.o) TEST_OBJ := $(TEST_SRC:.cc=.o) +TIMEORACLE_OBJ := $(TIMEORACLE_SRC:.cc=.o) +TIMEORACLE_BENCH_OBJ := $(TIMEORACLE_BENCH_SRC:.cc=.o) +ROWLOCK_OBJ := $(ROWLOCK_SRC:.cc=.o) +ROWLOCK_PROXY_OBJ := $(ROWLOCK_PROXY_SRC:.cc=.o) +OBSERVER_OBJ := $(OBSERVER_SRC:.cc=.o) +OBSERVER_DEMO_OBJ := $(OBSERVER_DEMO_SRC:.cc=.o) ALL_OBJ := $(MASTER_OBJ) $(TABLETNODE_OBJ) $(IO_OBJ) $(SDK_OBJ) $(PROTO_OBJ) \ - $(JNI_TERA_OBJ) $(OTHER_OBJ) $(COMMON_OBJ) $(SERVER_OBJ) $(CLIENT_OBJ) \ + $(JNI_TERA_OBJ) $(OTHER_OBJ) $(COMMON_OBJ) $(SERVER_OBJ) $(CLIENT_OBJ) $(TERAUTIL_OBJ) \ $(TEST_CLIENT_OBJ) $(TERA_C_OBJ) $(MONITOR_OBJ) $(MARK_OBJ) \ - $(SERVER_WRAPPER_OBJ) + $(SERVER_WRAPPER_OBJ) $(TIMEORACLE_OBJ) $(ROWLOCK_OBJ) $(ROWLOCK_PROXY_OBJ) $(OBSERVER_OBJ) $(OBSERVER_DEMO_OBJ) LEVELDB_LIB := src/leveldb/libleveldb.a LEVELDB_UTIL := src/leveldb/util/histogram.o src/leveldb/port/port_posix.o -PROGRAM = tera_main tera_master tabletserver teracli teramo tera_test +PROGRAM = tera_main tera_master tabletserver teracli terautil teramo tera_test timeoracle timeoracle_bench rowlock observer_demo rowlock_proxy +TEST_PROGRAM=gtxn_test_tool + LIBRARY = libtera.a SOLIBRARY = libtera.so TERA_C_SO = libtera_c.so JNILIBRARY = libjni_tera.so +OBSERVER_LIBRARY = libobserver.a BENCHMARK = tera_bench tera_mark TESTS = prop_tree_test tprinter_test string_util_test tablet_io_test \ - tablet_scanner_test fragment_test progress_bar_test master_test load_test \ - thread_pool_test + tablet_scanner_test fragment_test progress_bar_test master_test load_test observer_test \ + common_test sdk_test .PHONY: all clean cleanall test -all: $(PROGRAM) $(LIBRARY) $(SOLIBRARY) $(TERA_C_SO) $(JNILIBRARY) $(BENCHMARK) +all: $(PROGRAM) $(TEST_PROGRAM) $(LIBRARY) $(SOLIBRARY) $(TERA_C_SO) $(JNILIBRARY) $(BENCHMARK) $(OBSERVER_LIBRARY) mkdir -p build/include build/lib build/bin build/log build/benchmark cp $(PROGRAM) build/bin - cp $(LIBRARY) $(SOLIBRARY) $(TERA_C_SO) $(JNILIBRARY) build/lib + cp $(LIBRARY) $(SOLIBRARY) $(TERA_C_SO) $(JNILIBRARY) $(OBSERVER_LIBRARY) build/lib cp src/leveldb/tera_bench . cp -r benchmark/*.sh benchmark/ycsb4tera/ $(BENCHMARK) build/benchmark cp -r include build/ cp -r conf build + mkdir -p test/tools + cp $(TEST_PROGRAM) test/tools echo 'Done' test: $(TESTS) @@ -115,11 +143,12 @@ check: test clean: rm -rf $(ALL_OBJ) $(TEST_OBJ) $(PROTO_OUT_CC) $(PROTO_OUT_H) $(TEST_OUTPUT) $(MAKE) clean -C src/leveldb - rm -rf $(PROGRAM) $(LIBRARY) $(SOLIBRARY) $(TERA_C_SO) $(JNILIBRARY) $(BENCHMARK) $(TESTS) terahttp + rm -rf $(PROGRAM) $(TEST_PROGRAM) $(LIBRARY) $(OBSERVER_LIBRARY) $(SOLIBRARY) $(TERA_C_SO) $(JNILIBRARY) $(BENCHMARK) $(TESTS) terahttp cleanall: $(MAKE) clean rm -rf build + rm -rf test/tools tera_main: src/tera_main_wrapper.o src/version.o src/tera_flags.o $(CXX) -o $@ $^ $(LDFLAGS) @@ -135,6 +164,13 @@ tabletserver: $(SERVER_OBJ) $(TABLETNODE_OBJ) $(IO_OBJ) $(SDK_OBJ) \ libtera.a: $(SDK_OBJ) $(PROTO_OBJ) $(OTHER_OBJ) $(COMMON_OBJ) $(LEVELDB_UTIL) $(AR) -rs $@ $^ +observer_demo : $(OBSERVER_DEMO_OBJ) $(OBSERVER_LIBRARY) $(LIBRARY) + $(CXX) -o $@ $^ $(LDFLAGS) + +libobserver.a: $(OBSERVER_OBJ) $(SDK_OBJ) $(PROTO_OBJ) $(OTHER_OBJ) $(COMMON_OBJ) $(LEVELDB_UTIL) \ + $(IO_OBJ) $(SDK_OBJ) + $(AR) -rs $@ $^ + libtera.so: $(SDK_OBJ) $(PROTO_OBJ) $(OTHER_OBJ) $(COMMON_OBJ) $(LEVELDB_UTIL) $(CXX) -o $@ $^ $(SO_LDFLAGS) @@ -144,6 +180,12 @@ libtera_c.so: $(TERA_C_OBJ) $(LIBRARY) teracli: $(CLIENT_OBJ) $(LIBRARY) $(CXX) -o $@ $^ $(LDFLAGS) +terautil: $(TERAUTIL_OBJ) $(LIBRARY) + $(CXX) -o $@ $^ $(LDFLAGS) + +gtxn_test_tool: $(GTXN_TEST_OBJ) $(LIBRARY) + $(CXX) -o $@ $^ $(LDFLAGS) + teramo: $(MONITOR_OBJ) $(LIBRARY) $(CXX) -o $@ $^ $(LDFLAGS) @@ -153,6 +195,18 @@ tera_mark: $(MARK_OBJ) $(LIBRARY) $(LEVELDB_LIB) tera_test: $(TEST_CLIENT_OBJ) $(LIBRARY) $(CXX) -o $@ $(TEST_CLIENT_OBJ) $(LIBRARY) $(LDFLAGS) +timeoracle: $(TIMEORACLE_OBJ) $(PROTO_OBJ) $(COMMON_OBJ) $(OTHER_OBJ) + $(CXX) -o $@ $^ $(LDFLAGS) + +timeoracle_bench : $(TIMEORACLE_BENCH_OBJ) $(LIBRARY) + $(CXX) -o $@ $^ $(LDFLAGS) + +rowlock : $(SERVER_OBJ) $(ROWLOCK_OBJ) $(PROTO_OBJ) $(OTHER_OBJ) $(COMMON_OBJ) + $(CXX) -o $@ $^ $(LDFLAGS) + +rowlock_proxy : $(SERVER_OBJ) $(ROWLOCK_PROXY_OBJ) $(PROTO_OBJ) $(OTHER_OBJ) $(COMMON_OBJ) $(OBSERVER_LIBRARY) + $(CXX) -o $@ $^ $(LDFLAGS) + terahttp: $(HTTP_OBJ) $(PROTO_OBJ) $(LIBRARY) $(CXX) -o $@ $^ $(LDFLAGS) @@ -165,7 +219,7 @@ src/leveldb/libleveldb.a: FORCE tera_bench: # unit test -thread_pool_test: src/common/test/thread_pool_test.o $(LIBRARY) +common_test: $(COMMON_TEST_OBJ) $(LIBRARY) $(CXX) -o $@ $^ $(LDFLAGS) prop_tree_test: src/utils/test/prop_tree_test.o $(LIBRARY) @@ -200,6 +254,15 @@ master_test: src/master/test/master_test.o src/master/test/master_impl_test.o \ $(PROTO_OBJ) $(OTHER_OBJ) $(COMMON_OBJ) $(LEVELDB_LIB) $(CXX) -o $@ $^ $(LDFLAGS) +sdk_test: src/sdk/test/global_txn_internal_test.o src/sdk/test/global_txn_test.o \ + src/sdk/test/filter_utils_test.o src/sdk/test/scan_impl_test.o \ + src/sdk/test/sdk_timeout_manager_test.o src/sdk/test/sdk_test.o $(SDK_OBJ) \ + $(PROTO_OBJ) $(OTHER_OBJ) $(COMMON_OBJ) $(LEVELDB_LIB) + $(CXX) -o $@ $^ $(LDFLAGS) + +observer_test: src/observer/test/rowlock_test.o src/observer/test/scanner_test.o src/observer/test/observer_test.o src/observer/observer_demo/demo_observer.o $(PROTO_OBJ) $(COMMON_OBJ) $(OTHER_OBJ) $(OBSERVER_OBJ) $(LIBRARY) + $(CXX) -o $@ $^ $(LDFLAGS) + $(ALL_OBJ): %.o: %.cc $(PROTO_OUT_H) $(CXX) $(CXXFLAGS) -c $< -o $@ @@ -222,8 +285,8 @@ proto: $(PROTO_OUT_CC) $(PROTO_OUT_H) # install output into system directories .PHONY: install -install: $(PROGRAM) $(LIBRARY) $(SOLIBRARY) $(TERA_C_SO) $(JNILIBRARY) +install: $(PROGRAM) $(LIBRARY) $(SOLIBRARY) $(TERA_C_SO) $(JNILIBRARY) mkdir -p $(INSTALL_PREFIX)/bin $(INSTALL_PREFIX)/include $(INSTALL_PREFIX)/lib cp -rf $(PROGRAM) $(INSTALL_PREFIX)/bin cp -rf include/* $(INSTALL_PREFIX)/include - cp -rf $(LIBRARY) $(SOLIBRARY) $(TERA_C_SO) $(JNILIBRARY) $(INSTALL_PREFIX)/lib + cp -rf $(LIBRARY) $(SOLIBRARY) $(TERA_C_SO) $(JNILIBRARY) $(INSTALL_PREFIX)/lib diff --git a/benchmark/run_test.sh b/benchmark/run_test.sh index 8f33ce5e6..b1e9e7c34 100755 --- a/benchmark/run_test.sh +++ b/benchmark/run_test.sh @@ -1,8 +1,8 @@ #!/bin/bash -if [[ $# != 7 || $6 -lt 0 || $6 -gt 100 ]]; then - echo "$0 DIST[zipfian, uniform, latest] ROW_NUM OP_NUM VALUE_SIZE COLUMN_NUM UPDATE_PROPORTION[0~100] TABLE_NAME" - exit 0 +if [[ $# != 9 || $6 -lt 0 || $6 -gt 100 ]]; then + echo "$0 DIST[zipfian, uniform, latest] ROW_NUM OP_NUM VALUE_SIZE COLUMN_NUM UPDATE_PROPORTION[0~100] OP_SPEED THREAD_NUM TABLE_NAME" + exit 1 fi DIST=$1 @@ -11,11 +11,12 @@ OP_NUM=$3 VALUE_SIZE=$4 COLUMN_NUM=$5 UPDATE_PROPORTION=$6 -TABLE_NAME=$7 +OP_SPEED=$7 +THREAD_NUM=$8 +TABLE_NAME=$9 -UPDATE_PROPORTION=`printf "%02d" $6` -READ_PROPORTION=`expr 100 - $UPDATE_PROPORTION` -READ_PROPORTION=`printf "%02d" $READ_PROPORTION` +UPDATE_PROPORTION=`echo $6 | awk '{printf("%.2f",$1/100)}'` +READ_PROPORTION=`echo $6 | awk '{printf("%.2f",(100-$1)/100)}'` echo "$UPDATE_PROPORTION" echo "$READ_PROPORTION" @@ -30,8 +31,12 @@ bin/ycsb run tera -p workload=com.yahoo.ycsb.workloads.CoreWorkload \ -p operationcount=$OP_NUM \ -p fieldlength=$VALUE_SIZE \ -p fieldcount=$COLUMN_NUM \ - -p updateproportion=0.$UPDATE_PROPORTION \ - -p readproportion=0.$READ_PROPORTION \ + -p updateproportion=$UPDATE_PROPORTION \ + -p readproportion=$READ_PROPORTION \ + -p target=$OP_SPEED \ + -p thread=$THREAD_NUM \ -p exportfile=ycsb.out \ | ./tera_mark --mode=m --tablename=$TABLE_NAME --type=async --verify=false +exit $? + diff --git a/benchmark/ycsb4tera.md b/benchmark/ycsb4tera.md index f32ae9e28..d348434ce 100644 --- a/benchmark/ycsb4tera.md +++ b/benchmark/ycsb4tera.md @@ -30,6 +30,13 @@ 更新(写入)占所有操作的比例 updateproportion: what proportion of operations should be updates (default: 0.05) + + 每秒总共操作的次数 + target: target ops/sec all threads (default: unthrottled) + + 客户端线程数 + thread: number of client threads (default: 1) + ``` 以下参数对于tera的测试意义不大,使用默认值即可: diff --git a/build.conf.template b/build.conf.template index 1fd914ec6..170383dde 100755 --- a/build.conf.template +++ b/build.conf.template @@ -18,6 +18,7 @@ LIBUNWIND_VERSION=0.99 GPERFTOOLS_VERSION=2.5 INS_VERSION=0.17 NOSE_VERSION=1.3.7 +MONGOOSE_VERSION=6.8 if [ $MIRROR == "china" ]; then BOOST_URL=http://mirrors.tuna.tsinghua.edu.cn/macports/distfiles/boost/boost_${BOOST_VERSION}.tar.bz2 @@ -32,6 +33,7 @@ if [ $MIRROR == "china" ]; then GPERFTOOLS_URL=https://github.com/00k/gperftools/raw/master/gperftools-${GPERFTOOLS_VERSION}.tar.gz INS_URL=https://github.com/baidu/ins/archive/${INS_VERSION}.tar.gz NOSE_URL=http://mirrors.163.com/gentoo/distfiles/nose-${NOSE_VERSION}.tar.gz + MONGOOSE_URL=https://github.com/cesanta/mongoose/archive/${MONGOOSE_VERSION}.tar.gz elif [ $MIRROR == "origin" ]; then BOOST_URL=http://downloads.sourceforge.net/project/boost/boost/1.58.0/boost_${BOOST_VERSION}.tar.bz2 PROTOBUF_URL=https://github.com/google/protobuf/releases/download/v${PROTOBUF_VERSION}/protobuf-${PROTOBUF_VERSION}.tar.bz2 @@ -45,19 +47,7 @@ elif [ $MIRROR == "origin" ]; then GPERFTOOLS_URL=https://github.com/gperftools/gperftools/releases/download/gperftools-${GPERFTOOLS_VERSION}/gperftools-${GPERFTOOLS_VERSION}.tar.gz INS_URL=https://github.com/baidu/ins/archive/${INS_VERSION}.tar.gz NOSE_URL=https://pypi.python.org/packages/58/a5/0dc93c3ec33f4e281849523a5a913fa1eea9a3068acfa754d44d88107a44/nose-${NOSE_VERSION}.tar.gz -elif [ $MIRROR == "baidu" ]; then - BOOST_URL=http://gitlab.baidu.com/baidups/third/raw/master/boost_${BOOST_VERSION}.tar.bz2 - PROTOBUF_URL=http://gitlab.baidu.com/baidups/third/raw/master/protobuf-${PROTOBUF_VERSION}.tar.bz2 - SNAPPY_URL=http://gitlab.baidu.com/baidups/third/raw/master/snappy-${SNAPPY_VERSION}.tar.gz - SOFA_PBRPC_URL=http://gitlab.baidu.com/baidups/third/raw/master/sofa-pbrpc-${SOFA_PBRPC_VERSION}.tar.gz - ZOOKEEPER_URL=http://gitlab.baidu.com/baidups/third/raw/master/zookeeper-${ZOOKEEPER_VERSION}.tar.gz - GFLAGS_URL=http://gitlab.baidu.com/baidups/third/raw/master/gflags-${GFLAGS_VERSION}.tar.gz - GLOG_URL=http://gitlab.baidu.com/baidups/third/raw/master/glog-${GLOG_VERSION}.tar.gz - GTEST_URL=http://gitlab.baidu.com/baidups/third/raw/master/googletest-release-${GTEST_VERSION}.tar.gz - LIBUNWIND_URL=http://gitlab.baidu.com/baidups/third/raw/master/libunwind-${LIBUNWIND_VERSION}.tar.gz - GPERFTOOLS_URL=http://gitlab.baidu.com/baidups/third/raw/master/gperftools-${GPERFTOOLS_VERSION}.tar.gz - INS_URL=http://gitlab.baidu.com/baidups/third/raw/master/ins-${INS_VERSION}.tar.gz - NOSE_URL=http://gitlab.baidu.com/baidups/third/raw/master/nose-${NOSE_VERSION}.tar.gz + MONGOOSE_URL=https://github.com/cesanta/mongoose/archive/${MONGOOSE_VERSION}.tar.gz else return 1 fi diff --git a/build.sh b/build.sh index 1e1156aa9..f565149ef 100755 --- a/build.sh +++ b/build.sh @@ -218,7 +218,7 @@ elif [ ! -f "${FLAG_DIR}/ins_${INS_VERSION}" ] \ sed -i "s|^PROTOBUF_PATH ?=.*|PROTOBUF_PATH ?=${DEPS_PREFIX}|" Makefile sed -i "s|^PBRPC_PATH ?=.*|PBRPC_PATH ?=${DEPS_PREFIX}|" Makefile sed -i "s|^GTEST_PATH ?=.*|GTEST_PATH ?=${DEPS_PREFIX}|" Makefile - #BOOST_PATH=${DEPS_PREFIX}/boost_${BOOST_VERSION} make install_sdk + # BOOST_PATH=${DEPS_PREFIX}/boost_${BOOST_VERSION} make install_sdk make -j4 install_sdk cd - touch "${FLAG_DIR}/ins_${INS_VERSION}" @@ -239,6 +239,23 @@ elif [ ! -f "${FLAG_DIR}/nose_${NOSE_VERSION}" ] \ touch "${FLAG_DIR}/nose_${NOSE_VERSION}" fi +# mongoose +if [ ${MONGOOSE_VERSION} == "DISABLE" ]; then + echo "Disable mongoose." +elif [ ! -f "${FLAG_DIR}/mongoose_${MONGOOSE_VERSION}" ] \ + || [ ! -f "${DEPS_PREFIX}/include/mongoose.h" ] \ + || [ ! -f "${DEPS_PREFIX}/lib/libmongoose.a" ]; then + wget --no-check-certificate -O mongoose-${MONGOOSE_VERSION}.tar.gz ${MONGOOSE_URL} + tar zxf mongoose-${MONGOOSE_VERSION}.tar.gz --recursive-unlink + cd mongoose-${MONGOOSE_VERSION} + cp -af mongoose.h ${DEPS_PREFIX}/include + gcc -c mongoose.c -o mongoose.o -g2 -pipe -Wall -Werror -fPIC + ar -rv libmongoose.a mongoose.o + cp -af libmongoose.a ${DEPS_PREFIX}/lib + cd - + touch "${FLAG_DIR}/mongoose_${MONGOOSE_VERSION}" +fi + cd ${WORK_DIR} ######################################## diff --git a/build_version.sh b/build_version.sh index 8cac725a6..2534fcb85 100755 --- a/build_version.sh +++ b/build_version.sh @@ -56,7 +56,7 @@ GIT_INFO_FILE=git_info.tmp VERSION_CPP_FILE=src/version.cc # generate template file -git log | head -n 6 | sed 's/$/&\\n\\/g' > $GIT_INFO_FILE +git log | head -n 6 | sed 's/"/\\"/g' | sed 's/$/&\\n\\/g' > $GIT_INFO_FILE gen_info_template_header > $TEMPLATE_HEADER_FILE gen_info_template_foot > $TEMPLATE_FOOT_FILE gen_info_print_template >> $TEMPLATE_FOOT_FILE diff --git a/depends.mk.template b/depends.mk.template index 191cd8162..f0dbea180 100644 --- a/depends.mk.template +++ b/depends.mk.template @@ -14,17 +14,19 @@ GLOG_PREFIX=./thirdparty GTEST_PREFIX=./thirdparty GPERFTOOLS_PREFIX=./thirdparty INS_PREFIX=./thirdparty +MONGOOSE_PREFIX=./thirdparty BOOST_INCDIR=./thirdparty/boost_1_57_0 SOFA_PBRPC_INCDIR = $(SOFA_PBRPC_PREFIX)/include PROTOBUF_INCDIR = $(PROTOBUF_PREFIX)/include SNAPPY_INCDIR = $(SNAPPY_PREFIX)/include -ZOOKEEPER_INCDIR = $(ZOOKEEPER_PREFIX)/include +ZOOKEEPER_INCDIR = $(ZOOKEEPER_PREFIX)/include/zookeeper GFLAGS_INCDIR = $(GFLAGS_PREFIX)/include GLOG_INCDIR = $(GLOG_PREFIX)/include GTEST_INCDIR = $(GTEST_PREFIX)/include GPERFTOOLS_INCDIR = $(GPERFTOOLS_PREFIX)/include INS_INCDIR = $(INS_PREFIX)/include +MONGOOSE_INCDIR = $(MONGOOSE_PREFIX)/include SOFA_PBRPC_LIBDIR = $(SOFA_PBRPC_PREFIX)/lib PROTOBUF_LIBDIR = $(PROTOBUF_PREFIX)/lib @@ -35,6 +37,7 @@ GLOG_LIBDIR = $(GLOG_PREFIX)/lib GTEST_LIBDIR = $(GTEST_PREFIX)/lib GPERFTOOLS_LIBDIR = $(GPERFTOOLS_PREFIX)/lib INS_LIBDIR = $(INS_PREFIX)/lib +MONGOOSE_LIBDIR = $(MONGOOSE_PREFIX)/lib PROTOC = $(PROTOBUF_PREFIX)/bin/protoc @@ -45,13 +48,13 @@ PROTOC = $(PROTOBUF_PREFIX)/bin/protoc DEPS_INCPATH = -I$(SOFA_PBRPC_INCDIR) -I$(PROTOBUF_INCDIR) \ -I$(SNAPPY_INCDIR) -I$(ZOOKEEPER_INCDIR) \ -I$(GFLAGS_INCDIR) -I$(GLOG_INCDIR) -I$(GTEST_INCDIR) \ - -I$(GPERFTOOLS_INCDIR) -I$(BOOST_INCDIR) -I$(INS_INCDIR) + -I$(GPERFTOOLS_INCDIR) -I$(BOOST_INCDIR) -I$(INS_INCDIR) -I$(MONGOOSE_INCDIR) DEPS_LDPATH = -L$(SOFA_PBRPC_LIBDIR) -L$(PROTOBUF_LIBDIR) \ -L$(SNAPPY_LIBDIR) -L$(ZOOKEEPER_LIBDIR) \ -L$(GFLAGS_LIBDIR) -L$(GLOG_LIBDIR) -L$(GTEST_LIBDIR) \ - -L$(GPERFTOOLS_LIBDIR) -L$(INS_LIBDIR) + -L$(GPERFTOOLS_LIBDIR) -L$(INS_LIBDIR) -L$(MONGOOSE_LIBDIR) SO_DEPS_LDFLAGS = -lins_sdk -lsofa-pbrpc -lprotobuf -lsnappy -lzookeeper_mt \ - -lgtest_main -lgtest -lglog -lgflags + -lgtest_main -lgtest -lglog -lgflags -lmongoose DEPS_LDFLAGS = $(SO_DEPS_LDFLAGS) -ltcmalloc_minimal -lunwind ################################################################ diff --git a/doc/README.md b/doc/README.md new file mode 100644 index 000000000..eb4ed0d0a --- /dev/null +++ b/doc/README.md @@ -0,0 +1,160 @@ + +# Tera SDK及工具说明 + +## 目录 +### 1. [主要数据结构](#main-data-structure) + +* tera::[client](./sdk_reference/client.md) +* tera::[table](./sdk_reference/table.md) +* tera::[mutation](./sdk_reference/mutation.md) +* tera::[reader](./sdk_reference/reader.md) +* tera::[table_descriptor](./sdk_reference/table_descriptor.md) +* tera::[transaction](./sdk_reference/transaction.md) +* tera::[scan](./sdk_reference/scan.md) +* tera::[utils](./sdk_reference/utils.md) + +### 2. [主要工具](#main-tools) +* [teracli](./tools/teracli.md) +* [terautil](./tools/terautil.md) +* [tera_bench & tera_mark](./tools/benchmark.md) +* [YCSB](./tools/ycsb.md) + + + +### 1. 主要数据结构 +#### (1) tera::client 访问tera服务主结构,所有对tera的访问或操作全部由此发起。 +一个集群对应一个client即可,如需访问多个client,需要创建多个 +##### 主要功能包括: +* 表格操作:建、删、加载、卸载、打开、关闭、更新表结构、获取表格信息、快照等 +* 用户管理:建、删、修改密码、组管理等 +* 集群信息获取:获取全部表格列表、状态等 + +#### (2) tera::table 表格主结构,对表格的所有增删查改操作由此发起。 +由tera::Client::OpenTable产生,tera::Client::CloseTable关闭,不可析构。 + +#### (3) tera::error_code 错误码,很多操作会返回,注意检查。 + +#### (4) tera::mutation + +#### (5) tera::scan 扫描操作,并获取返回数据。 + +#### (6) tera::reader 读取操作,并获取返回数据。 + +#### (7) tera::table_descriptor 表格描述符主体 + +#### (8) tera::transaction 单行事务 + +#### (9) tera::scan 扫描 + +#### (10) tera::utils 编码解码 + + +### 2. 主要工具 +#### (1) teracli 操作tera的工具 +* 实际上封装了对数据的操作等,可用来进行表格创建、schema更新等管理、控制操作。 +* 查看有哪些命令可用 :./teracli help; +* 查看某个命令的help:./teracli help [cmd],例如./teracli help tablet + +#### (2) terautil 集群间数据迁移的dump工具 + +* 具体用法./terautil dump help +* 建表主要用法:./terautil --flagfile=../conf/terautil.flag dump prepare_safe +* 扫表run起来主要用法:./terautil --flagfile=../conf/terautil.flag dump run +* flag配置 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
flag名称flag默认值或格式flag介绍
dump_tera_src_conf ../conf/src_tera.flag(格式)tera的源集群
dump_tera_dest_conf../conf/dest_tera.flag(格式)tera的目的集群
dump_tera_src_root_path/xxx_(路径格式)tera的源路径
dump_tera_dest_root_path/xxx_(路径格式)tera的目的路径
ins_cluster_addrterautil_ins(格式)锁服务器的地址
ins_cluster_root_path/terautil/dump/xxxx(格式)锁服务器路径
dump_tera_src_meta_addr“”源meta表的地址
dump_tera_dest_meta_addr“”目的meta表的地址
dump_manual_split_interval1000手动分裂时间间隔,单位为ms
dump_enable_manual_splitfalse是否允许手动分裂
+ + +#### (3) tera_mark 读写数据 +* 支持异步读写scan +``` +#示例: +./tera_mark --mode=w --tablename=test --type=async --verify=false --entry_limit=1000 +``` +* 参数列表 + +参数名 | 意义 | 有效取值 | 单位 | 默认值 | 其它说明 +--- | --- | --- | --- | --- | --- +table | 表名 | - | - | "" | +mode | 模式 | "w"/"r"/"s"/"m" | - | "w" | - +type | 类型 | "sync"/"async" | - | "async" | - +pend_size | 最大pending大小 | - | - | 100 | - +pend_count | 最大pending数 | - | - | 100000 | - +start_key | scan的开始key | - | - | "" | - +end_key | scan的结束key | - | - | "" | - +cf_list | scan的列簇 | - | - | "" | - +print | scan的结果是否需要打印 | true/false | - | false | - +buf_size | scan的buffer_size | >0 | - | 65536 | - +verify | md5 verify(writer&read) | true/false | - | true | - +max_outflow | max_outflow | - | - | -1 | - +max_rate | max_rate | - | - | -1 | - +scan_streaming | enable streaming scan | true/false | - | false | - +batch_count | batch_count(sync) | - | - | 1 | - +entry_limit | writing/reading speed limit | - | - | 0 | - + +#### (4) tera_bench 造数据的工具 +``` +./tera_bench --compression_ratio=1 --key_seed=1 --value_seed=20 --value_size=1000 --num=200000 +--benchmarks=random --key_size=24 --key_step=1 +``` + +#### (5) YCSB 业界通用NoSQL测试的基准测试工具 + +* 全称Yahoo! Cloud Serving Benchmark,Yahoo公司开发的专门用于NoSQL测试的基准测试工具 +* YCSB支持各种不同的数据分布方式,如Uniform(等概论随机选择记录)、Zipfian(随机选择记录,存在热记录)、Latest(近期写入的记录为热记录) + diff --git a/doc/cn/README.md b/doc/cn/README.md index d18e4cf74..12eeb4d98 100644 --- a/doc/cn/README.md +++ b/doc/cn/README.md @@ -1,5 +1,5 @@ -# Tera文档专区 +# Tera文档专区 ## 简介 [系统设计](../tera_design.md) @@ -10,9 +10,15 @@ [体验单机Tera](onebox.md) -[命令行工具teracli使用方法](teracli.md) +[命令行工具teracli使用方法](../tools/teracli.md) + +[集群间数据迁移的dump工具terautil使用方法](../tools/terautil.md) -[主要api使用方法](sdk_guide.md) +[造数据的工具 & 读写数据使用方法](../tools/benchmark.md) + +[性能测试工具ycsb的使用方法](../tools/ycsb.md) + +[主要api使用方法](../sdk_reference/readme.md) [搭建tera集群](cluster_setup.md) @@ -35,3 +41,4 @@ ## 版本发布 [版本发布及管理](../release_management.md) + diff --git a/doc/global_txn.md b/doc/global_txn.md new file mode 100644 index 000000000..bb62d4c79 --- /dev/null +++ b/doc/global_txn.md @@ -0,0 +1,3 @@ +# Tera全局事务的原理及实现 + +[image-1]: ../resources/images/global_txn.png diff --git a/doc/sdk_reference/client.md b/doc/sdk_reference/client.md new file mode 100644 index 000000000..a7f6fe878 --- /dev/null +++ b/doc/sdk_reference/client.md @@ -0,0 +1,169 @@ + +# Client接口说明 + +## 主要功能 + +#### 1. 表格管理 +##### (1) 新建client Client::NewClient +``` +1.1) static Client* NewClient(const std::string& confpath, const std::string& log_prefix, ErrorCode* err = NULL) +1.2) static Client* NewClient(const std::string& confpath, ErrorCode* err = NULL) +1.3) static Client* NewClient() +``` + +##### (2) 打开表格 Client::OpenTable +``` +Table* OpenTable(const std::string& table_name, ErrorCode* err) = 0 +``` +##### (3) 建表 Client::CreateTable +``` +1) bool CreateTable(const TableDescriptor& desc, ErrorCode* err) = 0 //新建带有具体描述符的表格 +2) bool CreateTable(const TableDescriptor& desc, const std::vector& tablet_delim, ErrorCode* err) = 0 //新建多个前缀为tablet_delim的tablets +``` + +##### (4) 更新schema Client::UpdateTableSchema + +``` +bool ClientImpl::UpdateTableSchema(const TableDescriptor& desc, ErrorCode* err) = 0 +``` +调用UpdateTable(desc, err),分两种情况: +* 更新lg属性。需要先disable表格 +* 更新cf属性。直接更新 +##### (5) 检查更新状态 Client::UpdateCheck + +``` +bool UpdateCheck(const std::string& table_name, bool* done, ErrorCode* err) = 0 +``` + +##### (6) disable表 Client::DisableTable +暂停表,表格不再提供读、写服务。某些属性的更新需要先disable表;使用drop删除表时,需要先执行disable操作,此操作不可回滚。 + +``` +bool DisableTable(const std::string& name, ErrorCode* err) = 0 +``` + +##### (7) drop表 Client::DropTable +删除处于disable状态的表格,此操作不可回滚。 + +``` +bool DropTable(const std::string& name, ErrorCode* err) = 0 +``` + +##### (8) enable表 Client::EnableTable + +将处于disable状态的表格重新enable,恢复读、写服务。 + +``` +bool EnableTable(const std::string& name, ErrorCode* err) = 0 +``` + +##### (9) 获取表的描述符 Client::GetTableDescriptor +``` +TableDescriptor* GetTableDescriptor(const std::string& table_name, ErrorCode* err) = 0 +``` + +##### (10) 列出所有的表 Client::List +``` +bool List(std::vector* table_list, ErrorCode* err) = 0;//列出所有的表 +bool List(const std::string& table_name, TableInfo* table_info, std::vector* tablet_list, ErrorCode* err) = 0;//获取指定的表 +``` +##### (11) 检查表是否存在 Client::IsTableExist +``` +bool IsTableExist(const std::string& table_name, ErrorCode* err) = 0 +``` + +##### (12) 检查表是否为enable状态 Client::IsTableEnabled +``` +bool IsTableEnabled(const std::string& table_name, ErrorCode* err) = 0 +``` + +##### (13) 检查表是否为空 Client::IsTableEmpty +``` +bool IsTableEmpty(const std::string& table_name, ErrorCode* err) = 0 +``` + +##### (14) 发送请求给服务器 Client::CmdCtrl +``` +bool CmdCtrl(const std::string& command, const std::vector& arg_list, bool* bool_result, std::string* str_result, ErrorCode* err) = 0 +``` + +##### (15) 使用glog的用户防止冲突 Client::SetGlogIsInitialized +``` +void SetGlogIsInitialized() +``` + +##### (16) 删除表格 Client::DeleteTable +``` +bool DeleteTable(const std::string& name, ErrorCode* err) = 0 +``` + +##### (17) 更新表格 Client::UpdateTable +``` +bool UpdateTable(const TableDescriptor& desc, ErrorCode* err) = 0 +``` + +##### (18) 获得表格的位置 Client::GetTabletLocation +``` +bool GetTabletLocation(const std::string& table_name, std::vector* tablets, ErrorCode* err) = 0 +``` + +##### (19) 重命名表格 Client::Rename +``` +bool Rename(const std::string& old_table_name, const std::string& new_table_name, ErrorCode* err) = 0 +``` +#### 2. 用户管理 + +##### (1) 创建用户 Client::CreateUser + +``` +bool ClientImpl::CreateUser(const std::string& user, + const std::string& password, ErrorCode* err) = 0 +``` +##### (2) 删除用户 Client::DeleteUser + +``` +bool ClientImpl::DeleteUser(const std::string& user, ErrorCode* err) = 0 +``` + +##### (3) 修改用户密码 Client::ChangePwd + +``` +bool ClientImpl::ChangePwd(const std::string& user, const std::string& password, ErrorCode* err) = 0 +``` + +##### (4) 显示指定用户信息 Client::ShowUser + +``` +bool ClientImpl::ShowUser(const std::string& user, std::vector& user_groups, ErrorCode* err) = 0 +``` + +##### (5) 添加用户到用户群 Client::AddUserToGroup + +``` +bool ClientImpl::AddUserToGroup(const std::string& user_name, const std::string& group_name, ErrorCode* err)= 0 +``` + +##### (6) 从用户群中删除用户 Client::DeleteUserFromGroup + +``` +bool ClientImpl::DeleteUserFromGroup(const std::string& user_name, const std::string& group_name, ErrorCode* err) = 0 +``` + + + diff --git a/doc/sdk_reference/mutation.md b/doc/sdk_reference/mutation.md index 54e444607..752891f9b 100644 --- a/doc/sdk_reference/mutation.md +++ b/doc/sdk_reference/mutation.md @@ -1,108 +1,154 @@ -# RowMutation +# RowMutation接口说明 tera sdk中通过RowMutation结构描述一次行更新操作,包含删除操作。 -一个RowMutaion中可以同时对多列进行操作,保证: - * 服务端生效时序与RowMutation的执行时序相同。比如对某列的删除+更新,服务端生效时不会乱序,导致先更新再删除的情况发生。 - * 同一个RowMutation中的操作保证同时成功或失败。 - * 操作不存在的列族会返回成功,但无法读取。 - -## 创建与析构 - -由tera::Table::NewRowMutation创建,不能由用户创建。 - -用户需要自行析构: - * 同步模式下Put返回后即可析构 - * 异步模式下需要等待回调返回,并处理完成后析构,建议在回调函数末尾进行析构 -## API - -### 更新 - -Key-value模式更新。若设定ttl,数据会在ttl时间超时后被淘汰。 +## 1. 数据结构 +``` + enum Type { + kPut, + kDeleteColumn, + kDeleteColumns, + kDeleteFamily, + kDeleteRow, + kAdd, + kPutIfAbsent, + kAppend, + kAddInt64 + }; + struct Mutation { + Type type; + std::string family; + std::string qualifier; + std::string value; + int64_t timestamp; + int32_t ttl; + }; +``` + +## 2. 主要接口与用法 +#### 2.1 更新 + + +表格类型 | 接口功能 | 接口 | 参数 | 可省参数 | 返回值类型 | 其它说明 +--- | --- | --- | --- | --- | --- | --- +表格模式 | 修改一个列 | Put | const std::string& family, const std::string& qualifier, const int64_t value, int64_t timestamp | timestamp可省,省略时为-1 | void | Counter场景下使用,设定初始值。 +表格模式 | 修改一个列的特定版本 | Put | const std::string& family, const std::string& qualifier, const std::string& value, int64_t timestamp| timestamp可省,省略时为-1 | void | 若设定timestamp,数据会被更新至指定时间,危险,不建议使用 +表格模式 | 修改一个带TTL列的特定版本 | Put | const std::string& family, const std::string& qualifier, int64_t timestamp, const std::string& value, int32_t ttl | | void | +表格模式 | 修改一个列的特定版本 | Put | const std::string& family, const std::string& qualifier, int64_t timestamp, const std::string& value | | void | +表格模式 | 原子操作:如果不存在才能Put成功 | PutIfAbsent | const std::string& family, const std::string& qualifier, const int64_t delta | | void |若不存在,更新生效;否则更新数据不生效。delta可为负数。 +表格模式 | 原子加一个Cell | Add | const std::string& family, const std::string& qualifier, const int64_t delta | | void | Counter场景下使用,累加。若无初始值,会从0开始累加 +表格模式 | 原子加一个Cell | Append | const std::string& family, const std::string& qualifier, const std::string& value | | void | 将value追加至此列原数据末尾;若原数据不存在,则与Put等效。 +k-v模式 |修改带TTL的默认列 | Put | const std::string& value, int32_t ttl | ttl 可省,默认为-1 | void |若设定ttl,数据会在ttl时间超时后被淘汰。 + +#### 2.2 删除 +##### (1) 删除整行 RowMutation::DeleteRow +删除整行的指定范围版本。 +``` +void DeleteRow(int64_t timestamp = -1) = 0;//若设定timestamp,则删除此时间之前的所有更新。 Key-value模式下timestamp不生效。 +``` + +##### (2) 删除某列族 RowMutation::DeleteFamily +删除一个列族的所有列的指定范围版本。 ``` -void Put(const std::string& value, int32_t ttl = -1); +void DeleteFamily(const std::string& family, int64_t timestamp = -1) = 0;//若设定timestamp,则删除此时间之前的所有更新。 ``` -表格模式更新。若设定timestamp,数据会被更新至指定时间,危险,不建议使用。 + +##### (3) 删除某列所有版本 RowMutation::DeleteColumns +删除一个列的指定范围版本。 ``` -void Put(const std::string& family, const std::string& qualifier, const std::string& value, int64_t timestamp = -1); +void DeleteColumns(const std::string& family, const std::string& qualifier, int64_t timestamp = -1) = 0;//若设定timestamp,则删除此时间之前的所有更新。 ``` -表格模式更新。Counter场景下使用,设定初始值。 + +##### (4) 删除一个列的指定版本 RowMutation::DeleteColumn ``` -void Put(const std::string& family, const std::string& qualifier, int64_t value, int64_t timestamp = -1); +void DeleteColumn(const std::string& family, const std::string& qualifier, int64_t timestamp) = 0;//若不存在,则不生效。 ``` -表格模式更新。Counter场景下使用,累加。若无初始值,会从0开始累加。 + + +#### 2.3 错误码 +##### (1) 行更新错误码 RowMutation::ErrorCode ``` -void Add(const std::string& family, const std::string& qualifier, const int64_t delta); +const ErrorCode& GetError() = 0; //成功返回KOK ``` -表格模式更新。若不存在,更新生效;否则更新数据不生效。 +##### (2) 设置错误码 RowMutation::SetError ``` -void PutIfAbsent(const std::string& family, const std::string& qualifier, const std::string& value); +void SetError(ErrorCode::ErrorCodeType err, const std::string& reason) = 0; ``` -表格模式更新。将value追加至此列原数据末尾;若原数据不存在,则与Put等效。 +#### 2.4 异步 +若设定回调,则异步提交;否则同步提交。 +##### (1) 设置回调 RowMutation::SetCallBack + +设置异步回调, 操作会异步返回。 ``` -void Append(const std::string& family, const std::string& qualifier, const std::string& value); +void SetCallBack(Callback callback) = 0; ``` -### 删除 - -删除整行。若设定timestamp,则删除此时间之前的所有更新。 -Key-value模式下timestamp不生效。 +##### (2) 获得回调函数 RowMutation::GetCallBack ``` -void DeleteRow(int64_t timestamp = -1); +Callback GetCallBack() = 0; ``` -删除某列族。若设定timestamp,则删除此时间之前的所有更新。 + +#### 2.5 上下文设定 +##### (1) 设置上下文 RowMutation::SetContext +设置用户上下文,可在回调函数中获取。 ``` -void DeleteFamily(const std::string& family, int64_t timestamp = -1); +void SetContext(void* context) = 0; ``` -删除某列所有版本。若设定timestamp,则删除此时间之前的所有更新。 + +##### (2) 获取用户上下文 RowMutation::GetContext ``` -void DeleteColumns(const std::string& family, const std::string& qualifier, int64_t timestamp = -1); +void* GetContext() = 0; ``` -删除某列指定时间更新。若不存在,则不生效。 +#### 2.6 超时设定 +设定单个mutation的超时时间。 如没有特殊需要,不必单独设定,使用sdk的统一超时即可。 +##### (1) 设置超时时间 RowMutation::SetTimeOut + +设置超时时间(只影响当前操作,不影响Table::SetWriteTimeout设置的默认写超时) ``` -void DeleteColumn(const std::string& family, const std::string& qualifier, int64_t timestamp); +void SetTimeOut(int64_t timeout_ms) = 0; ``` - -### 异步 - -若设定回调,则异步提交;否则同步提交。 + +##### (2) 超时 RowMutation::TimeOut ``` -typedef void (*Callback)(RowMutation* param); -void SetCallBack(Callback callback); -Callback GetCallBack(); -bool IsAsync(); +int64_t TimeOut() = 0 ``` - -### 超时设定 - -设定单个mutation的超时时间。 -如没有特殊需要,不必要单独设定,使用sdk的统一超时即可。 + #### 2.7 其他操作 +##### (1) 获取行更新的操作数 RowMutation::MutationNum ``` -void SetTimeOut(int64_t timeout_ms); -int64_t TimeOut() = 0; +uint32_t MutationNum() = 0; ``` - -### 上下文设定 - -用于回调中获取用户自定义上下文信息。 -内存由用户自己管理。 - + +##### (2) 获取mutation总大小 RowMutation::Size ``` -void SetContext(void* context); -void* GetContext(); +uint32_t Size() = 0; ``` - -### 其它 - + +##### (3) 返回row_key RowMutation::RowKey ``` -uint32_t MutationNum(); -uint32_t Size(); -const RowMutation::Mutation& GetMutation(uint32_t index); +const std::string& RowKey() = 0; ``` - -### 预发布 - -获取所属事务 + +##### (4) 返回mutation RowMutation::GetMutation ``` -Transaction* GetTransaction(); +const RowMutation::Mutation& GetMutation(uint32_t index) = 0; ``` + diff --git a/doc/sdk_reference/reader.md b/doc/sdk_reference/reader.md index 476d945fb..876e5f8ca 100644 --- a/doc/sdk_reference/reader.md +++ b/doc/sdk_reference/reader.md @@ -1,103 +1,61 @@ -# RowReader +# Reader接口说明 tera sdk中通过RowReader结构描述一次行读取操作,并获取返回数据。 -## 创建与析构 - -由tera::Table::NewRowReader创建,不能由用户创建。 - -用户需要自行析构: - * 同步模式下Get返回后即可析构 - * 异步模式下需要等待回调返回,并处理完成后析构,建议在回调函数末尾进行析构 - -## API - -### 描述过滤条件 - -通过相关的API可以对列名、更新时间、版本数目等信息描述,从而对返回数据集合进行过滤。 - -如果不进行任何描述,默认返回此行所有数据。 - -#### AddColumnFamily - +## 1. 主要接口与用法 +#### 1.1 描述过滤条件 +通过相关的API可以对列名、更新时间、版本数目等信息描述,从而对返回数据集合进行过滤。如果不进行任何描述,默认返回此行所有数据。 +##### (1) 可以增加多个列族 RowReader::AddColumnFamily ``` -void AddColumnFamily(const std::string& family); +void AddColumnFamily(const std::string& family) = 0;//如此“family”不存在于表格的schema中,则不进行过滤 ``` - -限定返回数据的列族为“family”。 - -可以增加多个列族。 - -如此“family”不存在于表格的schema中,则不进行过滤。 - -#### AddColumn - + +##### (2) 可以增加多个列 RowReader::AddColumn ``` -void AddColumn(const std::string& family, const std::string& qualifier); +void AddColumn(const std::string& family, const std::string& qualifier); //除限定返回数据列族为“family”外,其列名必须为“qualifier”。 ``` - -与AddColumnFamily类似,除限定返回数据列族为“family”外,其列名必须为“qualifier”。 - -此操作与AddColumnFamily共同生效,返回数据为二者并集。 - -#### SetTimeRange - + +##### (3) 设定最大版本数 RowReader::SetMaxVersions ``` -void SetTimeRange(int64_t ts_start, int64_t ts_end); +void SetMaxVersions(uint32_t max_version) = 0; //从最新版本开始计数,若实际数据版本数小于此值,全部返回。在最大版本数基础上再进行时间过滤。 ``` - -设定返回数据的更新时间范围。 - -只返回更新时间在[ts_start, ts_end]范围内的数据。 - -其中ts_start、ts_end均为Unix时间戳,单位为微秒(us)。 - -#### SetMaxVersions - + +##### (4) 设定返回数据的更新时间范围 RowReader::SetTimeRange ``` -void SetMaxVersions(uint32_t max_version); +void SetTimeRange(int64_t ts_start, int64_t ts_end) = 0;//只返回更新时间在[ts_start, ts_end]范围内的数据。其中ts_start、ts_end均为Unix时间戳,单位为微秒(us)。 ``` - -设定最大版本数。 - -从最新版本开始计数,若实际数据版本数小于此值,全部返回。 - -过滤优先级高于TimeRange,即在最大版本数基础上再进行时间过滤。 - -### 获取数据 - + +#### 1.2 获取数据 在RowReader被提交至服务端并返回后,可以从此结构中获取返回的数据。 - 支持两种获取方式: +
    +
  • 迭代器方式。依次遍历所有列、所有版本。
  • +
  • 全量输出。返回一个特定结构的std::Map,可按列名等信息进行访问。
  • +
- * 迭代器方式。依次遍历所有列、所有版本。 - * 全量输出。返回一个特定结构的std::Map,可按列名等信息进行访问。 - -#### 迭代器方式 +##### (1) 访问数据前通过Done进行确认 RowReader::Done +``` +bool Done() = 0;;//若返回false,则数据已遍历完毕。 ``` -bool Done(); -void Next(); + +##### (2) 访问数据前通过Next进行确认 RowReader::Next ``` - -访问数据前通过Done()进行确认。 - -若返回false,则数据已遍历完毕。 - +void Next() = 0; +``` + +##### (3) 当数据存在时,可以通过以下接口访问此单元格的各字段值 +当通过RowReader访问key-value模式的表时,除RowKey和Value外,其它字段值无效。 ``` const std::string& RowKey(); std::string Value(); -std::string Family(); -std::string Qualifier(); -int64_t Timestamp(); +std::string Family() = 0; +std::string Qualifier() = 0; +int64_t Timestamp() = 0; ``` - -当数据存在时,可以通过这些接口访问此单元格的各字段值。 - -当通过RowReader访问key-value模式的表时,除RowKey和Value外,其它字段值无效。 - -#### 全量输出 - + +##### (4) 全量输出 +通过多级std::map的形式进行访问。 ``` typedef std::map TColumn; typedef std::map TColumnFamily; @@ -105,37 +63,50 @@ typedef std::map TRow; virtual void ToMap(TRow* rowmap); ``` -通过多级std::map的形式进行访问。 - -### 异步与上下文设定 - +#### 1.3 错误码 +##### (1) 获取错误码 RowReader::ErrorCode +``` +const ErrorCode& GetError() = 0; //成功返回KOK +``` +#### 1.4 异步 若设定回调,则异步提交;否则同步提交。 +##### (1) 设置回调 RowReader::SetCallBack ``` -typedef void (*Callback)(RowMutation* param); -void SetCallBack(Callback callback); -Callback GetCallBack(); +void SetCallBack(Callback callback) = 0; ``` -用于回调中获取用户自定义上下文信息。 -内存由用户自己管理。 - +##### (2) 设置回调 RowReader::GetCallBack ``` -void SetContext(void* context); -void* GetContext(); +void (*Callback)(RowReader* param); ``` -### 超时设定 +#### 1.5 上下文设定 +用于回调中获取用户自定义上下文信息。 内存由用户自己管理。 -设定单个reader的超时时间。 -如没有特殊需要,不必要单独设定,使用sdk的统一超时即可。 +##### (1) 设置上下文 RowReader::SetContext ``` -void SetTimeOut(int64_t timeout_ms); -int64_t TimeOut() = 0; +void SetContext(void* context) = 0; ``` - -### 预发布 - -获取所属事务 + +##### (2) 获取上下文 RowReader::GetContext +``` +void* GetContext() = 0; +``` +#### 1.6 超时设定 +设定单个reader的超时时间。如没有特殊需要,不必要单独设定,使用sdk的统一超时即可。 +##### (1) 设置超时时间 RowReader::SetTimeOut +``` +void SetTimeOut(int64_t timeout_ms) = 0; +``` + +#### 1.7 其他 +##### (1) 获取表格 RowReader::GetTable +``` +Table* GetTable() = 0; +``` + +##### (2) 获取按列过滤的map ``` -Transaction* GetTransaction(); +typedef std::map >ReadColumnList; +const ReadColumnList& GetReadColumnList() = 0; ``` diff --git a/doc/sdk_reference/readme.md b/doc/sdk_reference/readme.md new file mode 100644 index 000000000..c57f747cf --- /dev/null +++ b/doc/sdk_reference/readme.md @@ -0,0 +1,42 @@ +# Tera SDK主要api接口说明 + + +### 主要数据结构 + +* tera::[client](../sdk_reference/client.md) +* tera::[table](../sdk_reference/table.md) +* tera::[mutation](../sdk_reference/mutation.md) +* tera::[reader](../sdk_reference/reader.md) +* tera::[table_descriptor](../sdk_reference/table_descriptor.md) +* tera::[transaction](../sdk_reference/transaction.md) +* tera::[scan](../sdk_reference/scan.md) +* tera::[utils](../sdk_reference/utils.md) + + +### 介绍 +#### (1) tera::client 访问tera服务主结构,所有对tera的访问或操作全部由此发起。 +一个集群对应一个client即可,如需访问多个client,需要创建多个 +##### 主要功能包括: +* 表格操作:建、删、加载、卸载、打开、关闭、更新表结构、获取表格信息、快照等 +* 用户管理:建、删、修改密码、组管理等 +* 集群信息获取:获取全部表格列表、状态等 + +#### (2) tera::table 表格主结构,对表格的所有增删查改操作由此发起。 +由tera::Client::OpenTable产生,tera::Client::CloseTable关闭,不可析构。 + +#### (3) tera::error_code 错误码,很多操作会返回,注意检查。 + +#### (4) tera::mutation + +#### (5) tera::scan 扫描操作,并获取返回数据。 + +#### (6) tera::reader 读取操作,并获取返回数据。 + +#### (7) tera::table_descriptor 表格描述符主体 + +#### (8) tera::transaction 单行事务 + + +#### (9) tera::scan 扫描 + +#### (10) tera::utils 编码解码 diff --git a/doc/sdk_reference/scan.md b/doc/sdk_reference/scan.md new file mode 100644 index 000000000..dadb915fd --- /dev/null +++ b/doc/sdk_reference/scan.md @@ -0,0 +1,98 @@ + +# scan接口说明 +tera中scan操作由ResultStream和ScanDescriptor两个数据结构进行描述。 +### 1. ResultStream + +##### (1) 检查迭代是否结束 +``` +bool Done(ErrorCode* err = NULL) = 0; //如果检查失败则返回error code。 +``` + +##### (2) 移到下一个cell + +``` +void Next() = 0; +``` + +##### (3) 获取当前cell的rowkey名字 +``` +std::string RowName() const = 0; +``` +##### (4) 获取当前cell的簇 +``` +std::string Family() const = 0; +``` + +##### (5) 获取当前cell的列 +``` +std::string Qualifier() const = 0; +``` + +##### (6) 返回时间戳 +``` +int64_t Timestamp() const = 0; +``` + +##### (7) 返回当前cell的值 +``` +std::string Value() const = 0; +int64_t ValueInt64() const = 0; +``` + +### 2. ScanDescriptor + +##### (1) 设置扫描的结束key +``` +void SetEnd(const std::string& rowkey); +``` + +##### (2) 设置扫描的目标cf + +``` +void AddColumnFamily(const std::string& cf); +``` + +##### (3) 设置扫描的目标列 +``` + void AddColumn(const std::string& cf, const std::string& qualifier); +``` +##### (4) 设置每列的maxversion +``` +void SetMaxVersions(int32_t versions); +``` + +##### (5) 设置每个扫描结果的时间范围 +``` +void SetTimeRange(int64_t ts_end, int64_t ts_start); +``` + +##### (6) 设置批量扫描模式 +``` +void SetAsync(bool async); +``` + +##### (7) 检查扫描是否为批量扫描模式 +``` +bool IsAsync() const; +``` + +##### (8) 设置扫描的超时时间 +``` +void SetPackInterval(int64_t timeout); +``` + +##### (9) 设置扫描的buffersize +``` +void SetBufferSize(int64_t buf_size);//默认为64K +``` + +##### (10) 设置每次扫描的cell数 +``` +void SetNumberLimit(int64_t number_limit); +``` + +##### (11) 获取每次扫描的cell数 +``` +int64_t GetNumberLimit(); +``` + diff --git a/doc/sdk_reference/table.md b/doc/sdk_reference/table.md new file mode 100644 index 000000000..58894a8ed --- /dev/null +++ b/doc/sdk_reference/table.md @@ -0,0 +1,100 @@ + +# Table接口说明 + +## 1. 主要数据结构 +#### 1. 表格信息 +``` +struct TableInfo { + TableDescriptor* table_desc; //表的描述符 + std::string status; //表格状态信息 +}; +``` +#### 2. tablet信息 +``` +struct TabletInfo { + std::string table_name; //表名 + std::string path; //路径 + std::string server_addr; //服务器地址 + std::string start_key; //起始key + std::string end_key; //结束key + int64_t data_size; //数据大小 + std::string status; //状态 +}; +``` + +## 2. 主要接口 +##### (1) 获取表名 Table::GetName +``` +const std::string GetName() = 0 +``` + +##### (2) 行mutation操作 Table::NewRowMutation +``` +RowMutation* NewRowMutation(const std::string& row_key) = 0 +``` +##### (3) 写数据 Table::Put +``` +1) void Put(RowMutation* row_mutation) = 0 +2) void Put(const std::vector& row_mutations) = 0 +3) bool Put(const std::string& row_key, const std::string& family, const std::string& qualifier, const std::string& value, ErrorCode* err) = 0 +4) bool Put(const std::string& row_key, const std::string& family, const std::string& qualifier, const int64_t value, ErrorCode* err) = 0; +5) bool PutIfAbsent(const std::string& row_key, const std::string& family, const std::string& qualifier, const std::string& value, ErrorCode* err) = 0; +``` + +##### (4) 检查写数据是否结束 Table::IsPutFinished + +``` +bool IsPutFinished() = 0 +``` + +##### (5) 添加数据 Table::Add + +``` +bool Add(const std::string& row_key, const std::string& family, const std::string& qualifier, int64_t delta, ErrorCode* err) = 0; +``` + +##### (6) 追加数据 Table::Append + +``` +bool Append(const std::string& row_key, const std::string& family, const std::string& qualifier, const std::string& value, ErrorCode* err) = 0; +``` + +##### (7) 按行读数据 Table::NewRowReader + +``` +RowReader* NewRowReader(const std::string& row_key) = 0 +``` + +##### (8) 读数据 Table::Get + +``` +1) void Get(RowReader* row_reader) = 0 +2) void Get(const std::vector& row_readers) = 0; +3) bool Get(const std::string& row_key, const std::string& family, const std::string& qualifier, std::string* value, ErrorCode* err) = 0; +4) bool Get(const std::string& row_key, const std::string& family, const std::string& qualifier, int64_t* value, ErrorCode* err) = 0; +``` + +##### (9) 检查get是否结束 Table::IsGetFinished +``` +bool IsGetFinished() = 0; +``` + +##### (10) 扫描 Table::Scan +``` +ResultStream* Scan(const ScanDescriptor& desc, ErrorCode* err) = 0 +``` +##### (11) 按行事务处理 Table::StartRowTransaction +``` +Transaction* StartRowTransaction(const std::string& row_key) = 0 +``` + +##### (12) 提交行事务 Table::CommitRowTransaction +``` +void CommitRowTransaction(Transaction* transaction) = 0 +``` + +##### (13) 执行mutation Table::ApplyMutation +```c +void ApplyMutation(RowMutation* row_mu) = 0; +void ApplyMutation(const std::vector& row_mu_list) = 0; +``` diff --git a/doc/sdk_reference/table_descriptor.md b/doc/sdk_reference/table_descriptor.md index ccf79a1f1..cbc2e0670 100644 --- a/doc/sdk_reference/table_descriptor.md +++ b/doc/sdk_reference/table_descriptor.md @@ -1,350 +1,217 @@ -# 表格描述 -tera中的表格由TableDescriptor、LocalityGroupDescriptor、ColumnFamilyDescriptor三个数据结构进行描述,C++接口。 - -同时也支持更简单的字符串描述,参见本文最后。 - -## TableDescriptor - -表格描述符主体,LocalityGroupDescriptor、ColumnFamilyDescriptor由其管理。 - -描述表格全局属性,如key拼装方式、分片分裂合并阈值、ACL等信息。 - -### 创建与析构 - -此结构由用户自己创建并析构。 +# table_descriptor接口说明 +tera中的表格由ColumnFamilyDescriptor、LocalityGroupDescriptor、TableDescriptor三个数据结构进行描述。 +### 1. ColumnFamilyDescriptor +描述一个列族的属性。 +属性支持动态更新。更新状态为最终一致,过程中存在分片之前属性不一致情况,使用时需要注意。 +##### (1) TTL +设定列族内cell的TTL(time-to-live),单位秒,默认无穷大。 +当列族内某cell的更新时间超过此值后,读取时被屏蔽,并在垃圾回收时物理删除。 +``` +void SetTimeToLive(int32_t ttl) = 0; +int32_t TimeToLive() const = 0; +``` -### 使用场景 +##### (2) 最大版本数MaxVersions +设定列族内cell的最大版本数,默认为1。 +当某cell的版本数超过此限制后,会将最旧的版本进行屏蔽,并在垃圾回收时物理删除。 +此值不做最大值限制,但随着版本数大量增加,相应的随机读、扫描性能会下降,存储使用上升,用户可按实际情况调整。 +``` +void SetMaxVersions(int32_t max_versions) = 0; +int32_t MaxVersions() const = 0; +``` - * 表格创建,通过`tera::Client::CreateTable` - * 表格Schema更新,通过`tera::Client::UpdateTable` - * 获取表格属性,通过`tera::Client::GetTableDescriptor` - -### API +##### (3) 获取LG的名字 +``` +const std::string& LocalityGroup() const = 0; +``` +##### (4) 获取Id +``` +int32_t Id() const = 0; +``` -#### TableDescriptor +### 2. LocalityGroupDescriptor +描述一个locality group的属性。 +##### (1) 获取此LG名字 ``` -TableDescriptor(const std::string& name); +const std::string& Name() const; ``` -构造表格名为“name”的表格描述符。 +##### (2) 设定、获取存储介质,默认kInDisk +``` +void SetStore(StoreType type) = 0; +StoreType Store() const = 0; +enum StoreType { + kInDisk = 0, + kInFlash = 1, + kInMemory = 2, +}; +``` -其中表格名长度需要小于256字节,字符只支持{[a-z],[A-Z],[0-9],'_','-'}。 +##### (3) 设定、获取物理文件内部block大小 +``` +void SetBlockSize(int block_size) = 0;//设定、获取物理文件内部block大小,单位KB,默认值:4。 +int BlockSize() const = 0; +``` +##### (4) 设定、获取物理文件基础大小 +``` +int32_t SstSize() const = 0;//设定、获取物理文件内部block大小,单位KB,默认值:4。 +void SetSstSize(int32_t sst_size) = 0; +``` +##### (5) 获取/得到compress type +``` + void SetCompress(CompressType type) = 0; + CompressType Compress() const = 0; +``` +##### (6) 设定、获取是否使用bloom filter +设定、获取是否使用bloom filter,默认不使用。 +``` +void SetUseBloomfilter(bool use_bloomfilter) = 0; +bool UseBloomfilter() const = 0; +``` +##### (7) 内存内compact +是否使用内存内compact。 +``` +bool UseMemtableOnLeveldb() const = 0; +void SetUseMemtableOnLeveldb(bool use_mem_ldb) = 0; +``` +##### (8) 设定、获取内存compact中写缓存大小 +设定、获取内存compact中写缓存大小,单位KB。 +``` +int32_t MemtableLdbWriteBufferSize() const = 0; +void SetMemtableLdbWriteBufferSize(int32_t buffer_size) = 0; +``` +##### (9) 设定、获取内存compact中对应block大小 +设定、获取内存compact中对应block大小,单位KB。 +``` +int32_t MemtableLdbBlockSize() const = 0; +void SetMemtableLdbBlockSize(int32_t block_size) = 0; +``` + +### 3. TableDescriptor +表格描述符主体,LocalityGroupDescriptor、ColumnFamilyDescriptor由其管理。 +描述表格全局属性,如key拼装方式、分片分裂合并阈值、ACL等信息。 +使用场景 +
    +
  • 表格创建,通过tera::Client::CreateTable
  • +
  • 表格Schema更新,通过tera::Client::UpdateTable
  • +
  • 获取表格属性,通过tera::Client::GetTableDescriptor
  • +
-#### TableName +#### 3.1 TableDescriptor +##### (1) 获取表名 +设置、返回表格名。 ``` void SetTableName(const std::string& name); std::string TableName() const; ``` -设置、返回表格名。 - -#### LocalityGroup - +##### (2) 新增一个名为‘lg_name’的LG +其中,LocalityGroup名长度需要小于256字节,字符只支持{[a-z],[A-Z],[0-9],'_','-'} ``` LocalityGroupDescriptor* AddLocalityGroup(const std::string& lg_name); ``` -新增一个名为‘lg_name’的LG。 - -其中的LocalityGroup名长度需要小于256字节,字符只支持{[a-z],[A-Z],[0-9],'_','-'}。 - +##### (3) 删除名为‘lg_name’的LG ``` -bool RemoveLocalityGroup(const std::string& lg_name); +bool RemoveLocalityGroup(const std::string& lg_name);//如果此LG中还有列族存在,删除失败。 ``` - -删除名为‘lg_name’的LG。 - -如果此LG中还有列族存在,删除失败。 - +##### (4) 通过id/名称访问对应LG +LG在表格内部以vector形式保存,id为其对应的下标。 ``` const LocalityGroupDescriptor* LocalityGroup(int32_t id) const; const LocalityGroupDescriptor* LocalityGroup(const std::string& lg_name) const; ``` - -通过id/名称访问对应LG。 - -LG在表格内部以vector形式保存,id为其对应的下标。 - +##### (5) 获取/得到compress type ``` -int32_t LocalityGroupNum() const; + void SetCompress(CompressType type) = 0; + CompressType Compress() const = 0; ``` - -返回当前表格中LG数量。 - -#### ColumnFamily - +##### (6) 返回当前表格中LG数量 ``` -ColumnFamilyDescriptor* AddColumnFamily(const std::string& cf_name,const std::string& lg_name); +int32_t LocalityGroupNum() const; ``` + +#### 3.2 ColumnFamily -在‘lg_name’下新增一个名为‘cf_name’的列族。 - -若‘lg_name’不存在,返回NULL。 - -其中列族名长度需要小于256字节,字符只支持{[a-z],[A-Z],[0-9],'_','-'}。 - +##### (1) 在‘lg_name’下新增一个名为‘cf_name’的列族 +若‘lg_name’不存在,返回NULL。其中列族名长度需要小于256字节,字符只支持{[a-z],[A-Z],[0-9],'_','-'}。 +``` +ColumnFamilyDescriptor* AddColumnFamily(const std::string& cf_name, const std::string& lg_name = "lg0"); ``` +##### (2) 删除名为‘cf_name’的列族 +``` void RemoveColumnFamily(const std::string& cf_name); ``` - -删除名为‘cf_name’的列族。 - +##### (3) 通过id/名称访问对应列族 +列族在表格内部以vector形式保存,id为其对应的下标。 ``` const ColumnFamilyDescriptor* ColumnFamily(int32_t id) const; const ColumnFamilyDescriptor* ColumnFamily(const std::string& cf_name) const; ``` - -通过id/名称访问对应列族。 - -列族在表格内部以vector形式保存,id为其对应的下标。 - +##### (4) 返回当前表格中列族数量 ``` int32_t ColumnFamilyNum() const; ``` -返回当前表格中列族数量。 +#### 3.3 RawKey -#### RawKey - -``` +##### (1) 表格内部key的拼装格式 +决定了表格的存储及访问格式,推荐kBinary。 +``` +void SetRawKey(RawKeyType type); +RawKeyType RawKey() const; enum RawKeyType { kReadable = 0, - kBinary = 1, + kBinary = 1, kTTLKv = 2, kGeneralKv = 3, -}; -void SetRawKey(RawKeyType type); -RawKeyType RawKey() const; -``` - -表格内部key的拼装格式。 - -决定了表格的存储及访问格式,推荐kBinary。 - -#### SplitSize - -``` -void SetSplitSize(int64_t size); -int64_t SplitSize() const; +}; ``` - -分片分裂阈值。 - +#### 3.4 SplitSize +##### (1) 分片分裂阈值 当分片数据量(物理存储)超过此阈值时,会被一分为二,并可能被两个不同服务器加载。 - 此分裂阈值是一个基础参考值,系统会根据实际动态负载在此值基础上进行调整。 - -#### MergeSize - ``` -void SetMergeSize(int64_t size); -int64_t MergeSize() const; +void SetSplitSize(int64_t size); +int64_t SplitSize() const; ``` -分片合并阈值。 - +#### 3.5 MergeSize +##### (1) 分片合并阈值 当分片数据量(物理存储)低于此阈值时,会被合并至相临分片中。 - 此值是一个基础参考值,系统会根据实际动态负载在此值基础上进行调整。 - 需要小于分裂阈值的1/3,防止出现合并、分裂的循环出现。 -#### Write Ahead Log - -``` -void DisableWal(); -bool IsWalDisabled() const; +``` +void SetMergeSize(int64_t size); +int64_t MergeSize() const; ``` - -配置日志开关,默认打开。 - +#### 3.6 Write Ahead Log +##### (1) 配置日志开关,默认打开 当此表格数据没有强特久化需求时,可以选择关闭日志。 - 会大幅提升写性能、降低系统IO消耗。 - 当有服务器宕机时,内存中数据将丢失,谨慎关闭。 -#### Admin - -``` -void SetAdmin(const std::string& name); -std::string Admin() const; -void SetAdminGroup(const std::string& name); -std::string AdminGroup() const; -``` - -设置表格ACL信息。 - -## LocalityGroupDescriptor - -描述一个locality group的属性。 - -### 创建与析构 - -通过`TableDescriptor::AddLocalityGroup`进行创建。 - -无须用户析构。 - -### API - -#### Name - -``` -const std::string& Name() const; -``` - -获取此LG名字。 - -#### Store - -``` -enum StoreType { - kInDisk = 0, - kInFlash = 1, - kInMemory = 2, -}; -void SetStore(StoreType type); -StoreType Store() const; -``` - -设定、获取存储介质,默认kInDisk。 - -#### BlockSize、SstSize、BloomFilter - -``` -void SetBlockSize(int block_size); -int BlockSize() const; -``` - -设定、获取物理文件内部block大小,单位KB,默认值:4。 - -物理存储基于leveldb开发,此概念与leveldb中的block相似。 - -``` -void SetSstSize(int sst_size); -int SstSize() const; -``` - -设定、获取物理文件基础大小,单位MB,默认值:8。 - -物理存储基于leveldb开发,此概念与leveldb中的level1文件大小相同。 - -``` -void SetUseBloomfilter(bool use_bloomfilter); -bool UseBloomfilter() const; -``` - -设定、获取是否使用bloom filter,默认不使用。 - -物理存储基于leveldb开发,此概念与leveldb中的bloom filter。 - -#### 内存内compact - -``` -bool UseMemtableOnLeveldb() const; -void SetUseMemtableOnLeveldb(bool use_mem_ldb); -``` - -是否使用内存内compact。 - -``` -int32_t MemtableLdbWriteBufferSize() const; -void SetMemtableLdbWriteBufferSize(int32_t buffer_size); -``` - -设定、获取内存compact中写缓存大小,单位KB。 - -``` -int32_t MemtableLdbBlockSize() const; -void SetMemtableLdbBlockSize(int32_t block_size); +``` +void DisableWal(); +bool IsWalDisabled() const; ``` - -设定、获取内存compact中对应block大小,单位KB。 - -## ColumnFamilyDescriptor - -描述一个列族的属性。 - -属性支持动态更新。更新状态为最终一致,过程中存在分片之前属性不一致情况,使用时需要注意。 - -### 创建与析构 - -通过`TableDescriptor::AddColumnFamily`进行创建。 - -无须用户析构。 - -### API - -#### TTL +#### 3.7 事务 +##### (1) 事务处理 ``` -void SetTimeToLive(int32_t ttl); -int32_t TimeToLive() const; +void EnableTxn(); +bool IsTxnEnabled() const; ``` - -设定列族内cell的TTL(time-to-live),单位秒,默认无穷大。 - -当列族内某cell的更新时间超过此值后,读取时被屏蔽,并在垃圾回收时物理删除。 - -#### MaxVersion +#### 3.8 Admin +##### (1) 设置表格的admin ``` -void SetMaxVersions(int32_t max_versions); -int32_t MaxVersions() const; +void SetAdmin(const std::string& name); +std::string Admin() const; +void SetAdminGroup(const std::string& name); +std::string AdminGroup() const; ``` - -设定列族内cell的最大版本数,默认为1。 - -当某cell的版本数超过此限制后,会将最旧的版本进行屏蔽,并在垃圾回收时物理删除。 - -此值不做最大值限制,但随着版本数大量增加,相应的随机读、扫描性能会下降,存储使用上升,用户可按实际情况调整。 - -## 字符串描述 - -描述表格的字符串是一个支持描述节点属性的树结构,语法详见[PropTree](https://github.com/BaiduPS/tera/blob/master/doc/prop_tree.md) - -### 描述表格存储 - -表格结构中包含表名、locality groups定义、column families定义,一个典型的表格定义如下(可写入文件): - - # tablet分裂阈值为4096M,合并阈值为512M - # 三个lg,分别配置为flash、flash、磁盘存储 - table_hello { - lg_index { - update_flag - }, - lg_props { - level, - weight - }, - lg_raw { - data - } - } - -如果无需配置LG,指定表名和所需列名即可(所有的属性可配): - - table_hello {cf0, cf1, cf2} - -### 描述key-value存储 - -只需指定表名即可,若需要指定存储介质等属性,可选择性添加: - - kv_hello # 简单key-value - kv_hello # 配置若干属性 - -### 属性及含义 - -span | 属性名 | 意义 | 有效取值 | 单位 | 默认值 | 其它说明 ---- | --- | --- | --- | --- | --- | --- -table | splitsize | 某个tablet增大到此阈值时分裂为2个子tablets| >=0,等于0时关闭split | MB | 512 | -table | mergesize | 某个tablet减小到此阈值时和相邻的1个tablet合并 | >=0,等于0时关闭merge | MB | 0 | splitsize至少要为mergesize的5倍 -lg | storage | 存储类型 | "disk" / "flash" / "memory" | - | "disk" | -lg | blocksize | LevelDB中block的大小 | >0 | KB | 4 | -lg | use_memtable_on_leveldb | 是否启用内存compact | "true" / "false" | - | false | -lg | sst_size | 第一层sst文件大小 | >0 | MB | 8 | -cf | maxversions | 保存的最大版本数 | >0 | - | 1 | -cf | ttl | 数据有效时间 | >=0,等于0时此数据永远有效 | second | 0 | diff --git a/doc/sdk_reference/transaction.md b/doc/sdk_reference/transaction.md new file mode 100644 index 000000000..7a9ba1ae1 --- /dev/null +++ b/doc/sdk_reference/transaction.md @@ -0,0 +1,60 @@ + +# 单行事务transaction接口说明 + +## 主要功能 + + +##### (1) 提交一个修改操作 Transaction::ApplyMutation +``` +void ApplyMutation(RowMutation* row_mu) = 0 +``` + +##### (2) 读取操作 Transaction::Get +``` +ErrorCode Get(RowReader* row_reader) = 0 +``` +##### (3) 回调函数原型 Transaction::Callback +``` +typedef void (*Callback)(Transaction* transaction) +``` + +##### (4) 设置提交回调, 提交操作会异步返回 Transaction::SetCommitCallback + +``` +void SetCommitCallback(Callback callback) = 0; +``` + +##### (5) 获取提交回调 Transaction::GetCommit + +``` +Callback GetCommitCallback() = 0; +``` + +##### (6) 设置用户上下文,可在回调函数中获取 Transaction::SetContext + +``` +void SetContext(void* context) = 0; +``` + +##### (7) 获取用户上下文 Transaction::GetContext + +``` +void* GetContext() = 0 +``` + +##### (8) 获得结果错误码 Transaction::GetError + +``` +const ErrorCode& GetError() = 0; // 异步模式下,通过GetError()获取提交结果 +``` + +##### (9) 同步模式下,获得提交的结果 Transaction::Commit +``` +ErrorCode Commit() = 0 // 同步模式下,Commit()的返回值代表了提交操作的结果(成功 或者 失败及其原因) +``` + +##### (10) 获取事务开始时间戳 Transaction::GetStartTimestamp +``` +int64_t GetStartTimestamp() = 0 //仅在全局事务场景下有效 +``` + diff --git a/doc/sdk_reference/utils.md b/doc/sdk_reference/utils.md new file mode 100644 index 000000000..0ad5ba27a --- /dev/null +++ b/doc/sdk_reference/utils.md @@ -0,0 +1,14 @@ + +# utils接口说明 +tera中utils操作主要用来编码和解码counter cell +##### (1) 编码 +``` +static std::string EncodeCounter(int64_t counter); +``` + +##### (2) 解码 + +``` +static bool DecodeCounter(const std::string& buf, int64_t* counter); +``` + diff --git a/doc/tools/benchmark.md b/doc/tools/benchmark.md new file mode 100644 index 000000000..5f8ce2941 --- /dev/null +++ b/doc/tools/benchmark.md @@ -0,0 +1,38 @@ + +## 1. tera_bench +造数据的工具 +### (1) 用法 +``` +./tera_bench --compression_ratio=1 --key_seed=1 --value_seed=20 --value_size=1000 --num=200000 --benchmarks=random --key_size=24 --key_step=1 +``` + +## 2. tera_mark +读写数据,支持异步读写scan + +### (1) 用法 +``` +#示例: +./tera_mark --mode=w --tablename=test --type=async --verify=false --entry_limit=1000 +``` + +### (2) 参数列表 + +参数名 | 意义 | 有效取值 | 单位 | 默认值 | 其它说明 +--- | --- | --- | --- | --- | --- +table | 表名 | - | - | "" | +mode | 模式 | "w"/"r"/"s"/"m" | - | "w" | - +type | 类型 | "sync"/"async" | - | "async" | - +pend_size | 最大pending大小 | - | - | 100 | - +pend_count | 最大pending数 | - | - | 100000 | - +start_key | scan的开始key | - | - | "" | - +end_key | scan的结束key | - | - | "" | - +cf_list | scan的列簇 | - | - | "" | - +print | scan的结果是否需要打印 | true/false | - | false | - +buf_size | scan的buffer_size | >0 | - | 65536 | - +verify | md5 verify(writer&read) | true/false | - | true | - +max_outflow | max_outflow | - | - | -1 | - +max_rate | max_rate | - | - | -1 | - +scan_streaming | enable streaming scan | true/false | - | false | - +batch_count | batch_count(sync) | - | - | 1 | - +entry_limit | writing/reading speed limit | - | - | 0 | - + diff --git a/doc/tools/readme.md b/doc/tools/readme.md new file mode 100644 index 000000000..401fad9e9 --- /dev/null +++ b/doc/tools/readme.md @@ -0,0 +1,10 @@ + +# Tera 主要工具说明 + +## 主要工具 +* 操作tera的工具: [teracli](../tools/teracli.md) +* 集群间数据迁移的dump工具: [terautil](../tools/terautil.md) +* 造数据 & 读写数据的工具: [tera_bench & tera_mark](../tools/benchmark.md) +* 业界通用NoSQL测试的基准测试工具: [YCSB](../tools/ycsb.md) + + diff --git a/doc/tools/teracli.md b/doc/tools/teracli.md new file mode 100644 index 000000000..1ca78c460 --- /dev/null +++ b/doc/tools/teracli.md @@ -0,0 +1,448 @@ + +# teracli使用说明 +./teracli help即可看到相关的命令和使用方法 + +### 1. create 创建表格 +#### 1.1 基本命令 + +```c +./teracli create [] +./teracli createbyfile [] +``` +说明: +* table-schema是一个描述表格结构的字符串。 +* 表名规范:首字符为字母(大小写均可), +* 有效字符包括大小写的英文字母(a-zA-Z)、数字(0-9)、下划线(_)、连字符(-)、点(.)。 1 <= 有效长度 <= +* 512 +* Tera支持在建立表格时预分配若干tablet,tablet分隔的key写在tablet-delimiter-file中,按“\n”分隔。 +* 如果表格schema比较复杂,可以将其写入文件中,通过createbyfile命令进行创建。 + +#### 1.2 创建table模式存储 +表格结构中包含表名、locality groups定义、column families定义,一个典型的表格定义如下(可写入文件) +```c +# tablet分裂阈值为4096M,合并阈值为512M +# 三个lg,分别配置为flash、flash、磁盘存储 +table_hello { + lg_index { + update_flag + }, + lg_props { + level, + weight + }, + lg_raw { + data + } +} +``` +如果只希望简单的使用tera,对性能没有很高要求,那么schema只需指定表名和所需列名即可(如需要,所有的属性也是可配的): +```c +table_hello {cf0, cf1, cf2} +``` + +#### 1.3 创建key-value表 +tera支持高性能的key-value存储,其schema只需指定表名即可,若需要指定存储介质等属性,可选择性添加: +```c + # 表名为key-value,默认storage为disk, splitsize为512M, mergesize为0 +./teracli create kv_hello + # 配置若干属性 +./teracli create "kv_hello " +``` +#### 1.4 表格各级属性 + +span | 属性名 | 意义 | 有效取值 | 单位 | 默认值 | 其它说明 +--- | --- | --- | --- | --- | --- | --- +table | splitsize | 某个tablet增大到此阈值时分裂为2个子tablets| >=0,等于0时关闭split | MB | 512 | +table | mergesize | 某个tablet减小到此阈值时和相邻的1个tablet合并 | >=0,等于0时关闭merge | MB | 0 | +splitsize至少要为mergesize的3倍,建议为mergesize的10倍,避免merge后又分裂 +lg | storage | 存储类型 | "disk" / "flash" / "memory" | - | "disk" | +lg | blocksize | LevelDB中block的大小 | >0 | KB | 4 | +lg | use_memtable_on_leveldb | 是否启用内存compact | "true" / "false" | - | false | +lg | sst_size | 第一层sst文件大小 | >0 | MB | 8 | +cf | maxversions | 保存的最大版本数 | >0 | - | 1 | +cf | ttl | 数据有效时间 | >=0,等于0时此数据永远有效 | second | 0 | +和minversions冲突时以minversions为准 + + +### 2 update 更新表格schema +更新时使用schema语法和建表时的语法基本一致, +不同主要在于更新时只需指定要更新的属性,不需要改动的属性无需列出。 +#### 2.1 基本语法 +```c +./teracli update +``` +#### 2.2 分类 +主要分为两大类更新: +* 更新table模式schema +* 更新kv模式schema + +#### 2.3 更新table模式schema + +支持表格、cf属性热更新 +##### 2.3.1 更新table的属性(不更新lg、cf属性) +```c +./teracli update "table_hello" //更新mergesize +./teracli update "table_hello" //更新mergesize和splitsize +``` +##### 2.3.2 更新lg属性时,***需要disable表格*** +```c +./teracli disable table_hello +./teracli update "table_hello{lg0}" +./teracli update "table_hello{lg0}" //也可以同时修改table属性 +``` +##### 2.3.3 更新cf属性 +```c +./teracli update "table_hello{lg0{cf0}}" +#也可以同时修改table或者lg属性 +./teracli update "table_hello{lg0{cf0}}" +``` +##### 2.3.4 增加、删除cf + +```c +# 在lg0下增加cf1,并设置属性ttl值为123. +# op意为操作,op=add需要放在cf属性的最前面 +./teracli update "table_hello{lg0{cf1}}" + +# 从lg0中删除cf1 +./teracli update "table_hello{lg0{cf1}}" +``` + +#### 2.4 更新kv模式schema +```c +# 更新部分属性时需要disable表格,程序会在运行时给出提示 +./teracli update "kv_hello" +``` + +### 3. update-check + +### 4. enable +将处于disable状态的表格重新enable,恢复读、写服务。 +```c +./teracli enable +``` + +### 5. disable +将处于表格置于disable状态,不再提供读、写服务。 +```c +./teracli enable +``` + +### 6. drop +删除处于disable状态的表格,此操作不可回滚。 +```c +./teracli drop +``` +### 7. rename 重命名表格 +```c +#语法: +./teracli rename +``` +示例: +```c +./teracli rename tb1 tb2 +``` + +### 8. put 向表中写入一个value +向表中写入以rowkey为key,列为columnfamily:qualifier的值value.对于kv模式的表来说,无需columnfamily:qualifier. +```c +#语法: +./teracli put [] +``` +示例: +```c +./teracli put mytable rowkey cf0:qu0 value +``` + +### 9. put-ttl 新增的ttl字段表示这个value的有效时间 +```c +#语法: +./teracli put-ttl [] +``` +示例: +```c +#这个value在20秒内有效,超时就读不到了。 +./teracli put-ttl mytable rowkey cf0:qu0 value 20 +``` + +### 10. putif 原子操作,如果不存在才能put成功 + +```c +#语法: +./teracli putif [] +``` + +### 11. get 读取一个value +```c +#语法: +./teracli get [] +``` +示例: +```c +#这个value在20秒内有效,超时就读不到了。 +./teracli get mytable rowkey cf0:qu0 +``` + +### 12. scan 扫描一个表 +将表中key从[startkey, endkey)范围的所有数据扫描出来。 +每个value可以有多个版本(versions),scan命令默认只输出每个value的最新版本, +想要获取全部版本可以使用scanallv命令。 +```c +#语法: +./teracli scan[allv] +``` +示例: +```c +#扫描整个表 +./teracli scan mytable "" "" +``` + + +### 13. delete 删除一个value +如果只想删除某列最新的一个版本可以用delete1v命令。 +```c +#语法: +./teracli delete[1v] [] +``` + +### 14. put_counter 写入一个counter(计数器) +```c +#语法: +./teracli put_counter [] +``` +示例: +```c +#写入一个初始值为3的计数器: +./teracli put_counter mytable rowkey cf0:qu0 3 +``` +### 15. get_counter 读取一个counter +``` +#语法: +./teracli get_counter [] +``` +示例: +```c +#读取之前写入的那个counter: +./teracli get_counter mytable rowkey cf0:qu0 +``` + +### 16. add 给某个counter加上一个delta值 +``` +#语法: +./teracli add delta +``` +示例: +```c +#读取之前写入的那个counter: +./teracli get_counter mytable rowkey cf0:qu0 +``` + +### 17. putint64 写入一个int64类型counter(计数器) + +``` +#语法: +./teracli putint64 [] +``` +示例: +```c +#写入一个初始值为67的计数器: +./teracli putint64 mytable row1 cf0:qu0 67 +``` + +### 18. getint64 读取一个int64类型的counter + +``` +#语法: +./teracli getint64 [] +``` +示例: +```c +./teracli getint64 mytable row1 cf0:qu0 +``` + +### 19. addint64 对int64类型的counter执行原子加操作 +``` +#语法: +./teracli addint64 delta +``` +示例: +```c +#对之前写入的counter执行-3的操作: +# addint64操作执行完以后,该counter的值为 64 +./teracli addint64 mytable row1 cf0:qu0 -3 +``` +### 20. append 原子操作:追加内容到一个Cell +``` +#语法: +./teracli append [] +``` +示例: +```c +./teracli put mytalbe rowkey cf0:qu0 hello +./teracli append mytable rowkey cf0:qu0 world +#此时再去get会得到helloworld +./teracli get mytable rowkey cf0:qu0 +``` +### 20. batchput 批量写数据 +``` +#语法: +./teracli batchput +``` +### 21. batchget 批量读数据 +``` +#语法: +./teracli batchget +``` +### 22. show 显示表格信息 +``` +#语法: +./teracli show[x] [] +``` +示例: +```c +#查看某个table的信息: +./teracli show mytable +#查看集群内所有table的信息: +./teracli show +``` + +### 23. showx 显示表格详细信息 +``` +#语法: +./teracli show[x] [] +``` +示例: +```c +#查看某个table的信息: +./teracli showx mytable +``` + +### 24. showschema 显示表格schema +表格schema里含有很多属性(例如某个cf保留的最小版本数),创建表格时,没有显示指定的属性都取默认值, +这些属性在showschema时不会显示出来;想要显示全部属性,可以使用showschemax命令。 +``` +#语法: +./teracli showschema[x] +``` + + +### 25. showts 显示tabletnode的信息 +带上后缀'x'得到的信息会更详细(showtsx)。 +``` +#语法: +./teracli showts [] +``` +示例: +```c +#显示某个tabletnode的信息: +./teracli showts "example.company.com:7770" +#显示集群内所有tabletnode的信息: +./teracli showts +``` + +### 26. range 显示表的范围 +``` +#语法: +./teracli range +``` +### 27. txn 事务(仅支持单事务行操作) +``` +#语法: +./teracli txn +operation包括start和commit +./teracli txn start +./teracli txn commit +``` + +### 28. user用户管理 +``` +#语法: +./teracli user +operation包括create、changepwd、show、delete、addtogroup和deletefromgroup +user + create + changepwd + show + delete + addtogroup + deletefromgroup +``` +### 29. tablet +``` +#语法: +./teracli tablet +operation包括move、reload、compact、split、merge和scan +tablet + move + reload + force to unload and load on the same ts + compact + split + merge + scan +``` + +### 30. compact +``` +#语法: +./teracli compact +``` + +### 31. safemode +``` +#语法: +./teracli safemode [get|enter|leave] +``` + +### 32. meta +meta for master memory, meta2 for meta table. +``` +#语法: +./teracli meta[2] [backup|check|repair|show] +``` +### 33. findmaster master的位置 +``` +#语法: +./teracli findmaster +``` +### 34. reload +``` +#语法: +./teracli reload config hostname:port +``` + +### 35. kick +``` +#语法: +./teracli kick +``` + +### 36. findtablet +``` +#语法: +./teracli findtablet +./teracli findtablet +``` + +### 37. cookie +``` +#语法: +./teracli cookie +cookie + dump cookie-file -- dump contents of specified files + findkey cookie-file key -- find the info of a key +``` + +### 38. version版本 +``` +#语法: +./teracli version +``` + diff --git a/doc/tools/terautil.md b/doc/tools/terautil.md new file mode 100644 index 000000000..842b572eb --- /dev/null +++ b/doc/tools/terautil.md @@ -0,0 +1,78 @@ + + +# terautil + +集群间数据迁移的dump工具 +### 1. 用法 +``` +./terautil dump help +``` +#### (1)建表 +``` +./terautil --flagfile=../conf/terautil.flag dump prepare_safe +``` +#### (2) 将扫表操作run起来 +``` +./terautil --flagfile=../conf/terautil.flag dump run +``` + +### 2. flag配置 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
flag名称flag默认值或格式flag介绍
dump_tera_src_conf ../conf/src_tera.flag(格式)tera的源集群
dump_tera_dest_conf../conf/dest_tera.flag(格式)tera的目的集群
dump_tera_src_root_path/xxx_(路径格式)tera的源路径
dump_tera_dest_root_path/xxx_(路径格式)tera的目的路径
ins_cluster_addrterautil_ins(格式)锁服务器的地址
ins_cluster_root_path/terautil/dump/xxxx(格式)锁服务器路径
dump_tera_src_meta_addr“”源meta表的地址
dump_tera_dest_meta_addr“”目的meta表的地址
dump_manual_split_interval1000手动分裂时间间隔,单位为ms
dump_enable_manual_splitfalse是否允许手动分裂
+ + diff --git a/doc/tools/ycsb.md b/doc/tools/ycsb.md new file mode 100644 index 000000000..b6f922bc7 --- /dev/null +++ b/doc/tools/ycsb.md @@ -0,0 +1,294 @@ + +# YCSB工具使用说明 + +### 1. 属性 + +#### 1.1 核心YCSB属性 +所有工作量文件可以指定以下属性: + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
参数名意义默认值
workload要使用的工作量类,如com.yahoo.ycsb.workloads.CoreWorkload
db要使用的数据库类。可选地,这在命令行可以指定com.yahoo.ycsb.BasicDB
exporter要是用的测量结果的输出类com.yahoo.ycsb.measurements.exporter.TextMeasurementsExporter
exportfile用于替代stdout的输出文件路径未定义/输出到stdout
threadcountYCSB客户端的线程数。可选地,这可以在命令行指定1
measurementtype支持的测量结果类型有直方图和时间序列直方图
+ + + + + + +#### 1.2 核心工作量包属性 +和核心工作量构造器一起使用的属性文件可以指定以下属性及值 +#####1.2.1 重要参数 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
参数名意义默认值有效取值
recordcount数据行数,装载进数据库的初始记录数0
operationcount要进行的操作数数量
fieldcount每行的qualifier个数10
fieldlength100
requestdistribution随机读的数据分布uniformuniform、zipfian、latest
insertorder写入顺序,ordered是顺序写,hashed是随机写hashedordered、hashed
readallfields读取所有qualifier还是只读一个qualifiertruetrue、false
readproportion随机读占所有操作的比例0.95
readproportion更新(写入)占所有操作的比例0.05
target每秒总共操作的次数unthrottled
thread客户端线程数1
+ +##### 1.2.2 非必需参数(对tera测试意义不大,用默认值即可) + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
参数名意义默认值有效取值
insertproportion插入(写入)占所有操作的比例0
scanproportionscan占所有操作的比例,tera_mark不支持0
readmodifywriteproportionreadmodifywrite占所有操作的比例,tera不支持该操作0
maxscanlength每次scan需要读取的行数,tera不支持指定行数的scan1000
scanlengthdistributionscan的行数选择策略uniform
maxexecutiontime最大执行时间,超过此时间会强行结束测试(单位为秒)
table表名,tera_mark不支持usertable
+ +#### 1.3 测量结果属性 +每一个测量结果类型可以为如下属性形式: + + + + + + + + + + + + + + + + + + + + + + + + + +
类型参数名意义默认值有效取值
直方图histogram.buckets直方图输出的区间数1000
时间序列timeseries.granularity时间序列输出的粒度1000
+ +### 2 运行时参数 +即使工作负载类和参数文件定义了一个特定的工作负载,在运行基准测试时你还是想指定一些额外的设置。当你运行YCSB客户端时命令行提供了这些设置。这些设置包括: +* -threads :客户端的线程。默认地,YCSB客户端使用一个工作者线程,但是额外的线程可以被指定。当需要增加对数据库的装载数量时这是经常使用的。 +* -target:每秒的目标操作数。默认地,YCSB客户端将试图尽可能地执行最多操作。例如,如果每个操作平均使用了100ms,客户端每个工作者线程每秒将执行10个操作。然而,你可以限制每秒的目标操作数。比如,为了生成一条延迟-吞吐量曲线,你可以指定不同的目标吞吐量,以测试每种吞吐量下的延迟。 +* -s:状态。对于一个运行时间长的工作负载,让客户端报告状态是有用的,这可以让你知道它并没有挂掉,并且给你某些对它的执行过程的想法。通过在命令行指定“-s”,客户端将每10秒输出状态到stderr。 + + + + +### 3 用法 + +#### 3.1 相关命令 +* load: 执行加载命令 +* run: 执行工作负载 +* shell: 交互式模式 +``` +#basic参数告诉客户端使用哑BasicDB层。你也可以在你的参数文件中使用“db”属性指定它(例如,“db=com.yahoo.ycsb.BasicDB”) +./bin/ycsb shell basic +> help +Commands: +read key [field1 field2 ...] // Read a record +scan key recordcount [field1 field2 ...] // Scan starting at key +insert key name1=value1 [name2=value2 ...] // Insert a new record +update key name1=value1 [name2=value2 ...] // Update a record +delete key // Delete a record +table [tablename] // Get or [set] the name of the table +quit // Quit +``` + +#### 3.2 使用方法 +使用时,先建表,再加载数据,最后执行相关事务。 + +##### 3.2.1 建表 +ycsb的生成的row都是“user”+19位数字的格式,如 user9105318085603802964。 因此,如果需要预分表,必须以“user”+N个数字作为分隔,建议选择2个数字。 例如要预分4个tablet,分隔字符串为:user25、user50、user75 +``` +create 'usertable','f1','f2','f3' +``` + +##### 3.2.2 向tera中加载测试数据 +``` +bin/ycsb load tera -p workload=com.yahoo.ycsb.workloads.CoreWorkload \ //load参数告诉客户端执行工作负载的装载阶段。 + -p recordcount=$(ROW_NUM) \ //-p参数被用于设置参数,-P参数用于装载属性文件。 + -p fieldlength=$(QUALIFIER_NUM) \ + -p fieldcount=$(VALUE_SIZE) +``` + +##### 3.2.3 执行测试 +``` +bin/ycsb run tera -p workload=com.yahoo.ycsb.workloads.CoreWorkload \ + -p recordcount=$(ROW_NUM) \ + -p operationcount=$(ROW_NUM) \ + -p requestdistribution=$(DIST) \ + -p fieldlength=$(QUALIFIER_NUM) \ + -p fieldcount=$(VALUE_SIZE) \ + -p updateproportion=$(WRITE_PROP) \ + -p readproportion=$(READ_PROP) +``` + + diff --git a/example/onebox/conf/tera.flag b/example/onebox/conf/tera.flag index 99f62b45e..37329893d 100644 --- a/example/onebox/conf/tera.flag +++ b/example/onebox/conf/tera.flag @@ -7,10 +7,10 @@ --tera_leveldb_env_type=local ## 是否使用zk -# 指定使用非zk模式, 但只能本机访问tera ---tera_zk_enabled=false +# 指定使用fake_zk模式, 只能本机访问tera +--tera_coord_type=fake_zk # 指定使用zk, 可以跨服务使用, 配置相应地址和路径即可 -#--tera_zk_enabled=true +--tera_zk_enabled=false #--tera_zk_addr_list=localhost:2181 #--tera_zk_root_path=/tera --tera_master_query_tabletnode_period=1000 @@ -18,3 +18,18 @@ # sdk --tera_sdk_timeout=20000 + +# balancer +#--tera_info_log_clean_enable=false +#--logbugsecs=0 +#--v=5 +#--tera_master_load_balance_ts_load_threshold=1000000000 +#--tera_master_load_balance_ts_size_threshold=10000000000000 +#--tera_master_meta_isolate_enabled=true +#--tera_lb_load_balance_period_s=60 +#--tera_lb_tablet_max_move_num=10 +#--tera_lb_min_cost_need_balance=0.05 +#--tera_lb_move_cost_weight=10 +#--tera_lb_size_cost_weight=90 +#--tera_lb_debug_mode_enabled=false +--online_schema_update_enabled=true diff --git a/include/tera/client.h b/include/tera/client.h index 2ef68638e..80308a911 100644 --- a/include/tera/client.h +++ b/include/tera/client.h @@ -12,6 +12,7 @@ #include "error_code.h" #include "table.h" #include "table_descriptor.h" +#include "transaction.h" #pragma GCC visibility push(default) namespace tera { @@ -101,6 +102,10 @@ class Client { // Rename a table. virtual bool Rename(const std::string& old_table_name, const std::string& new_table_name, ErrorCode* err) = 0 ; + + /// New a global transaction + virtual Transaction* NewGlobalTransaction() = 0; + Client() {} virtual ~Client() {} diff --git a/include/tera/error_code.h b/include/tera/error_code.h index a03df0905..ad6ab2b64 100644 --- a/include/tera/error_code.h +++ b/include/tera/error_code.h @@ -26,7 +26,22 @@ class ErrorCode { kNoAuth = 7, kUnknown = 8, kNotImpl = 9, - kTxnFail = 10 + kTxnFail = 10, + + // only for global transaction error + kGTxnDataTooLarge = 101, + kGTxnNotSupport = 102, + kGTxnSchemaError = 103, + kGTxnOpAfterCommit = 104, + kGTxnPrimaryLost = 105, + kGTxnWriteConflict = 106, + kGTxnLockConflict = 107, + kGTxnOKButAckFailed = 108, + kGTxnOKButNotifyFailed = 109, + kGTxnPrewriteTimeout = 110, + kGTxnPrimaryCommitTimeout = 111, + kGTxnTimestampLost = 112 + // end of global transaction error }; public: diff --git a/include/tera/reader.h b/include/tera/reader.h index cc916c14d..08615f4d8 100644 --- a/include/tera/reader.h +++ b/include/tera/reader.h @@ -31,6 +31,12 @@ class RowReader { virtual void AddColumn(const std::string& family, const std::string& qualifier) = 0; // Set the maximum number of versions of each column. virtual void SetMaxVersions(uint32_t max_version) = 0; + + // Set the the max qualifiers of each column family when read this row + // This is useful when a column family contains too many qualifiers + // If this value is not set, the default value is std::numeric_limits::max() + virtual void SetMaxQualifiers(uint64_t max_qualifiers) = 0; + // If set, only returns cells of which update timestamp is within [ts_start, ts_end]. virtual void SetTimeRange(int64_t ts_start, int64_t ts_end) = 0; diff --git a/include/tera/scan.h b/include/tera/scan.h index 45646ec9d..c9023f9b6 100644 --- a/include/tera/scan.h +++ b/include/tera/scan.h @@ -79,6 +79,11 @@ class ScanDescriptor { // Set max version number per column. void SetMaxVersions(int32_t versions); + // Set the the max qualifiers of each column family when read this row + // This is useful when a column family contains too many qualifiers + // If this value is not set, the default value is std::numeric_limits::max() + void SetMaxQualifiers(uint64_t max_qualifiers); + // Set time range for the scan result, // which likes the SQL statement (SELECT * from Table WHERE timestamp in [ts_start, ts_end]). // Return the newest value first. diff --git a/include/tera/table_descriptor.h b/include/tera/table_descriptor.h index 8865d5a9d..4b464070f 100644 --- a/include/tera/table_descriptor.h +++ b/include/tera/table_descriptor.h @@ -54,6 +54,12 @@ class ColumnFamilyDescriptor { virtual int64_t DiskQuota() const = 0; virtual void SetAcl(ACL acl) = 0; virtual ACL Acl() const = 0; + virtual void EnableGlobalTransaction() = 0; + virtual void DisableGlobalTransaction() = 0; + virtual bool GlobalTransaction() const = 0; + virtual void EnableNotify() = 0; + virtual void DisableNotify() = 0; + virtual bool IsNotifyEnabled() const = 0; ColumnFamilyDescriptor() {} virtual ~ColumnFamilyDescriptor() {} diff --git a/include/tera/transaction.h b/include/tera/transaction.h index dc63a7842..81722f35b 100644 --- a/include/tera/transaction.h +++ b/include/tera/transaction.h @@ -15,9 +15,15 @@ #pragma GCC visibility push(default) namespace tera { - class RowReader; class RowMutation; +class Table; + +/// 事务隔离级别 +enum class IsolationLevel { + kReadCommitedSnapshot = 0, + kSnapshot = 1 +}; /// 事务操作接口 class Transaction { @@ -47,9 +53,36 @@ class Transaction { /// 异步模式下,通过GetError()获取提交结果 virtual ErrorCode Commit() = 0; - /// 获取事务开始时间戳,仅在多行事务场景下有效 + /// 获取事务开始时间戳 virtual int64_t GetStartTimestamp() = 0; + /// 获取事务提交时间戳 + virtual int64_t GetCommitTimestamp() = 0; + + /// 仅全局事务支持 + virtual void Ack(Table* t, + const std::string& row_key, + const std::string& column_family, + const std::string& qualifier) = 0; + + /// 仅全局事务支持 + virtual void Notify(Table* t, + const std::string& row_key, + const std::string& column_family, + const std::string& qualifier) = 0; + + /// 设置隔离级别 + virtual void SetIsolation(const IsolationLevel& isolation_level) = 0; + + /// 获取隔离级别 + virtual IsolationLevel Isolation() = 0; + + // Set timeout(ms). + virtual void SetTimeout(int64_t timeout_ms) = 0; + + // Get timeout(ms). + virtual int64_t Timeout() = 0; + Transaction() {} virtual ~Transaction() {} @@ -58,10 +91,6 @@ class Transaction { void operator=(const Transaction&); }; -/// cross-row, cross-table transaction -/// 跨行,跨表事务 -Transaction* NewTransaction(); - } // namespace tera #pragma GCC visibility pop diff --git a/readme-cn.md b/readme-cn.md index 7e136a26c..7edc5362f 100644 --- a/readme-cn.md +++ b/readme-cn.md @@ -1,63 +1,48 @@ [高性能、可伸缩的结构化数据库](http://github.com/baidu/tera) ==== Tera是一个高性能、可伸缩的结构化数据存储系统,被设计用来管理搜索引擎万亿量级的超链与网页信息。为实现数据的实时分析与高效访问,我们使用按行键、列名和时间戳全局排序的三维数据模型组织数据,使用多级Cache系统,充分利用新一代服务器硬件大内存、SSD盘和万兆网卡的性能优势,做到模型灵活的同时,实现了高吞吐与水平扩展。([English](README.md)) - # 特性 - * 全局有序 - * 热点自动分片 - * 数据强一致 - * 多版本,自动垃圾收集 - * 按列存储,支持内存表 - * 动态schema - * 支持表格快照 - * 高效随机读写 - +* 全局有序 +* 热点自动分片 +* 数据强一致 +* 多版本,自动垃圾收集 +* 按列存储,支持内存表 +* 动态schema +* 支持表格快照 +* 高效随机读写 # 数据模型 Tera使用了Bigtable的数据模型,可以将一张表格理解为这样一种数据结构: ``` map > > ``` 其中RowKey、ColumnFamily、Qualifier和Value是字符串,Timestamp是一个64位整形。ColumnFamliy需要建表时指定,是访问控制、版本保留等策略的基本单位。 - # 系统架构 系统主要由Tabletserver、Master和ClientSDK三部分构成。其中Tabletserver是核心服务器,承载着所有的数据管理与访问;Master是系统的仲裁者,负责表格的创建、schema更新与负载均衡;ClientSDK包含供管理员使用的命令行工具teracli和给用户使用的SDK。 表格被按RowKey全局排序,并横向切分成多个Tablet,每个Tablet负责服务RowKey的一个区间,表格又被纵向切分为多个LocalityGroup,一个Tablet的多个Localitygroup在物理上单独存储,可以选择不同的存储介质,以优化访问效率。 - ![架构图](resources/images/arch.png) - # 系统依赖 - * 使用分布式文件系统([BFS](https://github.com/baidu/bfs)、HDFS等)持久化数据与元信息 - * 使用分布式协调服务([Nexus](https://github.com/baidu/ins/)或者Zookeeper)选主与协调 - * 使用[Sofa-pbrpc](https://github.com/baidu/sofa-pbrpc/)实现跨进程通信 - +* 使用分布式文件系统([BFS](https://github.com/baidu/bfs)、HDFS等)持久化数据与元信息 +* 使用分布式协调服务([Nexus](https://github.com/baidu/ins/)或者Zookeeper)选主与协调 +* 使用[Sofa-pbrpc](https://github.com/baidu/sofa-pbrpc/)实现跨进程通信 # 系统构建 -sh ./build.sh +sh ./build.sh 参考[BUILD](BUILD-cn) - # 使用示例 - [体验单机Tera](doc/cn/onebox.md) - [通过docker体验Tera](example/docker) - -[主要api使用方法](doc/cn/sdk_guide.md) - -[客户端teracli使用方法](doc/cn/teracli.md) - +[主要api使用方法](doc/sdk_reference/README.md) +[客户端teracli使用方法](doc/tools/teracli.md) +[集群间数据迁移的dump工具terautil使用方法](doc/tools/terautil.md) +[造数据 & 读写数据的工具使用方法](doc/tools/benchmark.md) +[性能测试工具ycsb使用方法](doc/tools/ycsb.md) [其它文档](doc/cn/README.md) - #反馈与技术支持 tera_dev at baidu.com - # 成为贡献者 阅读[RoadMap](doc/cn/roadmap.md)文件或者源代码,了解我们当前的开发方向。 - 完成[5个小任务](doc/to_be_a_contributor.md),帮你一步步成为tera贡献者。 - # Become a Committer - 成为tera的committer,你需要知道的一些[规则](doc/cn/to_be_a_committer.md)。 - # 欢迎加入 如果你热爱开源,热爱分布式技术,请将简历发送至: opensearch at baidu.com diff --git a/resources/images/global_txn.png b/resources/images/global_txn.png new file mode 100644 index 000000000..0e6e8f950 Binary files /dev/null and b/resources/images/global_txn.png differ diff --git a/src/benchmark/mark.cc b/src/benchmark/mark.cc index a0081e2e4..a53d1f8b7 100644 --- a/src/benchmark/mark.cc +++ b/src/benchmark/mark.cc @@ -49,7 +49,7 @@ void sdk_write_callback(tera::RowMutation* row_mu) { adapter->WriteCallback(row_mu, req_size, req_time); } -void Adapter::Write(const std::string& row, +void Adapter::Write(int opt, const std::string& row, std::map >& column, uint64_t timestamp, std::string& value) { @@ -74,7 +74,13 @@ void Adapter::Write(const std::string& row, if (FLAGS_verify) { add_checksum(row, family, qualifier, &value); } - row_mu->Put(family, qualifier, value, (int64_t)timestamp); + if (opt == PUT) { + row_mu->Put(family, qualifier, value, (int64_t)timestamp); + } else if (opt == PIF) { + row_mu->PutIfAbsent(family, qualifier, value); + } else { + abort(); + } if (FLAGS_verify) { remove_checksum(&value); } @@ -122,6 +128,8 @@ void Adapter::WriteCallback(tera::RowMutation* row_mu, size_t req_size, tera::ErrorCode err = row_mu->GetError(); if (err.GetType() == tera::ErrorCode::kOK) { write_marker_.OnSuccess(req_size, latency); + } else if (err.GetType() == tera::ErrorCode::kTxnFail) { + write_marker_.OnConflict(req_size, latency); } else { /*std::cerr << "fail to write: row=[" << row << "], column=[" << family << ":" << qualifier << "], timestamp=[" diff --git a/src/benchmark/mark.h b/src/benchmark/mark.h index c510de42c..ec5099eb5 100644 --- a/src/benchmark/mark.h +++ b/src/benchmark/mark.h @@ -18,7 +18,7 @@ #include "common/mutex.h" #include "tera.h" -#include "utils/counter.h" +#include "common/counter.h" DECLARE_int64(pend_size); DECLARE_int64(pend_count); @@ -46,7 +46,8 @@ enum OP { PUT = 1, GET = 2, SCN = 3, - DEL = 4 + DEL = 4, + PIF = 5 }; int64_t Now(); @@ -201,8 +202,11 @@ class Statistic { last_finish_size_(0), last_success_count_(0), last_success_size_(0), + last_conflict_count_(0), + last_conflict_size_(0), finish_marker_(1000000), - success_marker_(1000000) {} + success_marker_(1000000), + conflict_marker_(1000000) {} int GetOpt() { return opt_; @@ -210,24 +214,30 @@ class Statistic { void GetStatistic(int64_t* total_count, int64_t* total_size, int64_t* finish_count, int64_t* finish_size, - int64_t* success_count, int64_t* success_size) { + int64_t* success_count, int64_t* success_size, + int64_t* conflict_count, int64_t* conflict_size) { *total_count = last_total_count_ = total_count_.Get(); *total_size = last_total_size_ = total_size_.Get(); *finish_count = last_finish_count_ = finish_count_.Get(); *finish_size = last_finish_size_ = finish_size_.Get(); *success_count = last_success_count_ = success_count_.Get(); *success_size = last_success_size_ = success_size_.Get(); + *conflict_count = last_conflict_count_ = conflict_count_.Get(); + *conflict_size = last_conflict_size_ = conflict_size_.Get(); } void GetLastStatistic(int64_t* total_count, int64_t* total_size, int64_t* finish_count, int64_t* finish_size, - int64_t* success_count, int64_t* success_size) { + int64_t* success_count, int64_t* success_size, + int64_t* conflict_count, int64_t* conflict_size) { *total_count = last_total_count_; *total_size = last_total_size_; *finish_count = last_finish_count_; *finish_size = last_finish_size_; *success_count = last_success_count_; *success_size = last_success_size_; + *conflict_count = last_conflict_count_; + *conflict_size = last_conflict_size_; } Marker* GetFinishMarker() { @@ -238,6 +248,10 @@ class Statistic { return &success_marker_; } + Marker* GetConflictMarker() { + return &conflict_marker_; + } + void OnReceive(size_t size) { last_send_time_ = Now(); last_send_size_ = size; @@ -257,6 +271,12 @@ class Statistic { success_marker_.AddLatency(latency); } + void OnConflict(size_t size, uint32_t latency) { + conflict_count_.Inc(); + conflict_size_.Add(size); + conflict_marker_.AddLatency(latency); + } + void CheckPending() { int64_t max_pend_count = FLAGS_pend_count; int64_t max_pend_size = FLAGS_pend_size << 20; @@ -297,6 +317,8 @@ class Statistic { tera::Counter finish_size_; tera::Counter success_count_; tera::Counter success_size_; + tera::Counter conflict_count_; + tera::Counter conflict_size_; size_t last_send_size_; int64_t last_send_time_; @@ -307,9 +329,12 @@ class Statistic { int64_t last_finish_size_; int64_t last_success_count_; int64_t last_success_size_; + int64_t last_conflict_count_; + int64_t last_conflict_size_; Marker finish_marker_; Marker success_marker_; + Marker conflict_marker_; }; class Adapter { @@ -317,7 +342,7 @@ class Adapter { Adapter(tera::Table* table); ~Adapter(); - void Write(const std::string& row, + void Write(int opt, const std::string& row, std::map >& column, uint64_t timestamp, std::string& value); diff --git a/src/benchmark/mark_main.cc b/src/benchmark/mark_main.cc index 36ae66c4b..dd57af93a 100644 --- a/src/benchmark/mark_main.cc +++ b/src/benchmark/mark_main.cc @@ -6,6 +6,7 @@ #include #include +#include #include #include #include @@ -61,6 +62,8 @@ bool parse_row(const char* buffer, ssize_t size, *op = GET; } else if (strncmp(buffer, "PUT", 3) == 0) { *op = PUT; + } else if (strncmp(buffer, "PIF", 3) == 0) { + *op = PIF; } else { return false; } @@ -76,13 +79,14 @@ bool parse_row(const char* buffer, ssize_t size, delim = end; } row->assign(buffer, delim - buffer); - if ((delim == end && mode != WRITE && (mode != MIX || *op != PUT)) || - (delim == end && mode == DELETE)) { + if ((delim == end && mode != WRITE && + (mode != MIX || (*op != PUT && *op != PIF))) + ||(delim == end && mode == DELETE)) { return true; } // parse value - if (mode == WRITE || (mode == MIX && *op == PUT)) { + if (mode == WRITE || (mode == MIX && (*op == PUT || *op == PIF))) { if (delim == end) { return false; } @@ -170,7 +174,7 @@ bool parse_row(const char* buffer, ssize_t size, } if (comma == end) { return true; - } else if (mode == WRITE || (mode == MIX && *op == PUT)) { + } else if (mode == WRITE || (mode == MIX && (*op == PUT || *op == PIF))) { return false; } @@ -217,10 +221,11 @@ bool get_next_row(int* op, std::string* row, void print_header() { std::cout << "HH:MM:SS OPT\t"; if (mode != SCAN && type == ASYNC) { - std::cout << "SENT [speed/total]\t\t"; + std::cout << "SENT [total/speed]\t\t"; } - std::cout << "FINISH [speed/total]\t\t"; - std::cout << "SUCCESS [speed/total]\t\t"; + std::cout << "FINISH [total/speed]\t\t"; + std::cout << "SUCCESS [total/speed]\t\t"; + std::cout << "CONFLICT [total/speed]\t\t"; if (mode != SCAN && type == ASYNC) { std::cout << "PENDING [count]"; } @@ -271,24 +276,28 @@ void print_size_and_count(int64_t size, int64_t count) { } void print_statistic(Statistic* statistic) { - int64_t old_total_count, old_finish_count, old_success_count; - int64_t old_total_size, old_finish_size, old_success_size; + int64_t old_total_count, old_finish_count, old_success_count, old_conflict_count; + int64_t old_total_size, old_finish_size, old_success_size, old_conflict_size; statistic->GetLastStatistic(&old_total_count, &old_total_size, &old_finish_count, &old_finish_size, - &old_success_count, &old_success_size); + &old_success_count, &old_success_size, + &old_conflict_count, &old_conflict_size); - int64_t new_total_count, new_finish_count, new_success_count; - int64_t new_total_size, new_finish_size, new_success_size; + int64_t new_total_count, new_finish_count, new_success_count, new_conflict_count; + int64_t new_total_size, new_finish_size, new_success_size, new_conflict_size; statistic->GetStatistic(&new_total_count, &new_total_size, &new_finish_count, &new_finish_size, - &new_success_count, &new_success_size); + &new_success_count, &new_success_size, + &new_conflict_count, &new_conflict_size); int64_t total_count = new_total_count - old_total_count; int64_t finish_count = new_finish_count - old_finish_count; int64_t success_count = new_success_count - old_success_count; + int64_t conflict_count = new_conflict_count - old_conflict_count; int64_t total_size = new_total_size - old_total_size; int64_t finish_size = new_finish_size - old_finish_size; int64_t success_size = new_success_size - old_success_size; + int64_t conflict_size = new_conflict_size - old_conflict_size; int64_t total_pending_count = new_total_count - new_finish_count; // scan @@ -317,6 +326,11 @@ void print_statistic(Statistic* statistic) { std::cout << "/"; print_size_and_count(success_size, success_count); std::cout << "\t\t"; + + print_size_and_count(new_conflict_size, new_conflict_count); + std::cout << "/"; + print_size_and_count(conflict_size, conflict_count); + std::cout << "\t\t"; if (mode != SCAN && type == ASYNC) { std::cout << total_pending_count; @@ -341,6 +355,11 @@ void print_marker(Statistic* statistic) { std::cout << " [SUCCESS]" << std::endl; Marker* success_marker = statistic->GetSuccessMarker(); print_marker(success_marker); + if (statistic->GetOpt() == PUT) { + std::cout << " [CONFLICT]" << std::endl; + Marker* conflict_marker = statistic->GetConflictMarker(); + print_marker(conflict_marker); + } } void* print_proc(void* param) { @@ -416,11 +435,12 @@ void* print_proc(void* param) { } void print_summary(Statistic* marker, double duration) { - int64_t total_count, finish_count, success_count; - int64_t total_size, finish_size, success_size; + int64_t total_count, finish_count, success_count, conflict_count; + int64_t total_size, finish_size, success_size, conflict_size; marker->GetStatistic(&total_count, &total_size, &finish_count, &finish_size, - &success_count, &success_size); + &success_count, &success_size, + &conflict_count, &conflict_size); print_opt(marker); std::streamsize precision = std::cout.precision(); @@ -432,7 +452,10 @@ void print_summary(Statistic* marker, double duration) { << (double)finish_size / 1048576 / duration << " MB/s\n" << " succ: " << success_size << " bytes " << success_count << " records " - << (double)success_size / 1048576 / duration << " MB/s" + << (double)success_size / 1048576 / duration << " MB/s\n" + << " conflict: " << conflict_size << " bytes " + << conflict_count << " records " + << (double)conflict_size / 1048576 / duration << " MB/s" << std::endl; std::cout.precision(precision); std::cout.flags(flag); @@ -616,10 +639,11 @@ int main(int argc, char** argv) { switch (opt) { case PUT: + case PIF: if (type == SYNC && mode == MIX && last_opt == GET) { adapter->CommitSyncRead(); } - adapter->Write(row, column, largest_ts, value); + adapter->Write(opt, row, column, largest_ts, value); break; case GET: if (type == SYNC && mode == MIX && last_opt == PUT) { diff --git a/src/benchmark/tpcc/data_generator.cc b/src/benchmark/tpcc/data_generator.cc new file mode 100644 index 000000000..8fd76cbe6 --- /dev/null +++ b/src/benchmark/tpcc/data_generator.cc @@ -0,0 +1,182 @@ +// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. +// +// Author: baorenyi@baidu.com + +#include "benchmark/tpcc/data_generator.h" +#include "benchmark/tpcc/tpccdb.h" +#include "common/thread_pool.h" +#include "common/timer.h" + +DECLARE_int32(warehouses_count); +DECLARE_int32(tpcc_thread_pool_size); +DECLARE_int32(generate_data_wait_times); + +namespace tera { +namespace tpcc { + +DataGenerator::DataGenerator(RandomGenerator* rand_gen, TpccDb* db) + : event_(), + rand_gen_(rand_gen), + db_(db), + now_datatime_(get_curtime_str()), + thread_pool_(FLAGS_tpcc_thread_pool_size) { + for (int i = 0; i < kTpccTableCnt; ++i) { + states_.push_back(std::make_pair(Counter(), Counter())); + } +} + +void DataGenerator::PrintJoinTimeoutInfo(int need_cnt, int table_enum_num) { + if (need_cnt > states_[table_enum_num].first.Get() + states_[table_enum_num].second.Get()) { + LOG(ERROR) << "table:" << kTpccTables[table_enum_num] + << "[need/succ/fail]:[" + << need_cnt << "/" + << states_[table_enum_num].first.Get() << "/" + << states_[table_enum_num].first.Get() << "]"; + } +} + +void DataGenerator::Join() { + event_.Trigger(); + if (!event_.TimeWait(FLAGS_generate_data_wait_times)) { + int stock_cnt = FLAGS_warehouses_count * kItemCount; + int districts_cnt = FLAGS_warehouses_count * kDistrictCountPerWarehouse; + int customers_cnt = districts_cnt * kCustomerCountPerDistrict; + PrintJoinTimeoutInfo(kItemCount, kItemTable); + PrintJoinTimeoutInfo(stock_cnt, kStockTable); + PrintJoinTimeoutInfo(FLAGS_warehouses_count, kWarehouseTable); + PrintJoinTimeoutInfo(districts_cnt, kDistrictTable); + PrintJoinTimeoutInfo(customers_cnt, kCustomerTable); + PrintJoinTimeoutInfo(customers_cnt, kCustomerLastIndex); + PrintJoinTimeoutInfo(customers_cnt, kHistoryTable); + } +} + +void DataGenerator::GenStocks(int32_t warehouse_id) { + IdSet original_ids = PickUniqueIdSet(rand_gen_, kItemCount / 10, 1, kItemCount); + event_.AddEventSources(kItemCount); + for (int id = 1; id <= kItemCount; ++id) { + bool is_original = original_ids.find(id) != original_ids.end(); + PushToInsertQueue(std::bind(&DataGenerator::GenStock, this, id, warehouse_id, is_original)); + } +} + +void DataGenerator::GenStock(int32_t id, int32_t warehouse_id, bool is_original) { + Stock s(id, warehouse_id, is_original, rand_gen_); + VLOG(12) << s.ToString(); + db_->InsertStock(s) ? states_[kStockTable].first.Inc() : states_[kStockTable].second.Inc(); + event_.Complete(); +} + +void DataGenerator::GenCustomers(int32_t district_id, int32_t warehouse_id) { + IdSet bad_credit_ids = PickUniqueIdSet(rand_gen_, + kCustomerCountPerDistrict / 10, 1, kCustomerCountPerDistrict); + event_.AddEventSources(kCustomerCountPerDistrict); + for (int c_id = 1; c_id <= kCustomerCountPerDistrict; ++c_id) { + bool is_bad_credit = bad_credit_ids.find(c_id) != bad_credit_ids.end(); + Customer c(c_id, district_id, warehouse_id, now_datatime_, is_bad_credit, rand_gen_); + VLOG(12) << c.ToString(); + db_->InsertCustomer(c) ? states_[kCustomerTable].first.Inc() : states_[kCustomerTable].second.Inc(); + } + event_.Complete(kCustomerCountPerDistrict); +} + +void DataGenerator::GenHistorys(int32_t district_id, int32_t warehouse_id) { + event_.AddEventSources(kCustomerCountPerDistrict); + for (int h_id = 1; h_id <= kCustomerCountPerDistrict; ++h_id) { + History h(h_id, district_id, warehouse_id, now_datatime_, rand_gen_); + VLOG(12) << h.ToString(); + db_->InsertHistory(h) ? states_[kHistoryTable].first.Inc() : states_[kHistoryTable].second.Inc(); + } + event_.Complete(kCustomerCountPerDistrict); +} + +void DataGenerator::GenOrderLines(int cnt, int32_t order_id, int32_t district_id, + int32_t warehouse_id, bool new_order) { + event_.AddEventSources(cnt); + for (int i = 1; i <= cnt; ++i) { + OrderLine ol(order_id, district_id, warehouse_id, i, new_order, now_datatime_, rand_gen_); + VLOG(12) << ol.ToString(); + db_->InsertOrderLine(ol) ? states_[kOrderLineTable].first.Inc() : states_[kOrderLineTable].second.Inc(); + } + event_.Complete(cnt); +} + +void DataGenerator::GenOrders(int32_t d_id, int32_t w_id) { + std::vector disorder_ids = rand_gen_->MakeDisOrderList(1, kCustomerCountPerDistrict); + event_.AddEventSources(kCustomerCountPerDistrict); + for (int o_id = 1; o_id <= kCustomerCountPerDistrict; ++o_id) { + bool new_order = (kCustomerCountPerDistrict - kInitNewOrderCountPerDistrict) < o_id; + int32_t c_id = disorder_ids[o_id]; + Order o(o_id, c_id, d_id, w_id, new_order, now_datatime_, rand_gen_); + // insert order line and new order first + // this use sync interface + GenOrderLines(o.o_ol_cnt, o_id, d_id, w_id, new_order); + if (new_order) { + event_.AddEventSources(1); + NewOrder no(o_id, d_id, w_id); + VLOG(12) << no.ToString(); + db_->InsertNewOrder(no) ? states_[kNewOrderTable].first.Inc() : states_[kNewOrderTable].second.Inc(); + event_.Complete(1); + } + // wait orderline and neworder insert done + VLOG(12) << o.ToString(); + db_->InsertOrder(o) ? states_[kOrderTable].first.Inc() : states_[kOrderTable].second.Inc(); + } + event_.Complete(kCustomerCountPerDistrict); +} + +void DataGenerator::GenDistricts(int32_t warehouse_id) { + event_.AddEventSources(kDistrictCountPerWarehouse); + for (int d_id = 1; d_id <= kDistrictCountPerWarehouse; ++d_id) { + District d(d_id, warehouse_id, rand_gen_); + VLOG(12) << d.ToString(); + db_->InsertDistrict(d) ? states_[kDistrictTable].first.Inc() : states_[kDistrictTable].second.Inc(); + GenCustomers(d_id, warehouse_id); + GenHistorys(d_id, warehouse_id); + + GenOrders(d_id, warehouse_id); + } + event_.Complete(kDistrictCountPerWarehouse); +} + +void DataGenerator::GenWarehouses() { + event_.AddEventSources(FLAGS_warehouses_count); + for (int32_t w_id = 1; w_id <= FLAGS_warehouses_count; ++w_id) { + GenStocks(w_id); + Warehouse w(w_id, rand_gen_); + VLOG(12) << w.ToString(); + db_->InsertWarehouse(w) ? states_[kWarehouseTable].first.Inc() : states_[kWarehouseTable].second.Inc(); + + GenDistricts(w_id); + } + event_.Complete(FLAGS_warehouses_count); +} + +void DataGenerator::GenItems() { + IdSet original_ids = PickUniqueIdSet(rand_gen_, kItemCount / 10, 1, kItemCount); + event_.AddEventSources(kItemCount); + for (int i_id = 1; i_id <= kItemCount; ++i_id) { + bool is_original = original_ids.find(i_id) != original_ids.end(); + PushToInsertQueue(std::bind(&DataGenerator::GenItem, this, i_id, is_original)); + } +} + +void DataGenerator::GenItem(int32_t item_id, bool is_original) { + Item item(item_id, is_original, rand_gen_); + VLOG(12) << item.ToString(); + db_->InsertItem(item) ? states_[kItemTable].first.Inc() : states_[kItemTable].second.Inc(); + event_.Complete(); +} + +void DataGenerator::PushToInsertQueue(const ThreadPool::Task& task) { + while(thread_pool_.PendingNum() > FLAGS_tpcc_thread_pool_size / 2) { + usleep(100); + } + thread_pool_.AddTask(task); + VLOG(12) << "thread_pool pending num = " << thread_pool_.PendingNum(); +} + +} // namespace tpcc +} // namespace tera diff --git a/src/benchmark/tpcc/data_generator.h b/src/benchmark/tpcc/data_generator.h new file mode 100644 index 000000000..f5593b64c --- /dev/null +++ b/src/benchmark/tpcc/data_generator.h @@ -0,0 +1,61 @@ +// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. +// +// Author: baorenyi@baidu.com + +#ifndef TERA_BENCHMARK_TPCC_DATA_GENERATOR_H +#define TERA_BENCHMARK_TPCC_DATA_GENERATOR_H + +#include +#include + +#include "benchmark/tpcc/random_generator.h" +#include "benchmark/tpcc/tpccdb.h" +#include "common/counter.h" +#include "common/event.h" +#include "common/thread_pool.h" + +namespace tera { +namespace tpcc { + + +class DataGenerator { +public: + DataGenerator(RandomGenerator* random_gen, TpccDb* db); + ~DataGenerator(){} + void GenWarehouses(); + void GenItems(); + void Join(); + +private: + void PrintJoinTimeoutInfo(int need_cnt, int table_enum_num); + + // for generate data + void GenStocks(int32_t warehouse_id); + void GenCustomers(int32_t district_id, int32_t warehouse_id); + void GenHistorys(int32_t district_id, int32_t warehouse_id); + void GenOrderLines(int cnt, int32_t order_id, int32_t district_id, + int32_t warehouse_id, bool new_order); + void GenOrders(int32_t district_id, int32_t warehouse_id); + void GenDistricts(int32_t warehouse_id); + + void GenItem(int32_t item_id, bool is_original); + void GenStock(int32_t id, int32_t warehouse_id, bool is_original); + + // for async insert + void PushToInsertQueue(const ThreadPool::Task& task); +private: + typedef std::vector> InsertStates; + CompletedEvent event_; + RandomGenerator* rand_gen_; + TpccDb* db_; + InsertStates states_; + std::string now_datatime_; + common::ThreadPool thread_pool_; +}; + +} // namespace tpcc +} // namespace tera + +#endif /* TERA_BENCHMARK_TPCC_DATA_GENERATOR_H */ diff --git a/src/benchmark/tpcc/driver.cc b/src/benchmark/tpcc/driver.cc new file mode 100644 index 000000000..aed2e6235 --- /dev/null +++ b/src/benchmark/tpcc/driver.cc @@ -0,0 +1,190 @@ +// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. +// +// Author: baorenyi@baidu.com + +#include "benchmark/tpcc/driver.h" +#include "benchmark/tpcc/tpccdb.h" +#include "common/thread_pool.h" +#include "common/timer.h" + +DECLARE_int32(driver_wait_times); +DECLARE_int32(warehouses_count); +DECLARE_int32(tpcc_run_gtxn_thread_pool_size); +DECLARE_int64(transactions_count); + +namespace tera { +namespace tpcc { + +Driver::Driver(RandomGenerator* rand_gen, TpccDb* db) + : event_(), + rand_gen_(rand_gen), + db_(db), + now_datatime_(get_curtime_str()), + thread_pool_(FLAGS_tpcc_run_gtxn_thread_pool_size) { +} + +void Driver::PrintJoinTimeoutInfo(int need_cnt, int table_enum_num) { + if (need_cnt < states_[table_enum_num].first.Get() + states_[table_enum_num].second.Get()) { + LOG(ERROR) << "table:" << kTpccTables[table_enum_num] + << "[need/succ/fail]:[" + << need_cnt << "/" + << states_[table_enum_num].first.Get() << "/" + << states_[table_enum_num].first.Get() << "]"; + } +} + +void Driver::RunTransactions() { + for (int64_t i = 0; i < FLAGS_transactions_count; ++i) { + RunOneTransaction(); + } +} + +void Driver::Join() { + event_.Trigger(); + if (!event_.TimeWait(FLAGS_driver_wait_times)) { + // TODO + } +} + +void Driver::RunOneTransaction() { + int rand_num = rand_gen_->GetRandom(1, 100); + if (rand_num <= kTpccTransactionRatios[0]) { // %4 do stock_level + RunStockLevelTxn(); + } else if (rand_num <= kTpccTransactionRatios[1]) { // %4 do order_status + RunOrderStatusTxn(); + } else if (rand_num <= kTpccTransactionRatios[2]) { // %4 do delivery + RunDeliveryTxn(); + } else if (rand_num <= kTpccTransactionRatios[3]) { // %43 do payment + RunPaymentTxn(); + } else { // %45 do new_order + RunNewOrderTxn(); + } +} + +void Driver::RunStockLevelTxn() { + int32_t threshold = rand_gen_->GetRandom(kMinStockLevelThreshold, kMaxStockLevelThreshold); + StockLevelResult ret; + db_->StockLevelTxn(FindWareHouse(), FindDistrict(), threshold, &ret); +} + +void Driver::RunOrderStatusTxn() { + int x = rand_gen_->GetRandom(1, 100); + OrderStatusResult ret; + if (x <= 60) { + // 60% order_status by lastname + std::string last_name = GenLastName(rand_gen_, kCustomerCountPerDistrict); + db_->OrderStatusTxn(true, FindWareHouse(), FindDistrict(), + -1, last_name, &ret); + } else { + // 40% order_status by customer_id + db_->OrderStatusTxn(false, FindWareHouse(), FindDistrict(), + FindCustomerId(), "", &ret); + } +} + +void Driver::RunDeliveryTxn() { + int32_t carrier_id = rand_gen_->GetRandom(kMinCarrierId, kMaxCarrierId); + DeliveryResult ret;; + db_->DeliveryTxn(FindWareHouse(), carrier_id, get_curtime_str(), &ret); +} + +void Driver::RunPaymentTxn() { + int32_t warehouse_id = FindWareHouse(); + int32_t district_id = FindDistrict(); + + float h_amount = rand_gen_->MakeFloat(kRuntimeMinAmount, kRuntimeMaxAmount, + kRuntimeAmountDigits); + + int32_t customer_warehouse_id = -1; + int32_t customer_district_id = -1; + + int x = rand_gen_->GetRandom(1, 100); + + // set customer c_w_id and c_d_id + if (FLAGS_warehouses_count == 1 && x <= 85) { + // 85% payment through local warehouse (or only one warehouse) + customer_warehouse_id = warehouse_id; + customer_district_id = district_id; + } else { + // 15% payment through remote warehouse + customer_warehouse_id = + rand_gen_->GetRandom(1, FLAGS_warehouses_count, warehouse_id); + customer_district_id = FindDistrict(); + } + + x = rand_gen_->GetRandom(1, 100); + PaymentResult ret; + if (x <= 60) { + // 60% payment by lastname + std::string last_name = GenLastName(rand_gen_, kCustomerCountPerDistrict); + db_->PaymentTxn(true, warehouse_id, district_id, + customer_warehouse_id, customer_district_id, -1, + last_name, h_amount, &ret); + } else { + // 40% payment by customer_id + db_->PaymentTxn(false, warehouse_id, district_id, + customer_warehouse_id, customer_district_id, FindCustomerId(), + "", h_amount, &ret); + } +} + +void Driver::RunNewOrderTxn() { + int32_t warehouse_id = FindWareHouse(); + + // init NewOrderInfo + NewOrderInfo info; + // 1% of new_order transactions will be failed + info.need_failed = rand_gen_->GetRandom(1,100) == 1 ? true : false; + info.o_ol_cnt = rand_gen_->GetRandom(kMinOrderLineCnt, kMaxOrderLineCnt); + + info.ol_supply_w_ids.reserve(info.o_ol_cnt); + info.ol_i_ids.reserve(info.o_ol_cnt); + info.ol_quantities.reserve(info.o_ol_cnt); + info.o_all_local = 1; + for (int32_t i = 0; i < info.o_ol_cnt; ++i) { + // 1% of orderlines will be remote order + bool remote = rand_gen_->GetRandom(1, 100) == 1 ? true : false; + if (FLAGS_warehouses_count > 1 && remote) { + info.ol_supply_w_ids.emplace_back( + rand_gen_->GetRandom(1, FLAGS_warehouses_count, warehouse_id)); + info.o_all_local = 0; + } else { + info.ol_supply_w_ids.emplace_back(warehouse_id); + } + info.ol_i_ids.emplace_back(FindItemId()); + info.ol_quantities.emplace_back( + rand_gen_->GetRandom(1, kMaxOrderLineQuantity)); + } + + NewOrderResult ret; + db_->NewOrderTxn(warehouse_id, FindDistrict(), FindCustomerId(), info, &ret); +} + +void Driver::PushToInsertQueue(const ThreadPool::Task& task) { + while(thread_pool_.PendingNum() > FLAGS_tpcc_run_gtxn_thread_pool_size / 2) { + usleep(100); + } + thread_pool_.AddTask(task); + VLOG(12) << "thread_pool pending num = " << thread_pool_.PendingNum(); +} + +int32_t Driver::FindWareHouse() { + return rand_gen_->GetRandom(1, FLAGS_warehouses_count); +} + +int32_t Driver::FindDistrict() { + return rand_gen_->GetRandom(1, kDistrictCountPerWarehouse); +} + +int32_t Driver::FindCustomerId() { + return rand_gen_->NURand(1023, 1, kCustomerCountPerDistrict); +} + +int32_t Driver::FindItemId() { + return rand_gen_->NURand(8191, 1, kItemCount); +} + +} // namespace tpcc +} // namespace tera diff --git a/src/benchmark/tpcc/driver.h b/src/benchmark/tpcc/driver.h new file mode 100644 index 000000000..56bf5a66f --- /dev/null +++ b/src/benchmark/tpcc/driver.h @@ -0,0 +1,68 @@ +// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. +// +// Author: baorenyi@baidu.com + +#ifndef TERA_BENCHMARK_TPCC_DRIVER_H +#define TERA_BENCHMARK_TPCC_DRIVER_H + +#include +#include + +#include "benchmark/tpcc/random_generator.h" +#include "benchmark/tpcc/tpccdb.h" +#include "common/counter.h" +#include "common/event.h" +#include "common/thread_pool.h" + +namespace tera { +namespace tpcc { + +class Driver { +public: + Driver(RandomGenerator* random_gen, TpccDb* db); + ~Driver(){} + void RunTransactions(); + void Join(); + +private: + void PrintJoinTimeoutInfo(int need_cnt, int table_enum_num); + + // for run transaction + void RunOneTransaction(); + // + void RunStockLevelTxn(); + + void RunOrderStatusTxn(); + + void RunDeliveryTxn(); + + void RunPaymentTxn(); + + void RunNewOrderTxn(); + + // for async run txn + void PushToInsertQueue(const ThreadPool::Task& task); + + int32_t FindWareHouse(); + + int32_t FindDistrict(); + + int32_t FindCustomerId(); + + int32_t FindItemId(); +private: + typedef std::vector> TxnStates; + CompletedEvent event_; + RandomGenerator* rand_gen_; + TpccDb* db_; + TxnStates states_; + std::string now_datatime_; + common::ThreadPool thread_pool_; +}; + +} // namespace tpcc +} // namespace tera + +#endif /* TERA_BENCHMARK_TPCC_DATA_GENERATOR_H */ diff --git a/src/benchmark/tpcc/mock_tpccdb.cc b/src/benchmark/tpcc/mock_tpccdb.cc new file mode 100644 index 000000000..ee8cce0d0 --- /dev/null +++ b/src/benchmark/tpcc/mock_tpccdb.cc @@ -0,0 +1,18 @@ +// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. +// +// Author: baorenyi@baidu.com + +#include "benchmark/tpcc/mock_tpccdb.h" + +#include +#include + +namespace tera { +namespace tpcc { + +MockTpccDb::MockTpccDb() : flag_(true) {} + +} // namespace tpcc +} // namespace tera diff --git a/src/benchmark/tpcc/mock_tpccdb.h b/src/benchmark/tpcc/mock_tpccdb.h new file mode 100644 index 000000000..0f29f0320 --- /dev/null +++ b/src/benchmark/tpcc/mock_tpccdb.h @@ -0,0 +1,98 @@ +// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. +// +// Author: baorenyi@baidu.com + +#ifndef TERA_BENCHMARK_TPCC_MOCK_TPCCDB_H +#define TERA_BENCHMARK_TPCC_MOCK_TPCCDB_H + +#include "benchmark/tpcc/tpccdb.h" + +namespace tera { +namespace tpcc { + +class TpccDb; +class TxnResult; + +class MockTpccDb : public TpccDb { +public: + MockTpccDb(); + virtual ~MockTpccDb() {} + + virtual bool CreateTables() { return true; } + virtual bool CleanTables() { return true; } + + // init db + virtual bool InsertItem(const Item& i) { + return flag_; + } + + virtual bool InsertWarehouse(const Warehouse& w) { + return flag_; + } + + virtual bool InsertDistrict(const District& d) { + return flag_; + } + + virtual bool InsertCustomer(const Customer& c) { + return flag_; + } + + virtual bool InsertHistory(const History& h) { + return flag_; + } + + virtual bool InsertStock(const Stock& s) { + return flag_; + } + + virtual bool InsertOrder(const Order& o) { + return flag_; + } + + virtual bool InsertOrderLine(const OrderLine& ol) { + return flag_; + } + + virtual bool InsertNewOrder(const NewOrder& no) { + return flag_; + } + + virtual void StockLevelTxn(int32_t warehouse_id, int32_t district_id, + int32_t threshold, + StockLevelResult* ret) {} + + virtual void DeliveryTxn(int32_t warehouse_id, + int32_t carrier_id, + const std::string& delivery_datetime, + DeliveryResult* ret) {} + + virtual void OrderStatusTxn(bool by_last_name, + int32_t warehouse_id, int32_t district_id, + int32_t c_customer_id, + const std::string& last_name, + OrderStatusResult* ret) {} + + virtual void PaymentTxn(bool by_last_name, + int32_t warehouse_id, int32_t district_id, + int32_t c_warehouse_id, int32_t c_district_id, + int32_t c_customer_id, + const std::string& last_name, + int32_t h_amount, + PaymentResult* ret) {} + + virtual void NewOrderTxn(int32_t warehouse_id, + int32_t district_id, + int32_t customer_id, const NewOrderInfo& info, + NewOrderResult* ret) {} + +private: + bool flag_; +}; + +} // namespace tpcc +} // namespace tera + +#endif /* TERA_BENCHMARK_TPCC_MOCK_TPCCDB_H */ diff --git a/src/benchmark/tpcc/random_generator.cc b/src/benchmark/tpcc/random_generator.cc new file mode 100644 index 000000000..9308ec6e9 --- /dev/null +++ b/src/benchmark/tpcc/random_generator.cc @@ -0,0 +1,132 @@ +// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. +// +// Author: baorenyi@baidu.com + +#include "benchmark/tpcc/random_generator.h" + +#include + +namespace tera { +namespace tpcc { + +RandomGenerator::RandomGenerator():c_({0,0,0}) { + InitRandomState(); +} + +void RandomGenerator::InitRandomState() { + memset(&rand_state_, 0, sizeof(rand_state_)); + int ret = initstate_r(static_cast(time(NULL)), + rand_state_buf_, + sizeof(rand_state_buf_), + &rand_state_); + assert(ret == 0); +} + +NURandConstant RandomGenerator::GetRandomConstant() const { + return c_; +} + +void RandomGenerator::SetRandomConstant() { + c_.c_last = GetRandom(0, 255); + c_.c_id = GetRandom(0, 1023); + c_.ol_i_id = GetRandom(0, 8191); +} + +inline bool VarfiyConstantAvailableForRun(int run_last, int load_last) { + int delta = run_last - load_last; + delta = delta > 0 ? delta : -1 * delta; + return 65 <=delta && delta <= 119 && delta != 96 && delta != 112; +} + +void RandomGenerator::SetRandomConstant(const NURandConstant& constant_for_load) { + c_.c_last = GetRandom(0, 255); + c_.c_id = GetRandom(0, 1023); + c_.ol_i_id = GetRandom(0, 8191); + while (!VarfiyConstantAvailableForRun(c_.c_last, constant_for_load.c_last)) { + c_.c_last = GetRandom(0, 255); + } +} + +int RandomGenerator::GetRandom(int lower, int upper) { + int ret = 0; + int err = random_r(&rand_state_, &ret); + assert(err == 0); + return lower <= upper ? (ret % (upper - lower + 1) + lower) : (ret % (lower - upper + 1) + upper); +} + +int RandomGenerator::GetRandom(int lower, int upper, int exclude) { + if (exclude > upper || exclude < lower) { + return GetRandom(lower, upper); + } else { + int rand = GetRandom(lower, upper - 1); + if (rand >= exclude) { + ++rand; + } + return rand; + } +} + +std::string RandomGenerator::MakeAString(int lower_len, int upper_len) { + int len = GetRandom(lower_len, upper_len); + std::string ret; + for (int i = 0; i < len; ++i) { + ret += (char)('a' + GetRandom(0, 25)); + } + return ret; +} + +std::string RandomGenerator::MakeNString(int lower_len, int upper_len) { + int len = GetRandom(lower_len, upper_len); + std::string ret; + for (int i = 0; i < len; ++i) { + ret += (char)('0' + GetRandom(0, 9)); + } + return ret; +} + +float RandomGenerator::MakeFloat(float lower, float upper, int digits) { + float num = 1.0; + for (int i = 0; i < digits; ++i) { + num *= 10; + } + return GetRandom(int(lower * num + 0.5), int(upper * num + 0.5)) / num; +} + +std::vector RandomGenerator::MakeDisOrderList(int lower, int upper) { + std::vector ret(upper - lower + 1, -1); + for (int i = 0; i < upper - lower + 1; ++i) { + int rand_pos = GetRandom(0, upper - lower); + while (true) { + if (ret[rand_pos] == -1) { + ret[rand_pos] = lower + i; + break; + } + rand_pos = GetRandom(0, upper - lower); + } + } + return ret; +} + +int RandomGenerator::NURand(int A, int x, int y) { + int C = 0; + switch(A) { + case 255: + C = c_.c_last; + break; + case 1023: + C = c_.c_id; + break; + case 8191: + C = c_.ol_i_id; + break; + default: + LOG(ERROR) << "NURand: A = " << A << " not available"; + abort(); + } + return (((GetRandom(0, A) | GetRandom(x, y)) + C) % (y - x + 1)) + x; +} + +} // namespace tpcc +} // namespace tera diff --git a/src/benchmark/tpcc/random_generator.h b/src/benchmark/tpcc/random_generator.h new file mode 100644 index 000000000..c39070294 --- /dev/null +++ b/src/benchmark/tpcc/random_generator.h @@ -0,0 +1,64 @@ +// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. +// +// Author: baorenyi@baidu.com + +#ifndef TERA_BENCHMARK_TPCC_RANDOM_GENERATOR_H +#define TERA_BENCHMARK_TPCC_RANDOM_GENERATOR_H + +#include +#include +#include + +#include "benchmark/tpcc/tpcc_types.h" + +namespace tera { +namespace tpcc { + +struct NURandConstant { + int c_last; + int c_id; + int ol_i_id; +}; + +class RandomGenerator { +public: + RandomGenerator(); + virtual ~RandomGenerator(){} + + NURandConstant GetRandomConstant() const; + void SetRandomConstant(); + void SetRandomConstant(const NURandConstant& constant_for_load); + + // make a string A len=rand[lower_len, upper_len] A[x] = set(a..z) + std::string MakeAString(int lower_len, int upper_len); + + // make a string N len=rand[lower_len, upper_len] N[x] = set(0..9) + std::string MakeNString(int lower_len, int upper_len); + + float MakeFloat(float lower, float upper, int digits); + + std::vector MakeDisOrderList(int lower, int upper); + + int NURand(int A, int lower, int upper); + + // get rand int from [lower, upper] + int GetRandom(int lower, int upper); + + int GetRandom(int lower, int upper, int exclude); +private: + void InitRandomState(); +private: + // for system call random_r and initstate_r + char rand_state_buf_[kRandomStateSize]; + struct random_data rand_state_; + + // for NURand, need a constant + NURandConstant c_; +}; + +} // namespace tpcc +} // namespace tera + +#endif /* TERA_BENCHMARK_TPCC_RANDOM_GENERATOR_H */ diff --git a/src/benchmark/tpcc/tera_tpccdb.cc b/src/benchmark/tpcc/tera_tpccdb.cc new file mode 100644 index 000000000..f35f4ed2a --- /dev/null +++ b/src/benchmark/tpcc/tera_tpccdb.cc @@ -0,0 +1,538 @@ +// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. +// +// Author: baorenyi@baidu.com + +#include "benchmark/tpcc/tera_tpccdb.h" + +#include +#include + +#include "sdk/client_impl.h" +#include "sdk/sdk_utils.h" + +DECLARE_string(tera_client_flagfile); +DECLARE_string(tera_table_schema_dir); + +namespace tera { +namespace tpcc { + +TeraTpccDb::TeraTpccDb() : client_(NULL) { + ErrorCode error_code; + client_ = Client::NewClient(FLAGS_tera_client_flagfile, "tera_tpcc", &error_code); + if (client_ == NULL) { + LOG(ERROR) << "new client failed. err:" << error_code.ToString(); + _Exit(EXIT_FAILURE); + } +} + +TeraTpccDb::~TeraTpccDb() { + delete client_; +} + +bool TeraTpccDb::CreateTables() { + ErrorCode err; + for (auto table : kTpccTables) { + std::string schema_file = FLAGS_tera_table_schema_dir + table; + TableDescriptor* desc = new TableDescriptor(); + if (ParseTableSchemaFile(schema_file, desc, &err)) { + if (client_->CreateTable(*desc, &err) && err.GetType() == ErrorCode::kOK) { + LOG(INFO) << "create table " << table << " ok"; + Table* table_ptr = client_->OpenTable(table, &err); + if (table_ptr == NULL) { + LOG(ERROR) << "open table " << table << " failed"; + delete desc; + return false; + } else { + table_map_[table] = table_ptr; + LOG(INFO) << "open table " << table << " ok"; + } + } else { + LOG(ERROR) << "create table " << table << " failed"; + delete desc; + return false; + } + } else { + LOG(ERROR) << "load schema failed, schema_file:" << schema_file << "err:" << err.ToString(); + delete desc; + return false; + } + delete desc; + } + return true; +} + +bool TeraTpccDb::CleanTables() { + ErrorCode err; + for (auto table : kTpccTables) { + if (!client_->DisableTable(table, &err)) { + LOG(ERROR) << "fail to disable table : " << table << " err: " <(client_); + if (!client_impl->ShowTablesInfo(table, &table_meta, &tablet_list, &err)) { + LOG(ERROR) << "table not exist: " << table; + continue; + } + uint64_t tablet_num = tablet_list.meta_size(); + VLOG(11) << tablet_num; + int wait_times = 0; + while (true) { + if (!client_impl->ShowTablesInfo(table, &table_meta, &tablet_list, &err)) { + LOG(ERROR) << "table not exist: " << table; + break; + } + uint64_t tablet_cnt = 0; + for (int32_t i = 0; i < tablet_list.meta_size(); ++i) { + const TabletMeta& tablet = tablet_list.meta(i); + if (tablet.status() == kTabletDisable || tablet.status() == kTableOffLine) { + tablet_cnt++; + } + } + if (tablet_cnt == tablet_num) { + break; + } + if (wait_times < 20) { + sleep(1); + } else { + LOG(ERROR) << "disable table : " << table << " failed, try " << wait_times << " time(s)"; + break; + } + } + } + if (!client_->DeleteTable(table, &err)) { + LOG(ERROR) << "drop table: " << table << " failed. " << err.ToString(); + } else { + LOG(INFO) << "drop table: "<< table << " done."; + } + } + return true; +} + +// init db +bool TeraTpccDb::InsertItem(const Item& i) { + std::string tablename = "t_item"; + if ( table_map_.find(tablename) == table_map_.end()) { + return false; + } + Table* table = table_map_[tablename]; + Transaction* gtxn = client_->NewGlobalTransaction(); + RowMutation* mu = table->NewRowMutation(i.PrimaryKey()); + mu->Put("cf0", "i_id", std::to_string(i.i_id)); + mu->Put("cf0", "i_im_id", std::to_string(i.i_im_id)); + mu->Put("cf0", "i_price", std::to_string(i.i_price)); + mu->Put("cf0", "i_name", i.i_name); + mu->Put("cf0", "i_data", i.i_data); + gtxn->ApplyMutation(mu); + gtxn->Commit(); + delete mu; + if (gtxn->GetError().GetType() != ErrorCode::kOK) { + LOG(ERROR) << "insert table:" << tablename << " failed. err:" + << gtxn->GetError().ToString(); + delete gtxn; + return false; + } + delete gtxn; + return true; +} + +bool TeraTpccDb::InsertWarehouse(const Warehouse& w) { + std::string tablename = "t_warehouse"; + if ( table_map_.find(tablename) == table_map_.end()) { + return false; + } + Table* table = table_map_[tablename]; + Transaction* gtxn = client_->NewGlobalTransaction(); + RowMutation* mu = table->NewRowMutation(w.PrimaryKey()); + mu->Put("cf0", "w_id", std::to_string(w.w_id)); + mu->Put("cf0", "w_tax", std::to_string(w.w_tax)); + mu->Put("cf0", "w_ytd", std::to_string(w.w_ytd)); + mu->Put("cf0", "w_name", w.w_name); + mu->Put("cf0", "w_street_1", w.w_street_1); + mu->Put("cf0", "w_street_2", w.w_street_2); + mu->Put("cf0", "w_city", w.w_city); + mu->Put("cf0", "w_state", w.w_state); + mu->Put("cf0", "w_zip", w.w_zip); + gtxn->ApplyMutation(mu); + gtxn->Commit(); + delete mu; + if (gtxn->GetError().GetType() != ErrorCode::kOK) { + LOG(ERROR) << "insert table:" << tablename << " failed. err:" + << gtxn->GetError().ToString(); + delete gtxn; + return false; + } + delete gtxn; + return true; +} + +bool TeraTpccDb::InsertDistrict(const District& d) { + std::string tablename = "t_district"; + if ( table_map_.find(tablename) == table_map_.end()) { + return false; + } + Table* table = table_map_[tablename]; + Transaction* gtxn = client_->NewGlobalTransaction(); + RowMutation* mu = table->NewRowMutation(d.PrimaryKey()); + mu->Put("cf0", "d_id", std::to_string(d.d_id)); + mu->Put("cf0", "d_w_id", std::to_string(d.d_w_id)); + mu->Put("cf0", "d_tax", std::to_string(d.d_tax)); + mu->Put("cf0", "d_ytd", std::to_string(d.d_ytd)); + mu->Put("cf0", "d_next_o_id", std::to_string(d.d_next_o_id)); + mu->Put("cf0", "d_name", d.d_name); + mu->Put("cf0", "d_street_1", d.d_street_1); + mu->Put("cf0", "d_street_2", d.d_street_2); + mu->Put("cf0", "d_city", d.d_city); + mu->Put("cf0", "d_state", d.d_state); + mu->Put("cf0", "d_zip", d.d_zip); + gtxn->ApplyMutation(mu); + gtxn->Commit(); + delete mu; + if (gtxn->GetError().GetType() != ErrorCode::kOK) { + LOG(ERROR) << "insert table:" << tablename << " failed. err:" + << gtxn->GetError().ToString(); + delete gtxn; + return false; + } + delete gtxn; + return true; +} + +bool TeraTpccDb::InsertCustomer(const Customer& c) { + std::string tablename = "t_customer"; + std::string c_last_index_name = "t_customer_last_index"; + if ( table_map_.find(tablename) == table_map_.end() + || table_map_.find(c_last_index_name) == table_map_.end()) { + return false; + } + Table* table = table_map_[tablename]; + Table* t_index = table_map_[tablename]; + Transaction* gtxn = client_->NewGlobalTransaction(); + std::string key = std::to_string(c.c_w_id) + "_" + std::to_string(c.c_d_id) + + "_" + c.c_last + "_" + std::to_string(c.c_id); + RowMutation* index_mu = t_index->NewRowMutation(key); + index_mu->Put("cf0", "c_id", std::to_string(c.c_id)); + index_mu->Put("cf0", "c_d_id", std::to_string(c.c_d_id)); + index_mu->Put("cf0", "c_w_id", std::to_string(c.c_w_id)); + index_mu->Put("cf0", "c_last", c.c_last); + gtxn->ApplyMutation(index_mu); + delete index_mu; + + RowMutation* mu = table->NewRowMutation(c.PrimaryKey()); + mu->Put("cf0", "c_id", std::to_string(c.c_id)); + mu->Put("cf0", "c_d_id", std::to_string(c.c_d_id)); + mu->Put("cf0", "c_w_id", std::to_string(c.c_w_id)); + mu->Put("cf0", "c_credit_lim", std::to_string(c.c_credit_lim)); + mu->Put("cf0", "c_discount", std::to_string(c.c_discount)); + mu->Put("cf0", "c_balance", std::to_string(c.c_balance)); + mu->Put("cf0", "c_ytd_payment", std::to_string(c.c_ytd_payment)); + mu->Put("cf0", "c_payment_cnt", std::to_string(c.c_payment_cnt)); + mu->Put("cf0", "c_delivery_cnt", std::to_string(c.c_delivery_cnt)); + mu->Put("cf0", "c_first", c.c_first); + mu->Put("cf0", "c_middle", c.c_middle); + mu->Put("cf0", "c_last", c.c_last); + mu->Put("cf0", "c_street_1", c.c_street_1); + mu->Put("cf0", "c_street_2", c.c_street_2); + mu->Put("cf0", "c_city", c.c_city); + mu->Put("cf0", "c_state", c.c_state); + mu->Put("cf0", "c_zip", c.c_zip); + mu->Put("cf0", "c_phone", c.c_phone); + mu->Put("cf0", "c_since", c.c_since); + mu->Put("cf0", "c_credit", c.c_credit); + mu->Put("cf0", "c_data", c.c_data); + gtxn->ApplyMutation(mu); + gtxn->Commit(); + delete mu; + if (gtxn->GetError().GetType() != ErrorCode::kOK) { + LOG(ERROR) << "insert table:" << tablename << " failed. err:" + << gtxn->GetError().ToString(); + delete gtxn; + return false; + } + delete gtxn; + return true; +} + +bool TeraTpccDb::InsertHistory(const History& h) { + std::string tablename = "t_history"; + std::string history_index_name = "t_history_index"; + + if (table_map_.find(tablename) == table_map_.end() || + table_map_.find(history_index_name) == table_map_.end()) { + return false; + } + Table* table = table_map_[tablename]; + Table* t_history_index = table_map_[history_index_name]; + Transaction* gtxn = client_->NewGlobalTransaction(); + + RowReader* hindex_reader = t_history_index->NewRowReader("count"); + RetTuples hindex_ret; + int cnt = -1; + TxnResult ret; + if (hindex_reader->GetError().GetType() != ErrorCode::kNotFound + && !GetValues(&ret, gtxn, hindex_reader, + {"count"}, + &hindex_ret, + "@insert_history|hindex_reader|count")) { + return false; + } else if (hindex_reader->GetError().GetType() == ErrorCode::kNotFound) { + cnt = 0; + } else { + cnt = std::stoi(hindex_ret["count"]); + } + + RowMutation* hindex_mu = t_history_index->NewRowMutation("count"); + hindex_mu->Put("cf0", "count", std::to_string(++cnt)); + gtxn->ApplyMutation(hindex_mu); + delete hindex_mu; + + RowMutation* mu = table->NewRowMutation(std::to_string(cnt)); + mu->Put("cf0", "h_c_id", std::to_string(h.h_c_id)); + mu->Put("cf0", "h_c_d_id", std::to_string(h.h_c_d_id)); + mu->Put("cf0", "h_c_w_id", std::to_string(h.h_c_w_id)); + mu->Put("cf0", "h_d_id", std::to_string(h.h_d_id)); + mu->Put("cf0", "h_w_id", std::to_string(h.h_w_id)); + mu->Put("cf0", "h_amount", std::to_string(h.h_amount)); + mu->Put("cf0", "h_date", h.h_date); + mu->Put("cf0", "h_data", h.h_data); + gtxn->ApplyMutation(mu); + gtxn->Commit(); + delete mu; + if (gtxn->GetError().GetType() != ErrorCode::kOK) { + LOG(ERROR) << "insert table:" << tablename << " failed. err:" + << gtxn->GetError().ToString(); + delete gtxn; + return false; + } + delete gtxn; + return true; +} + +bool TeraTpccDb::InsertStock(const Stock& s) { + std::string tablename = "t_stock"; + if ( table_map_.find(tablename) == table_map_.end()) { + return false; + } + Table* table = table_map_[tablename]; + Transaction* gtxn = client_->NewGlobalTransaction(); + RowMutation* mu = table->NewRowMutation(s.PrimaryKey()); + + mu->Put("cf0", "s_i_id", std::to_string(s.s_i_id)); + mu->Put("cf0", "s_w_id", std::to_string(s.s_w_id)); + mu->Put("cf0", "s_quantity", std::to_string(s.s_quantity)); + mu->Put("cf0", "s_ytd", std::to_string(s.s_ytd)); + mu->Put("cf0", "s_order_cnt", std::to_string(s.s_order_cnt)); + mu->Put("cf0", "s_remote_cnt", std::to_string(s.s_remote_cnt)); + int i = 0; + for (auto dist : s.s_dist) { + mu->Put("cf0", "s_dist_" + std::to_string(++i), dist); + } + mu->Put("cf0", "s_data", s.s_data); + + gtxn->ApplyMutation(mu); + gtxn->Commit(); + delete mu; + if (gtxn->GetError().GetType() != ErrorCode::kOK) { + LOG(ERROR) << "insert table:" << tablename << " failed. err:" + << gtxn->GetError().ToString(); + delete gtxn; + return false; + } + delete gtxn; + return true; +} + +bool TeraTpccDb::InsertOrder(const Order& o) { + std::string tablename = "t_order"; + std::string indexname = "t_order_index"; + if ( table_map_.find(tablename) == table_map_.end() || + table_map_.find(indexname) == table_map_.end()) { + return false; + } + Table* table = table_map_[tablename]; + Table* index = table_map_[indexname]; + + Transaction* gtxn = client_->NewGlobalTransaction(); + RowMutation* mu = table->NewRowMutation(o.PrimaryKey()); + std::string index_key = o.ForeignKey() + "_" + std::to_string(o.o_id); + RowMutation* index_mu = index->NewRowMutation(index_key); + index_mu->Put("cf0", "o_id", std::to_string(o.o_id)); + index_mu->Put("cf0", "o_c_id", std::to_string(o.o_c_id)); + index_mu->Put("cf0", "o_d_id", std::to_string(o.o_d_id)); + index_mu->Put("cf0", "o_w_id", std::to_string(o.o_w_id)); + mu->Put("cf0", "o_id", std::to_string(o.o_id)); + mu->Put("cf0", "o_c_id", std::to_string(o.o_c_id)); + mu->Put("cf0", "o_d_id", std::to_string(o.o_d_id)); + mu->Put("cf0", "o_w_id", std::to_string(o.o_w_id)); + mu->Put("cf0", "o_carrier_id", std::to_string(o.o_carrier_id)); + mu->Put("cf0", "o_ol_cnt", std::to_string(o.o_ol_cnt)); + mu->Put("cf0", "o_all_local", std::to_string(o.o_all_local)); + mu->Put("cf0", "o_entry_d", o.o_entry_d); + gtxn->ApplyMutation(mu); + gtxn->ApplyMutation(index_mu); + delete mu; + delete index_mu; + gtxn->Commit(); + if (gtxn->GetError().GetType() != ErrorCode::kOK) { + LOG(ERROR) << "insert table:" << tablename << " failed. err:" + << gtxn->GetError().ToString(); + delete gtxn; + return false; + } + delete gtxn; + return true; +} + +bool TeraTpccDb::InsertOrderLine(const OrderLine& ol) { + std::string tablename = "t_orderline"; + if ( table_map_.find(tablename) == table_map_.end()) { + return false; + } + Table* table = table_map_[tablename]; + Transaction* gtxn = client_->NewGlobalTransaction(); + RowMutation* mu = table->NewRowMutation(ol.PrimaryKey()); + mu->Put("cf0", "ol_o_id", std::to_string(ol.ol_o_id)); + mu->Put("cf0", "ol_d_id", std::to_string(ol.ol_d_id)); + mu->Put("cf0", "ol_w_id", std::to_string(ol.ol_w_id)); + mu->Put("cf0", "ol_number", std::to_string(ol.ol_number)); + mu->Put("cf0", "ol_i_id", std::to_string(ol.ol_i_id)); + mu->Put("cf0", "ol_supply_w_id", std::to_string(ol.ol_supply_w_id)); + mu->Put("cf0", "ol_quantity", std::to_string(ol.ol_quantity)); + mu->Put("cf0", "ol_amount", std::to_string(ol.ol_amount)); + mu->Put("cf0", "ol_delivery_d", ol.ol_delivery_d); + mu->Put("cf0", "ol_dist_info", ol.ol_dist_info); + gtxn->ApplyMutation(mu); + gtxn->Commit(); + delete mu; + if (gtxn->GetError().GetType() != ErrorCode::kOK) { + LOG(ERROR) << "insert table:" << tablename << " failed. err:" + << gtxn->GetError().ToString(); + delete gtxn; + return false; + } + delete gtxn; + return true; +} + +bool TeraTpccDb::InsertNewOrder(const NewOrder& no) { + std::string tablename = "t_neworder"; + if ( table_map_.find(tablename) == table_map_.end()) { + return false; + } + Table* table = table_map_[tablename]; + Transaction* gtxn = client_->NewGlobalTransaction(); + RowMutation* mu = table->NewRowMutation(no.PrimaryKey()); + mu->Put("cf0", "no_o_id", std::to_string(no.no_o_id)); + mu->Put("cf0", "no_d_id", std::to_string(no.no_d_id)); + mu->Put("cf0", "no_w_id", std::to_string(no.no_w_id)); + gtxn->ApplyMutation(mu); + gtxn->Commit(); + delete mu; + if (gtxn->GetError().GetType() != ErrorCode::kOK) { + LOG(ERROR) << "insert table:" << tablename << " failed. err:" + << gtxn->GetError().ToString(); + delete gtxn; + return false; + } + delete gtxn; + return true; +} + +void TeraTpccDb::SetTxnResult(TxnResult* ret, Transaction* gtxn, bool state, + const std::string& msg) { + ret->SetState(state); + if (msg != "") { + ret->SetReason(gtxn->GetError().GetReason() + " msg:" + msg); + } else { + ret->SetReason(gtxn->GetError().GetReason()); + } +} + +bool TeraTpccDb::GetValues(TxnResult* ret, Transaction* gtxn, RowReader* reader, + std::initializer_list qu_names_initlist, + RetTuples* ret_tuples, + const std::string& if_error_msg) { + std::vector qu_names(qu_names_initlist); + for (auto& qu_name : qu_names) { + reader->AddColumn("cf0", qu_name); + } + gtxn->Get(reader); + if (gtxn->GetError().GetType() != ErrorCode::kOK) { + SetTxnResult(ret, gtxn, false, if_error_msg); + delete reader; + return false; + } else { + RowReader::TRow row; + reader->ToMap(&row); + for (auto qu_name : qu_names) { + if (row["cf0"].find(qu_name) != row["cf0"].end()) { + for (auto k : row["cf0"][qu_name]) { + ret_tuples->insert({{qu_name, k.second}}); + break; + } + } + } + delete reader; + } + return true; +} + +bool TeraTpccDb::GetCustomer(TxnResult* ret, Transaction* gtxn, bool by_last_name, + const std::string& last_name, int32_t customer_id, + int32_t warehouse_id, int32_t district_id, + std::string* customer_key, RetTuples* customer_ret) { + // open table + Table* t_customer_last_index = table_map_[kTpccTables[kCustomerLastIndex]]; + Table* t_customer = table_map_[kTpccTables[kCustomerTable]]; + *customer_key = std::to_string(warehouse_id) + "_" + std::to_string(district_id) + "_"; + + if (by_last_name) { + ErrorCode error_code; + std::string start_key = *customer_key + last_name + "_"; + ScanDescriptor scan_desc(start_key); + scan_desc.SetEnd(start_key + "~"); + scan_desc.AddColumnFamily("cf0"); + ResultStream* scanner = t_customer_last_index->Scan(scan_desc, &error_code); + std::vector keys; + for (scanner->LookUp(start_key); !scanner->Done(); scanner->Next()) { + std::string row_key = scanner->RowName(); + if (row_key.find(start_key) == std::string::npos) { + break; + } + + RowReader* index_reader = t_customer_last_index->NewRowReader(row_key); + RetTuples index_ret; + if (!GetValues(ret, gtxn, index_reader, + {"c_id"}, + &index_ret, + "@get_customer|index_reader|" + row_key)) { + delete scanner; + return false; + } + keys.push_back(index_ret["c_id"]); + } + delete scanner; + size_t pos = keys.size(); + pos = pos % 2 == 0 ? (pos / 2 - 1) : (pos / 2); + *customer_key += keys.at(pos); + } else { + *customer_key += std::to_string(customer_id); + } + RowReader* customer_reader = t_customer->NewRowReader(*customer_key); + if (!GetValues(ret, gtxn, customer_reader, + {"c_id", "c_d_id", "c_w_id", "c_first", "c_middle", "c_last", + "c_balance", "c_ytd_payment", "c_payment_cnt", "c_credit", + "c_data", "c_street_1", "c_street_2", "c_city", "c_state", + "c_zip", "c_phone", "c_since", "c_credit_lim", "c_discount"}, + customer_ret, + "@get_customer|customer_reader" + *customer_key)) { + return false; + } + return true; +} + +} // namespace tpcc +} // namespace tera diff --git a/src/benchmark/tpcc/tera_tpccdb.h b/src/benchmark/tpcc/tera_tpccdb.h new file mode 100644 index 000000000..a300166b0 --- /dev/null +++ b/src/benchmark/tpcc/tera_tpccdb.h @@ -0,0 +1,101 @@ +// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. +// +// Author: baorenyi@baidu.com + +#ifndef TERA_BENCHMARK_TPCC_TERA_TPCCDB_H +#define TERA_BENCHMARK_TPCC_TERA_TPCCDB_H + +#include "tera.h" +#include "benchmark/tpcc/tpccdb.h" + +namespace tera { +namespace tpcc { + +class TpccDb; +class TxnResult; + +class TeraTpccDb : public TpccDb { +public: + TeraTpccDb(); + virtual ~TeraTpccDb(); + + virtual bool CreateTables(); + virtual bool CleanTables(); + + // init db + virtual bool InsertItem(const Item& i); + + virtual bool InsertWarehouse(const Warehouse& w); + + virtual bool InsertDistrict(const District& d); + + virtual bool InsertCustomer(const Customer& c); + + virtual bool InsertHistory(const History& h); + + virtual bool InsertStock(const Stock& s); + + virtual bool InsertOrder(const Order& o); + + virtual bool InsertOrderLine(const OrderLine& ol); + + virtual bool InsertNewOrder(const NewOrder& no); + + virtual void StockLevelTxn(int32_t warehouse_id, int32_t district_id, + int32_t threshold, + StockLevelResult* ret); + + virtual void DeliveryTxn(int32_t warehouse_id, + int32_t carrier_id, + const std::string& delivery_datetime, + DeliveryResult* ret); + + virtual void OrderStatusTxn(bool by_last_name, + int32_t warehouse_id, int32_t district_id, + int32_t c_customer_id, + const std::string& last_name, + OrderStatusResult* ret); + + virtual void PaymentTxn(bool by_last_name, + int32_t warehouse_id, int32_t district_id, + int32_t c_warehouse_id, int32_t c_district_id, + int32_t c_customer_id, + const std::string& last_name, + int32_t h_amount, + PaymentResult* ret); + + virtual void NewOrderTxn(int32_t warehouse_id, + int32_t district_id, + int32_t customer_id, const NewOrderInfo& info, + NewOrderResult* ret); + +private: + void SetTxnResult(TxnResult* ret, Transaction* gtxn, bool state = true, + const std::string& msg = ""); + + bool GetValues(TxnResult* ret, Transaction* gtxn, RowReader* reader, + std::initializer_list qu_names_initlist, + RetTuples* ret_tuples, + const std::string& if_error_msg); + + bool GetCustomer(TxnResult* ret, Transaction* gtxn, bool by_last_name, + const std::string& last_name, int32_t customer_id, + int32_t warehouse_id, int32_t district_id, + std::string* customer_key, RetTuples* customer_ret); +private: + void SetPaymentSingleLineRet(const RetTuples& warehouse_ret, + const RetTuples& district_ret, + const RetTuples& customer_ret, + const RetTuples& other_ret, + RetTuples* payment_ret); +private: + Client* client_; + std::unordered_map table_map_; +}; + +} // namespace tpcc +} // namespace tera + +#endif /* TERA_BENCHMARK_TPCC_TERA_TPCCDB_H */ diff --git a/src/benchmark/tpcc/tera_txn/delivery_txn.cc b/src/benchmark/tpcc/tera_txn/delivery_txn.cc new file mode 100644 index 000000000..d1a7a3e18 --- /dev/null +++ b/src/benchmark/tpcc/tera_txn/delivery_txn.cc @@ -0,0 +1,144 @@ +// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. +// +// Author: baorenyi@baidu.com + +#include "benchmark/tpcc/tera_tpccdb.h" + +#include +#include + +#include "sdk/client_impl.h" +#include "sdk/sdk_utils.h" + +namespace tera { +namespace tpcc { + +void TeraTpccDb::DeliveryTxn(int32_t warehouse_id, + int32_t carrier_id, + const std::string& delivery_datetime, + DeliveryResult* ret) { + // open table + Table* t_neworder = table_map_[kTpccTables[kNewOrderTable]]; + Table* t_order = table_map_[kTpccTables[kOrderTable]]; + Table* t_orderline = table_map_[kTpccTables[kOrderLineTable]]; + Table* t_customer = table_map_[kTpccTables[kCustomerTable]]; + // begin transaction + Transaction* gtxn = client_->NewGlobalTransaction(); + for (int32_t district_id = 1; district_id <= kDistrictCountPerWarehouse; ++district_id) { + // The row in the NEW-ORDER table with matching NO_W_ID (equals W_ID) + // and NO_D_ID (equals D_ID) and with the lowest NO_O_ID value is selected. + ErrorCode error_code; + std::string start_key = std::to_string(warehouse_id) + "_" + std::to_string(district_id) + "_"; + ScanDescriptor scan_desc(start_key); + scan_desc.SetEnd(start_key + "~"); + scan_desc.AddColumnFamily("cf0"); + tera::ResultStream* scanner = t_neworder->Scan(scan_desc, &error_code); + bool not_new_order = false; + int32_t order_id = INT32_MAX; + for (scanner->LookUp(start_key); !scanner->Done(); scanner->Next()) { + std::string row_key = scanner->RowName(); + if (row_key.find(start_key) == std::string::npos) { + not_new_order = true; + break; + } + std::size_t found = row_key.find_last_of("_"); + int32_t found_order_id = std::stoi(row_key.substr(found + 1)); + if (order_id > found_order_id) { + order_id = found_order_id; + } + } + delete scanner; + // If no matching row is found, then the delivery of an order + // for this district is skipped. + if (not_new_order || order_id == INT32_MAX) { + continue; + } + + // The selected row in the NEW-ORDER table is deleted + std::string no_primary_key = start_key + std::to_string(order_id); + RowReader* no_reader = t_neworder->NewRowReader(no_primary_key); + RetTuples no_ret; + if (!GetValues(ret, gtxn, no_reader, + {"no_o_id"}, + &no_ret, + "@delivery|no_reader|" + no_primary_key)) { + return; + } + + RowMutation* no_mu = t_neworder->NewRowMutation(no_primary_key); + no_mu->DeleteColumns("cf0", "no_o_id", gtxn->GetStartTimestamp()); + no_mu->DeleteColumns("cf0", "no_d_id", gtxn->GetStartTimestamp()); + no_mu->DeleteColumns("cf0", "no_w_id", gtxn->GetStartTimestamp()); + gtxn->ApplyMutation(no_mu); + delete no_mu; + + // The row in the ORDER table with matching + // O_W_ID (equals W_ID), O_D_ID (equals D_ID), and O_ID (equals NO_O_ID) + // is selected, O_C_ID, the customer number, is retrieved, + // and O_CARRIER_ID is updated. + std::string order_primary_key = no_primary_key; + RowReader* order_reader = t_order->NewRowReader(order_primary_key); + RetTuples order_ret; + if (!GetValues(ret, gtxn, order_reader, + {"o_carrier_id", "o_ol_cnt", "o_c_id"}, + &order_ret, + "@delivery|order_reader|" + order_primary_key)) { + return; + } + RowMutation* order_mu = t_order->NewRowMutation(order_primary_key); + order_mu->Put("cf0", "o_carrier_id", std::to_string(carrier_id)); + gtxn->ApplyMutation(order_mu); + delete order_mu; + + int32_t o_ol_cnt = std::stoi(order_ret["o_ol_cnt"]); + // the sum of all OL_AMOUNT. + float amount = 0.0f; + // All rows in the ORDER-LINE table with matching + // OL_W_ID (= O_W_ID), OL_D_ID (= O_D_ID), and OL_O_ID (= O_ID) are selected. + for (int32_t ol_number = 1; ol_number <= o_ol_cnt; ++ ol_number) { + std::string ol_key = order_primary_key + "_" + std::to_string(ol_number); + RowReader* ol_reader = t_orderline->NewRowReader(ol_key); + RetTuples ol_ret; + if (!GetValues(ret, gtxn, ol_reader, + {"ol_amount", "ol_delivery_d"}, + &ol_ret, + "@delivery|ol_reader|" + ol_key)) { + return; + } + amount += std::stof(ol_ret["ol_amount"]); + RowMutation* ol_mu = t_orderline->NewRowMutation(ol_key); + // All OL_DELIVERY_D, the delivery dates, + // are updated to the current system time as returned by the OS + ol_mu->Put("cf0","ol_delivery_d",delivery_datetime); + gtxn->ApplyMutation(ol_mu); + delete ol_mu; + } + + // The row in the CUSTOMER table with matching + // C_W_ID (= W_ID), C_D_ID (= D_ID), and C_ID (= O_C_ID) is selected + std::string customer_key = start_key + order_ret["o_c_id"]; + RowReader* customer_reader = t_customer->NewRowReader(customer_key); + RetTuples customer_ret; + if (!GetValues(ret, gtxn, customer_reader, + {"c_balance", "c_delivery_cnt"}, + &customer_ret, + "@delivery|customer_reader" + customer_key)) { + return; + } + // and C_BALANCE + sum(OL_AMOUNT) previously retrieved. C_DELIVERY_CNT + 1. + RowMutation* customer_mu = t_customer->NewRowMutation(customer_key); + customer_mu->Put("cf0", "c_balance", + std::to_string(std::stof(customer_ret["c_balance"]) + amount)); + customer_mu->Put("cf0", "c_delivery_cnt", + std::to_string(std::stoi(customer_ret["c_delivery_cnt"]) + 1)); + gtxn->ApplyMutation(customer_mu); + delete customer_mu; + } + gtxn->Commit(); + SetTxnResult(ret, gtxn, gtxn->GetError().GetType() == ErrorCode::kOK); +} + +} // namespace tpcc +} // namespace tera diff --git a/src/benchmark/tpcc/tera_txn/new_order_txn.cc b/src/benchmark/tpcc/tera_txn/new_order_txn.cc new file mode 100644 index 000000000..df4100824 --- /dev/null +++ b/src/benchmark/tpcc/tera_txn/new_order_txn.cc @@ -0,0 +1,214 @@ +// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. +// +// Author: baorenyi@baidu.com + +#include "benchmark/tpcc/tera_tpccdb.h" + +#include +#include + +#include "sdk/client_impl.h" +#include "sdk/sdk_utils.h" + +namespace tera { +namespace tpcc { + +void TeraTpccDb::NewOrderTxn(int32_t warehouse_id, + int32_t district_id, + int32_t customer_id, const NewOrderInfo& info, + NewOrderResult* ret) { + // open table + Table* t_warehouse = table_map_[kTpccTables[kWarehouseTable]]; + Table* t_district = table_map_[kTpccTables[kDistrictTable]]; + Table* t_customer = table_map_[kTpccTables[kCustomerTable]]; + Table* t_order = table_map_[kTpccTables[kOrderTable]]; + Table* t_order_index = table_map_[kTpccTables[kOrderIndex]]; + Table* t_neworder = table_map_[kTpccTables[kNewOrderTable]]; + Table* t_orderline = table_map_[kTpccTables[kOrderLineTable]]; + Table* t_item = table_map_[kTpccTables[kItemTable]]; + Table* t_stock = table_map_[kTpccTables[kStockTable]]; + // begin transaction + std::unique_ptr gtxn(client_->NewGlobalTransaction()); + std::string datetime = get_curtime_str(); + std::string warehouse_key = std::to_string(warehouse_id); + std::string district_key = warehouse_key + "_" + std::to_string(district_id); + std::string customer_key = district_key + "_" + std::to_string(customer_id); + + RowReader* warehouse_reader = t_warehouse->NewRowReader(warehouse_key); + RetTuples warehouse_ret; + if (!GetValues(ret, gtxn.get(), warehouse_reader, + {"w_tax"}, + &warehouse_ret, + "@new_order|warehouse_reader|" + warehouse_key)) { + return; + } + + RowReader* district_reader = t_district->NewRowReader(district_key); + RetTuples district_ret; + if (!GetValues(ret, gtxn.get(), district_reader, + {"d_next_o_id", "d_tax"}, + &district_ret, + "@new_order|district_reader|" + district_key)) { + return; + } + std::string d_next_o_id_str = std::to_string(std::stoi(district_ret["d_next_o_id"]) + 1); + + RowReader* customer_reader = t_customer->NewRowReader(customer_key); + RetTuples customer_ret; + if (!GetValues(ret, gtxn.get(), customer_reader, + {"c_discount", "c_credit", "c_last"}, + &customer_ret, + "@new_order|customer_reader|" + customer_key)) { + return; + } + + RowMutation* district_mu = t_district->NewRowMutation(district_key); + district_mu->Put("cf0", "d_next_o_id", d_next_o_id_str); + gtxn->ApplyMutation(district_mu); + delete district_mu; + + std::string order_key = district_key + "_" + d_next_o_id_str; + RowMutation* order_mu = t_order->NewRowMutation(order_key); + std::string order_index_key = customer_key + "_" + d_next_o_id_str; + RowMutation* order_index_mu = t_order_index->NewRowMutation(order_index_key); + order_index_mu->Put("cf0", "o_id", d_next_o_id_str); + order_index_mu->Put("cf0", "o_c_id", std::to_string(customer_id)); + order_index_mu->Put("cf0", "o_d_id", std::to_string(district_id)); + order_index_mu->Put("cf0", "o_w_id", warehouse_key); + order_mu->Put("cf0", "o_id", d_next_o_id_str); + order_mu->Put("cf0", "o_c_id", std::to_string(customer_id)); + order_mu->Put("cf0", "o_d_id", std::to_string(district_id)); + order_mu->Put("cf0", "o_w_id", warehouse_key); + order_mu->Put("cf0", "o_carrier_id", std::to_string(0)); + order_mu->Put("cf0", "o_ol_cnt", std::to_string(info.o_ol_cnt)); + order_mu->Put("cf0", "o_all_local", std::to_string(info.o_all_local)); + order_mu->Put("cf0", "o_entry_d", datetime); + gtxn->ApplyMutation(order_mu); + gtxn->ApplyMutation(order_index_mu); + delete order_mu; + delete order_index_mu; + + RowMutation* no_mu = t_neworder->NewRowMutation(order_key); + no_mu->Put("cf0", "no_o_id", d_next_o_id_str); + no_mu->Put("cf0", "no_d_id", std::to_string(district_id)); + no_mu->Put("cf0", "no_w_id", warehouse_key); + gtxn->ApplyMutation(no_mu); + delete no_mu; + + std::string ol_dist_info_key; + if (district_id == kDistrictCountPerWarehouse) { + ol_dist_info_key = "s_dist_10"; + } else { + ol_dist_info_key = "s_dist_0" + std::to_string(district_id); + } + + float ol_amount_sum = 0; + for (int32_t i = 0; i < info.o_ol_cnt; ++i) { + int32_t i_id = info.ol_i_ids[i]; + std::string item_key = std::to_string(i_id); + RowReader* item_reader = t_item->NewRowReader(item_key); + RetTuples item_ret; + if (!GetValues(ret, gtxn.get(), item_reader, + {"i_price", "i_name", "i_data"}, + &item_ret, + "@new_order|item_reader|" + item_key)) { + return; + } + + std::string ol_supply_w_id_str = std::to_string(info.ol_supply_w_ids[i]); + std::string stock_key = ol_supply_w_id_str+ "_" + item_key; + RowReader* stock_reader = t_item->NewRowReader(stock_key); + RetTuples stock_ret; + if (!GetValues(ret, gtxn.get(), stock_reader, + {"s_quantity", "s_ytd", "s_order_cnt", "s_remote_cnt", "s_data", ol_dist_info_key}, + &stock_ret, + "@new_order|stock_reader|" + stock_key)) { + return; + } + + int32_t ol_quantity = info.ol_quantities[i]; + float ol_amount = std::stof(item_ret["i_price"]) * ol_quantity; + ol_amount_sum += ol_amount; + std::string ol_number_str = std::to_string(i + 1); + std::string ol_key = order_key + "_" + ol_number_str; + RowMutation* ol_mu = t_orderline->NewRowMutation(ol_key); + ol_mu->Put("cf0", "ol_o_id", d_next_o_id_str); + ol_mu->Put("cf0", "ol_d_id", std::to_string(district_id)); + ol_mu->Put("cf0", "ol_w_id", warehouse_key); + ol_mu->Put("cf0", "ol_number", ol_number_str); + ol_mu->Put("cf0", "ol_i_id", item_key); + ol_mu->Put("cf0", "ol_supply_w_id", ol_supply_w_id_str); + ol_mu->Put("cf0", "ol_delivery_d", ""); + ol_mu->Put("cf0", "ol_quantity", std::to_string(ol_quantity)); + ol_mu->Put("cf0", "ol_amount", std::to_string(ol_amount)); + ol_mu->Put("cf0", "ol_dist_info", stock_ret[ol_dist_info_key]); + gtxn->ApplyMutation(ol_mu); + delete ol_mu; + // update stock + int32_t s_quantity = std::stoi(stock_ret["s_quantity"]); + if (s_quantity > ol_quantity + 10) { + s_quantity -= ol_quantity; + } else { + s_quantity = (s_quantity - ol_quantity) + 91; + } + float s_ytd = std::stof(stock_ret["s_quantity"]) + ol_quantity; + int32_t s_order_cnt = std::stoi(stock_ret["s_order_cnt"]) + 1; + int32_t s_remote_cnt = std::stoi(stock_ret["s_remote_cnt"]); + if (info.ol_supply_w_ids[i] != warehouse_id) { + ++s_remote_cnt; + } + RowMutation* stock_mu = t_stock->NewRowMutation(stock_key); + stock_mu->Put("cf0", "s_quantity", std::to_string(s_quantity)); + stock_mu->Put("cf0", "s_ytd", std::to_string(s_ytd)); + stock_mu->Put("cf0", "s_order_cnt", std::to_string(s_order_cnt)); + stock_mu->Put("cf0", "s_remote_cnt", std::to_string(s_remote_cnt)); + gtxn->ApplyMutation(stock_mu); + delete stock_mu; + + // set result + RetTuples line; + line["ol_supply_w_id"] = ol_supply_w_id_str; + line["ol_i_id"] = item_key; + line["i_name"] = item_ret["i_name"]; + line["ol_quantity"] = std::to_string(ol_quantity); + line["s_quantity"] = std::to_string(s_quantity); + line["i_price"] = item_ret["i_price"]; + line["ol_amount"] = std::to_string(ol_amount); + std::string i_data = item_ret["i_data"]; + std::string s_data = item_ret["s_data"]; + if (i_data.find("ORIGINAL") != std::string::npos && + s_data.find("ORIGINAL") != std::string::npos) { + line["brand_generic"] = "B"; + } else { + line["brand_generic"] = "G"; + } + ret->AddLine(line); + } + if (!info.need_failed) { + RetTuples single_line; + single_line["o_id"] = d_next_o_id_str; + single_line["o_ol_cnt"] = std::to_string(info.o_ol_cnt); + single_line["c_last"] = customer_ret["c_last"]; + single_line["c_credit"] = customer_ret["c_credit"]; + single_line["c_discount"] = customer_ret["c_discount"]; + single_line["w_tax"] = warehouse_ret["w_tax"]; + single_line["d_tax"] = district_ret["d_tax"]; + single_line["o_entry_d"] = datetime; + float c_discount = std::stof(customer_ret["c_discount"]); + float w_tax = std::stof(warehouse_ret["w_tax"]); + float d_tax = std::stof(district_ret["d_tax"]); + float total_amount = ol_amount_sum * ( 1 - c_discount) * (1 + w_tax + d_tax); + single_line["total_amount"] = std::to_string(total_amount); + ret->SetSingleLine(single_line); + gtxn->Commit(); + SetTxnResult(ret, gtxn.get()); + } else { + // set commit failed + SetTxnResult(ret, gtxn.get(), false, "@new_order|rowback simulation"); + } +} + +} // namespace tpcc +} // namespace tera diff --git a/src/benchmark/tpcc/tera_txn/order_status_txn.cc b/src/benchmark/tpcc/tera_txn/order_status_txn.cc new file mode 100644 index 000000000..a88fe7e0c --- /dev/null +++ b/src/benchmark/tpcc/tera_txn/order_status_txn.cc @@ -0,0 +1,89 @@ +// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. +// +// Author: baorenyi@baidu.com + +#include "benchmark/tpcc/tera_tpccdb.h" + +#include +#include + +#include "sdk/client_impl.h" +#include "sdk/sdk_utils.h" + +namespace tera { +namespace tpcc { + +void TeraTpccDb::OrderStatusTxn(bool by_last_name, + int32_t warehouse_id, int32_t district_id, + int32_t c_customer_id, + const std::string& last_name, + OrderStatusResult* ret) { + // open table + Table* t_order_index = table_map_[kTpccTables[kOrderIndex]]; + Table* t_orderline = table_map_[kTpccTables[kOrderLineTable]]; + Table* t_order = table_map_[kTpccTables[kOrderTable]]; + // begin transaction + std::unique_ptr gtxn(client_->NewGlobalTransaction()); + std::string customer_key = ""; + RetTuples customer_ret; + if (!GetCustomer(ret, gtxn.get(), by_last_name, last_name, c_customer_id, + warehouse_id, district_id, &customer_key, &customer_ret)) { + return; + } + + // find newest order from order index + ErrorCode error_code; + std::string prefix_key = std::to_string(warehouse_id) + "_" + + std::to_string(district_id) + "_"; + std::string start_key = prefix_key + customer_ret["c_id"] + "_"; + ScanDescriptor scan_desc(start_key); + scan_desc.SetEnd(start_key + "~"); + scan_desc.AddColumnFamily("cf0"); + ResultStream* scanner = t_order_index->Scan(scan_desc, &error_code); + int32_t max_order_id = -1; + for (scanner->LookUp(start_key); !scanner->Done(); scanner->Next()) { + std::string row_key = scanner->RowName(); + RowReader* index_reader = t_order_index->NewRowReader(row_key); + RetTuples index_ret; + if (!GetValues(ret, gtxn.get(), index_reader, + {"o_id"}, + &index_ret, + "@order_status|order_index_reader|" + row_key)) { + break; + } + if ( max_order_id < std::stoi(index_ret["o_id"])) { + max_order_id = std::stoi(index_ret["o_id"]); + } + } + delete scanner; + if (max_order_id == -1) { + SetTxnResult(ret, gtxn.get(), false, "not found order|" + start_key); + return; + } + std::string order_key = prefix_key + std::to_string(max_order_id); + RowReader* order_reader = t_order->NewRowReader(order_key); + RetTuples order_ret; + if (!GetValues(ret, gtxn.get(), order_reader, + {"o_ol_cnt", "o_id"}, + &order_ret, + "@order_status|order_reader|" + order_key)) { + return; + } + for (int32_t i = 1; i <= std::stoi(order_ret["o_ol_cnt"]); ++i) { + std::string ol_key = prefix_key + order_ret["o_id"] + "_" + std::to_string(i); + RowReader* ol_reader = t_orderline->NewRowReader(ol_key); + RetTuples ol_ret; + if (!GetValues(ret, gtxn.get(), ol_reader, + {}, // TODO + &ol_ret, + "@order_status|ol_reader|" + ol_key)) { + return; + } + } + SetTxnResult(ret, gtxn.get()); +} + +} // namespace tpcc +} // namespace tera diff --git a/src/benchmark/tpcc/tera_txn/payment_txn.cc b/src/benchmark/tpcc/tera_txn/payment_txn.cc new file mode 100644 index 000000000..c45d371bd --- /dev/null +++ b/src/benchmark/tpcc/tera_txn/payment_txn.cc @@ -0,0 +1,194 @@ +// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. +// +// Author: baorenyi@baidu.com + +#include "benchmark/tpcc/tera_tpccdb.h" + +#include +#include + +#include "sdk/client_impl.h" +#include "sdk/sdk_utils.h" + +namespace tera { +namespace tpcc { + +void TeraTpccDb::PaymentTxn(bool by_last_name, + int32_t warehouse_id, int32_t district_id, + int32_t customer_warehouse_id, int32_t customer_district_id, + int32_t c_customer_id, + const std::string& last_name, + int32_t h_amount, + PaymentResult* ret) { + // open table + Table* t_warehouse = table_map_[kTpccTables[kWarehouseTable]]; + Table* t_district = table_map_[kTpccTables[kDistrictTable]]; + Table* t_customer = table_map_[kTpccTables[kCustomerTable]]; + Table* t_history = table_map_[kTpccTables[kHistoryTable]]; + Table* t_history_index = table_map_[kTpccTables[kHistoryIndex]]; + + // begin transaction + Transaction* gtxn = client_->NewGlobalTransaction(); + + // read customer + std::string customer_key = ""; + RetTuples customer_ret; + if (!GetCustomer(ret, gtxn, by_last_name, last_name, c_customer_id, + customer_warehouse_id, customer_district_id, &customer_key, &customer_ret)) { + return; + } + + // read warehouse + std::string warehouse_key = std::to_string(warehouse_id); + RowReader* warehouse_reader = t_warehouse->NewRowReader(warehouse_key); + RetTuples warehouse_ret; + if (!GetValues(ret, gtxn, warehouse_reader, + {"w_ytd", "w_name", "w_street_1", "w_street_2", "w_city", "w_state", "w_zip"}, + &warehouse_ret, + "@payment|warehouse_reader|" + warehouse_key)) { + return; + } + + // update warehouse + RowMutation* warehouse_mu = t_warehouse->NewRowMutation(warehouse_key); + // add amount of this payment to the ytd balance of current warehouse. + float w_ytd = std::stof(warehouse_ret["w_ytd"]) + h_amount; + warehouse_mu->Put("cf0", "w_ytd", std::to_string(w_ytd)); + gtxn->ApplyMutation(warehouse_mu); + delete warehouse_mu; + + // read district + std::string district_id_str = std::to_string(district_id); + std::string district_key = warehouse_key + "_" + district_id_str; + RowReader* district_reader = t_district->NewRowReader(district_key); + RetTuples district_ret; + if (!GetValues(ret, gtxn, district_reader, + {"d_ytd", "d_name", "d_street_1", "d_street_2", "d_city", "d_state", "d_zip"}, + &district_ret, + "@payment|district_reader|" + district_key)) { + return; + } + + // update district + RowMutation* district_mu = t_district->NewRowMutation(district_key); + // add amount of this payment to the ytd balance of current district. + float d_ytd = std::stof(district_ret["d_ytd"]) + h_amount; + district_mu->Put("cf0", "d_ytd", std::to_string(d_ytd)); + gtxn->ApplyMutation(district_mu); + delete district_mu; + + // update customer + // [Revision 5.11 - Page 34] see Clause 2.5.2.2 + // C_BALANCE is decreased by H_AMOUNT. + // C_YTD_PAYMENT is increased by H_AMOUNT. + // C_PAYMENT_CNT is incremented by 1. + RowMutation* customer_mu = t_customer->NewRowMutation(customer_key); + std::string c_balance_str = std::to_string(std::stof(customer_ret["c_balance"]) - h_amount); + customer_mu->Put("cf0", "c_balance", c_balance_str); + customer_mu->Put("cf0", "c_ytd_payment", + std::to_string(std::stof(customer_ret["c_ytd_payment"]) + h_amount)); + customer_mu->Put("cf0", "c_payment_cnt", + std::to_string(std::stof(customer_ret["c_payment_cnt"]) + h_amount)); + + if (customer_ret["c_credit"] == "BC") { + std::string data_info = customer_key + "_" + district_key + "_" + std::to_string(h_amount); + customer_ret["c_data"].insert(0, data_info); + if (customer_ret["c_data"].size() > kCustomerDataUpperLen) { + customer_ret["c_data"].substr(0, kCustomerDataUpperLen); + } + customer_mu->Put("cf0", "c_data", customer_ret["c_data"]); + } + gtxn->ApplyMutation(customer_mu); + delete customer_mu; + + // read history_index (find newest history) + std::string history_data = warehouse_ret["w_name"] + " " + district_ret["d_name"]; + RowReader* hindex_reader = t_history_index->NewRowReader("count"); + RetTuples hindex_ret; + if (!GetValues(ret, gtxn, hindex_reader, + {"count"}, + &hindex_ret, + "@payment|hindex_reader|count")) { + return; + } + int cnt = std::stoi(hindex_ret["count"]); + + // update history_index + RowMutation* hindex_mu = t_history_index->NewRowMutation("count"); + hindex_mu->Put("cf0", "count", std::to_string(++cnt)); + gtxn->ApplyMutation(hindex_mu); + delete hindex_mu; + + // update history use now newest count as the primary key(row_key) of history + // default t_history don't have priamry key in tpcc + std::string history_key = std::to_string(cnt); + RowMutation* mu = t_history->NewRowMutation(history_key); + mu->Put("cf0", "h_c_id", customer_ret["c_id"]); + mu->Put("cf0", "h_c_d_id", customer_ret["c_d_id"]); + mu->Put("cf0", "h_c_w_id", customer_ret["c_w_id"]); + mu->Put("cf0", "h_d_id", district_id_str); + mu->Put("cf0", "h_w_id", warehouse_key); + mu->Put("cf0", "h_amount", std::to_string(h_amount)); + // The payment date (H_DATE) in generated within the SUT + // by using the current system date and time + std::string datetime = get_curtime_str(); + mu->Put("cf0", "h_date", datetime); + mu->Put("cf0", "h_data", history_data); + gtxn->ApplyMutation(mu); + delete mu; + + gtxn->Commit(); + RetTuples single_line; + RetTuples other_ret = { + {"w_id", warehouse_key}, + {"d_id", district_id_str}, + {"h_amount", std::to_string(h_amount)}, + {"h_date", datetime}, + {"c_balance", c_balance_str}, + {"c_data", customer_ret["c_data"].substr(0,200)} + }; + SetPaymentSingleLineRet(warehouse_ret, district_ret, customer_ret, other_ret, + &single_line); + + SetTxnResult(ret, gtxn); +} + +void TeraTpccDb::SetPaymentSingleLineRet(const RetTuples& warehouse_ret, + const RetTuples& district_ret, + const RetTuples& customer_ret, + const RetTuples& other_ret, + RetTuples* payment_ret) { + // The following fields are displayed: + // W_ID, D_ID, C_ID, C_D_ID, C_W_ID, + // W_STREET_1, W_STREET_2, W_CITY, W_STATE, W_ZIP, + // D_STREET_1, D_STREET_2, D_CITY, D_STATE, D_ZIP, + // C_FIRST, C_MIDDLE, C_LAST, C_STREET_1, C_STREET_2, C_CITY, C_STATE, + // C_ZIP, C_PHONE, C_SINCE, C_CREDIT, C_CREDIT_LIM, C_DISCOUNT, C_BALANCE, + // the first 200 characters of C_DATA (only if C_CREDIT = "BC"), + // H_AMOUNT, and H_DATE. + payment_ret->insert(other_ret.begin(), other_ret.end()); + for (auto t : warehouse_ret) { + if (t.first != "w_ytd" && t.first != "w_name") { + payment_ret->insert(t); + } + } + for (auto t : district_ret) { + if (t.first != "d_ytd" && t.first != "w_name") { + payment_ret->insert(t); + } + } + std::unordered_set c_names = {"c_id", "c_d_id", "c_w_id", + "c_first", "c_middle", "c_last", "c_street_1", "c_street_2", "c_city", + "c_state", "c_zip", "c_phone", "c_since", "c_credit", "c_credit_lim", + "c_discount"}; + for (auto t : customer_ret) { + if (c_names.find(t.first) != c_names.end()) { + payment_ret->insert(t); + } + } +} + +} // namespace tpcc +} // namespace tera diff --git a/src/benchmark/tpcc/tera_txn/stocklevel_txn.cc b/src/benchmark/tpcc/tera_txn/stocklevel_txn.cc new file mode 100644 index 000000000..eeb7bb06d --- /dev/null +++ b/src/benchmark/tpcc/tera_txn/stocklevel_txn.cc @@ -0,0 +1,79 @@ +// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. +// +// Author: baorenyi@baidu.com + +#include "benchmark/tpcc/tera_tpccdb.h" + +#include +#include + +#include "sdk/client_impl.h" +#include "sdk/sdk_utils.h" + +namespace tera { +namespace tpcc { + +void TeraTpccDb::StockLevelTxn(int32_t warehouse_id, int32_t district_id, + int32_t threshold, + StockLevelResult* ret) { + // open table + Table* t_district = table_map_[kTpccTables[kDistrictTable]]; + Table* t_order = table_map_[kTpccTables[kOrderTable]]; + Table* t_orderline = table_map_[kTpccTables[kOrderLineTable]]; + Table* t_stock = table_map_[kTpccTables[kStockTable]]; + // begin transaction + std::unique_ptr gtxn(client_->NewGlobalTransaction()); + std::string district_primary_key = std::to_string(warehouse_id) + + "_" + std::to_string(district_id); + RowReader* district_reader = t_district->NewRowReader(district_primary_key); + RetTuples district_ret; + if (!GetValues(ret, gtxn.get(), district_reader, {"d_next_o_id"}, &district_ret, + "@stock_level|district_reader|" + district_primary_key)) { + return; + } + int32_t order_id = std::stoi(district_ret["d_next_o_id"]); + + int32_t cnt = 0; + for (int32_t ol_o_id = order_id - 20; ol_o_id <= order_id; ++ol_o_id) { + std::string order_primary_key = std::to_string(warehouse_id) + + "_" + std::to_string(district_id) + "_" + std::to_string(ol_o_id); + RowReader* order_reader = t_order->NewRowReader(order_primary_key); + RetTuples order_ret; + if (!GetValues(ret, gtxn.get(), order_reader, {"o_ol_cnt"}, &order_ret, + "@stock_level|order_reader|" + order_primary_key)) { + return; + } + int32_t o_ol_cnt = std::stoi(order_ret["o_ol_cnt"]); + for (int32_t ol_number = 1; ol_number <= o_ol_cnt; ++ ol_number) { + std::string ol_primary_key = order_primary_key + "_" + std::to_string(ol_number); + RowReader* ol_reader = t_orderline->NewRowReader(ol_primary_key); + RetTuples ol_ret; + ol_reader->AddColumn("cf0", "ol_i_id"); + if (!GetValues(ret, gtxn.get(), ol_reader, {"ol_i_id"}, &ol_ret, + "@stock_level|ol_reader|" + ol_primary_key)) { + return; + } + int32_t ol_i_id = std::stoi(ol_ret["ol_i_id"]); + std::string stock_key = std::to_string(warehouse_id) + + "_" + std::to_string(ol_i_id); + RowReader* stock_reader = t_stock->NewRowReader(stock_key); + RetTuples stock_ret; + if (!GetValues(ret, gtxn.get(), stock_reader, {"s_quantity"}, &stock_ret, + "@stock_level|stock_reader|" + stock_key)) { + return; + } + int32_t s_quantity = std::stoi(stock_ret["s_quantity"]); + if (s_quantity < threshold) { + ++cnt; + } + } + } + // only read not need commit + ret->SetLowStock(cnt); + SetTxnResult(ret, gtxn.get()); +} + +} // namespace tpcc +} // namespace tera diff --git a/src/benchmark/tpcc/test/data_generator_test.cc b/src/benchmark/tpcc/test/data_generator_test.cc new file mode 100644 index 000000000..6c5b71fe7 --- /dev/null +++ b/src/benchmark/tpcc/test/data_generator_test.cc @@ -0,0 +1,88 @@ +// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. +// +// Author: baorenyi@baidu.com + +#include + +#include "benchmark/tpcc/data_generator.h" +#include "benchmark/tpcc/mock_tpccdb.h" +#include "benchmark/tpcc/random_generator.h" +#include "benchmark/tpcc/tpccdb.h" + +#include "gflags/gflags.h" +#include "gtest/gtest.h" + +DECLARE_int32(warehouses_count); + +namespace tera { +namespace tpcc { + +class DataGeneratorTest : public ::testing::Test { +public: + DataGeneratorTest() { + random_gen_.SetRandomConstant(); + TpccDb* db_ = (TpccDb*)(&mdb_); + data_gen_ = new DataGenerator(&random_gen_, db_); + } + + void CleanStateCounter(int table_enum_num = -1) { + if (table_enum_num == -1) { + for (int i = 0; i < kTpccTableCnt; ++i) { + data_gen_->states_[i].first.Set(0); + data_gen_->states_[i].second.Set(0); + } + } else if (table_enum_num > -1 && table_enum_num < kTpccTableCnt) { + data_gen_->states_[table_enum_num].first.Set(0); + data_gen_->states_[table_enum_num].second.Set(0); + } + } + + ~DataGeneratorTest() { + delete data_gen_; + } +private: + RandomGenerator random_gen_; + TpccDb* db_; + MockTpccDb mdb_; + DataGenerator* data_gen_; + +}; + +TEST_F(DataGeneratorTest, GenItem) { + CleanStateCounter(); + mdb_.flag_ = true; + data_gen_->GenItem(1, false); + EXPECT_TRUE(data_gen_->states_[kItemTable].first.Get() == 1); + data_gen_->GenItem(1, false); + EXPECT_TRUE(data_gen_->states_[kItemTable].first.Get() == 2); + mdb_.flag_ = false; + data_gen_->GenItem(1, false); + EXPECT_TRUE(data_gen_->states_[kItemTable].second.Get() == 1); +} + +TEST_F(DataGeneratorTest, GenStock) { + CleanStateCounter(); + mdb_.flag_ = true; + data_gen_->GenStock(1, 2, false); + EXPECT_TRUE(data_gen_->states_[kStockTable].first.Get() == 1); + data_gen_->GenStock(1, 2, false); + EXPECT_TRUE(data_gen_->states_[kStockTable].first.Get() == 2); + mdb_.flag_ = false; + data_gen_->GenStock(1, 3, false); + EXPECT_TRUE(data_gen_->states_[kStockTable].second.Get() == 1); +} + +TEST_F(DataGeneratorTest, GenStocks) { + CleanStateCounter(); + mdb_.flag_ = true; + for (int i = 1; i <=FLAGS_warehouses_count; ++i) { + data_gen_->GenStocks(i); + } + data_gen_->Join(); + EXPECT_TRUE(data_gen_->states_[kStockTable].first.Get() == FLAGS_warehouses_count * kItemCount); +} + +} // namespace tpcc +} // namespace tera diff --git a/src/benchmark/tpcc/test/random_generator_test.cc b/src/benchmark/tpcc/test/random_generator_test.cc new file mode 100644 index 000000000..978521739 --- /dev/null +++ b/src/benchmark/tpcc/test/random_generator_test.cc @@ -0,0 +1,81 @@ +// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. +// +// Author: baorenyi@baidu.com + +#include "benchmark/tpcc/random_generator.h" + +#include "gtest/gtest.h" + +namespace tera { +namespace tpcc { + +class RandomGenerator; + +class RandomGeneratorTest : public ::testing::Test, public RandomGenerator { +public: + RandomGeneratorTest() : RandomGenerator() { + SetRandomConstant(); + } + + ~RandomGeneratorTest() {} +}; + +TEST_F(RandomGeneratorTest, MakeFloat) { + EXPECT_EQ(MakeFloat(1.0, 1.0, 1), 1.0); + float f = MakeFloat(0, 1.0, 2); + std::cout << std::to_string(f) << std::endl; + EXPECT_TRUE(f >= 0 && f <= 1); +} + +TEST_F(RandomGeneratorTest, MakeAString) { + EXPECT_TRUE(MakeAString(0, 0) == ""); + EXPECT_TRUE((MakeAString(1, 1)).length() == 1); + std::string a_str = MakeAString(1,10); + EXPECT_TRUE(a_str.length() <= 10 && a_str.length() >= 1); + std::string a_str1 = MakeAString(26,27); + int cnt = 0; + for (int i = 0; i < a_str1.length(); ++i) { + for (int j = i + 1; j < a_str1.length(); ++j) { + if (a_str1[i] == a_str1[j]) { + ++cnt; + } + } + } + EXPECT_TRUE(cnt > 0); +} + +TEST_F(RandomGeneratorTest, MakeNString) { + EXPECT_TRUE(MakeNString(0, 0) == ""); + EXPECT_TRUE((MakeNString(1, 1)).length() == 1); + std::string n_str = MakeNString(1,10); + EXPECT_TRUE(n_str.length() <= 10 && n_str.length() >= 1); +} + +TEST_F(RandomGeneratorTest, MakeDisOrderList) { + std::vector dis_order_list = MakeDisOrderList(10,20); + sort(dis_order_list.begin(),dis_order_list.end()); + for (int i = 10; i <= 20; ++i) { + EXPECT_EQ(dis_order_list[i-10], i); + } +} + +TEST_F(RandomGeneratorTest, SetRandomConstant) { + SetRandomConstant(); + NURandConstant c = GetRandomConstant(); + EXPECT_TRUE(c.c_last >= 0 && c.c_last <= 255); + EXPECT_TRUE(c.c_last >= 0 && c.c_last <= 1023); + EXPECT_TRUE(c.c_last >= 0 && c.c_last <= 8191); +} + +TEST_F(RandomGeneratorTest, GetRandom) { + EXPECT_EQ(GetRandom(1, 1) , 1); + int rand_num = GetRandom(0, 1); + int rand_num1 = GetRandom(1, 0); + EXPECT_TRUE(rand_num == 0 || rand_num == 1); + EXPECT_TRUE(rand_num == 0 || rand_num == 1); +} + +} // namespace tpcc +} // namespace tera diff --git a/src/benchmark/tpcc/test/tpcc_test.cc b/src/benchmark/tpcc/test/tpcc_test.cc new file mode 100644 index 000000000..04d5b4890 --- /dev/null +++ b/src/benchmark/tpcc/test/tpcc_test.cc @@ -0,0 +1,21 @@ +// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. +// +// Author: baorenyi@baidu.com + +#include "gflags/gflags.h" +#include "glog/logging.h" +#include "gtest/gtest.h" + +namespace tera { +namespace tpcc { + +int main(int argc, char* argv[]) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} + + +} // namespace tpcc +} // namespace tera diff --git a/src/benchmark/tpcc/tpcc_flags.cc b/src/benchmark/tpcc/tpcc_flags.cc new file mode 100644 index 000000000..4de8b300e --- /dev/null +++ b/src/benchmark/tpcc/tpcc_flags.cc @@ -0,0 +1,17 @@ +// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. +// +// Author: baorenyi@baidu.com + +#include "gflags/gflags.h" + +DEFINE_int64(transactions_count, 200, "the count of transactions"); +DEFINE_int32(warehouses_count, 2, "the count of warsehouses"); +DEFINE_int32(tpcc_thread_pool_size, 20, "size of tpcc thread pool"); +DEFINE_int32(tpcc_run_gtxn_thread_pool_size, 20, "size of tpcc run global transactions thread pool"); +DEFINE_string(db_type, "tera", "test db type"); +DEFINE_string(tera_client_flagfile, "./tera.flag", "the flag file path of tera client"); +DEFINE_string(tera_table_schema_dir, "./tpcc_schemas/", "table schema directory"); +DEFINE_int32(generate_data_wait_times, 36000000, "generate data wait times, default 1h"); +DEFINE_int32(driver_wait_times, 36000000, "driver wait times, default 1h"); diff --git a/src/benchmark/tpcc/tpcc_main.cc b/src/benchmark/tpcc/tpcc_main.cc new file mode 100644 index 000000000..2e2df8e26 --- /dev/null +++ b/src/benchmark/tpcc/tpcc_main.cc @@ -0,0 +1,78 @@ +// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. +// +// Author: baorenyi@baidu.com + +#include + +#include +#include + +#include "benchmark/tpcc/data_generator.h" +#include "benchmark/tpcc/driver.h" +#include "benchmark/tpcc/random_generator.h" +#include "benchmark/tpcc/tpccdb.h" +#include "benchmark/tpcc/tpcc_types.h" +#include "types.h" +#include "common/timer.h" +#include "version.h" + +DECLARE_int64(transactions_count); +DECLARE_int32(warehouses_count); +DECLARE_string(db_type); + +int main(int argc, char *argv[]) { + // load conf from flags + ::google::ParseCommandLineFlags(&argc, &argv, true); + + if (argc > 1 && strcmp(argv[1], "version") == 0) { + PrintSystemVersion(); + return 0; + } + if (FLAGS_warehouses_count > tera::tpcc::kMaxWarehouseId + && FLAGS_warehouses_count <= 0) { + LOG(ERROR) << "--warehouses_count=" << FLAGS_warehouses_count << " is not availability"; + return -1; + } + + tera::tpcc::RandomGenerator random_gen; + random_gen.SetRandomConstant(); + + tera::tpcc::TpccDb* db = tera::tpcc::TpccDb::NewTpccDb(FLAGS_db_type); + // do clean tables + if (argc == 2 && strcmp(argv[1], "clean") == 0) { + if(!db->CleanTables()) { + LOG(ERROR) << "clean tables failed, exit"; + _Exit(EXIT_FAILURE); + } + delete db; + return 0; + } + + if (!db->CreateTables()) { + LOG(ERROR) << "create tables failed, exit"; + _Exit(EXIT_FAILURE); + } + + tera::tpcc::DataGenerator data_gen(&random_gen, db); + int64_t beg_ts = tera::get_micros(); + data_gen.GenItems(); + data_gen.GenWarehouses(); + data_gen.Join(); + int64_t cost_t = tera::get_micros() - beg_ts; + LOG(INFO) << "Generate Tables Cost:" << cost_t << "us"; + + // init driver + tera::tpcc::NURandConstant constant = random_gen.GetRandomConstant(); + random_gen.SetRandomConstant(constant); + tera::tpcc::Driver driver(&random_gen, db); + // run test + int64_t beg_txn_ts = tera::get_micros(); + driver.RunTransactions(); + driver.Join(); + int64_t cost_txn_t = tera::get_micros() - beg_txn_ts; + LOG(INFO) << "RunTransactions Cost:" << cost_txn_t << "us"; + delete db; + return 0; +} diff --git a/src/benchmark/tpcc/tpcc_schemas/t_customer b/src/benchmark/tpcc/tpcc_schemas/t_customer new file mode 100644 index 000000000..7b8c7ddfd --- /dev/null +++ b/src/benchmark/tpcc/tpcc_schemas/t_customer @@ -0,0 +1,5 @@ +t_customer { + lg0 { + cf0 + } +} diff --git a/src/benchmark/tpcc/tpcc_schemas/t_customer_last_index b/src/benchmark/tpcc/tpcc_schemas/t_customer_last_index new file mode 100644 index 000000000..e7990ca13 --- /dev/null +++ b/src/benchmark/tpcc/tpcc_schemas/t_customer_last_index @@ -0,0 +1,5 @@ +t_customer_last_index { + lg0 { + cf0 + } +} diff --git a/src/benchmark/tpcc/tpcc_schemas/t_district b/src/benchmark/tpcc/tpcc_schemas/t_district new file mode 100644 index 000000000..2a6cbe3a3 --- /dev/null +++ b/src/benchmark/tpcc/tpcc_schemas/t_district @@ -0,0 +1,5 @@ +t_district { + lg0 { + cf0 + } +} diff --git a/src/benchmark/tpcc/tpcc_schemas/t_history b/src/benchmark/tpcc/tpcc_schemas/t_history new file mode 100644 index 000000000..a21f40001 --- /dev/null +++ b/src/benchmark/tpcc/tpcc_schemas/t_history @@ -0,0 +1,5 @@ +t_history { + lg0 { + cf0 + } +} diff --git a/src/benchmark/tpcc/tpcc_schemas/t_history_index b/src/benchmark/tpcc/tpcc_schemas/t_history_index new file mode 100644 index 000000000..205b3aa23 --- /dev/null +++ b/src/benchmark/tpcc/tpcc_schemas/t_history_index @@ -0,0 +1,5 @@ +t_history_index { + lg0 { + cf0 + } +} diff --git a/src/benchmark/tpcc/tpcc_schemas/t_item b/src/benchmark/tpcc/tpcc_schemas/t_item new file mode 100644 index 000000000..02bf1ff5a --- /dev/null +++ b/src/benchmark/tpcc/tpcc_schemas/t_item @@ -0,0 +1,5 @@ +t_item { + lg0 { + cf0 + } +} diff --git a/src/benchmark/tpcc/tpcc_schemas/t_neworder b/src/benchmark/tpcc/tpcc_schemas/t_neworder new file mode 100644 index 000000000..e7ef005e0 --- /dev/null +++ b/src/benchmark/tpcc/tpcc_schemas/t_neworder @@ -0,0 +1,5 @@ +t_neworder { + lg0 { + cf0 + } +} diff --git a/src/benchmark/tpcc/tpcc_schemas/t_order b/src/benchmark/tpcc/tpcc_schemas/t_order new file mode 100644 index 000000000..4e7d0139f --- /dev/null +++ b/src/benchmark/tpcc/tpcc_schemas/t_order @@ -0,0 +1,5 @@ +t_order { + lg0 { + cf0 + } +} diff --git a/src/benchmark/tpcc/tpcc_schemas/t_order_index b/src/benchmark/tpcc/tpcc_schemas/t_order_index new file mode 100644 index 000000000..6d2a47528 --- /dev/null +++ b/src/benchmark/tpcc/tpcc_schemas/t_order_index @@ -0,0 +1,5 @@ +t_order_index { + lg0 { + cf0 + } +} diff --git a/src/benchmark/tpcc/tpcc_schemas/t_orderline b/src/benchmark/tpcc/tpcc_schemas/t_orderline new file mode 100644 index 000000000..d075e7918 --- /dev/null +++ b/src/benchmark/tpcc/tpcc_schemas/t_orderline @@ -0,0 +1,5 @@ +t_orderline { + lg0 { + cf0 + } +} diff --git a/src/benchmark/tpcc/tpcc_schemas/t_stock b/src/benchmark/tpcc/tpcc_schemas/t_stock new file mode 100644 index 000000000..a35115aa0 --- /dev/null +++ b/src/benchmark/tpcc/tpcc_schemas/t_stock @@ -0,0 +1,5 @@ +t_stock { + lg0 { + cf0 + } +} diff --git a/src/benchmark/tpcc/tpcc_schemas/t_warehouse b/src/benchmark/tpcc/tpcc_schemas/t_warehouse new file mode 100644 index 000000000..9102544ff --- /dev/null +++ b/src/benchmark/tpcc/tpcc_schemas/t_warehouse @@ -0,0 +1,5 @@ +t_warehouse { + lg0 { + cf0 + } +} diff --git a/src/benchmark/tpcc/tpcc_types.h b/src/benchmark/tpcc/tpcc_types.h new file mode 100644 index 000000000..c73e9f489 --- /dev/null +++ b/src/benchmark/tpcc/tpcc_types.h @@ -0,0 +1,139 @@ +// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. +// +// Author: baorenyi@baidu.com + +#ifndef TERA_BENCHMARK_TPCC_TPCC_TYPES_H +#define TERA_BENCHMARK_TPCC_TPCC_TYPES_H + +#include + +#include +#include + +namespace tera { +namespace tpcc { + +const int kTpccTableCnt = 12; + +// t_customer_last_index is the index of t_customer +// +const char* const kTpccTables[] = {"t_item", "t_warehouse", "t_district", + "t_customer", "t_history", "t_stock", + "t_order", "t_orderline", "t_neworder", + "t_customer_last_index", "t_order_index", + "t_history_index"}; + +// StockLevel 4% 4 +// OrderStatus 4% 8 +// Delivery 4% 12 +// Payment 43% 55 +// NewOrder 45% 100 +const int kTpccTransactionRatios[] = {4, 8, 12, 55, 100}; + +// http://www.man7.org/linux/man-pages/man3/initstate.3.html +// Current "optimal" values for the size of the state array n +// are 8, 32, 64, 128, and 256 bytes; +const int kRandomStateSize = 64; + +// YTD +const float kInitYTD = 300000.00f; + +// tax +const float kTaxMax = 0.20f; +const float kTaxMin = 0.10f; +const int kTaxDigits = 2; + +// address +const int kStreetLowerLen = 10; +const int kStreetUpperLen = 20; +const int kCityLowerLen = 10; +const int kCityUpperLen = 20; +const int kStateLen = 2; +const int kZipLen = 9; + +// warehourse +const int kMaxWarehouseId = 100; +const int kWareHouseNameLowerLen = 6; +const int kWareHouseNameUpperLen = 10; + +// stock +const int kMaxQuantity = 100; +const int kMinQuantity = 10; +const int kDistLen = 24; +const int kStockDataLowerLen = 26; +const int kStockDataUpperLen = 50; +const int kMinStockLevelThreshold = 10; +const int kMaxStockLevelThreshold = 20; + +// item +const int kItemCount = 100000; +const int kItemMaxIm = 10000; +const int kItemMinIm = 1; +const float kItemMaxPrice = 100.00; +const float kItemMinPrice = 1.00; +const int kItemPriceDigits = 2; +const int kItemMaxNameLen = 24; +const int kItemMinNameLen = 14; +const int kItemMaxDataLen = 50; +const int kItemMinDataLen = 26; + +// district +const int kDistrictCountPerWarehouse = 10; +const int kDistrictNameLowerLen = 6; +const int kDistrictNameUpperLen = 10; + +// customer +const int kCustomerCountPerDistrict = 3000; +const float kInitCreditLimit = 5000.00; +const float kMaxDisCount = 0.0; +const float kMinDisCount = 0.5; +const int kDisCountDigits = 2; +const float kInitBalance = -10.00; +const float kInitYTDPayment = 10.00; +const int kInitPaymentCnt = 1; +const int kInitDeliveryCnt = 0; +const int kFirstLowerLen = 6; +const int kFirstUpperLen = 10; +const int kMiddleLen = 2; +const int kLastLen = 16; +const int kPhoneLen = 16; +const int kCreditLen = 2; +const int kCustomerDataUpperLen = 500; +const int kCustomerDataLowerLen = 300; + +// order +const int kInitOrdersPerDistrict = 3000; +const int kInitAllLocal = 1; +const int kMaxCarrierId = 10; +const int kMinCarrierId = 1; +const int kMaxOrderLineCnt = 15; +const int kMinOrderLineCnt = 5; + +// new order +const int kInitNewOrderCountPerDistrict = 900; + +// order line +const int kMaxItemId = 100000; +const int kMinItemId = 1; +const int kInitQuantity = 5; +const int kMaxOrderLineQuantity = 10; +const float kOrderLineMinAmount = 0.01f; +const float kOrderLineMaxAmount = 9999.99f; +const int kOrderLineAmountDigits = 2; + +// history +const float kInitHistoryAmount = 10.00f; +const int kHistoryDataLowerLen = 12; +const int kHistoryDataUpperLen = 24; + +// runtime h_amount +const float kRuntimeMaxAmount = 5000.00f; +const float kRuntimeMinAmount = 1.00f; +const int kRuntimeAmountDigits = 2; + +} // namespace tpcc +} // namepsace tera + +#endif /* TERA_BENCHMARK_TPCC_TPCC_TYPES_H */ diff --git a/src/benchmark/tpcc/tpccdb.cc b/src/benchmark/tpcc/tpccdb.cc new file mode 100644 index 000000000..bb7e0cfb5 --- /dev/null +++ b/src/benchmark/tpcc/tpccdb.cc @@ -0,0 +1,360 @@ +// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. +// +// Author: baorenyi@baidu.com + +#include + +#include "benchmark/tpcc/mock_tpccdb.h" +#include "benchmark/tpcc/tera_tpccdb.h" +#include "benchmark/tpcc/tpccdb.h" + +namespace tera { +namespace tpcc { + +class TeraTpccDb; +class MockTpccDb; + +/// ------------------------- [begin item table] -------------------------- /// +std::string Item::ToString() const { + std::stringstream ss; + ss << "i_id = " << i_id + << ",i_im_id = " << i_im_id + << ",i_price = " << i_price + << ",i_name = " << i_name + << ",i_data = " << i_data; + return ss.str(); +} + +/// ------------------------- [begin warehouse table] --------------------- /// +std::string Warehouse::ToString() const { + std::stringstream ss; + ss << "w_id = " << w_id + << ",w_tax = " << w_tax + << ",w_ytd = " << w_ytd + << ",w_name = " << w_name + << ",w_street_1 = " << w_street_1 + << ",w_street_2 = " << w_street_2 + << ",w_city = " << w_city + << ",w_state = " << w_state + << ",w_zip = " << w_zip; + return ss.str(); +} + +/// ------------------------- [begin district table] ---------------------- /// + +District::District(int32_t id, int32_t w_id, RandomGenerator* rand_gen) + : d_id(id), d_w_id(w_id), d_ytd(kInitYTD), d_next_o_id(kCustomerCountPerDistrict + 1) { + d_tax = GenTax(rand_gen); + d_name = rand_gen->MakeAString(kDistrictNameLowerLen, kDistrictNameUpperLen); + d_street_1 = rand_gen->MakeAString(kStreetLowerLen, kStreetUpperLen); + d_street_2 = rand_gen->MakeAString(kStreetLowerLen, kStreetUpperLen); + d_city = rand_gen->MakeAString(kCityLowerLen, kCityUpperLen); + d_state = rand_gen->MakeAString(kStateLen,kStateLen); + d_zip = GenZip(rand_gen); +} + +std::string District::PrimaryKey() const { + return std::to_string(d_w_id) + "_" + + std::to_string(d_id); +} + +std::string District::ForeignKey() const { + return std::to_string(d_w_id); +} + +std::string District::ToString() const { + std::stringstream ss; + ss << "d_id = " << d_id + << ",d_w_id = " << d_w_id + << ",d_tax = " << d_tax + << ",d_ytd = " << d_ytd + << ",d_next_o_id = " << d_next_o_id + << ",d_name = " << d_name + << ",d_street_1 = " << d_street_1 + << ",d_street_2 = " << d_street_2 + << ",d_city = " << d_city + << ",d_state = " << d_state + << ",d_zip = " << d_zip; + return ss.str(); +} + +/// ------------------------- [begin stock table] ------------------------- /// + +Stock::Stock(int32_t id, int32_t w_id, bool is_original, RandomGenerator* rand_gen) + : s_i_id (id), s_w_id(w_id) { + s_quantity = rand_gen->GetRandom(kMinQuantity, kMaxQuantity); + s_ytd = 0; + s_order_cnt = 0; + s_remote_cnt = 0; + for (int i = 0; i < kDistrictCountPerWarehouse; ++i) { + s_dist.push_back(rand_gen->MakeAString(kDistLen, kDistLen)); + } + s_data = GenData(rand_gen, kStockDataLowerLen, kStockDataUpperLen, is_original); +} + +std::string Stock::PrimaryKey() const { + return std::to_string(s_w_id) + "_" + std::to_string(s_i_id); +} + +std::string Stock::ForeignKey() const { + return std::to_string(s_i_id); +} + +std::string Stock::ToString() const { + std::stringstream ss; + ss << "s_w_id = " << s_w_id + << ",s_quantity = " << s_quantity + << ",s_ytd = " << s_ytd + << ",s_order_cnt = " << s_order_cnt + << ",s_remote_cnt = " << s_remote_cnt + << ",s_data = " << s_data + << ",s_dist = ["; + for (auto d : s_dist) { + ss << d << ","; + } + ss << "]"; + return ss.str(); +} + +/// ------------------------- [begin order table] ------------------------- /// + +Order::Order(int32_t id, int32_t c_id, int32_t d_id, int32_t w_id, + bool new_order, const std::string& datetime, + RandomGenerator* rand_gen) + : o_id(id), o_c_id(c_id), o_d_id(d_id), o_w_id(w_id), + o_carrier_id(0), o_all_local(kInitAllLocal), + o_entry_d(datetime) { + + if (!new_order) { + o_carrier_id = rand_gen->GetRandom(kMinCarrierId, kMaxCarrierId); + } + o_ol_cnt = rand_gen->GetRandom(kMinOrderLineCnt, kMaxOrderLineCnt); +} + +std::string Order::PrimaryKey() const { + return std::to_string(o_w_id) + "_" + + std::to_string(o_d_id) + "_" + + std::to_string(o_id); +} + +std::string Order::ForeignKey() const { + return std::to_string(o_w_id) + "_" + + std::to_string(o_d_id) + "_" + + std::to_string(o_c_id); +} + +std::string Order::ToString() const { + std::stringstream ss; + ss << "o_id = " << o_id + << ",o_c_id = " << o_c_id + << ",o_d_id = " << o_d_id + << ",o_w_id = " << o_w_id + << ",o_carrier_id = " << o_carrier_id + << ",o_ol_cnt = " << o_ol_cnt + << ",o_all_local = " << o_all_local + << ",o_entry_d = " << o_entry_d; + return ss.str(); +} + +/// ------------------------- [begin neworder table] ---------------------- /// + + +NewOrder::NewOrder(int32_t o_id, int32_t d_id, int32_t w_id) + : no_o_id(o_id), no_d_id(d_id), no_w_id(w_id) { +} + +std::string NewOrder::ToString() const { + std::stringstream ss; + ss << "no_o_id = " << no_o_id + << ",no_d_id = " << no_d_id + << ",no_w_id = " << no_w_id; + return ss.str(); +} + +std::string NewOrder::PrimaryKey() const { + return std::to_string(no_w_id) + + "_" + std::to_string(no_d_id) + + "_" + std::to_string(no_o_id); +} + +std::string NewOrder::ForeignKey() const { + return std::to_string(no_w_id) + + "_" + std::to_string(no_d_id) + + "_" + std::to_string(no_o_id); +} + +/// ------------------------- [begin orderline table] --------------------- /// + +OrderLine::OrderLine(int32_t o_id, int32_t d_id, int32_t w_id, int32_t number, + bool new_order, const std::string& datetime, + RandomGenerator* rand_gen) + : ol_o_id(o_id), ol_d_id(d_id), ol_w_id(w_id), ol_number(number), + ol_supply_w_id(w_id), ol_quantity(kInitQuantity), + ol_amount(0.00f), ol_delivery_d(datetime) { + + ol_i_id = rand_gen->GetRandom(kMinItemId, kMaxItemId); + if (new_order) { + ol_amount = rand_gen->MakeFloat(kOrderLineMinAmount, + kOrderLineMaxAmount, + kOrderLineAmountDigits); + ol_delivery_d = ""; + } + ol_dist_info = rand_gen->MakeAString(kDistLen, kDistLen); +} + +std::string OrderLine::PrimaryKey() const { + return std::to_string(ol_w_id) + "_" + + std::to_string(ol_d_id) + "_" + + std::to_string(ol_o_id) + "_" + + std::to_string(ol_number); +} + +ForeignKeyMap OrderLine::ForeignKeys() const { + ForeignKeyMap foreign_keys; + std::string order_index = std::to_string(ol_w_id) + "_" + + std::to_string(ol_d_id) + "_" + + std::to_string(ol_o_id); + std::string item_index = std::to_string(ol_supply_w_id) + "_" + + std::to_string(ol_i_id); + foreign_keys["order_index"] = order_index; + foreign_keys["item_index"] = item_index; + return foreign_keys; +} + +std::string OrderLine::ToString() const { + std::stringstream ss; + ss << "ol_o_id = " << ol_o_id + << ",ol_d_id = " << ol_d_id + << ",ol_w_id = " << ol_w_id + << ",ol_number = " << ol_number + << ",ol_i_id = " << ol_i_id + << ",ol_supply_w_id = " << ol_supply_w_id + << ",ol_quantity = " << ol_quantity + << ",ol_amount = " << ol_amount + << ",ol_delivery_d = " << ol_delivery_d + << ",ol_dist_info = " << ol_dist_info; + return ss.str(); +} + +/// ------------------------- [begin customer table] ---------------------- /// + +Customer::Customer(int32_t id, int32_t d_id, int32_t w_id, const std::string& datetime, + bool bad_credit, RandomGenerator* rand_gen) + : c_id(id), + c_d_id(d_id), + c_w_id(w_id), + c_credit_lim(kInitCreditLimit), + c_balance(kInitBalance), + c_ytd_payment(kInitYTDPayment), + c_payment_cnt(kInitPaymentCnt), + c_delivery_cnt(kInitDeliveryCnt), + c_middle("OE"), + c_since(datetime) { + c_discount = rand_gen->MakeFloat(kMinDisCount, kMaxDisCount, kDisCountDigits); + c_first = rand_gen->MakeAString(kFirstLowerLen, kFirstUpperLen); + c_last = GenLastName(rand_gen, (id <= 1000 ? id : kCustomerCountPerDistrict)); + c_street_1 = rand_gen->MakeAString(kStreetLowerLen, kStreetUpperLen); + c_street_2 = rand_gen->MakeAString(kStreetLowerLen, kStreetUpperLen); + c_city = rand_gen->MakeAString(kCityLowerLen, kCityUpperLen); + c_state = rand_gen->MakeAString(kStateLen,kStateLen); + c_zip = GenZip(rand_gen); + c_phone = rand_gen->MakeNString(kPhoneLen,kPhoneLen); + c_credit = bad_credit ? "BC" : "GC"; + c_data = GenData(rand_gen, kCustomerDataLowerLen, kCustomerDataUpperLen, false); +} + +std::string Customer::PrimaryKey() const { + return std::to_string(c_w_id) + "_" + std::to_string(c_d_id) + + "_" + std::to_string(c_id); +} + +std::string Customer::ForeignKey() const { + return std::to_string(c_w_id) + "_" + std::to_string(c_d_id); +} + +std::string Customer::ToString() const { + std::stringstream ss; + ss << "c_id = " << c_id + << ",c_d_id = " << c_d_id + << ",c_w_id = " << c_w_id + << ",c_credit_lim = " << c_credit_lim + << ",c_discount = " << c_discount + << ",c_balance = " << c_balance + << ",c_ytd_payment = " << c_ytd_payment + << ",c_payment_cnt = " << c_payment_cnt + << ",c_delivery_cnt = " << c_delivery_cnt + << ",c_name = [" << c_first << "," << c_middle << "," << c_last << "]" + << ",c_street_1 = " << c_street_1 + << ",c_street_2 = " << c_street_2 + << ",c_city = " << c_city + << ",c_state = " << c_state + << ",c_zip = " << c_zip + << ",c_phone = " << c_phone + << ",c_since = " << c_since + << ",c_credit = " << c_credit + << ",c_data = " << c_data; + return ss.str(); +} + +/// ------------------------- [begin history table] ----------------------- /// +std::string History::ToString() const { + std::stringstream ss; + ss << "h_c_id = " << h_c_id + << ",h_c_d_id = " << h_c_d_id + << ",h_c_w_id = " << h_c_w_id + << ",h_d_id = " << h_d_id + << ",h_w_id = " << h_w_id + << ",h_amount = " << h_amount + << ",h_date = " << h_date + << ",h_data = " << h_data; + return ss.str(); +} + +/// ------------------------- [end tables] -------------------------------- /// + +bool TxnResult::State() const { + return status_; +} + +void TxnResult::SetState(bool status) { + status_ = status; +} + +void TxnResult::SetReason(const std::string& reason) { + reason_ = reason; +} + +void StockLevelResult::SetLowStock(int low_stock) { + low_stock_ = low_stock; +} + +int StockLevelResult::LowStock() const { + return low_stock_; +} + +void PaymentResult::SetSingleLine(const RetTuples& single_line) { + single_line_ = single_line; +} + +void NewOrderResult::AddLine(const RetTuples& line) { + lines_.push_back(line); +} + +void NewOrderResult::SetSingleLine(const RetTuples& single_line) { + single_line_ = single_line; +} + +TpccDb* TpccDb::NewTpccDb(const std::string& db_type) { + if (db_type == "tera") { + return new TeraTpccDb(); + } else { + LOG(ERROR) << "not support db:" << db_type; + } + return NULL; +} + +} // namespace tpcc +} // namespace tera + diff --git a/src/benchmark/tpcc/tpccdb.h b/src/benchmark/tpcc/tpccdb.h new file mode 100644 index 000000000..93b3c32f3 --- /dev/null +++ b/src/benchmark/tpcc/tpccdb.h @@ -0,0 +1,471 @@ +// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. +// +// Author: baorenyi@baidu.com + +#ifndef TERA_BENCHMARK_TPCC_TPCCDB_H +#define TERA_BENCHMARK_TPCC_TPCCDB_H + +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "benchmark/tpcc/random_generator.h" +#include "benchmark/tpcc/tpcc_types.h" + +namespace tera { +namespace tpcc { + +typedef std::unordered_set IdSet; +typedef std::unordered_map ForeignKeyMap; +typedef std::unordered_map RetTuples; + + +inline float GenTax(RandomGenerator* rand_gen) { + return rand_gen->MakeFloat(kTaxMax, kTaxMin, kTaxDigits); +} + +inline std::string GenZip(RandomGenerator* rand_gen) { + return rand_gen->MakeNString(kZipLen, kZipLen); +} + +inline std::string GenData(RandomGenerator* rand_gen, + int lower_len, + int upper_len, + bool is_original) { + std::string ret = rand_gen->MakeAString(lower_len, upper_len); + if (is_original) { + int pos = rand_gen->GetRandom(0, ret.size() - 8); + ret = ret.replace(pos, 8, "ORIGINAL"); + } + return ret; +} + +inline std::string GenLastName(RandomGenerator* rand_gen, int id) { + if (id > 999) { + id = rand_gen->NURand(255, 0, std::min(999, id - 1)); + } + std::vector labels = {"BAR", "OUGHT", "ABLE", "PRI", "PRES", + "ESE", "ANTI", "CALLY", "ATION", "EING"}; + return labels[id / 100] + labels[(id / 10) % 10] + labels[id % 10]; +} + +inline IdSet PickUniqueIdSet(RandomGenerator* rand_gen, size_t cnt, int lower_id, int upper_id) { + IdSet ids; + while(ids.size() < cnt) { + int tmp_id = rand_gen->GetRandom(lower_id, upper_id); + if (ids.find(tmp_id) == ids.end()) { + ids.insert(tmp_id); + } + } + return ids; +} + +struct Item { + int32_t i_id; + int32_t i_im_id; + float i_price; + std::string i_name; + std::string i_data; + + Item(int32_t id, bool is_original, RandomGenerator* rand_gen) : i_id(id) { + i_im_id = rand_gen->GetRandom(kItemMinIm, kItemMaxIm); + i_price = rand_gen->MakeFloat(kItemMinPrice, kItemMaxPrice, kItemPriceDigits); + i_name = rand_gen->MakeAString(kItemMinNameLen, kItemMaxNameLen); + i_data = GenData(rand_gen, kItemMinDataLen, kItemMaxDataLen, is_original); + } + + std::string PrimaryKey() const { return std::to_string(i_id); } + std::string ToString() const; +}; + +struct Warehouse { + int32_t w_id; + float w_tax; + float w_ytd; + std::string w_name; + std::string w_street_1; + std::string w_street_2; + std::string w_city; + std::string w_state; + std::string w_zip; + Warehouse(int32_t id, RandomGenerator* rand_gen) : w_id(id) { + w_tax = GenTax(rand_gen); + w_ytd = kInitYTD; + w_name = rand_gen->MakeAString(kWareHouseNameLowerLen, kWareHouseNameUpperLen); + w_street_1 = rand_gen->MakeAString(kStreetLowerLen, kStreetUpperLen); + w_street_2 = rand_gen->MakeAString(kStreetLowerLen, kStreetUpperLen); + w_city = rand_gen->MakeAString(kCityLowerLen, kCityUpperLen); + w_state = rand_gen->MakeAString(kStateLen,kStateLen); + w_zip = GenZip(rand_gen); + } + std::string PrimaryKey() const { return std::to_string(w_id); } + std::string ToString() const; +}; + +struct District { + int32_t d_id; + int32_t d_w_id; + float d_tax; + float d_ytd; + int32_t d_next_o_id; + std::string d_name; + std::string d_street_1; + std::string d_street_2; + std::string d_city; + std::string d_state; + std::string d_zip; + + District(int32_t id, int32_t w_id, RandomGenerator* rand_gen); + std::string PrimaryKey() const; + std::string ForeignKey() const; + std::string ToString() const; +}; + +struct Stock { +int32_t s_i_id; + int32_t s_w_id; + int32_t s_quantity; + int32_t s_ytd; + int32_t s_order_cnt; + int32_t s_remote_cnt; + std::vector s_dist; + std::string s_data; + + Stock(int32_t id, int32_t w_id, bool is_original, RandomGenerator* rand_gen); + std::string PrimaryKey() const; + std::string ForeignKey() const; + std::string ToString() const; +}; + +struct Customer { + int32_t c_id; + int32_t c_d_id; + int32_t c_w_id; + float c_credit_lim; + float c_discount; + float c_balance; + float c_ytd_payment; + int32_t c_payment_cnt; + int32_t c_delivery_cnt; + std::string c_first; + std::string c_middle; + std::string c_last; + std::string c_street_1; + std::string c_street_2; + std::string c_city; + std::string c_state; + std::string c_zip; + std::string c_phone; + std::string c_since; + std::string c_credit; + std::string c_data; + Customer(int32_t id, int32_t d_id, int32_t w_id, const std::string& datetime, + bool bad_credit, RandomGenerator* rand_gen); + std::string PrimaryKey() const; + std::string ForeignKey() const; + std::string ToString() const; +}; + +struct Order { + int32_t o_id; + int32_t o_c_id; + int32_t o_d_id; + int32_t o_w_id; + int32_t o_carrier_id; + int32_t o_ol_cnt; + + // If the order includes only home order-lines, + // then O_ALL_LOCAL is set to 1, otherwise O_ALL_LOCAL is set to 0. + int32_t o_all_local; + std::string o_entry_d; + + Order(int32_t id, int32_t c_id, int32_t d_id, int32_t w_id, bool new_order, + const std::string& datetime, RandomGenerator* rand_gen); + std::string PrimaryKey() const; + std::string ForeignKey() const; + std::string ToString() const; +}; + +// An order-line is said to be 'home' if it is supplied by the home warehouse +// (i.e., when OL_SUPPLY_W_ID equals O_W_ID). +// +// An order-line is said to be remote when it is supplied by a remote warehouse +// (i.e., when OL_SUPPLY_W_ID does not equal O_W_ID). +// +struct OrderLine { + int32_t ol_o_id; + int32_t ol_d_id; + int32_t ol_w_id; + int32_t ol_number; + int32_t ol_i_id; + int32_t ol_supply_w_id; + int32_t ol_quantity; + float ol_amount; + std::string ol_delivery_d; + std::string ol_dist_info; + + OrderLine(int32_t o_id, int32_t d_id, int32_t w_id, int32_t number, + bool new_order, const std::string& datetime, + RandomGenerator* rand_gen); + std::string PrimaryKey() const; + ForeignKeyMap ForeignKeys() const; + std::string ToString() const; +}; + +struct NewOrder { + int32_t no_o_id; + int32_t no_d_id; + int32_t no_w_id; + + NewOrder(int32_t o_id, int32_t d_id, int32_t w_id); + std::string PrimaryKey() const; + std::string ForeignKey() const; + std::string ToString() const; +}; + +struct History { + int32_t h_c_id; + int32_t h_c_d_id; + int32_t h_c_w_id; + int32_t h_d_id; + int32_t h_w_id; + float h_amount; + std::string h_date; + std::string h_data; + + History(int32_t c_id, int32_t d_id, int32_t w_id, const std::string& datetime, + RandomGenerator* rand_gen) + : h_c_id(c_id), h_c_d_id(d_id), h_c_w_id(w_id), h_d_id(d_id), h_w_id(w_id), + h_amount(kInitHistoryAmount), h_date(datetime) { + h_data = rand_gen->MakeAString(kHistoryDataLowerLen, kHistoryDataUpperLen); + } + std::string PrimaryKey() const { return std::to_string(h_c_id); } + std::string ToString() const; +}; + +struct NewOrderInfo { + bool need_failed; + int32_t o_all_local; + int32_t o_ol_cnt; + std::vector ol_supply_w_ids; + std::vector ol_i_ids; + std::vector ol_quantities; +}; + +enum TpccTables +{ + kItemTable = 0, + kWarehouseTable = 1, + kDistrictTable = 2, + kCustomerTable = 3, + kHistoryTable = 4, + kStockTable = 5, + kOrderTable = 6, + kOrderLineTable = 7, + kNewOrderTable = 8, + + // the index of table + kCustomerLastIndex = 9, + kOrderIndex = 10, + kHistoryIndex = 11 +}; + +/// ------------------------- transaction result ---------------------------/// + +class TxnResult { +public: + void SetState(bool status); + bool State() const; + void SetReason(const std::string& reason); + const std::string& Reason() const; +private: + bool status_; + std::string reason_; +}; + +class StockLevelResult : public TxnResult { +public: + void SetLowStock(int low_stock); + int LowStock() const; +private: + int low_stock_; +}; + +class PaymentResult : public TxnResult { +public: + void SetSingleLine(const RetTuples& single_line); +private: + RetTuples single_line_; +}; + +class NewOrderResult : public TxnResult { +public: + void AddLine(const RetTuples& line); + void SetSingleLine(const RetTuples& single_line); +private: + std::vector lines_; + RetTuples single_line_; +}; + +class OrderStatusResult : public TxnResult { + +}; + +class DeliveryResult : public TxnResult { + +}; + +class TpccDb { +public: + TpccDb(){} + virtual ~TpccDb(){} + + // init db + virtual bool CreateTables() = 0; + virtual bool CleanTables() = 0; + + // for insert table + virtual bool InsertItem(const Item& i) = 0; + + virtual bool InsertWarehouse(const Warehouse& w) = 0; + + virtual bool InsertDistrict(const District& d) = 0; + + virtual bool InsertCustomer(const Customer& c) = 0; + + virtual bool InsertHistory(const History& h) = 0; + + virtual bool InsertStock(const Stock& s) = 0; + + virtual bool InsertOrder(const Order& o) = 0; + + virtual bool InsertOrderLine(const OrderLine& ol) = 0; + + virtual bool InsertNewOrder(const NewOrder& no) = 0; + + // for transaction + + // The Stock-Level Transaction [Revision 5.11 - Page 44] + // + // (warehouse_id, district_id) + // is the primarykey of t_district + // Each terminal must use a unique value of (W_ID, D_ID) that is constant + // over the whole measurement, i.e., D_IDs cannot be re-used within a warehouse + // + // threshold + // The threshold of minimum quantity in stock (threshold) is selected + // at random within [10 .. 20]. + // + virtual void StockLevelTxn(int32_t warehouse_id, int32_t district_id, + int32_t threshold, + StockLevelResult* ret) = 0; + + // The Delivery Transaction [Revision 5.11 - Page 40] + // + // warehouse_id + // For any given terminal, the home warehouse number (W_ID) is constant + // over the whole measurement interval + // + // carrier_id + // The carrier number (O_CARRIER_ID) is randomly selected within [1 .. 10]. + // + // delivery_datetime + // The delivery date (OL_DELIVERY_D) is generated within the + // SUT by using the current system date and time. + // + virtual void DeliveryTxn(int32_t warehouse_id, + int32_t carrier_id, + const std::string& delivery_datetime, + DeliveryResult* ret) = 0; + + // The Order-Status Transaction [Revision 5.11 - Page 37] + // + // warehouse_id + // For any given terminal, the home warehouse number (W_ID) is constant + // over the whole measurement interval + // + // district_id + // The district number (D_ID) is randomly selected within [1 .. 10] + // from the home warehouse (D_W_ID = W_ID). + // + // c_warehouse_id, c_district_id, last_name + // customer is randomly selected + // 60% of the time by last name (C_W_ID, C_D_ID, C_LAST) + // from the selected district (C_D_ID = D_ID) + // and the home warehouse number (C_W_ID = W_ID). + // + // c_warehouse_id, c_district_id, customer_id + // 40% of the time by number (C_W_ID, C_D_ID, C_ID) + // from the selected district (C_D_ID = D_ID) + // and the home warehouse number (C_W_ID = W_ID). + // + virtual void OrderStatusTxn(bool by_last_name, + int32_t warehouse_id, int32_t district_id, + int32_t c_customer_id, + const std::string& last_name, + OrderStatusResult* ret) = 0; + + // The Payment Transaction [Revision 5.11 - Page 33] + // + // warehouse_id + // For any given terminal, the home warehouse number (W_ID) is constant + // over the whole measurement interval + // + // district_id + // The district number (D_ID) is randomly selected within [1 .. 10] + // from the home warehouse (D_W_ID = W_ID). + // + // c_warehouse_id, c_district_id, last_name + // The customer is randomly selected + // 1) 60% of the time by last name (C_W_ID , C_D_ID, C_LAST) + // c_warehouse_id, c_district_id, customer_id + // The customer is randomly selected + // 2) 40% of the time by number (C_W_ID , C_D_ID , C_ID). + // + // h_amount + // The payment amount (H_AMOUNT) is randomly selected within + // [1.00 .. 5,000.00]. + // + virtual void PaymentTxn(bool by_last_name, + int32_t warehouse_id, int32_t district_id, + int32_t c_warehouse_id, int32_t c_district_id, + int32_t c_customer_id, + const std::string& last_name, + int32_t h_amount, + PaymentResult* ret) = 0; + + + // The New-Order Transaction [Revision 5.11 - Page 28] + // warehouse_id + // For any given terminal, the home warehouse number (W_ID) is constant + // over the whole measurement interval + // + // district_id + // The district number (D_ID) is randomly selected within [1 .. 10] + // from the home warehouse (D_W_ID = W_ID). + // + // customer_id + // The non-uniform random customer number (C_ID) is selected using + // the NURand(1023,1,3000) function from the selected district + // number (C_D_ID = D_ID) and the home warehouse number (C_W_ID = W_ID). + // + virtual void NewOrderTxn(int32_t warehouse_id, + int32_t district_id, + int32_t customer_id, const NewOrderInfo& info, + NewOrderResult* ret) = 0; + + static TpccDb* NewTpccDb(const std::string& db_type); +}; + +} // namespace tpcc +} // namespace tera + +#endif /* TERA_BENCHMARK_TPCC_TPCCDB_H */ diff --git a/src/common/atomic.h b/src/common/atomic.h index 6837cb302..195a7b0da 100644 --- a/src/common/atomic.h +++ b/src/common/atomic.h @@ -1,11 +1,10 @@ +#pragma once // Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. +#include -#ifndef TERA_COUNTER_ATOMIC_H_ -#define TERA_COUNTER_ATOMIC_H_ - -namespace common { +namespace tera { static inline int atomic_add(volatile int *mem, int add) { @@ -106,5 +105,4 @@ static inline int64_t atomic_comp_swap64(volatile void *mem, int64_t xchg, int64 return cmp; } -} // namespace common -#endif // TERA_COMMON_ATOMIC_H_ +} diff --git a/src/common/counter.h b/src/common/counter.h index c9869f633..d4687bfd8 100644 --- a/src/common/counter.h +++ b/src/common/counter.h @@ -7,10 +7,10 @@ #include -#include "atomic.h" -#include "timer.h" +#include "common/atomic.h" +#include "common/timer.h" -namespace common { +namespace tera { class Counter { public: @@ -47,19 +47,19 @@ class AutoCounter { : counter_(counter), msg1_(msg1), msg2_(msg2) { - start_ = timer::get_micros(); + start_ = get_micros(); counter_->Inc(); } ~AutoCounter() { - int64_t end = timer::get_micros(); + int64_t end = get_micros(); if (end - start_ > 5000000) { int64_t t = (end - start_) / 1000000; if (!msg2_) { fprintf(stderr, "%s [AutoCounter] %s hang for %ld s\n", - timer::get_curtime_str().data(), msg1_, t); + get_curtime_str().data(), msg1_, t); } else { fprintf(stderr, "%s [AutoCounter] %s %s hang for %ld s\n", - timer::get_curtime_str().data(), msg1_, msg2_, t); + get_curtime_str().data(), msg1_, msg2_, t); } } counter_->Dec(); diff --git a/src/common/cpu_profiler.cc b/src/common/cpu_profiler.cc new file mode 100644 index 000000000..758ed674d --- /dev/null +++ b/src/common/cpu_profiler.cc @@ -0,0 +1,50 @@ +// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include +#include + +#include "common/cpu_profiler.h" + +namespace tera { + +CpuProfiler::CpuProfiler() + : exit_(false), + thread_(&CpuProfiler::run, this) {} + +CpuProfiler::~CpuProfiler() { + exit_ = true; + cv_.notify_one(); + thread_.join(); + ProfilerState ps; + ProfilerGetCurrentState(&ps); + if (ps.enabled) { + ProfilerStop(); + } +} + +void CpuProfiler::run() { + while (!exit_.load()) { + if (enable_) { + ProfilerState ps; + ProfilerGetCurrentState(&ps); + if (ps.enabled == 0) { + ProfilerStart(profiler_file_.c_str()); + } + + ProfilerFlush(); + LOG(INFO) << "[Cpu Profiler] Cpu Profiler Dumped"; + } else { + ProfilerState ps; + ProfilerGetCurrentState(&ps); + if (ps.enabled) { + ProfilerStop(); + } + } + std::unique_lock lock(lock_); + cv_.wait_for(lock, interval_); + } +} + +} // namespace tera \ No newline at end of file diff --git a/src/common/cpu_profiler.h b/src/common/cpu_profiler.h new file mode 100644 index 000000000..ccf0686ab --- /dev/null +++ b/src/common/cpu_profiler.h @@ -0,0 +1,68 @@ +// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef TERA_CPU_PROFILER_H +#define TERA_CPU_PROFILER_H + +#include +#include +#include +#include +#include + +#include +#include + +namespace tera { + +class CpuProfiler { +public: + /** + * @brief Init CpuProfiler and the detect thread will start + **/ + CpuProfiler(); + + ~CpuProfiler(); + + CpuProfiler& SetEnable(bool enable) { + enable_ = enable; + if (enable_) { + LOG(INFO) << "[Cpu Profiler] Cpu Profiler Enabled"; + } else { + LOG(INFO) << "[Cpu Profiler] Cpu Profiler Disabled"; + } + cv_.notify_one(); + return *this; + } + + CpuProfiler& SetInterval(int second) { + interval_ = std::chrono::seconds(second); + cv_.notify_one(); + return *this; + } + + CpuProfiler& SetProfilerFile(const std::string& file) { + profiler_file_ = file; + cv_.notify_one(); + return *this; + } + +private: + void run(); + +private: + std::atomic exit_; + bool enable_{false}; + std::chrono::seconds interval_{10}; + std::string profiler_file_; + std::thread thread_; + std::mutex lock_; + std::condition_variable cv_; +}; + +} // namespace tera + +#endif //TERA_CPU_PROFILER_H + +/* vim: set ts=4 sw=4 sts=4 tw=100 */ diff --git a/src/common/event.h b/src/common/event.h index a289d0278..9a6770ece 100644 --- a/src/common/event.h +++ b/src/common/event.h @@ -46,8 +46,75 @@ class AutoResetEvent { bool signaled_; }; +class CompletedEvent { +public: + CompletedEvent() + : cv_(&mutex_), cnt_(0), triggered_(false) {} + + CompletedEvent(int64_t task_cnt) + : cv_(&mutex_), cnt_(task_cnt), triggered_(false) {} + + // add event source, + // tasks maybe add while others finished or doing, like a task queue + void AddEventSources(int64_t task_cnt) { + MutexLock lock(&mutex_); + if (!triggered_) { + cnt_ += task_cnt; + } + } + + // call after all tasks added to EventSource, + // trigger other thread's Wait() function take effect. + void Trigger() { + MutexLock lock(&mutex_); + triggered_ = true; + if (cnt_ <= 0) { + cv_.Signal(); + } + } + + // wait until cnt_ == 0 and triggered_ == true + void Wait() { + MutexLock lock(&mutex_); + // cnt_ > 0 + while (cnt_ > 0 || !triggered_) { + cv_.Wait(); + } + } + + // wait for 'timeout' ms, don't careful cnt_ and triggered_ + // if last event source completed, this will returned early 'timeout' + bool TimeWait(int64_t timeout) { + MutexLock lock(&mutex_); + if (cnt_ > 0 || !triggered_) { + cv_.TimeWait(timeout); + } + return cnt_ > 0 ? false : true; + } + + // last event source complated and triggered_ == true, will be notify + // Wait or TimeWait + void Complete(int64_t task_cnt = 1) { + MutexLock lock(&mutex_); + cnt_ -= task_cnt; + // use 'triggered_' to make sure all tasks call 'AddEventSources' + if (cnt_ <= 0 && triggered_) { + cv_.Signal(); + } + } + +private: + CompletedEvent(const CompletedEvent&) = delete; + CompletedEvent &operator=(const CompletedEvent&) = delete; + Mutex mutex_; + CondVar cv_; + int64_t cnt_; + bool triggered_; +}; + } // namespace common using common::AutoResetEvent; +using common::CompletedEvent; #endif // TERA_COMMON_EVENT_H_ diff --git a/src/common/file/file_path.cc b/src/common/file/file_path.cc index 44738117f..ea3a8ef08 100644 --- a/src/common/file/file_path.cc +++ b/src/common/file/file_path.cc @@ -146,6 +146,33 @@ bool ListCurrentDir(const std::string& dir_path, return true; } +bool ListCurrentDirWithStat(const std::string& dir_path, + std::vector* file_list) { + DIR *dir = NULL; + struct dirent *ptr = NULL; + dir = opendir(dir_path.c_str()); + if (dir == NULL) { + return false; + } + bool stat_all_succ = true; + while ((ptr = readdir(dir)) != NULL) { + if (strcmp(ptr->d_name, ".") != 0 && strcmp(ptr->d_name, "..") != 0) { + struct stat st; + std::string file_name(ptr->d_name); + file_name = dir_path + "/" + file_name; + if (lstat(file_name.c_str(), &st) == 0) { + file_list->push_back(std::make_pair(file_name, st)); + } else { + // break if stat fail and return false later + stat_all_succ = false; + break; + } + } + } + closedir(dir); + return stat_all_succ; +} + bool IsExist(const std::string& path) { return access(path.c_str(), R_OK) == 0; } diff --git a/src/common/file/file_path.h b/src/common/file/file_path.h index e0ab5d002..d5e04ea99 100644 --- a/src/common/file/file_path.h +++ b/src/common/file/file_path.h @@ -8,6 +8,8 @@ #include #include #include +#include +#include void SplitStringPath(const std::string& full_path, std::string* dir_part, @@ -28,6 +30,11 @@ std::string UidToName(uid_t uid); bool ListCurrentDir(const std::string& dir_path, std::vector* file_list); +typedef std::pair FileStateInfo; + +bool ListCurrentDirWithStat(const std::string& dir_path, + std::vector* file_list); + bool IsExist(const std::string& path); bool IsDir(const std::string& path); diff --git a/src/common/heap_profiler.cc b/src/common/heap_profiler.cc new file mode 100644 index 000000000..386e314e9 --- /dev/null +++ b/src/common/heap_profiler.cc @@ -0,0 +1,51 @@ +// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include +#include +#include +#include + +#include "common/heap_profiler.h" + +namespace tera { + +HeapProfiler::HeapProfiler() + : exit_(false), + thread_(&HeapProfiler::run, this) {} + +HeapProfiler::~HeapProfiler() { + exit_ = true; + cv_.notify_one(); + thread_.join(); + if (IsHeapProfilerRunning()) { + HeapProfilerStop(); + } +} + +void HeapProfiler::run() { + while (!exit_.load()) { + if (enable_) { + // "reason" is time + std::time_t t = std::chrono::system_clock::to_time_t(std::chrono::system_clock::now()); + char ts[128]; + ctime_r(&t, ts); + ts[strlen(ts) - 1] = '\0'; // erase \n + + if (IsHeapProfilerRunning() == 0) { + HeapProfilerStart(profiler_file_.c_str()); + } + HeapProfilerDump(ts); + LOG(INFO) << "[Heap Profiler] Heap Profiler Dumped"; + } else { + if (IsHeapProfilerRunning()) { + HeapProfilerStop(); + } + } + std::unique_lock lock(lock_); + cv_.wait_for(lock, interval_); + } +} + +} // namespace tera \ No newline at end of file diff --git a/src/common/heap_profiler.h b/src/common/heap_profiler.h new file mode 100644 index 000000000..f5ffa9c6b --- /dev/null +++ b/src/common/heap_profiler.h @@ -0,0 +1,90 @@ +// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef TERA_HEAP_PROFILER_H +#define TERA_HEAP_PROFILER_H + +#include +#include +#include +#include +#include +#include + +#include +#include + +DECLARE_int64(heap_profile_allocation_interval); +DECLARE_int64(heap_profile_inuse_interval); + +namespace tera { + +class HeapProfiler { +public: + + /** + * @brief Init HeapProfiler and the detect thread will start + **/ + HeapProfiler(); + /** + * @brief: the heap profiler will stop after descontrucor called + * + **/ + ~HeapProfiler(); + + HeapProfiler& SetEnable(bool enable) { + enable_ = enable; + + if (enable_) { + setenv("HEAP_PROFILE_ALLOCATION_INTERVAL", + std::to_string(FLAGS_heap_profile_allocation_interval).c_str(), + 1); + + setenv("HEAP_PROFILE_INUSE_INTERVAL", + std::to_string(FLAGS_heap_profile_inuse_interval).c_str(), + 1); + + LOG(INFO) << "[Heap Profiler] HEAP_PROFILE_ALLOCATION_INTERVAL: " + << getenv("HEAP_PROFILE_ALLOCATION_INTERVAL"); + LOG(INFO) << "[Heap Profiler] HEAP_PROFILE_INUSE_INTERVAL: " + << getenv("HEAP_PROFILE_INUSE_INTERVAL"); + LOG(INFO) << "[Heap Profiler] Heap Profiler Enabled"; + } else { + unsetenv("HEAP_PROFILE_ALLOCATION_INTERVAL"); + unsetenv("HEAP_PROFILE_INUSE_INTERVAL"); + LOG(INFO) << "[Heap Profiler] Heap Profiler Disabled"; + } + cv_.notify_one(); + return *this; + } + + HeapProfiler& SetInterval(int second) { + interval_ = std::chrono::seconds(second); + cv_.notify_one(); + return *this; + } + + HeapProfiler& SetProfilerFile(const std::string& file) { + profiler_file_ = file; + cv_.notify_one(); + return *this; + } + +private: + void run(); +private: + std::atomic exit_; + bool enable_{false}; + std::chrono::seconds interval_{10}; + std::string profiler_file_; + std::thread thread_; + std::mutex lock_; + std::condition_variable cv_; +}; + +} // namespace tera + +#endif //TERA_HEAP_PROFILER + +/* vim: set ts=4 sw=4 sts=4 tw=100 */ \ No newline at end of file diff --git a/src/common/log/log_cleaner.cc b/src/common/log/log_cleaner.cc new file mode 100644 index 000000000..6b5474a1d --- /dev/null +++ b/src/common/log/log_cleaner.cc @@ -0,0 +1,322 @@ +// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "common/log/log_cleaner.h" + +#include +#include +#include +#include + +#include + +#include "common/file/file_path.h" +#include "common/timer.h" + +DECLARE_string(log_dir); +DECLARE_string(tera_log_prefix); +DECLARE_string(tera_leveldb_log_path); +DECLARE_int64(tera_info_log_clean_period_second); +DECLARE_int64(tera_info_log_expire_second); +DECLARE_string(ins_log_file); + +namespace common { + +static const int64_t kMinCleanPeriodMs = 1000; // 1s +static const int64_t kMinInfoLogExpireSec = 1; // 1s +static const size_t kPathMaxLen = 64; + +Mutex LogCleaner::inst_init_mutex_; +LogCleaner* LogCleaner::singleton_instance_ = NULL; + +static std::string GetProcFdPath() { + char path_buf[kPathMaxLen]; + snprintf(path_buf, kPathMaxLen, "/proc/%d/fd", getpid()); + return std::string(path_buf); +} + +static std::string GetFileNameFromPath(const std::string& path) { + std::string::size_type pos = path.rfind("/"); + if (pos == std::string::npos) { + return path; + } else { + return path.substr(pos + 1); + } +} + + +LogCleaner* LogCleaner::GetInstance(ThreadPool *thread_pool) { + if (singleton_instance_ == NULL) { + singleton_instance_ = new LogCleaner(FLAGS_log_dir, + FLAGS_tera_info_log_clean_period_second, + FLAGS_tera_info_log_expire_second, + thread_pool); + singleton_instance_->AddPrefix(FLAGS_tera_log_prefix); + singleton_instance_->AddPrefix(GetFileNameFromPath(FLAGS_tera_leveldb_log_path)); + singleton_instance_->AddPrefix(GetFileNameFromPath(FLAGS_ins_log_file)); + } + return singleton_instance_; +} + +bool LogCleaner::StartCleaner(ThreadPool *thread_pool) { + return GetInstance()->Start(); +} + +void LogCleaner::StopCleaner() { + MutexLock l(&inst_init_mutex_, "Destroy log cleaner"); + if (singleton_instance_ != NULL) { + singleton_instance_->Stop(); + delete singleton_instance_; + singleton_instance_ = NULL; + } +} + +LogCleaner::LogCleaner(const std::string& log_dir, + int64_t period_second, + int64_t expire_second, + ThreadPool *thread_pool) + : thread_pool_(thread_pool), + thread_pool_own_(false), + mutex_(), + info_log_dir_(log_dir), + log_prefix_list_(), + info_log_clean_period_ms_(std::max(period_second * 1000, kMinCleanPeriodMs)), + info_log_expire_sec_(std::max(expire_second, kMinInfoLogExpireSec)), + stop_(false), + bg_exit_(false), + bg_cond_(&mutex_), + bg_func_(std::bind(&LogCleaner::CleanTaskWrap, this)), + bg_task_id_(-1), + proc_fd_path_(GetProcFdPath()) {} + +LogCleaner::~LogCleaner() { + DestroyOwnThreadPool(); +} + +static bool CheckDirPath(const std::string &dir_path) { + return !dir_path.empty() && IsDir(dir_path); +} + +bool LogCleaner::CheckOptions() const { + return CheckDirPath(info_log_dir_) && + info_log_clean_period_ms_ > 0 && + info_log_expire_sec_ > 0; +} + +bool LogCleaner::Start() { + if (!CheckOptions()) { + return false; + } + + MutexLock l(&mutex_, "Start info log cleaner"); + + // double check + if (IsRunning()) { + return true; + } + + stop_ = false; + bg_exit_ = false; + if (nullptr == thread_pool_) { + NewThreadPool(); + } + + if (bg_task_id_ <= 0) { + // start immediately + bg_task_id_ = thread_pool_->DelayTask(0, bg_func_); + } + return true; +} + +void LogCleaner::Stop() { + MutexLock l(&mutex_, "Stop info log cleaner"); + stop_ = true; + bool is_running = false; + if (bg_task_id_ > 0) { + bg_exit_ = thread_pool_->CancelTask(bg_task_id_, true, &is_running); + } else { + bg_exit_ = true; + } + + CHECK(is_running || bg_exit_); + while(!bg_exit_) { + bg_cond_.Wait(); + } + bg_task_id_ = -1; +} + +void LogCleaner::CleanTaskWrap() { + MutexLock l(&mutex_); + DoCleanLocalLogs(); + if (stop_) { + bg_task_id_ = -1; + bg_exit_ = true; + } else { + bg_task_id_ = thread_pool_->DelayTask(info_log_clean_period_ms_, bg_func_); + } + bg_cond_.Signal(); +} + +bool LogCleaner::CheckLogPrefix(const std::string& filename) const { + std::set::const_iterator prefix_iter = log_prefix_list_.begin(); + for (; prefix_iter != log_prefix_list_.end(); ++prefix_iter) { + const std::string& prefix = *prefix_iter; + if (filename.size() < prefix.size()) { + // do not need to compare + continue; + } + + if (strncmp(prefix.c_str(), filename.c_str(), prefix.size()) == 0) { + // return true if match any prefix + return true; + } + } + return false; +} + +bool LogCleaner::DoCleanLocalLogs() { + if (log_prefix_list_.empty()) { + LOG(WARNING) << "[LogCleaner] Log prefix is not set yet."; + return false; + } + if (!CheckDirPath(info_log_dir_) || IsEmpty(info_log_dir_)) { + LOG(WARNING) << "[LogCleaner] Log dir " << info_log_dir_ << " not exsit logs."; + return false; + } + int64_t now_time = tera::get_millis() / 1000; + int64_t clean_time = now_time - info_log_expire_sec_; + LOG(INFO) << "[LogCleaner] Start clean log dir: " << info_log_dir_ + << ", now_time = " << now_time + << ", clean_time = " << clean_time; + + long path_maxlen = pathconf(info_log_dir_.c_str(), _PC_PATH_MAX); + std::vector log_file_list; + if (!ListCurrentDir(info_log_dir_, &log_file_list)) { + // list failed + LOG(WARNING) << "[LogCleaner] List log dir " << info_log_dir_ + << " failed. Cancel clean."; + return false; + } + + // reserved_set: filenames that should not to be clean + std::set reserved_set; + if (!GetCurrentOpendLogs(&reserved_set)) { + LOG(WARNING) << "[LogCleaner] GetCurrentOpendLogs failed. Cancel clean."; + return false; + } + + std::vector::const_iterator it = log_file_list.begin(); + for (; it != log_file_list.end(); ++it) { + if (reserved_set.find(*it) != reserved_set.end()) { + // already reserved + continue; + } + + const std::string& file_name = *it; + + // check if filename start with log_prefix_ + // if leveldb_log_prefix_ is not empty, check also + if (!CheckLogPrefix(file_name)) { + VLOG(16) << "[LogCleaner] Reserve log file: " << file_name + << ", which not match prefix."; + reserved_set.insert(file_name); + continue; + } + + // get file stat + std::string file_path = info_log_dir_ + "/" + file_name; + struct stat file_st; + if (lstat(file_path.c_str(), &file_st) != 0) { + // cancel clean if any file stat failed + LOG(WARNING) << "[LogCleaner] Stat log file: " << file_path << " fail. Cancel log clean."; + return false; + } + + if (S_ISLNK(file_st.st_mode)) { + // handle symbolic link + VLOG(16) << "[LogCleaner] Reserve symbolic link log: " << file_name; + reserved_set.insert(file_name); + char path_buf[path_maxlen]; + int ret = readlink(file_path.c_str(), path_buf, path_maxlen); + if (ret < 0 || ret >= path_maxlen) { + continue; + } else { + // reserve link target + path_buf[ret] = '\0'; + std::string target_filename = GetFileNameFromPath(path_buf); + VLOG(16) << "[LogCleaner] Reserve link target: " << target_filename + << " for link: " << file_path; + reserved_set.insert(target_filename); + } + } else if (!S_ISREG(file_st.st_mode)) { + VLOG(16) << "[LogCleaner] Reserve not regular file: " << file_name; + reserved_set.insert(file_name); + } else if (file_st.st_mtime >= clean_time) { + VLOG(16) << "[LogCleaner] Reserve not expire log: " << file_name + << ", mtime: " << file_st.st_mtime << ", clean_time: " << clean_time; + reserved_set.insert(file_name); + } + VLOG(16) << "stat filename: " << file_name + << ", is_symbolic_link: " << S_ISLNK(file_st.st_mode) + << ", is_dir: " << S_ISDIR(file_st.st_mode) + << ", is_regular_file: " << S_ISREG(file_st.st_mode) + << ", last mod time: " << file_st.st_mtime + << ", link number: " << file_st.st_nlink + << ", reserve: " << (reserved_set.find(file_name) != reserved_set.end()); + } + + // clean log + size_t clean_cnt = 0; + it = log_file_list.begin(); + for (; it != log_file_list.end(); ++it) { + const std::string &file_name = *it; + std::string file_path = info_log_dir_ + "/" + file_name; + if (reserved_set.find(file_name) == reserved_set.end()) { + LOG(INFO) << "[LogCleaner] log: " << file_path << " will be clean"; + if (!RemoveLocalFile(file_path)){ + LOG(WARNING) << "[LogCleaner] log clean fail: " << file_path; + } else { + ++clean_cnt; + } + } + } + LOG(INFO) << "[LogCleaner] Found log: " << log_file_list.size() + << ", clean: " << clean_cnt; + return true; +} + +bool LogCleaner::GetCurrentOpendLogs(std::set* opend_logs) { + long path_maxlen = pathconf(proc_fd_path_.c_str(), _PC_PATH_MAX); + if (path_maxlen < 0) { + LOG(ERROR) << "[LogCleaner] Get Path Max Len Failed"; + return false; + } + std::vector opend_logs_list; + VLOG(16) << "[LogCleaner] Search fd_path: " << proc_fd_path_; + if (!ListCurrentDirWithStat(proc_fd_path_, &opend_logs_list)) { + VLOG(16) << "[LogCleaner] list fd_path: " << proc_fd_path_ << " failed."; + return false; + } + + std::vector::const_iterator it = opend_logs_list.begin(); + for (; it != opend_logs_list.end(); ++it) { + const std::string& filename = it->first; + const struct stat& st = it->second; + if (S_ISLNK(st.st_mode)) { + char path_buf[path_maxlen]; + int ret = readlink(filename.c_str(), path_buf, path_maxlen); + if (ret > 0 && ret < path_maxlen && path_buf[0] == '/') { + path_buf[ret] = '\0'; + std::string target_filename = GetFileNameFromPath(path_buf); + VLOG(16) << "[LogCleaner] Reserve log in use: " << target_filename; + opend_logs->insert(target_filename); + } + } + } + return true; +} + +} // end namespace common + +/* vim: set expandtab ts=4 sw=4 sts=4 tw=100: */ diff --git a/src/common/log/log_cleaner.h b/src/common/log/log_cleaner.h new file mode 100644 index 000000000..53830a733 --- /dev/null +++ b/src/common/log/log_cleaner.h @@ -0,0 +1,114 @@ +// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef TERA_COMMON_LOG_CLEANER_H_ +#define TERA_COMMON_LOG_CLEANER_H_ + +#include +#include +#include + +#include "common/mutex.h" +#include "common/thread_pool.h" + +namespace common { + +class LogCleaner { +private: + // set private since singleton + LogCleaner(const std::string& log_dir, + int64_t period_second, + int64_t expire_second, + ThreadPool* thread_pool); + ~LogCleaner(); + // disallow copy + LogCleaner(const LogCleaner& other) = delete; + LogCleaner & operator = (const LogCleaner& other) = delete; + +public: + bool CheckOptions() const; + bool Start(); + void Stop(); + bool IsRunning() const { return bg_task_id_ > 0; } + + bool AddPrefix(const std::string& prefix) { + if (prefix.empty()) { + // empty prefix is not allowed + return false; + } else { + MutexLock l(&mutex_); + log_prefix_list_.insert(prefix); + return true; + } + } + + void RemovePrefix(const std::string& prefix) { + MutexLock l(&mutex_); + log_prefix_list_.erase(prefix); + } + +private: + // singleton + static Mutex inst_init_mutex_; + static LogCleaner* singleton_instance_; + + // get singleton instance but not start + // for unittest + static LogCleaner* GetInstance(ThreadPool *thread_pool = NULL); + +public: + static bool StartCleaner(ThreadPool *thread_pool = NULL); + static void StopCleaner(); + +private: + // do under lock + void NewThreadPool() { + if (NULL == thread_pool_) { + thread_pool_ = new ThreadPool(1); + thread_pool_own_ = true; + } + } + void DestroyOwnThreadPool() { + if (thread_pool_own_ && NULL != thread_pool_) { + thread_pool_->Stop(true); + delete thread_pool_; + thread_pool_ = NULL; + thread_pool_own_ = false; + } + } + + void CleanTaskWrap(); + + bool CheckLogPrefix(const std::string& filename) const; + + bool DoCleanLocalLogs(); + + bool GetCurrentOpendLogs(std::set* opend_logs); + +private: + ThreadPool* thread_pool_; + bool thread_pool_own_; + mutable Mutex mutex_; + + // options + std::string info_log_dir_; + std::set log_prefix_list_; + int64_t info_log_clean_period_ms_; // milli second + int64_t info_log_expire_sec_; // second + + bool stop_; + bool bg_exit_; + CondVar bg_cond_; + const ThreadPool::Task bg_func_; + int64_t bg_task_id_; + + std::string proc_fd_path_; +}; + +} // end namespace common + +#endif // TERA_COMMON_LOG_CLEANER_H_ + +/* vim: set expandtab ts=4 sw=4 sts=4 tw=100: */ + diff --git a/src/common/metric/cache_collector.h b/src/common/metric/cache_collector.h new file mode 100644 index 000000000..ae415b0d8 --- /dev/null +++ b/src/common/metric/cache_collector.h @@ -0,0 +1,108 @@ +// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef TERA_COMMOM_METRIC_CACHE_COLLECTOR_H_ +#define TERA_COMMOM_METRIC_CACHE_COLLECTOR_H_ + +#include +#include + +#include "common/metric/collector_report_publisher.h" +#include "common/metric/collector.h" +#include "db/table_cache.h" +#include "leveldb/cache.h" + +namespace tera { + +enum class CacheCollectType { + kHitRate, + kEntries, + kCharge, +}; + +class BaseCacheCollector : public Collector { +public: + explicit BaseCacheCollector(CacheCollectType cache_type) : cache_type_(cache_type) {} + virtual ~BaseCacheCollector() {} + + virtual int64_t Collect() { + switch (cache_type_) { + case CacheCollectType::kHitRate: + return HitRate(); + case CacheCollectType::kEntries: + return Entries(); + case CacheCollectType::kCharge: + return TotalCharge(); + default: + return 0; + } + } + +protected: + virtual int64_t HitRate() = 0; + virtual int64_t Entries() = 0; + virtual int64_t TotalCharge() = 0; + +protected: + CacheCollectType cache_type_; +}; + +class LRUCacheCollector : public BaseCacheCollector { +public: + LRUCacheCollector(leveldb::Cache* cache, + CacheCollectType cache_type): + BaseCacheCollector(cache_type), + cache_(cache) {} + + virtual ~LRUCacheCollector() {} + +protected: + int64_t HitRate() override { + if (cache_ == NULL) { + return 0; + } + + double hit_rate = cache_->HitRate(true); + return isnan(hit_rate) ? -1 : static_cast(hit_rate * 100.0d); + } + + int64_t Entries() override { return cache_ == NULL ? 0 : static_cast(cache_->Entries()); } + + int64_t TotalCharge() override { return cache_ == NULL ? 0 : static_cast(cache_->TotalCharge()); } +private: + leveldb::Cache* cache_; +}; + +class TableCacheCollector : public BaseCacheCollector { +public: + TableCacheCollector(leveldb::TableCache* cache, + CacheCollectType cache_type): + BaseCacheCollector(cache_type), + cache_(cache) {} + + virtual ~TableCacheCollector() {} + +protected: + int64_t HitRate() override { + if (cache_ == NULL) { + return 0; + } + + double hit_rate = cache_->HitRate(true); + return isnan(hit_rate) ? -1 : static_cast(hit_rate * 100.0d); + } + + int64_t Entries() override { return cache_ == NULL ? 0 : static_cast(cache_->TableEntries()); } + + int64_t TotalCharge() override { return cache_ == NULL ? 0 : static_cast(cache_->ByteSize()); } +private: + leveldb::TableCache* cache_; +}; + +} // end namespace tera + +#endif // TERA_COMMOM_METRIC_CACHE_COLLECTOR_H_ + +/* vim: set expandtab ts=4 sw=4 sts=4 tw=100: */ + diff --git a/src/common/metric/collector.h b/src/common/metric/collector.h new file mode 100644 index 000000000..0b31bb446 --- /dev/null +++ b/src/common/metric/collector.h @@ -0,0 +1,15 @@ +#pragma once +// Copyright (c) 2017, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. +#include +#include + +namespace tera{ +class Collector { +public: + virtual ~Collector() {} + // return a instant value of the metric for tera to dump log and other usage + virtual int64_t Collect() = 0; +}; +} diff --git a/src/common/metric/collector_report.h b/src/common/metric/collector_report.h new file mode 100644 index 000000000..8c453dcaa --- /dev/null +++ b/src/common/metric/collector_report.h @@ -0,0 +1,49 @@ +#pragma once +// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. +#include +#include +#include + +#include "common/metric/metric_id.h" +#include "common/mutex.h" +#include "common/metric/collector.h" +#include "common/metric/subscriber.h" + +namespace tera { + +using CollectorReportMap = std::unordered_map; + +struct CollectorReport { + int64_t timestamp_ms; // timestamp of the report + int64_t interval_ms; // time interval since last report + + // metric_id to metric snapshot + CollectorReport() : timestamp_ms(get_millis()) {} + + // find methods, return 0 if not found + int64_t FindMetricValue(const MetricId& metric_id) const { + auto iter = report.find(metric_id); + return iter == report.end() ? 0 : iter->second; + }; + + int64_t FindMetricValue(const std::string& metric_name) const { + return FindMetricValue(MetricId(metric_name)); + } + + int64_t FindMetricValue(const std::string& metric_name, const std::string& label_str) const { + MetricId metric_id; + if (!MetricId::ParseFromString(metric_name, label_str, &metric_id)) { + return 0; + } else { + return FindMetricValue(metric_id); + } + } + + CollectorReportMap report; +}; +} // end namespace tera + +/* vim: set expandtab ts=4 sw=4 sts=4 tw=100: */ + diff --git a/src/common/metric/collector_report_publisher.cc b/src/common/metric/collector_report_publisher.cc new file mode 100644 index 000000000..620cc4107 --- /dev/null +++ b/src/common/metric/collector_report_publisher.cc @@ -0,0 +1,150 @@ +// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "common/metric/collector_report_publisher.h" + +#include "glog/logging.h" + +#include "common/metric/hardware_collectors.h" +#include "common/timer.h" +#include "common/metric/collector.h" +#include "common/metric/prometheus_subscriber.h" + +namespace tera { + +CollectorReportPublisher& CollectorReportPublisher::GetInstance() { + static CollectorReportPublisher instance; + return instance; +} + +CollectorReportPublisher::CollectorReportPublisher(): + last_report_timestamp_(get_millis()), + last_collector_report_(new CollectorReport) { + AddHardwareCollectors(); +} + +CollectorReportPublisher::~CollectorReportPublisher() {} + +std::shared_ptr CollectorReportPublisher::GetSubscriberReport() { + std::lock_guard lock(mutex_); + + std::shared_ptr new_report(new SubscriberReport()); + int64_t start_ts = get_millis(); + // do collect + for (auto& subscriber_pair : subscribers_) { + const MetricId& metric_id = subscriber_pair.first; + new_report->insert(std::make_pair(metric_id, subscriber_pair.second->Collect())); + } + + int64_t end_ts = get_millis(); + VLOG(12) << "[Metric] Get Subscriber Summary Cost: " << (end_ts - start_ts) << " ms."; + return new_report; +} + +std::shared_ptr CollectorReportPublisher::GetCollectorReport() { + std::lock_guard lock(mutex_); + return last_collector_report_; +} + +void CollectorReportPublisher::Refresh() { + std::lock_guard lock(mutex_); + + std::shared_ptr new_report(new CollectorReport()); + int64_t start_ts = new_report->timestamp_ms; + new_report->interval_ms = new_report->timestamp_ms - last_report_timestamp_; + + // do collect + for (auto& metric_pair : collectors_) { + const MetricId& metric_id = metric_pair.first; + int64_t value = metric_pair.second->Collect(); + new_report->report[metric_id] = value; + } + + last_report_timestamp_ = start_ts; + int64_t end_ts = get_millis(); + VLOG(12) << "[Metric] Refresh Collectors Cost: " << (end_ts - start_ts) << " ms."; + last_collector_report_ = new_report; + NotifySubscribers(); +} + +bool CollectorReportPublisher::AddCollector(const MetricId& metric_id, + std::unique_ptr&& metric_collector, + SubscriberTypeList type_list) { + if (!metric_id.IsValid() || !metric_collector) { + return false; + } + + std::lock_guard lock(mutex_); + auto insert_ret = collectors_.insert(std::make_pair(metric_id, std::move(metric_collector))); + if (!insert_ret.second) { + return false; + } + + for (auto type : type_list) { + if (!AddSubscriber(std::unique_ptr(new PrometheusSubscriber(metric_id, type)))) { + LOG(ERROR) << "[METRIC] Add Subscriber For " << metric_id.ToString() << " Failed!"; + } + } + + return true; +} + +bool CollectorReportPublisher::AddSubscriber(std::unique_ptr&& prometheus_subscriber_ptr) { + if (!prometheus_subscriber_ptr || + !prometheus_subscriber_ptr->GetMetricId().IsValid()) { + // invalid arguments + return false; + } + + std::lock_guard lock(mutex_); + subscribers_.insert(std::make_pair(prometheus_subscriber_ptr->GetMetricId(), + std::move(prometheus_subscriber_ptr))); + + return true; +} + +void CollectorReportPublisher::NotifySubscribers() { + std::lock_guard lock(mutex_); + for (auto& subscriber_pair : subscribers_) { + subscriber_pair.second->OnUpdate(last_collector_report_); + } +} + +bool CollectorReportPublisher::HasCollector(const MetricId& metric_id) const { + std::lock_guard lock(mutex_); + return collectors_.find(metric_id) != collectors_.end(); +} + +bool CollectorReportPublisher::DeleteCollector(const MetricId& metric_id) { + std::lock_guard lock(mutex_); + DeleteSubscriber(metric_id); + return collectors_.erase(metric_id) > 0; +} + +bool CollectorReportPublisher::DeleteSubscriber(const MetricId& metric_id) { + std::lock_guard lock(mutex_); + return subscribers_.erase(metric_id) > 0; +} + +void CollectorReportPublisher::DeleteSubscribers() { + subscribers_.clear(); +} + +void CollectorReportPublisher::AddHardwareCollectors() { + // register hardware metrics + AddCollector(MetricId(kInstCpuMetricName), std::unique_ptr(new CpuUsageCollector())); + AddCollector(MetricId(kInstMemMetricName), std::unique_ptr(new MemUsageCollector())); + + AddCollector(MetricId(kInstNetRXMetricName), + std::unique_ptr(new NetUsageCollector(RECEIVE)), + {SubscriberType::MAX}); + + AddCollector(MetricId(kInstNetTXMetricName), + std::unique_ptr(new NetUsageCollector(TRANSMIT)), + {SubscriberType::MAX}); +} +} // end namespace tera + +/* vim: set expandtab ts=4 sw=4 sts=4 tw=100: */ + diff --git a/src/common/metric/collector_report_publisher.h b/src/common/metric/collector_report_publisher.h new file mode 100644 index 000000000..1290f2000 --- /dev/null +++ b/src/common/metric/collector_report_publisher.h @@ -0,0 +1,162 @@ +// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef TERA_COMMON_METRIC_METRICS_H_ +#define TERA_COMMON_METRIC_METRICS_H_ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "common/metric/metric_id.h" +#include "common/metric/collector_report.h" +#include "common/metric/collector.h" +#include "common/metric/subscriber.h" + +namespace tera { +// Base class for metric value collector + +using SubscriberTypeList = std::initializer_list; + +class CollectorReportPublisher { +private: + // set private for singleton + CollectorReportPublisher(); + ~CollectorReportPublisher(); + + // disallow copy + CollectorReportPublisher(const CollectorReportPublisher&) = delete; + CollectorReportPublisher& operator = (const CollectorReportPublisher&) = delete; + +public: + static CollectorReportPublisher& GetInstance(); + + void Refresh(); + /// report the instant values of collectors + std::shared_ptr GetCollectorReport(); + std::shared_ptr GetSubscriberReport(); + + /// Add a collector with a given metric_id + /// collector should be a right value reference of std::unique_ptr + /// return true if register success, + /// retrun false if argument is invalid or metric_id name has been registered already. + bool AddCollector(const MetricId& metric_id, + std::unique_ptr&& metric_collector, + SubscriberTypeList type_list = {SubscriberType::LATEST}); + + + /// weather a collector has been Added + bool HasCollector(const MetricId& metric_id) const; + /// Delete a collector + bool DeleteCollector(const MetricId& metric_id); + + + /// Add a subscriber to a given metricId. + /// Different type of subscribers can be registered to a same metricId. + bool AddSubscriber(std::unique_ptr&& subscriber); + /// Delete a subscriber + bool DeleteSubscriber(const MetricId& metric_id); + void DeleteSubscribers(); + +private: + void NotifySubscribers(); + void AddHardwareCollectors(); + +private: + mutable std::recursive_mutex mutex_; + + using CollectorMap = std::unordered_map>; + + using SubscriberMap = std::unordered_multimap>; + CollectorMap collectors_; + SubscriberMap subscribers_; + + int64_t last_report_timestamp_; + + std::shared_ptr last_collector_report_; +}; + +class AutoCollectorRegister { +public: + AutoCollectorRegister(const MetricId& id, + std::unique_ptr&& collector, + SubscriberTypeList type_list = {SubscriberType::LATEST}): + registered_(false), + id_(id) { + registered_ = CollectorReportPublisher::GetInstance().AddCollector(id_, std::move(collector), type_list); + } + + // create a metric with empty label + AutoCollectorRegister(const std::string& name, + std::unique_ptr&& collector, + SubscriberTypeList type_list = {SubscriberType::LATEST}): + registered_(false), + id_(name) { + if (name.empty()) { + throw std::invalid_argument("name"); + } + registered_ = CollectorReportPublisher::GetInstance().AddCollector(id_, std::move(collector), type_list); + } + + // create a metric with name and label + // label_str format: k1:v1,k2:v2,... + // can build by LabelStringBuilder().Append("k1", "v1").Append("k2","v2").ToString(); + AutoCollectorRegister(const std::string& name, + const std::string& label_str, + std::unique_ptr&& collector, + SubscriberTypeList type_list = {SubscriberType::LATEST}): + registered_(false) { + // parse metric id + MetricId::ParseFromStringWithThrow(name, label_str, &id_); + registered_ = CollectorReportPublisher::GetInstance().AddCollector(id_, std::move(collector), type_list); + } + + ~AutoCollectorRegister() { + if (registered_) { + CollectorReportPublisher::GetInstance().DeleteCollector(id_); + } + } + + const MetricId& GetId() const { + return id_; + } + + bool IsRegistered() const { + return registered_; + } + +private: + bool registered_; + MetricId id_; +}; + + +class AutoSubscriberRegister { +public: + AutoSubscriberRegister(std::unique_ptr&& subscriber_ptr):registered_(false) { + if (subscriber_ptr) { + metric_id_ = subscriber_ptr->GetMetricId(); + registered_ = CollectorReportPublisher::GetInstance().AddSubscriber(std::move(subscriber_ptr)); + } + } + ~AutoSubscriberRegister(){ + if (registered_) { + CollectorReportPublisher::GetInstance().DeleteSubscriber(metric_id_); + } + } +private: + bool registered_; + MetricId metric_id_; +}; +} // end namespace tera + +#endif // TERA_COMMON_METRIC_METRICS_H_ + +/* vim: set expandtab ts=4 sw=4 sts=4 tw=100: */ + diff --git a/src/common/metric/counter_collector.h b/src/common/metric/counter_collector.h new file mode 100644 index 000000000..1a5ea981b --- /dev/null +++ b/src/common/metric/counter_collector.h @@ -0,0 +1,41 @@ +// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef TERA_COMMON_METRIC_COUNTER_COLLECTOR_H_ +#define TERA_COMMON_METRIC_COUNTER_COLLECTOR_H_ + +#include "common/metric/collector.h" +#include "common/counter.h" + +namespace tera { + +class CounterCollector : public Collector { +public: + /// if is_periodic is true, the counter will be cleared when collect + /// this parameter is usually true, but it's false with some instantaneous value + /// Eg: read_pending_count, scan_pending_count, which can't be clear during collect. + explicit CounterCollector(Counter* counter, + bool is_periodic = true): + counter_(counter), + is_periodic_(is_periodic) {} + + ~CounterCollector() override {} + + int64_t Collect() override { + if (counter_ == NULL) { + return -1; + } else { + return is_periodic_ ? counter_->Clear() : counter_->Get(); + } + } +private: + Counter* const counter_; + const bool is_periodic_; +}; +} // end namespace tera + +#endif // TERA_COMMON_METRIC_COUNTER_COLLECTOR_H_ + +/* vim: set expandtab ts=4 sw=4 sts=4 tw=100: */ + diff --git a/src/common/metric/hardware_collectors.cc b/src/common/metric/hardware_collectors.cc new file mode 100644 index 000000000..cddfd6ee6 --- /dev/null +++ b/src/common/metric/hardware_collectors.cc @@ -0,0 +1,250 @@ +// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. +#include +#include +#include +#include +#include +#include +#include + +#include "gflags/gflags.h" +#include "glog/logging.h" + +#include "common/timer.h" + +#include "common/metric/hardware_collectors.h" + +DECLARE_int64(tera_hardware_collect_period_second); + +namespace tera { + +// return number of cpu(cores) +static uint32_t GetCpuCount() { +#if defined(_SC_NPROCESSORS_ONLN) + return sysconf(_SC_NPROCESSORS_ONLN); +#else + FILE *fp = fopen("/proc/stat", "r"); + if (fp == NULL) { + LOG(ERROR) << "[HardWare Metric] open /proc/stat failed."; + return 1; + } + static const size_t kLineMaxLen = 256; // enough in here + std::unique_ptr aline(new char[kLineMaxLen]); + if (!aline) { + LOG(ERROR) << "[HardWare Metric] malloc failed."; + return 1; + } + static const size_t kHeaderMaxLen = 10; + char header[kHeaderMaxLen]; + uint32_t i = 0; + size_t len = 0; + char* line_ptr = aline.get(); + getline(&line_ptr, &len, fp); // drop the first line + while (getline(&line_ptr, &len, fp)) { + i++; + sscanf(line_ptr, "%s", header); + if (!strncmp(header, "intr", kHeaderMaxLen)) { + break; + } + } + fclose(fp); + return std::max(i - 1, 1); +#endif +} + +// return the number of ticks(jiffies) that this process +// has been scheduled in user and kernel mode. +static bool ProcessCpuTick(const std::string& stat_path, int64_t* tick) { + if (tick == NULL) { + return false; + } + FILE *fp = fopen(stat_path.c_str(), "r"); + if (fp == NULL) { + LOG(ERROR) << "[HardWare Metric] open " << stat_path << " failed."; + return false; + } + long long utime = 0; + long long stime = 0; + if (fscanf(fp, "%*s %*s %*s %*s %*s %*s %*s %*s %*s %*s %*s %*s %*s %lld %lld", + &utime, &stime) < 2) { + fclose(fp); + LOG(ERROR) << "[HardWare Metric] get cpu tick from " << stat_path << " failed."; + return false; + } + fclose(fp); + *tick = utime + stime; + return true; +} + +CpuUsageCollector::CpuUsageCollector(): + pid_(getpid()), + cpu_core_num_(GetCpuCount()), + cpu_hertz_(sysconf(_SC_CLK_TCK)), + stat_path_(std::string("/proc/") + std::to_string(pid_) + "/stat"), + last_check_time_ms_(get_millis()), + last_tick_total_(0), + cpu_usage_(0) {} + +CpuUsageCollector::~CpuUsageCollector() {} + +int64_t CpuUsageCollector::Collect() { + int64_t cur_ts = get_millis(); + int64_t collect_period_ms = FLAGS_tera_hardware_collect_period_second * 1000; + if (collect_period_ms > 0 && cur_ts < last_check_time_ms_ + collect_period_ms) { + return cpu_usage_; + } else { + return CheckCpuUsage(cur_ts, false); + } +} + +int64_t CpuUsageCollector::CheckCpuUsage(int64_t cur_ts, bool is_irix_on) { + int64_t new_tick_total = 0; + if (!ProcessCpuTick(stat_path_, &new_tick_total)) { + // read proc file failed. + return 0; + } + + float interval_sec = static_cast(cur_ts - last_check_time_ms_) / 1000.0f; + // percentage per tick during time interval + float interval_total_ticks = static_cast(cpu_hertz_) * interval_sec; + if (!is_irix_on) { + interval_total_ticks *= cpu_core_num_; + } + + float usage_percentage = static_cast(new_tick_total - last_tick_total_) * 100.0f / interval_total_ticks; + usage_percentage = std::min(usage_percentage, 99.9f); + + // update + last_tick_total_ = new_tick_total; + cpu_usage_ = static_cast(usage_percentage); + last_check_time_ms_ = cur_ts; + VLOG(15) << "[Hardware Metric] %CPU: " << usage_percentage; + return cpu_usage_; +} + +MemUsageCollector::MemUsageCollector(): + pid_(getpid()), + stat_path_(std::string("/proc/") + std::to_string(pid_) + "/statm"), + last_check_time_ms_(get_millis()), + mem_usage_(0) {} + + +MemUsageCollector::~MemUsageCollector() {} + +int64_t MemUsageCollector::Collect() { + int64_t cur_ts = get_millis(); + int64_t collect_period_ms = FLAGS_tera_hardware_collect_period_second * 1000; + if (collect_period_ms > 0 && cur_ts < last_check_time_ms_ + collect_period_ms) { + return mem_usage_; + } else { + return CheckMemUsage(cur_ts); + } +} + +int64_t MemUsageCollector::CheckMemUsage(int64_t cur_ts) { + FILE* stat_file = fopen(stat_path_.c_str(), "r"); + if (stat_file == NULL) { + LOG(ERROR) << "[Hardware Metric] open " << stat_path_ << " failed."; + return false; + } + + int64_t mem_pages = 0; + fscanf(stat_file, "%*d %ld", &mem_pages); + fclose(stat_file); + + mem_usage_ = mem_pages * 4 * 1024; + last_check_time_ms_ = cur_ts; + VLOG(15) << "[Hardware Metric] Memory: " << mem_usage_; + return mem_usage_; +} + +NetUsageCollector::NetInfoChecker NetUsageCollector::net_info_checker_; + +NetUsageCollector::NetUsageCollector(NetUsageType n_type): + net_usage_type_(n_type) {} + +NetUsageCollector::~NetUsageCollector() {} + +int64_t NetUsageCollector::Collect() { + int64_t cur_ts = get_millis(); + int64_t collect_period_ms = FLAGS_tera_hardware_collect_period_second * 1000; + if (collect_period_ms > 0 && + cur_ts < net_info_checker_.last_check_time_ms_ + collect_period_ms) { + return net_usage_type_ == RECEIVE ? net_info_checker_.net_rx_usage_ : net_info_checker_.net_tx_usage_; + } else { + int64_t value = 0; + if (net_usage_type_ == RECEIVE) { + // check net info and get receive usage + net_info_checker_.CheckNetUsage(cur_ts, &value, NULL); + } else { + // check net info and get transmit usage + net_info_checker_.CheckNetUsage(cur_ts, NULL, &value); + } + return value; + } +} + +NetUsageCollector::NetInfoChecker::NetInfoChecker() + : pid_(getpid()), + stat_path_(std::string("/proc/") + std::to_string(pid_) + "/net/dev"), + last_check_time_ms_(get_millis()), + last_rx_total_(0), + last_tx_total_(0), + net_rx_usage_(0), + net_tx_usage_(0) { + GetCurrentTotal(&last_rx_total_, &last_tx_total_); +} + +bool NetUsageCollector::NetInfoChecker::GetCurrentTotal(int64_t *rx_total, int64_t *tx_total) { + FILE* stat_file = fopen(stat_path_.c_str(), "r"); + if (stat_file == NULL) { + LOG(ERROR) << "[Hardware Metric] open " << stat_path_ << "failed."; + return false; + } + int ret = fseek(stat_file, 327, SEEK_SET); + CHECK_EQ(ret, 0); + for (int i = 0; i < 10; i++) { + while (':' != fgetc(stat_file)); + ret = fscanf(stat_file, "%ld%*d%*d%*d%*d%*d%*d%*d%ld", rx_total, tx_total); + if (ret >= 2 && rx_total > 0 && tx_total > 0) { + break; + } + } + fclose(stat_file); + + return true; +} + +bool NetUsageCollector::NetInfoChecker::CheckNetUsage(int64_t cur_ts, int64_t* rx_usage, int64_t *tx_usage) { + int64_t new_rx_total = 0; + int64_t new_tx_total = 0; + + if (!GetCurrentTotal(&new_rx_total, &new_tx_total)) { + return false; + } + int64_t interval_ms = cur_ts - last_check_time_ms_; + // update + net_rx_usage_ = (new_rx_total - last_rx_total_) * 1000 / interval_ms; + net_tx_usage_ = (new_tx_total - last_tx_total_) * 1000 / interval_ms; + last_rx_total_ = new_rx_total; + last_tx_total_ = new_tx_total; + last_check_time_ms_ = cur_ts; + + if (rx_usage) { + *rx_usage = net_rx_usage_; + } + + if (tx_usage) { + *tx_usage = net_tx_usage_; + } + + VLOG(15) << "[Hardware Metric] Network RX/TX: " << last_rx_total_ << " / " << last_tx_total_; + return true; +} + +} // end namespace tera + +/* vim: set expandtab ts=4 sw=4 sts=4 tw=100: */ + diff --git a/src/common/metric/hardware_collectors.h b/src/common/metric/hardware_collectors.h new file mode 100644 index 000000000..be04e4165 --- /dev/null +++ b/src/common/metric/hardware_collectors.h @@ -0,0 +1,104 @@ +// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef TERA_COMMON_METRIC_HARDWARE_METRICS_H_ +#define TERA_COMMON_METRIC_HARDWARE_METRICS_H_ + +#include + +#include "common/metric/collector_report_publisher.h" +#include "common/metric/collector.h" + +namespace tera { + +const char* const kInstCpuMetricName = "tera_instance_cpu_usage_percent"; +const char* const kInstMemMetricName = "tera_instance_mem_usage_bytes"; +const char* const kInstNetRXMetricName = "tera_instance_net_receive_bytes"; +const char* const kInstNetTXMetricName = "tera_instance_net_transmit_bytes"; + +class CpuUsageCollector : public Collector { +public: + CpuUsageCollector(); + virtual ~CpuUsageCollector(); + + virtual int64_t Collect(); +private: + int64_t CheckCpuUsage(int64_t cur_ts, bool is_irix_on); + +private: + // proc info + int pid_; + uint32_t cpu_core_num_; + int64_t cpu_hertz_; + std::string stat_path_; + + // last check info + int64_t last_check_time_ms_; + int64_t last_tick_total_; // cpu total ticks at last check + int64_t cpu_usage_; // (new_tick_total - last_tick_total_) / (total ticks in interval) +}; + +class MemUsageCollector : public Collector { +public: + MemUsageCollector(); + virtual ~MemUsageCollector(); + + virtual int64_t Collect(); +private: + int64_t CheckMemUsage(int64_t cur_ts); + +private: + // proc info + int pid_; + std::string stat_path_; + + // last check info + int64_t last_check_time_ms_; + int64_t mem_usage_; +}; + +enum NetUsageType { + RECEIVE, // net_rx + TRANSMIT, // net_tx +}; + +class NetUsageCollector : public Collector { +public: + explicit NetUsageCollector(NetUsageType n_type); + virtual ~NetUsageCollector(); + + virtual int64_t Collect(); +private: + struct NetInfoChecker { + // proc info + int pid_; + std::string stat_path_; + + // last check info + int64_t last_check_time_ms_; + int64_t last_rx_total_; // total rx bytes at last check + int64_t last_tx_total_; // total tx bytes at last check + + // metric value cache + int64_t net_rx_usage_; // (new_rx_total - last_rx_total_) / check_interval + int64_t net_tx_usage_; // (new_tx_total - last_tx_total_) / check_interval + + NetInfoChecker(); + + bool GetCurrentTotal(int64_t*, int64_t*); + bool CheckNetUsage(int64_t cur_ts, int64_t* rx_usage, int64_t *tx_usage); + }; + + static NetInfoChecker net_info_checker_; + +private: + NetUsageType net_usage_type_; +}; + +} // end namespace tera + +#endif // TERA_COMMON_METRIC_HARDWARE_METRICS_H_ + +/* vim: set expandtab ts=4 sw=4 sts=4 tw=100: */ + diff --git a/src/common/metric/metric_counter.h b/src/common/metric/metric_counter.h new file mode 100644 index 000000000..55b4c59fe --- /dev/null +++ b/src/common/metric/metric_counter.h @@ -0,0 +1,93 @@ +#pragma once +// Copyright (c) 2017, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include +#include + +#include "common/metric/collector_report_publisher.h" +#include "common/metric/counter_collector.h" +#include "common/counter.h" + +namespace tera{ +class MetricCounter : public Counter { +public: + // create a metric with empty label + explicit MetricCounter(const std::string& name, + SubscriberTypeList type_list = {SubscriberType::LATEST}, + bool is_periodic = true): + Counter(), + registered_(false), + metric_id_(name), + type_list_(type_list), + is_periodic_(is_periodic) { + if (name.empty()) { + // throw a exception and make process exit with coredump + throw std::invalid_argument("metric name is empty"); + } + registered_ = CollectorReportPublisher::GetInstance().AddCollector( + metric_id_, + std::unique_ptr(new CounterCollector(this, is_periodic_)), + type_list_); + } + + // create a metric with name and label + // label_str format: k1:v1,k2:v2,... + // can build by LabelStringBuilder().Append("k1", "v1").Append("k2","v2").ToString(); + MetricCounter(const std::string& name, + const std::string& label_str, + SubscriberTypeList type_list = {SubscriberType::LATEST}, + bool is_periodic = true): + Counter(), + registered_(false), + type_list_(type_list), + is_periodic_(is_periodic) { + // parse metric id + MetricId::ParseFromStringWithThrow(name, label_str, &metric_id_); + // legal label str format, do register + registered_ = CollectorReportPublisher::GetInstance().AddCollector( + metric_id_, + std::unique_ptr(new CounterCollector(this, is_periodic_)), + type_list); + } + + MetricCounter(MetricCounter&& counter) { + // parse metric id + if (counter.registered_) { + CollectorReportPublisher::GetInstance().DeleteCollector(counter.metric_id_); + } + registered_ = counter.registered_; + metric_id_ = counter.metric_id_; + is_periodic_ = counter.is_periodic_; + type_list_ = counter.type_list_; + Set(counter.Get()); + counter.registered_ = false; + registered_ = CollectorReportPublisher::GetInstance().AddCollector( + metric_id_, + std::unique_ptr(new CounterCollector(this, is_periodic_)), + type_list_); + } + + virtual ~MetricCounter() { + if (registered_) { + // do unregister + CollectorReportPublisher::GetInstance().DeleteCollector(metric_id_); + } + } + + bool IsRegistered() const { + return registered_; + } + + //Never copyied + MetricCounter(const MetricCounter&) = delete; + MetricCounter& operator=(const MetricCounter&) = delete; + +private: + bool registered_; + MetricId metric_id_; + SubscriberTypeList type_list_; + bool is_periodic_; +}; +} diff --git a/src/common/metric/metric_http_server.cc b/src/common/metric/metric_http_server.cc new file mode 100644 index 000000000..fdb01910c --- /dev/null +++ b/src/common/metric/metric_http_server.cc @@ -0,0 +1,232 @@ +// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "common/metric/metric_http_server.h" + +#include +#include +#include + +#include "glog/logging.h" + +#include "common/timer.h" +#include "common/metric/collector_report.h" + +using std::string; + +namespace tera { + +void ResponseBodyBuilder::BuildType(string* body, const string& metric_name, const string& type) { + body->append("# TYPE " + metric_name + " " + type + "\n"); +} + +void ResponseBodyBuilder::BuildHelp(string* body, const string& metric_name, const string& help_info) { + body->append("# HELP " + metric_name + " " + help_info + "\n"); +} + +void ResponseBodyBuilder::BuildMetricItem(string* body, const MetricId& metric_id, const ReportItem& report_item) { + + VLOG(12) << "[Building Metric] name: " << metric_id.GetName() + << "\tValue: " << static_cast(report_item.Value()) + << "\tTimeStamp: " << report_item.Time() + << "\tType: " << report_item.Type(); + + if (report_item.Time() == -1) { + return; + } + + body->append(metric_id.GetName() + "{"); + const auto& label_map = metric_id.GetLabelMap(); + auto iter = label_map.begin(); + bool has_label = false; + if (iter != label_map.end()) { + body->append(iter->first + "=" + "\"" + iter->second + "\""); + has_label = true; + ++iter; + } + while (iter != label_map.end()) { + body->append("," + iter->first + "=" + "\"" + iter->second + "\""); + ++iter; + } + + if (has_label) { + body->append(",value_type=\"" + report_item.Type() + "\""); + } else { + body->append("value_type=\"" + report_item.Type() + "\""); + } + + body->append("} " + std::to_string(report_item.Value()) + " " + std::to_string(report_item.Time())); + body->append("\n"); +} + +static const int kMongoosePollTimeoutMs = 1000; + +static void LogRequest(struct http_message *request) { + VLOG(16) << "[MetricHttpServer] Recv http request." + << " method [" << std::string(request->method.p, request->method.len) << "]" + << " uri [" << std::string(request->uri.p, request->uri.len) << "]" + << " proto [" << std::string(request->proto.p, request->proto.len) << "]" + << " query [" << std::string(request->query_string.p, request->query_string.len) << "]" + << " body [" << std::string(request->body.p, request->body.len) << "]"; +} + +void MetricHttpServer::EventHandler(struct mg_connection *conn, int event, void *p_data) { + if (event == MG_EV_HTTP_REQUEST) { + if (conn == NULL || conn->mgr == NULL || p_data == NULL) { + LOG(WARNING) << "[MetricHttpServer] handle invalid request."; + return; + } + + // get user data + void* user_data = conn->mgr->user_data; + if (user_data == NULL) { + LOG(WARNING) << "[MetricHttpServer] Connection missing user data."; + return; + } + + MetricHttpServer *server = reinterpret_cast(user_data); + struct http_message *request = reinterpret_cast(p_data); + server->HandleHttpRequest(conn, request); + } + // ignore other events +} + +MetricHttpServer::MetricHttpServer(): + is_running_(false), + stop_(false), + listen_port_(-1) {} + +MetricHttpServer::~MetricHttpServer() {} + +bool MetricHttpServer::Start(int32_t listen_port) { + if (listen_port <= 0) { + LOG(WARNING) << "[MetricHttpServer] Start got invalid listen port: " << listen_port; + return false; + } + + MutexLock lock(&mutex_); + if (IsRunning()) { + LOG(WARNING) << "[MetricHttpServer] Server is already running, listening: " << listen_port_; + return false; + } + + // init mongoose use this as user_data + mg_mgr_init(&mongoose_mgr_, this); + + // bind listen port + std::string bind_addr = std::to_string(listen_port); + struct mg_connection *conn = mg_bind(&mongoose_mgr_, bind_addr.c_str(), &MetricHttpServer::EventHandler); + + if (conn == NULL) { + LOG(WARNING) << "[MetricHttpServer] Bind port [" << listen_port << "] failed."; + mg_mgr_free(&mongoose_mgr_); + return false; + } + + mg_set_protocol_http_websocket(conn); + LOG(INFO) << "[MetricHttpServer] Bind port [" << listen_port << "] success."; + + stop_.store(false); + if (!bg_thread_.Start(std::bind(&MetricHttpServer::BackgroundWorkWrapper, this))) { + mg_mgr_free(&mongoose_mgr_); + LOG(WARNING) << "[MetricHttpServer] Start background thread failed."; + return false; + } + return true; +} + +void MetricHttpServer::Stop() { + MutexLock lock(&mutex_); + if (!IsRunning()) { + return; + } + + stop_.store(true); + bg_thread_.Join(); + listen_port_ = -1; +} + +void MetricHttpServer::BackgroundWorkWrapper() { + LOG(INFO) << "[MetricHttpServer] Start background work"; + is_running_.store(true); + while (!stop_.load()) { + mg_mgr_poll(&mongoose_mgr_, kMongoosePollTimeoutMs); + } + is_running_.store(false); + mg_mgr_free(&mongoose_mgr_); + LOG(INFO) << "[MetricHttpServer] Exit background work"; +} + +void MetricHttpServer::HandleHttpRequest(struct mg_connection *conn, struct http_message *request) { + int64_t start_ts = get_micros(); + LogRequest(request); + + // select real handler based on uri + std::string uri(request->uri.p, request->uri.len); + if (uri == "/metrics") { + HandleMetrics(conn, request); + } else { + HandleUnknowUri(conn, request); + } + int64_t end_ts = get_micros(); + VLOG(16) << "[MetricHttpServer] Handle uri [" << uri << "] cost [" << (end_ts - start_ts) << "] us."; +} + +void MetricHttpServer::HandleUnknowUri(struct mg_connection *conn, struct http_message *request) { + VLOG(16) << "[MetricHttpServer] Handle unknow uri [" + << std::string(request->uri.p, request->uri.len) << "] ..."; + mg_send_head(conn, 404, 0, "Content-Type: text/plain"); +} + +void MetricHttpServer::HandleMetrics(struct mg_connection *conn, struct http_message *request) { + std::string body(GetResponseBody()); + mg_printf(conn, "HTTP/1.1 200 OK\r\nContent-Type: %s\r\n", "text/plain"); + mg_printf(conn, "Content-Length: %lu\r\n\r\n", static_cast(body.size())); + mg_send(conn, body.data(), body.size()); +} + +string MetricHttpServer::GetResponseBody() { + int64_t start_ts = get_millis(); + std::shared_ptr cur_report = + CollectorReportPublisher::GetInstance().GetSubscriberReport(); + + if (!cur_report) { + LOG(WARNING) << "[MetricHttpServer] Subscriber Report Is Empty"; + return ""; + } + + //pair + using MetricIdValuePair = SubscriberReport::value_type; + //Vector of pair + using MetricIdValueVec = std::vector; + // MetricNameMap: map< metric_name, vector< pair > > + using MetricNameMap = std::unordered_map; + + MetricNameMap metric_name_map; + + for (const auto& report_item : *cur_report) { + const std::string& metric_name = report_item.first.GetName(); + metric_name_map[metric_name].push_back(&report_item); + } + + std::string body; + // fill MetricFamilyVec + for (const auto& metric_item : metric_name_map) { + ResponseBodyBuilder::BuildHelp(&body, metric_item.first, metric_item.first); + ResponseBodyBuilder::BuildType(&body, metric_item.first, "gauge"); + + const MetricIdValueVec& metric_vec = metric_item.second; + + std::for_each(metric_vec.begin(), metric_vec.end(), [&body, this](const MetricIdValuePair* x) { + ResponseBodyBuilder::BuildMetricItem(&body, x->first, x->second); + }); + } + VLOG(12) << "[MetricHttpServer] Get Response Body cost: " << + get_millis() - start_ts << " ms"; + return std::move(body); +} +} // end namespace tera + +/* vim: set expandtab ts=4 sw=4 sts=4 tw=100: */ + diff --git a/src/common/metric/metric_http_server.h b/src/common/metric/metric_http_server.h new file mode 100644 index 000000000..a0b735450 --- /dev/null +++ b/src/common/metric/metric_http_server.h @@ -0,0 +1,84 @@ +// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef TERA_COMMON_METRIC_METRIC_HTTP_SERVER_H_ +#define TERA_COMMON_METRIC_METRIC_HTTP_SERVER_H_ + +#include +#include +#include + +#include "mongoose.h" + +#include "common/metric/collector_report_publisher.h" +#include "common/mutex.h" +#include "common/thread.h" + +namespace tera { + +struct ResponseBodyBuilder { + static void BuildType(std::string* body, + const std::string& metric_name, + const std::string& type); + + static void BuildHelp(std::string* body, + const std::string& metric_name, + const std::string& help_info); + + static void BuildMetricItem(std::string* body, + const MetricId& metric_id, + const ReportItem& report_item); +}; + +// a simple http server based on mongoose +class MetricHttpServer { +public: + MetricHttpServer(); + ~MetricHttpServer(); + +private: + // disallow copy + MetricHttpServer(const MetricHttpServer&) = delete; + MetricHttpServer& operator = (const MetricHttpServer&) = delete; + +private: + static void EventHandler(struct mg_connection *conn, int event, void *p_data); + +public: + bool Start(int32_t listen_port); + void Stop(); + + bool IsRunning() const { + return is_running_.load(); + } + +private: + void BackgroundWorkWrapper(); + + // http request handlers + void HandleHttpRequest(struct mg_connection *conn, struct http_message *request); + void HandleMetrics(struct mg_connection *conn, struct http_message *request); + void HandleUnknowUri(struct mg_connection *conn, struct http_message *request); + + // prometheus handle functions + std::string GetResponseBody(); + +private: + mutable Mutex mutex_; + std::atomic is_running_; + std::atomic stop_; + int32_t listen_port_; + + // background thread + common::Thread bg_thread_; + + // mongoose info + struct mg_mgr mongoose_mgr_; +}; + +} // end namespace tera + +#endif // TERA_COMMON_METRIC_METRIC_HTTP_SERVER_H_ + +/* vim: set expandtab ts=4 sw=4 sts=4 tw=100: */ diff --git a/src/common/metric/metric_id.cc b/src/common/metric/metric_id.cc new file mode 100644 index 000000000..b77ee095c --- /dev/null +++ b/src/common/metric/metric_id.cc @@ -0,0 +1,156 @@ +// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "common/metric/metric_id.h" + +#include +#include + +namespace tera { + +static const std::string kInvalidLabel = ""; + +MetricId::MetricId(const std::string& name, const std::string& label_str) { + ParseFromStringWithThrow(name, label_str, this); +} + +static std::string MetricLabelsToString(const MetricLabels& label_map) { + if (label_map.empty()) { + return ""; + } + std::ostringstream label_oss; + auto iter = label_map.begin(); + // do not append kLabelPairDelimiter for the first pair + label_oss << iter->first << kLabelKVDelimiter << iter->second; + ++iter; + + for (; iter != label_map.end(); ++iter) { + label_oss << kLabelPairDelimiter << iter->first << kLabelKVDelimiter << iter->second; + } + return label_oss.str(); +} + +std::string MetricId::GenMetricIdStr(const std::string& name, const MetricLabels& label_map) { + if (label_map.empty()) { + return name; + } + + std::ostringstream id_oss; + id_oss << name << kNameLabelsDelimiter << MetricLabelsToString(label_map); + return id_oss.str(); +} + +void MetricId::ParseFromStringWithThrow(const std::string& name, + const std::string& label_str, + MetricId* metric_id) throw(std::invalid_argument) { + if (metric_id == NULL) { + throw std::invalid_argument("metric_id is invalid"); + } + if (name.empty()) { + throw std::invalid_argument("metric name is invalid"); + } + + metric_id->name_ = name; + metric_id->labels_.clear(); + + if (label_str.empty()) { + metric_id->id_str_ = metric_id->name_; + return; + } + + // label_str format: k1:v1,k2:v2,... + std::vector label_str_splits; + boost::algorithm::split(label_str_splits, label_str, + boost::algorithm::is_any_of(kLabelPairDelimiter)); + for (const std::string& label_kv_str : label_str_splits) { + std::vector label_kv_splits; + boost::algorithm::split(label_kv_splits, label_kv_str, + boost::algorithm::is_any_of(kLabelKVDelimiter)); + if (label_kv_splits.size() != 2) { + // invalid label str format + throw std::invalid_argument("label_str"); + } + + metric_id->labels_.insert(std::make_pair(label_kv_splits[0], label_kv_splits[1])); + } + + // gen identifier string + metric_id->id_str_ = metric_id->name_ + kNameLabelsDelimiter + label_str; + return; +} + +bool MetricId::ParseFromString(const std::string& name, + const std::string& label_str, + MetricId* metric_id) throw() { + try { + ParseFromStringWithThrow(name, label_str, metric_id); + return true; + } catch (std::invalid_argument&) { + return false; + } +} + +MetricId::MetricId() : name_(), labels_(), id_str_() {} + +MetricId::MetricId(const std::string& name) + : name_(name), + labels_(), + id_str_(GenMetricIdStr(name_, labels_)) {} + +MetricId::MetricId(const std::string& name, const MetricLabels& label_map) + : name_(name), + labels_(label_map), + id_str_(GenMetricIdStr(name_, labels_)) {} + +MetricId::MetricId(const MetricId& other) + : name_(other.name_), + labels_(other.labels_), + id_str_(other.id_str_) {} + +MetricId::~MetricId() {} + +MetricId& MetricId::operator = (const MetricId& other) { + name_ = other.name_; + labels_ = other.labels_; + id_str_ = other.id_str_; + return *this; +} + +const std::string& MetricId::GetLabel(const std::string& name) const { + auto iter = labels_.find(name); + if (iter == labels_.end()) { + return kInvalidLabel; + } else { + return iter->second; + } +} + +bool MetricId::ExistLabel(const std::string& name) const { + return labels_.find(name) != labels_.end(); +} + +bool MetricId::CheckLabel(const std::string& name, const std::string& expected_value) const { + auto iter = labels_.find(name); + if (iter == labels_.end()) { + return false; + } else { + return (iter->second == expected_value); + } +} + +LabelStringBuilder& LabelStringBuilder::Append(const std::string& name, const std::string& value) { + if (!name.empty() && !value.empty()) { + labels_[name] = value; + } + return *this; +} + +std::string LabelStringBuilder::ToString() const { + return MetricLabelsToString(labels_); +} + +} // end namespace tera + +/* vim: set expandtab ts=4 sw=4 sts=4 tw=100: */ + diff --git a/src/common/metric/metric_id.h b/src/common/metric/metric_id.h new file mode 100644 index 000000000..cff30448e --- /dev/null +++ b/src/common/metric/metric_id.h @@ -0,0 +1,143 @@ +// Copyright (c) 2017, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef TERA_COMMON_METRIC_METRIC_ID_H_ +#define TERA_COMMON_METRIC_METRIC_ID_H_ + +#include +#include +#include +#include +#include + +namespace tera { + +// use ordered map to ensure the order of labels in id_str +typedef std::map MetricLabels; + +const char* const kNameLabelsDelimiter = "#"; +const char* const kLabelPairDelimiter = ","; +const char* const kLabelKVDelimiter = ":"; + +// A metric identifiered by name and all labels +// name: necessary, and should not be empty +// labels: optional +// +// Can get name and labels from MetricId +class MetricId { +public: + MetricId(); + explicit MetricId(const std::string& name); + MetricId(const std::string& name, const MetricLabels& label_map); + MetricId(const std::string& name, const std::string& label_str); + MetricId(const MetricId& other); + ~MetricId(); + + MetricId& operator = (const MetricId& other); + + bool IsValid() const { + return !name_.empty(); + } + + const std::string& GetName() const { + return name_; + } + + const MetricLabels& GetLabelMap() const { + return labels_; + } + + const std::string& ToString() const { + return id_str_; + } + + // access labels + const std::string& GetLabel(const std::string& name) const; + bool ExistLabel(const std::string& name) const; + bool CheckLabel(const std::string& name, const std::string& expected_value) const; + +public: + // Parse MetricId from name and formated label string + // nothrow std::invalid_argument if got illegal format arguments + static void ParseFromStringWithThrow(const std::string& name, + const std::string& label_str, + MetricId* metric_id) throw(std::invalid_argument); + // Parse MetricId from name and formated label string + // nothrow version + static bool ParseFromString(const std::string& name, + const std::string& label_str, + MetricId* metric_id) throw(); + +private: + static std::string GenMetricIdStr(const std::string& name, const MetricLabels& label_map); +private: + std::string name_; + MetricLabels labels_; + std::string id_str_; +}; + +// relational operators +// make MetricId can be the key of std::map and std::unordered_map +inline bool operator == (const MetricId& id1, const MetricId& id2) { + return id1.ToString() == id2.ToString(); +} + +inline bool operator != (const MetricId& id1, const MetricId& id2) { + return id1.ToString() != id2.ToString(); +} + +inline bool operator < (const MetricId& id1, const MetricId& id2) { + return id1.ToString() < id2.ToString(); +} + +inline bool operator <= (const MetricId& id1, const MetricId& id2) { + return id1.ToString() <= id2.ToString(); +} + +inline bool operator > (const MetricId& id1, const MetricId& id2) { + return id1.ToString() > id2.ToString(); +} + +inline bool operator >= (const MetricId& id1, const MetricId& id2) { + return id1.ToString() >= id2.ToString(); +} + +// A helper class to build formated label string +// Usage: label_str = LabelStringBuilder().Append("k1","v1").Append("k2","v2").ToString(); +class LabelStringBuilder { +public: + LabelStringBuilder() {} + ~LabelStringBuilder() {} + + // append a k-v pair + LabelStringBuilder& Append(const std::string& name, const std::string& value); + + // build formated string + std::string ToString() const; + +private: + MetricLabels labels_; +}; + +} // end namespace tera + +namespace std { +// specialization std::hash for tera::MetricId +// make MetricId can be the key of unordered_map +template<> +struct hash<::tera::MetricId> { +public: + size_t operator () (const ::tera::MetricId& id) const { + return str_hash_(id.ToString()); + } +private: + hash str_hash_; +}; + +} // end namespace std + +#endif // TERA_COMMON_METRIC_METRIC_ID_H_ + +/* vim: set expandtab ts=4 sw=4 sts=4 tw=100: */ + diff --git a/src/common/metric/prometheus_subscriber.cc b/src/common/metric/prometheus_subscriber.cc new file mode 100644 index 000000000..9aca684df --- /dev/null +++ b/src/common/metric/prometheus_subscriber.cc @@ -0,0 +1,142 @@ +// Copyright (c) 2017, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. +#include "gflags/gflags.h" +#include "glog/logging.h" + +#include "common/metric/prometheus_subscriber.h" +#include "common/metric/collector_report.h" + +DECLARE_int64(tera_metric_hold_max_time); + +namespace tera{ + +void PrometheusSubscriber::OnUpdate(std::shared_ptr report) { + int64_t value = report->FindMetricValue(metric_id_); + Append(report->timestamp_ms, value); +} + +ReportItem PrometheusSubscriber::Collect() { + ReportItem ret; + std::shared_ptr tera_queue_ptr; + int64_t last_collect_ts; + + { + std::lock_guard lock_mtx(mtx_); + if (tera_queue_ptr_->empty()) { + VLOG(12) << "[PROMETHEUS SUBSCRIBER] Empty Tera Queue"; + return ret; + } + + last_collect_ts = last_collect_ts_; + last_collect_ts_ = tera_queue_ptr_->back().first; + tera_queue_ptr = tera_queue_ptr_; + tera_queue_ptr_.reset(new TimeValueQueue); + } + + int64_t value = GetSpecificValue(tera_queue_ptr); + + if (type_ == SubscriberType::QPS || + type_ == SubscriberType::THROUGHPUT) { + int64_t time_interval = tera_queue_ptr->back().first - last_collect_ts; + value = (time_interval != 0 ? value * 1000 / time_interval : 0); + } + + ret.SetTimeValue({tera_queue_ptr->back().first, value}); + ret.SetType(GetTypeName()); + + return ret; +} + +void PrometheusSubscriber::Append(int64_t time_stamp, int64_t current_value) { + std::lock_guard mtx_lock(mtx_); + tera_queue_ptr_->emplace_back(time_stamp, current_value); + VLOG(12) << "[PROMETHEUS APPEND] " << metric_id_.GetName() + << "\tValue: " << current_value + << "\tQueue Size:" << tera_queue_ptr_->size(); + if (has_inited_) { + DropExpiredValue(); + } else { + last_collect_ts_ = time_stamp; + has_inited_ = true; + } +} + +std::string PrometheusSubscriber::GetTypeName() { + switch (type_) + { + + case SubscriberType::LATEST: + return "Latest"; + + case SubscriberType::MAX: + return "Max"; + + case SubscriberType::MIN: + return "Min"; + + case SubscriberType::SUM: + return "Sum"; + + case SubscriberType::QPS: + return "Qps"; + + case SubscriberType::THROUGHPUT: + return "ThroughPut"; + + default: + LOG(ERROR) << "Unknown collector type: "; + abort(); + + } + //Never reach here + return ""; +} + +void PrometheusSubscriber::DropExpiredValue() { + if (tera_queue_ptr_->empty()) { + return; + } + + auto last_enqueue_ts = tera_queue_ptr_->back().first; + int64_t drop_cnt = 0; + while (last_enqueue_ts - tera_queue_ptr_->front().first >= FLAGS_tera_metric_hold_max_time) { + VLOG(12) << "[PROMETHEUS SUBSCRIBER] drop last_enqueue_ts: " << last_enqueue_ts + << "first_ts: " << tera_queue_ptr_->front().first; + ++drop_cnt; + last_collect_ts_ = tera_queue_ptr_->front().first; + tera_queue_ptr_->pop_front(); + } + if (drop_cnt != 0) { + VLOG(12) << "[PROMETHEUS SUBSCRIBER] drop " << drop_cnt << "values"; + } +} + +int64_t PrometheusSubscriber::GetSpecificValue(std::shared_ptr tera_queue_ptr) { + switch (type_) + { + + case SubscriberType::LATEST: + return GetLatest(tera_queue_ptr); + + case SubscriberType::MAX: + return GetMax(tera_queue_ptr); + + case SubscriberType::MIN: + return GetMin(tera_queue_ptr); + + //Both of SUM, Qps, and THROUGHPUT use GetSum here + case SubscriberType::SUM: + case SubscriberType::QPS: + case SubscriberType::THROUGHPUT: + return GetSum(tera_queue_ptr); + + default: + LOG(ERROR) << "Unknown collector type"; + abort(); + + } + //Never reach here + return -1; +} +} diff --git a/src/common/metric/prometheus_subscriber.h b/src/common/metric/prometheus_subscriber.h new file mode 100644 index 000000000..67affa7bb --- /dev/null +++ b/src/common/metric/prometheus_subscriber.h @@ -0,0 +1,81 @@ +#pragma once +// Copyright (c) 2017, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. +#include +#include +#include +#include +#include +#include +#include + +#include "common/metric/subscriber.h" + +namespace tera { + +using TimeValueQueue = std::deque; + +class PrometheusSubscriber : public Subscriber { +public: + PrometheusSubscriber(const MetricId& metric_id, SubscriberType type = SubscriberType::LATEST): + tera_queue_ptr_(new TimeValueQueue), + last_collect_ts_(0), + has_inited_(false), + type_(type), + metric_id_(metric_id) { } + + ~PrometheusSubscriber() override {} + ReportItem Collect() override; + void OnUpdate(const std::shared_ptr) override; + + std::string GetTypeName() override; + + const MetricId& GetMetricId() override { + return metric_id_; + } + +private: + void Append(int64_t time_stamp, int64_t current_value); + void DropExpiredValue(); + int64_t GetSpecificValue(std::shared_ptr); + + int64_t GetMax(std::shared_ptr tera_queue_ptr) { + return std::max_element(tera_queue_ptr->begin(), tera_queue_ptr->end(), + [](const TimeValuePair& x, const TimeValuePair& y) { + return x.second < y.second; + })->second; + } + + int64_t GetMin(std::shared_ptr tera_queue_ptr) { + return std::min_element(tera_queue_ptr->begin(), tera_queue_ptr->end(), + [](const TimeValuePair& x, const TimeValuePair& y) { + return x.second < y.second; + })->second; + } + + int64_t GetLatest(std::shared_ptr tera_queue_ptr) { + return tera_queue_ptr->back().second; + } + + int64_t GetSum(std::shared_ptr tera_queue_ptr) { + return std::accumulate(tera_queue_ptr->begin(), tera_queue_ptr->end(), (int64_t)0, + [](const int64_t val, const TimeValuePair& x) { + return val + x.second; + }); + } + + + std::mutex mtx_; + //queue of tera timestamp-value + std::shared_ptr tera_queue_ptr_; + //timestamp of prometheus_queue_ptr_'s last enqueue operation + int64_t last_collect_ts_; + //Is this class inited? + bool has_inited_; + //subscriber type + const SubscriberType type_; + MetricId metric_id_; +}; + +} \ No newline at end of file diff --git a/src/common/metric/ratio_collector.h b/src/common/metric/ratio_collector.h new file mode 100644 index 000000000..3a933adef --- /dev/null +++ b/src/common/metric/ratio_collector.h @@ -0,0 +1,45 @@ +// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef TERA_COMMOM_METRIC_RATIO_COLLECTOR_H_ +#define TERA_COMMOM_METRIC_RATIO_COLLECTOR_H_ + +#include +#include "common/metric/collector_report_publisher.h" + +namespace tera { + +class RatioCollector : public Collector { +public: + explicit RatioCollector(Counter* first_counter, + Counter* second_counter, + bool is_periodic = true): + first_counter_(first_counter), + second_counter_(second_counter), + is_periodic_(is_periodic) {} + + int64_t Collect() override { + if (NULL == first_counter_ || NULL == second_counter_) { + return 0; + } else { + double ratio = (double)first_counter_->Get() / second_counter_->Get(); + if (is_periodic_) { + first_counter_->Clear(); + second_counter_->Clear(); + } + return isnan(ratio) ? -1 : static_cast(ratio * 100); + } + } +private: + Counter* const first_counter_; + Counter* const second_counter_; + const bool is_periodic_; +}; + +} // end namespace tera + +#endif // TERA_COMMOM_METRIC_RATIO_COLLECTOR_H_ + +/* vim: set expandtab ts=4 sw=4 sts=4 tw=100: */ + diff --git a/src/common/metric/ratio_subscriber.h b/src/common/metric/ratio_subscriber.h new file mode 100644 index 000000000..32656b46b --- /dev/null +++ b/src/common/metric/ratio_subscriber.h @@ -0,0 +1,58 @@ +#pragma once +// Copyright (c) 2017, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. +#include "common/metric/subscriber.h" +#include +#include + +namespace tera { +class RatioSubscriber : public Subscriber { +public: + RatioSubscriber(const MetricId& metric_id, + std::unique_ptr&& subscriber1, + std::unique_ptr&& subscriber2): + metric_id_(metric_id), + subscriber1_(std::move(subscriber1)), + subscriber2_(std::move(subscriber2)) { + type_name_ = "Ratio: (" + + subscriber1_->GetMetricId().GetName() + ":" + subscriber1_->GetTypeName() + " / " + + subscriber2_->GetMetricId().GetName() + ":" + subscriber2_->GetTypeName() + ")"; + } + + virtual std::string GetTypeName() override { + return type_name_; + } + + virtual void OnUpdate(const std::shared_ptr report_ptr) override { + subscriber1_->OnUpdate(report_ptr); + subscriber2_->OnUpdate(report_ptr); + } + + virtual ReportItem Collect() override { + ReportItem ret; + auto subscriber1_ret = subscriber1_->Collect(); + auto subscriber2_ret = subscriber2_->Collect(); + //timestamp should be equal; + assert(subscriber1_ret.Time() == subscriber2_ret.Time()); + double ratio = (double)subscriber1_ret.Value() / subscriber2_ret.Value(); + ret.SetTimeValue({subscriber1_ret.Time(), + (isnan(ratio) ? -1 : static_cast(ratio))}); + ret.SetType(GetTypeName()); + return ret; + } + + const MetricId& GetMetricId() override { + return metric_id_; + } + + virtual ~RatioSubscriber() override {} + +private: + MetricId metric_id_; + std::unique_ptr subscriber1_; + std::unique_ptr subscriber2_; + std::string type_name_; +}; +} + diff --git a/src/common/metric/subscriber.h b/src/common/metric/subscriber.h new file mode 100644 index 000000000..6b0eb394b --- /dev/null +++ b/src/common/metric/subscriber.h @@ -0,0 +1,66 @@ +#pragma once +// Copyright (c) 2017, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. +#include +#include +#include +#include "common/metric/metric_id.h" + +namespace tera { + +using TimeValuePair = std::pair; + +class CollectorReport; + +struct ReportItem { + TimeValuePair time_value_pair; + std::string type; + ReportItem(TimeValuePair tvp = {-1, -1}, const std::string& t = ""): + time_value_pair(tvp), + type(t) { } + + int64_t Value() const { + return time_value_pair.second; + } + + int64_t Time() const { + return time_value_pair.first; + } + + void SetTimeValue(const TimeValuePair& tvp) { + time_value_pair = tvp; + } + + void SetType(const std::string& tp) { + type = tp; + } + + std::string Type() const { + return type; + } +}; + +class Subscriber { +public: + enum class SubscriberType { + LATEST, + MAX, + MIN, + QPS, + SUM, + THROUGHPUT + }; + virtual ~Subscriber() {} + // return a pair of to Prometheus + virtual ReportItem Collect() = 0; + // Update subscriber, depends to subscriber type + // Called in CollectorReportPublisher::Report() + virtual void OnUpdate(const std::shared_ptr) = 0; + virtual std::string GetTypeName() = 0; + virtual const MetricId& GetMetricId() = 0; +}; + +using SubscriberType = Subscriber::SubscriberType; +using SubscriberReport = std::unordered_multimap; +} \ No newline at end of file diff --git a/src/common/mutex.h b/src/common/mutex.h old mode 100644 new mode 100755 index 46e89044f..381a69218 --- a/src/common/mutex.h +++ b/src/common/mutex.h @@ -12,7 +12,8 @@ #include #include #include -#include "timer.h" +#include +#include "common/timer.h" namespace common { @@ -45,7 +46,7 @@ class Mutex { #ifdef MUTEX_DEBUG int64_t s = 0; if (msg) { - s = timer::get_micros(); + s = get_micros(); } #endif PthreadCall("mutex lock", pthread_mutex_lock(&mu_)); @@ -74,16 +75,16 @@ class Mutex { msg_ = msg; msg_threshold_ = msg_threshold; if (msg_) { - lock_time_ = timer::get_micros(); + lock_time_ = get_micros(); } #endif owner_ = pthread_self(); } void BeforeUnlock() { #ifdef MUTEX_DEBUG - if (msg_ && timer::get_micros() - lock_time_ > msg_threshold_) { + if (msg_ && get_micros() - lock_time_ > msg_threshold_) { printf("%s locked %.3f ms\n", - msg_, (timer::get_micros() - lock_time_) / 1000.0); + msg_, (get_micros() - lock_time_) / 1000.0); } msg_ = NULL; #endif @@ -137,11 +138,14 @@ class CondVar { } // Time wait in us // timeout < 0 would cause ETIMEOUT and return false immediately - bool TimeWaitInUs(int timeout, const char* msg = NULL) { + bool TimeWaitInUs(int64_t timeout, const char* msg = NULL) { // ref: http://www.qnx.com/developers/docs/6.5.0SP1.update/com.qnx.doc.neutrino_lib_ref/p/pthread_cond_timedwait.html struct timespec ts; clock_gettime(CLOCK_MONOTONIC, &ts); - int64_t nsec = ((int64_t)timeout) * 1000 + ts.tv_nsec; + int64_t nsec = timeout * 1000 + ts.tv_nsec; + + assert(nsec > 0); + ts.tv_sec += nsec / 1000000000; ts.tv_nsec = nsec % 1000000000; diff --git a/src/common/request_done_wrapper.h b/src/common/request_done_wrapper.h new file mode 100644 index 000000000..cd6b7b3b7 --- /dev/null +++ b/src/common/request_done_wrapper.h @@ -0,0 +1,29 @@ +#pragma once +#include + +namespace tera { +class RequestDoneWrapper : public google::protobuf::Closure { +public: + static google::protobuf::Closure* NewInstance(google::protobuf::Closure* done) { + return new RequestDoneWrapper(done); + } + + //Self-Deleted, never access it after Run(); + //Default do nothing; + virtual void Run() override { + delete this; + } + + virtual ~RequestDoneWrapper() { + done_->Run(); + } + +protected: + //Can Only Create on Heap; + RequestDoneWrapper(google::protobuf::Closure* done): + done_(done) { } + +private: + google::protobuf::Closure* done_; +}; +} \ No newline at end of file diff --git a/src/common/test/collector_report_test.cc b/src/common/test/collector_report_test.cc new file mode 100644 index 000000000..e01972cc9 --- /dev/null +++ b/src/common/test/collector_report_test.cc @@ -0,0 +1,179 @@ +// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include +#include +#include + +#include "gflags/gflags.h" +#include "glog/logging.h" +#include "gtest/gtest.h" + +#include "common/metric/metric_counter.h" +#include "common/metric/collector_report.h" +#include "common/this_thread.h" + +namespace tera { + +class CollectorReportTest : public ::testing::Test { +public: + CollectorReportTest() + : nonperiod_counter1_label(LabelStringBuilder().Append("key1", "value1").ToString()), + nonperiod_counter1("counter1", nonperiod_counter1_label, {}, false), + nonperiod_counter2("counter2", {}, false), + period_counter1_label(LabelStringBuilder().Append("key2", "value2").ToString()), + period_counter1("counter1", period_counter1_label, {}, true), + period_counter3("counter3", {}, true) { + other_whatever_ids.push_back(MetricId()); + other_whatever_ids.push_back(MetricId("whatevername")); + + MetricLabels whatever_labels; + whatever_labels["haha"] = "hehe"; + whatever_labels["heihei"] = "hoho"; + other_whatever_ids.push_back(MetricId("", whatever_labels)); + other_whatever_ids.push_back(MetricId("whatevername", whatever_labels)); + } + + virtual void SetUp() { + nonperiod_counter1.Set(1); + nonperiod_counter2.Set(2); + period_counter1.Set(3); + period_counter3.Set(4); + } + + virtual void TearDown() { + // reset cache to initial status + CollectorReportPublisher::GetInstance().last_collector_report_.reset(new CollectorReport()); + } +private: + std::string nonperiod_counter1_label; + MetricCounter nonperiod_counter1; + MetricCounter nonperiod_counter2; + std::string period_counter1_label; + MetricCounter period_counter1; + MetricCounter period_counter3; + + std::vector other_whatever_ids; +}; + +TEST_F(CollectorReportTest, FindTest) { + int64_t value = 0; + CollectorReportPublisher::GetInstance().Refresh(); + std::shared_ptr report = CollectorReportPublisher::GetInstance().GetCollectorReport(); + + // check report + EXPECT_EQ(report->report.size(), CollectorReportPublisher::GetInstance().collectors_.size()); + + // nonperiod_counter1 + value = report->FindMetricValue("counter1", nonperiod_counter1_label); + EXPECT_EQ(value, 1); + value = report->FindMetricValue(nonperiod_counter1.metric_id_); + EXPECT_EQ(value, 1); + value = report->FindMetricValue("counter1"); + EXPECT_EQ(value, 0); + value = report->FindMetricValue("counter1", "other not exist label"); + EXPECT_EQ(value, 0); + value = report->FindMetricValue("not exist name", nonperiod_counter1_label); + EXPECT_EQ(value, 0); + value = report->FindMetricValue(MetricId("counter1")); + EXPECT_EQ(value, 0); + + // nonperiod_counter2 + value = report->FindMetricValue("counter2"); + EXPECT_EQ(value, 2); + value = report->FindMetricValue("counter2", ""); + EXPECT_EQ(value, 2); + value = report->FindMetricValue(MetricId("counter2")); + EXPECT_EQ(value, 2); + value = report->FindMetricValue("counter2", "whatever_label"); + EXPECT_EQ(value, 0); + + // period_counter1 + value = report->FindMetricValue("counter1", period_counter1_label); + EXPECT_EQ(value, 3); + value = report->FindMetricValue(period_counter1.metric_id_); + EXPECT_EQ(value, 3); + + // period_counter3 + value = report->FindMetricValue("counter3"); + EXPECT_EQ(value, 4); + value = report->FindMetricValue(period_counter3.metric_id_); + EXPECT_EQ(value, 4); + + // invalid + for (const MetricId& not_exist_id : other_whatever_ids) { + value = report->FindMetricValue(not_exist_id.GetName()); + EXPECT_EQ(value, 0); + value = report->FindMetricValue(not_exist_id.ToString()); + EXPECT_EQ(value, 0); + value = report->FindMetricValue(not_exist_id); + EXPECT_EQ(value, 0); + } + + // report again + nonperiod_counter1.Inc(); + nonperiod_counter2.Inc(); + period_counter1.Inc(); + period_counter3.Inc(); + MetricCounter another_counter1("another1"); + MetricCounter another_counter2("another2"); + another_counter1.Inc(); + CollectorReportPublisher::GetInstance().Refresh(); + report = CollectorReportPublisher::GetInstance().GetCollectorReport(); + EXPECT_EQ(report->report.size(), CollectorReportPublisher::GetInstance().collectors_.size()); + + value = report->FindMetricValue(nonperiod_counter1.metric_id_); + EXPECT_EQ(value, 2); + value = report->FindMetricValue(nonperiod_counter2.metric_id_); + EXPECT_EQ(value, 3); + value = report->FindMetricValue(period_counter1.metric_id_); + EXPECT_EQ(value, 1); + value = report->FindMetricValue(period_counter3.metric_id_); + EXPECT_EQ(value, 1); + value = report->FindMetricValue(another_counter1.metric_id_); + EXPECT_EQ(value, 1); + value = report->FindMetricValue(another_counter2.metric_id_); + EXPECT_EQ(value, 0); +} + +TEST_F(CollectorReportTest, CacheTest) { + // do not update yet + std::shared_ptr initial_report = CollectorReportPublisher::GetInstance().GetCollectorReport(); + EXPECT_TRUE(initial_report.get() != NULL); + EXPECT_TRUE(initial_report->report.empty()); + + // update + CollectorReportPublisher::GetInstance().Refresh(); + std::shared_ptr report1 = CollectorReportPublisher::GetInstance().GetCollectorReport(); + EXPECT_EQ(report1->report.size(), CollectorReportPublisher::GetInstance().collectors_.size()); + EXPECT_TRUE(report1.get() == CollectorReportPublisher::GetInstance().last_collector_report_.get()); + + // modify counters and report again + nonperiod_counter1.Inc(); + nonperiod_counter2.Inc(); + period_counter1.Inc(); + period_counter3.Inc(); + MetricCounter another_counter1("another1"); + MetricCounter another_counter2("another2"); + another_counter1.Inc(); + + // get report before update, return same ptr + std::shared_ptr report2 = CollectorReportPublisher::GetInstance().GetCollectorReport(); + EXPECT_TRUE(report2.get() == CollectorReportPublisher::GetInstance().last_collector_report_.get()); + EXPECT_TRUE(report2.get() == report1.get()); + EXPECT_EQ(report2->FindMetricValue(period_counter3.metric_id_), 4); + + // update and get + CollectorReportPublisher::GetInstance().Refresh(); + std::shared_ptr report3 = CollectorReportPublisher::GetInstance().GetCollectorReport(); + EXPECT_TRUE(report3.get() == CollectorReportPublisher::GetInstance().last_collector_report_.get()); + EXPECT_FALSE(report3.get() == report1.get()); + EXPECT_EQ(report3->report.size(), report2->report.size() + 2); + EXPECT_EQ(report3->FindMetricValue(period_counter3.metric_id_), 1); +} + +} // end namespace tera + +/* vim: set expandtab ts=4 sw=4 sts=4 tw=100: */ + diff --git a/src/common/test/common_test_main.cc b/src/common/test/common_test_main.cc new file mode 100644 index 000000000..90c3b06dd --- /dev/null +++ b/src/common/test/common_test_main.cc @@ -0,0 +1,30 @@ +// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include + +#include "gflags/gflags.h" +#include "glog/logging.h" +#include "gtest/gtest.h" + +#include "utils/utils_cmd.h" + +int main(int argc, char** argv) { + ::google::InitGoogleLogging(argv[0]); + FLAGS_v = 16; + FLAGS_minloglevel=0; + FLAGS_log_dir = "./log"; + if (access(FLAGS_log_dir.c_str(), F_OK)) { + mkdir(FLAGS_log_dir.c_str(), 0777); + } + std::string pragram_name("tera"); + tera::utils::SetupLog(pragram_name); + ::google::ParseCommandLineFlags(&argc, &argv, true); + ::testing::InitGoogleTest(&argc, argv); + + return RUN_ALL_TESTS(); +} + +/* vim: set expandtab ts=4 sw=4 sts=4 tw=100: */ + diff --git a/src/utils/test/counter_test.cc b/src/common/test/counter_test.cc similarity index 95% rename from src/utils/test/counter_test.cc rename to src/common/test/counter_test.cc index 526f9cae6..598c98f04 100644 --- a/src/utils/test/counter_test.cc +++ b/src/common/test/counter_test.cc @@ -11,7 +11,7 @@ #include "common/mutex.h" #include "common/thread_pool.h" -#include "counter.h" +#include "common/counter.h" namespace tera { @@ -69,7 +69,7 @@ TEST(CounterTest, Basic) { Counter counter; ThreadPool* pool = new ThreadPool(thread_num); for (int i = 0; i < thread_num / 4; ++i) { - std::function callback = + std::function callback = std::bind(&callback_add, &counter); pool->AddTask(callback); @@ -99,7 +99,7 @@ TEST(CounterTest, Clear) { Counter counter; ThreadPool* pool = new ThreadPool(thread_num); for (int i = 0; i < thread_num / 3; ++i) { - std::function callback = + std::function callback = std::bind(&callback_add, &counter); pool->AddTask(callback); diff --git a/src/common/test/log_cleaner_test.cc b/src/common/test/log_cleaner_test.cc new file mode 100644 index 000000000..8fbf3ef9f --- /dev/null +++ b/src/common/test/log_cleaner_test.cc @@ -0,0 +1,246 @@ +// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + + +#include +#include +#include +#include +#include + +#include "gflags/gflags.h" +#include "glog/logging.h" +#include "gtest/gtest.h" + +#include "common/file/file_path.h" +#include "common/log/log_cleaner.h" +#include "common/this_thread.h" +#include "utils/utils_cmd.h" + +DECLARE_string(log_dir); +DECLARE_string(tera_log_prefix); +DECLARE_string(tera_leveldb_log_path); +DECLARE_int64(tera_info_log_clean_period_second); +DECLARE_int64(tera_info_log_expire_second); + +using namespace std::placeholders; + +namespace common { + +static size_t g_touch_file_count = 0; +static size_t g_expect_clean_count = 0; +const static int64_t kTestLogExpireSecond = 5; + +std::string TouchFile(const std::string& dir_path, const std::string& filename, bool need_close = true) { + std::string full_path = dir_path + "/" + filename; + int fd = open(full_path.c_str(), O_RDWR | O_CREAT | O_TRUNC, 0777); + if (need_close && fd > 0) { + close(fd); + } + ++g_touch_file_count; + return full_path; +} + +void SetupTestEnv() { + std::string leveldb_log_prefix = "leveldb.log"; + FLAGS_tera_leveldb_log_path = "./log/" + leveldb_log_prefix; + // fake options, change log dir for cleaner + FLAGS_log_dir = "./test_log"; + FLAGS_tera_log_prefix = "tera_test"; + FLAGS_tera_info_log_clean_period_second = 1; + FLAGS_tera_info_log_expire_second = kTestLogExpireSecond; + std::string other_prefix = "tera_other_prefix"; + + // make test log dir, ignore failture + mkdir(FLAGS_log_dir.c_str(), 0777); + g_touch_file_count = 0; + g_expect_clean_count = 0; + + // touch file unlinked + std::string unlinked_info = FLAGS_tera_log_prefix + ".INFO.unlink"; + TouchFile(FLAGS_log_dir, unlinked_info); + std::string unlinked_warn = FLAGS_tera_log_prefix + ".WARNING.unlink"; + TouchFile(FLAGS_log_dir, unlinked_warn); + std::string unlinked_err = FLAGS_tera_log_prefix + ".stderr.unlink"; + TouchFile(FLAGS_log_dir, unlinked_err); + g_expect_clean_count += 3; // expect clean unlinked file + + // touch file linked + std::string linked_info = FLAGS_tera_log_prefix + ".INFO.linked"; + std::string info_link_path = FLAGS_log_dir + "/" + FLAGS_tera_log_prefix + ".INFO"; + std::string linked_info_path = TouchFile(FLAGS_log_dir, linked_info); + // link full path + remove(info_link_path.c_str()); + symlink(linked_info_path.c_str(), info_link_path.c_str()); + ++g_touch_file_count; + + std::string linked_warn = FLAGS_tera_log_prefix + ".WARNING.linked"; + std::string warn_link_path = FLAGS_log_dir + "/" + FLAGS_tera_log_prefix + ".WARNING"; + TouchFile(FLAGS_log_dir, linked_warn); + // link filename only + remove(warn_link_path.c_str()); + symlink(linked_warn.c_str(), warn_link_path.c_str()); + ++g_touch_file_count; + + // touch file opened + std::string opened_info = FLAGS_tera_log_prefix + ".INFO.opened"; + TouchFile(FLAGS_log_dir, opened_info, false); + std::string opened_warn = FLAGS_tera_log_prefix + ".WARNING.opened"; + TouchFile(FLAGS_log_dir, opened_warn, false); + std::string opened_err = FLAGS_tera_log_prefix + ".stderr.opened"; + TouchFile(FLAGS_log_dir, opened_err, false); + + // touch file not start with prefix + std::string other_pre_info = other_prefix + ".INFO.otherpre"; + TouchFile(FLAGS_log_dir, other_pre_info); + std::string other_pre_warn = other_prefix + ".WARNING.otherpre"; + TouchFile(FLAGS_log_dir, other_pre_warn); + std::string other_pre_err = other_prefix + ".stderr.otherpre"; + TouchFile(FLAGS_log_dir, other_pre_err); + + // touch file start with leveldb_log_prefix and open one of them + std::string ldb_pre_info = leveldb_log_prefix; + TouchFile(FLAGS_log_dir, ldb_pre_info, false); + std::string ldb_pre_info_lod = leveldb_log_prefix + ".old"; + TouchFile(FLAGS_log_dir, ldb_pre_info_lod); + g_expect_clean_count++; // expect clean leveldb_log_prefix.old +} + +TEST(LogCleanerTest, InitialStatus) { + // ensure stop firstly + LogCleaner::StopCleaner(); + ASSERT_TRUE(LogCleaner::singleton_instance_ == NULL); + SetupTestEnv(); + LogCleaner *cleaner = LogCleaner::GetInstance(); + + ASSERT_FALSE(cleaner == NULL); + ASSERT_FALSE(cleaner->IsRunning()); + ASSERT_TRUE(cleaner->CheckOptions()); + ASSERT_FALSE(cleaner->stop_); +} + +TEST(LogCleanerTest, Basic) { + SetupTestEnv(); + // get instance + LogCleaner *cleaner = LogCleaner::GetInstance(); + ASSERT_FALSE(cleaner == NULL); + + // check log dir before clean + std::vector reserved_file_list; + bool list_ret = ListCurrentDir(cleaner->info_log_dir_, &reserved_file_list); + ASSERT_TRUE(list_ret); + + // print filelist before clean + std::cout << "before clean. file count: " << reserved_file_list.size() << std::endl; + for (size_t i = 0; i < reserved_file_list.size(); ++i) { + std::cout << reserved_file_list[i] << std::endl; + } + ASSERT_EQ(reserved_file_list.size(), g_touch_file_count); + + // start and stop + cleaner->Start(); + ASSERT_TRUE(cleaner->IsRunning()); + ASSERT_FALSE(cleaner->stop_); + + { + // wait schedule clean first times + MutexLock l(&(cleaner->mutex_), "log cleaner unittest"); + cleaner->bg_cond_.Wait(); + } + + // check clean result + reserved_file_list.clear(); + list_ret = ListCurrentDir(cleaner->info_log_dir_, &reserved_file_list); + ASSERT_TRUE(list_ret); + // print filelist after clean + std::cout << "first clean. expect clean nothing since not expire yet" << std::endl; + EXPECT_EQ(reserved_file_list.size(), g_touch_file_count); + + { + // wait schedule clean second times + MutexLock l(&(cleaner->mutex_), "log cleaner unittest"); + cleaner->bg_cond_.Wait(); + } + // check clean result + reserved_file_list.clear(); + list_ret = ListCurrentDir(cleaner->info_log_dir_, &reserved_file_list); + ASSERT_TRUE(list_ret); + std::cout << "second clean. expect clean nothing since not expire yet" << std::endl; + EXPECT_EQ(reserved_file_list.size(), g_touch_file_count); + + for (size_t i = 3; i < kTestLogExpireSecond + 5; ++i) { + // wait schedule clean several times + std::cout << "wait " << i << " times clean." << std::endl; + MutexLock l(&(cleaner->mutex_), "log cleaner unittest"); + cleaner->bg_cond_.Wait(); + } + // check clean result + reserved_file_list.clear(); + list_ret = ListCurrentDir(cleaner->info_log_dir_, &reserved_file_list); + ASSERT_TRUE(list_ret); + std::cout << "after " << kTestLogExpireSecond + << " times clean. expect clean " << g_expect_clean_count + << " logs: " << std::endl; + // print filelist after clean + for (size_t i = 0; i < reserved_file_list.size(); ++i) { + std::cout << reserved_file_list[i] << std::endl; + } + EXPECT_EQ(reserved_file_list.size(), g_touch_file_count - g_expect_clean_count); + + // stop cleaner + cleaner->Stop(); + ASSERT_FALSE(cleaner->IsRunning()); + ASSERT_TRUE(cleaner->stop_); + ASSERT_FALSE(cleaner == NULL); + + // destroy + LogCleaner::StopCleaner(); + ASSERT_TRUE(LogCleaner::singleton_instance_ == NULL); +} + +TEST(LogCleanerTest, MultiStartAndStop) { + // ensure stop firstly + LogCleaner::StopCleaner(); + ASSERT_TRUE(LogCleaner::singleton_instance_ == NULL); + + SetupTestEnv(); + // get instance + LogCleaner *cleaner = LogCleaner::GetInstance(); + + // stop while not start + cleaner->Stop(); + ASSERT_FALSE(cleaner->IsRunning()); + ASSERT_TRUE(cleaner->stop_); + + // start three times + cleaner->Start(); + ASSERT_TRUE(cleaner->IsRunning()); + cleaner->Start(); + ASSERT_TRUE(cleaner->IsRunning()); + cleaner->Start(); + ASSERT_TRUE(cleaner->IsRunning()); + + { + // wait schedule clean + MutexLock l(&(cleaner->mutex_), "log cleaner unittest"); + cleaner->bg_cond_.Wait(); + } + + // stop twice + cleaner->Stop(); + ASSERT_FALSE(cleaner->IsRunning()); + cleaner->Stop(); + ASSERT_FALSE(cleaner->IsRunning()); + + // start again + cleaner->Start(); + ASSERT_TRUE(cleaner->IsRunning()); + + // stop and destroy + LogCleaner::StopCleaner(); + ASSERT_TRUE(LogCleaner::singleton_instance_ == NULL); +} + +} // end namespace common + diff --git a/src/common/test/metric_counter_test.cc b/src/common/test/metric_counter_test.cc new file mode 100644 index 000000000..00062b8ff --- /dev/null +++ b/src/common/test/metric_counter_test.cc @@ -0,0 +1,97 @@ +// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include +#include +#include +#include + +#include "gflags/gflags.h" +#include "glog/logging.h" +#include "gtest/gtest.h" + +#include "common/metric/metric_counter.h" + +namespace tera { + +class MetricCounterTest : public ::testing::Test { +public: + virtual void SetUp() { + label_str_ = LabelStringBuilder() + .Append("test_label1", "test_value1") + .Append("test_label2", "test_value2") + .ToString(); + } + + virtual void TearDown() {} + +private: + std::string label_str_; +}; + +TEST_F(MetricCounterTest, RegisterTest) { + MetricId test_id; + { + // with name and labels + MetricCounter counter1("counter1", label_str_); + test_id = counter1.metric_id_; + + EXPECT_TRUE(CollectorReportPublisher::GetInstance().HasCollector(counter1.metric_id_)) + << "metric_id " << counter1.metric_id_.ToString() << std::endl; + EXPECT_TRUE(CollectorReportPublisher::GetInstance().HasCollector(test_id)) + << "metric_id " << test_id.ToString() << std::endl; + EXPECT_TRUE(counter1.IsRegistered()); + } + EXPECT_FALSE(CollectorReportPublisher::GetInstance().HasCollector(test_id)) + << "metric_id " << test_id.ToString() << std::endl; + + { + // with name only + MetricCounter counter2("counter2", {}, true); + test_id = counter2.metric_id_; + + EXPECT_TRUE(CollectorReportPublisher::GetInstance().HasCollector(counter2.metric_id_)) + << "metric_id " << counter2.metric_id_.ToString() << std::endl; + EXPECT_TRUE(CollectorReportPublisher::GetInstance().HasCollector(test_id)) + << "metric_id " << test_id.ToString() << std::endl; + EXPECT_TRUE(counter2.IsRegistered()); + } + EXPECT_FALSE(CollectorReportPublisher::GetInstance().HasCollector(test_id)) + << "metric_id " << test_id.ToString() << std::endl; + + // with illegal label string + ASSERT_THROW(MetricCounter("counter3", "illegal_label_string", {}, true), std::invalid_argument); + + // with empty name + ASSERT_THROW(MetricCounter("", label_str_, {}, true), std::invalid_argument); + ASSERT_THROW(MetricCounter("", {}, true), std::invalid_argument); +} + +TEST_F(MetricCounterTest, CollectTest) { + MetricCounter periodic_counter("periodic", label_str_, {}, true); + MetricCounter nonperiodic_counter("nonperiodic", label_str_, {}, false); + + for (size_t i = 0; i < 3; ++i) { + periodic_counter.Inc(); + nonperiodic_counter.Inc(); + } + EXPECT_EQ(periodic_counter.Get(), 3); + EXPECT_EQ(nonperiodic_counter.Get(), 3); + + // do collect + CollectorReportPublisher::GetInstance().Refresh(); + + EXPECT_EQ(periodic_counter.Get(), 0); + EXPECT_EQ(nonperiodic_counter.Get(), 3); + + periodic_counter.Inc(); + nonperiodic_counter.Inc(); + EXPECT_EQ(periodic_counter.Get(), 1); + EXPECT_EQ(nonperiodic_counter.Get(), 4); +} + +} // end namespace tera + +/* vim: set expandtab ts=4 sw=4 sts=4 tw=100: */ + diff --git a/src/common/test/metric_http_server_test.cc b/src/common/test/metric_http_server_test.cc new file mode 100644 index 000000000..c911b438e --- /dev/null +++ b/src/common/test/metric_http_server_test.cc @@ -0,0 +1,138 @@ +// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include +#include +#include + +#include "gflags/gflags.h" +#include "glog/logging.h" +#include "gtest/gtest.h" + +#include "common/metric/metric_counter.h" +#include "common/metric/metric_http_server.h" +#include "common/metric/collector_report.h" +#include "common/base/string_ext.h" + +namespace tera { + +class MetricHttpServerTest : public ::testing::Test { +public: + virtual void SetUp() { + // register metrics + test_counter = new MetricCounter("counter", {SubscriberType::LATEST}); + server = new MetricHttpServer; + test_counter->Set(1); + } + virtual void TearDown() { + delete test_counter; + delete server; + } + +private: + MetricCounter* test_counter; + MetricHttpServer* server; +}; + +TEST_F(MetricHttpServerTest, BuildType) { + std::string body; + ResponseBodyBuilder::BuildType(&body, "good", "gauge"); + EXPECT_STREQ(body.c_str(), "# TYPE good gauge\n"); + ResponseBodyBuilder::BuildType(&body, "bad", "summary"); + EXPECT_STREQ(body.c_str(), "# TYPE good gauge\n" + "# TYPE bad summary\n"); +} + +TEST_F(MetricHttpServerTest, BuildHelp) { + std::string body; + ResponseBodyBuilder::BuildHelp(&body, "good", "good"); + EXPECT_STREQ(body.c_str(), "# HELP good good\n"); + ResponseBodyBuilder::BuildHelp(&body, "bad", "bad"); + EXPECT_STREQ(body.c_str(), "# HELP good good\n" + "# HELP bad bad\n"); +} + +TEST_F(MetricHttpServerTest, BuildMetricItem) { + CollectorReportPublisher::GetInstance().Refresh(); + auto report = CollectorReportPublisher::GetInstance().GetSubscriberReport(); + + std::string body; + int64_t time_stamp; + + for (const auto& item : *report) { + if (item.first.GetName() == "counter") { + ResponseBodyBuilder::BuildMetricItem(&body, item.first, item.second); + time_stamp = item.second.Time(); + } + } + std::string expect_body = "counter{value_type=\"Latest\"} 1 " + + std::to_string(time_stamp) + "\n"; + + EXPECT_EQ(body, expect_body); + EXPECT_EQ(test_counter->Get(), 0); + test_counter->Set(2); + + CollectorReportPublisher::GetInstance().Refresh(); + report = CollectorReportPublisher::GetInstance().GetSubscriberReport(); + + for (const auto& item : *report) { + if (item.first.GetName() == "counter") { + ResponseBodyBuilder::BuildMetricItem(&body, item.first, item.second); + time_stamp = item.second.Time(); + } + } + + expect_body += "counter{value_type=\"Latest\"} 2 " + + std::to_string(time_stamp) + "\n"; + + EXPECT_EQ(body, expect_body); +} + +TEST_F(MetricHttpServerTest, GetResponseBody) { + CollectorReportPublisher::GetInstance().Refresh(); + int64_t timestamp = CollectorReportPublisher::GetInstance().GetCollectorReport()->timestamp_ms; + std::string body = server->GetResponseBody(); + std::vector splited_string; + SplitString(body, "\n", &splited_string); + bool find_counter = false; + for (int idx = 0; idx != splited_string.size(); ++ idx) { + if (splited_string[idx].substr(0, 8) == "counter{") { + find_counter = true; + EXPECT_STREQ(splited_string[idx - 2].c_str(), + "# HELP counter counter"); + EXPECT_STREQ(splited_string[idx - 1].c_str(), + "# TYPE counter gauge"); + std::string expected_line = "counter{value_type=\"Latest\"} 1 " + std::to_string(timestamp); + EXPECT_EQ(expected_line, splited_string[idx]); + } + } + EXPECT_TRUE(find_counter); + EXPECT_EQ(test_counter->Get(), 0); + test_counter->Set(19); + find_counter = false; + + CollectorReportPublisher::GetInstance().Refresh(); + timestamp = CollectorReportPublisher::GetInstance().GetCollectorReport()->timestamp_ms; + body = server->GetResponseBody(); + splited_string.clear(); + SplitString(body, "\n", &splited_string); + for (int idx = 0; idx != splited_string.size(); ++ idx) { + if (splited_string[idx].substr(0, 8) == "counter{") { + find_counter = true; + EXPECT_STREQ(splited_string[idx - 2].c_str(), + "# HELP counter counter"); + EXPECT_STREQ(splited_string[idx - 1].c_str(), + "# TYPE counter gauge"); + std::string expected_line = "counter{value_type=\"Latest\"} 19 " + std::to_string(timestamp); + EXPECT_EQ(expected_line, splited_string[idx]); + } + } + + EXPECT_TRUE(find_counter); + EXPECT_EQ(test_counter->Get(), 0); +} +} // end namespace tera + +/* vim: set expandtab ts=4 sw=4 sts=4 tw=100: */ + diff --git a/src/common/test/metric_id_test.cc b/src/common/test/metric_id_test.cc new file mode 100644 index 000000000..ad2795073 --- /dev/null +++ b/src/common/test/metric_id_test.cc @@ -0,0 +1,178 @@ +// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include +#include +#include + +#include "gflags/gflags.h" +#include "glog/logging.h" +#include "gtest/gtest.h" + +#include "common/metric/metric_id.h" + +namespace tera { + +static const std::string kTestMetricName = "test_name"; + +class MetricIdTest : public ::testing::Test { +public: + virtual void SetUp() { + empty_id_ = new MetricId(); + id_with_name_ = new MetricId(kTestMetricName); + + MetricLabels label_map; + label_map.insert(std::make_pair("test_label1", "test_value1")); + label_map.insert(std::make_pair("test_label2", "test_value2")); + label_str_ = "test_label1:test_value1,test_label2:test_value2"; + + id_with_label_ = new MetricId("", label_map); + id_with_name_and_label_ = new MetricId(kTestMetricName, label_map); + } + + virtual void TearDown() { + delete empty_id_; + delete id_with_name_; + delete id_with_label_; + delete id_with_name_and_label_; + } + +private: + MetricId *empty_id_; + MetricId *id_with_name_; + MetricId *id_with_label_; + MetricId *id_with_name_and_label_; + std::string label_str_; +}; + +TEST_F(MetricIdTest, BasicTest) { + // empty id + ASSERT_FALSE(empty_id_->IsValid()); + ASSERT_TRUE(empty_id_->GetName().empty()); + ASSERT_TRUE(empty_id_->GetLabelMap().empty()); + ASSERT_TRUE(empty_id_->ToString().empty()); + ASSERT_TRUE(empty_id_->GetLabel("whatever_label").empty()); + ASSERT_FALSE(empty_id_->ExistLabel("whatever_label")); + ASSERT_FALSE(empty_id_->CheckLabel("whatever_label", "whatever_value")); + + // id with name, empty label + ASSERT_TRUE(id_with_name_->IsValid()); + ASSERT_STREQ(id_with_name_->GetName().c_str(), kTestMetricName.c_str()); + ASSERT_TRUE(id_with_name_->GetLabelMap().empty()); + ASSERT_STREQ(id_with_name_->ToString().c_str(), kTestMetricName.c_str()); + ASSERT_TRUE(id_with_name_->GetLabel("whatever_label").empty()); + ASSERT_FALSE(id_with_name_->ExistLabel("whatever_label")); + ASSERT_FALSE(id_with_name_->CheckLabel("whatever_label", "whatever_value")); + + // id with name and label + ASSERT_TRUE(id_with_name_and_label_->IsValid()); + ASSERT_STREQ(id_with_name_and_label_->GetName().c_str(), kTestMetricName.c_str()); + ASSERT_EQ(id_with_name_and_label_->GetLabelMap().size(), 2); + + std::string expected_id_str = kTestMetricName + kNameLabelsDelimiter + label_str_; + ASSERT_STREQ(id_with_name_and_label_->ToString().c_str(), expected_id_str.c_str()); + ASSERT_STREQ(id_with_name_and_label_->GetLabel("test_label1").c_str(), "test_value1"); + ASSERT_TRUE(id_with_name_and_label_->ExistLabel("test_label1")); + ASSERT_TRUE(id_with_name_and_label_->CheckLabel("test_label1", "test_value1")); + + ASSERT_TRUE(id_with_name_and_label_->GetLabel("not_exist_label").empty()); + ASSERT_FALSE(id_with_name_and_label_->ExistLabel("not_exist_label")); + ASSERT_FALSE(id_with_name_and_label_->CheckLabel("not_exist_label", "test_value1")); + ASSERT_FALSE(id_with_name_and_label_->CheckLabel("test_label1", "test_value2")); + + // id with label, empty name + ASSERT_FALSE(id_with_label_->IsValid()); +} + +TEST_F(MetricIdTest, CopyTest) { + // copy id + MetricId copy_id(*id_with_name_and_label_); + ASSERT_TRUE(copy_id.IsValid()); + ASSERT_STREQ(copy_id.GetName().c_str(), id_with_name_and_label_->GetName().c_str()); + ASSERT_EQ(copy_id.GetLabelMap().size(), id_with_name_and_label_->GetLabelMap().size()); + ASSERT_STREQ(copy_id.ToString().c_str(), id_with_name_and_label_->ToString().c_str()); + ASSERT_STREQ(copy_id.GetLabel("test_label1").c_str(), "test_value1"); + ASSERT_TRUE(copy_id.ExistLabel("test_label1")); + ASSERT_TRUE(copy_id.CheckLabel("test_label1", "test_value1")); + + ASSERT_TRUE(copy_id.GetLabel("not_exist_label").empty()); + ASSERT_FALSE(copy_id.ExistLabel("not_exist_label")); + ASSERT_FALSE(copy_id.CheckLabel("not_exist_label", "test_value1")); + ASSERT_FALSE(copy_id.CheckLabel("test_label1", "test_value2")); + ASSERT_TRUE(copy_id == *id_with_name_and_label_); + + // assign id + MetricId assign_id; + assign_id = *id_with_name_and_label_; + ASSERT_TRUE(assign_id.IsValid()); + ASSERT_STREQ(assign_id.GetName().c_str(), id_with_name_and_label_->GetName().c_str()); + ASSERT_EQ(assign_id.GetLabelMap().size(), id_with_name_and_label_->GetLabelMap().size()); + ASSERT_STREQ(assign_id.ToString().c_str(), id_with_name_and_label_->ToString().c_str()); + ASSERT_STREQ(assign_id.GetLabel("test_label1").c_str(), "test_value1"); + ASSERT_TRUE(assign_id.ExistLabel("test_label1")); + ASSERT_TRUE(assign_id.CheckLabel("test_label1", "test_value1")); + + ASSERT_TRUE(assign_id.GetLabel("not_exist_label").empty()); + ASSERT_FALSE(assign_id.ExistLabel("not_exist_label")); + ASSERT_FALSE(assign_id.CheckLabel("not_exist_label", "test_value1")); + ASSERT_FALSE(assign_id.CheckLabel("test_label1", "test_value2")); + ASSERT_TRUE(assign_id == *id_with_name_and_label_); +} + +TEST_F(MetricIdTest, BuildTest) { + MetricId test_id; + bool ret = false; + + std::string legal_label_str = LabelStringBuilder() + .Append("test_label1", "test_value1") + .Append("test_label2", "test_value2") + .ToString(); + ASSERT_STREQ(legal_label_str.c_str(), label_str_.c_str()); + + ret = MetricId::ParseFromString(kTestMetricName, legal_label_str, &test_id); + ASSERT_TRUE(ret) << "Parse label string: " << legal_label_str << ", failed" << std::endl; + ASSERT_TRUE(test_id.IsValid()); + ASSERT_STREQ(test_id.GetName().c_str(), kTestMetricName.c_str()); + ASSERT_EQ(test_id.GetLabelMap().size(), id_with_name_and_label_->GetLabelMap().size()); + std::string expected_id_str = kTestMetricName + kNameLabelsDelimiter + legal_label_str; + ASSERT_STREQ(test_id.ToString().c_str(), expected_id_str.c_str()); + + std::string single_label_str = LabelStringBuilder() + .Append("test_label1", "test_value1") + .ToString(); + ASSERT_STREQ(single_label_str.c_str(), "test_label1:test_value1"); + ret = MetricId::ParseFromString(kTestMetricName, single_label_str, &test_id); + ASSERT_TRUE(ret) << "Parse label string: " << single_label_str << ", failed" << std::endl; + ASSERT_TRUE(test_id.IsValid()); + ASSERT_STREQ(test_id.GetName().c_str(), kTestMetricName.c_str()); + ASSERT_EQ(test_id.GetLabelMap().size(), 1); + expected_id_str = kTestMetricName + kNameLabelsDelimiter + single_label_str; + ASSERT_STREQ(test_id.ToString().c_str(), expected_id_str.c_str()); + + std::string empty_label_str = LabelStringBuilder().ToString(); + ASSERT_STREQ(empty_label_str.c_str(), ""); + ret = MetricId::ParseFromString(kTestMetricName, empty_label_str, &test_id); + ASSERT_TRUE(ret); + ASSERT_TRUE(test_id.IsValid()); + ASSERT_STREQ(test_id.GetName().c_str(), kTestMetricName.c_str()); + ASSERT_TRUE(test_id.GetLabelMap().empty()); + ASSERT_STREQ(test_id.ToString().c_str(), kTestMetricName.c_str()); + + std::vector illegal_label_str_vec; + illegal_label_str_vec.push_back("haha:hehe,,,,"); + illegal_label_str_vec.push_back("haha:hehe,hoho"); + illegal_label_str_vec.push_back("haha:hehe,hoho:heihei,"); + illegal_label_str_vec.push_back("haha"); + illegal_label_str_vec.push_back(",lalala"); + + for (const std::string& illegal_label : illegal_label_str_vec) { + ret = MetricId::ParseFromString(kTestMetricName, illegal_label, &test_id); + ASSERT_FALSE(ret); + } +} + +} // end namespace tera + +/* vim: set expandtab ts=4 sw=4 sts=4 tw=100: */ + diff --git a/src/common/test/metrics_test.cc b/src/common/test/metrics_test.cc new file mode 100644 index 000000000..7bc5e9abb --- /dev/null +++ b/src/common/test/metrics_test.cc @@ -0,0 +1,187 @@ +// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include +#include +#include + +#include "gflags/gflags.h" +#include "glog/logging.h" +#include "gtest/gtest.h" + +#include "common/metric/metric_counter.h" +#include "common/metric/hardware_collectors.h" +#include "common/metric/collector_report_publisher.h" +#include "common/this_thread.h" + +DECLARE_int64(tera_hardware_collect_period_second); + +namespace tera { + +class MetricsTest : public ::testing::Test { +public: + virtual void SetUp() { + // shorter period for test + FLAGS_tera_hardware_collect_period_second = 1; + CollectorReportPublisher::GetInstance().AddHardwareCollectors(); + + label_map_["test_label1"] = "test_value1"; + label_map_["test_label2"] = "test_value2"; + } + + virtual void TearDown() { + CollectorReportPublisher::GetInstance().collectors_.clear(); + label_map_.clear(); + } + +private: + MetricLabels label_map_; +}; + +static void PrintCollectorReportPublisher() { + std::cout << "Print Metric Registry: " << std::endl; + auto& metric_map = CollectorReportPublisher::GetInstance().collectors_; + auto metric_iter = metric_map.begin(); + for (; metric_iter != metric_map.end(); ++metric_iter) { + std::cout << metric_iter->first.ToString() << std::endl; + } +} + +TEST_F(MetricsTest, RegisterTest) { + // hardware metrics + ASSERT_TRUE(CollectorReportPublisher::GetInstance().HasCollector(MetricId(kInstCpuMetricName))); + ASSERT_TRUE(CollectorReportPublisher::GetInstance().HasCollector(MetricId(kInstMemMetricName))); + ASSERT_TRUE(CollectorReportPublisher::GetInstance().HasCollector(MetricId(kInstNetRXMetricName))); + ASSERT_TRUE(CollectorReportPublisher::GetInstance().HasCollector(MetricId(kInstNetTXMetricName))); + + bool ret = false; + Counter* test_counters = new Counter[5]; + // register a counter + MetricId test_id_1("test_counter", label_map_); + ret = CollectorReportPublisher::GetInstance().AddCollector( + test_id_1, std::unique_ptr(new CounterCollector(&test_counters[0]))); + EXPECT_TRUE(ret); + EXPECT_TRUE(CollectorReportPublisher::GetInstance().HasCollector(test_id_1)); + PrintCollectorReportPublisher(); + + // register a counter with different name + MetricId test_id_2("test_counter_2", label_map_); + ret = CollectorReportPublisher::GetInstance().AddCollector( + test_id_2, std::unique_ptr(new CounterCollector(&test_counters[0]))); + EXPECT_TRUE(ret); + EXPECT_TRUE(CollectorReportPublisher::GetInstance().HasCollector(test_id_2)); + PrintCollectorReportPublisher(); + + // register a counter with name only + ret = CollectorReportPublisher::GetInstance().AddCollector( + MetricId("test_counter3"), std::unique_ptr(new CounterCollector(&test_counters[2]))); + EXPECT_TRUE(ret); + EXPECT_TRUE(CollectorReportPublisher::GetInstance().HasCollector(MetricId("test_counter3"))); + PrintCollectorReportPublisher(); + + // register a counter with same name and different labels + label_map_["test_label2"] = "other_label_value"; + MetricId test_id_4("test_counter", label_map_); + ret = CollectorReportPublisher::GetInstance().AddCollector( + test_id_4, std::unique_ptr(new CounterCollector(&test_counters[3]))); + EXPECT_TRUE(ret); + EXPECT_TRUE(CollectorReportPublisher::GetInstance().HasCollector(test_id_4)); + PrintCollectorReportPublisher(); + + // register a counter with same id + ret = CollectorReportPublisher::GetInstance().AddCollector( + test_id_1, std::unique_ptr(new CounterCollector(&test_counters[4]))); + EXPECT_FALSE(ret); + EXPECT_TRUE(CollectorReportPublisher::GetInstance().HasCollector(test_id_1)); + PrintCollectorReportPublisher(); + + ret = CollectorReportPublisher::GetInstance().AddCollector( + MetricId("test_counter3"), std::unique_ptr(new CounterCollector(&test_counters[4]))); + EXPECT_FALSE(ret); + EXPECT_TRUE(CollectorReportPublisher::GetInstance().HasCollector(MetricId("test_counter3"))); + PrintCollectorReportPublisher(); + + // unregister + ret = CollectorReportPublisher::GetInstance().DeleteCollector(test_id_1); + EXPECT_TRUE(ret); + EXPECT_FALSE(CollectorReportPublisher::GetInstance().HasCollector(test_id_1)); + EXPECT_TRUE(CollectorReportPublisher::GetInstance().HasCollector(test_id_2)); + + ret = CollectorReportPublisher::GetInstance().DeleteCollector(MetricId("test_counter3")); + EXPECT_TRUE(ret); + EXPECT_FALSE(CollectorReportPublisher::GetInstance().HasCollector(MetricId("test_counter3"))); + EXPECT_TRUE(CollectorReportPublisher::GetInstance().HasCollector(test_id_2)); + + MetricId not_registered_id("not_registered_name", label_map_); + ret = CollectorReportPublisher::GetInstance().DeleteCollector(not_registered_id); + EXPECT_FALSE(ret); + + label_map_["test_label2"] = "not_registered_value"; + MetricId not_registered_id_2("test_counter", label_map_); + ret = CollectorReportPublisher::GetInstance().DeleteCollector(not_registered_id_2); + EXPECT_FALSE(ret); + + ret = CollectorReportPublisher::GetInstance().DeleteCollector(MetricId("not_registered_name")); + EXPECT_FALSE(ret); + + delete[] test_counters; +} + +TEST_F(MetricsTest, ReportTest) { + // check report cache + int64_t value = 0; + + // register 2 counter + std::string label_str = LabelStringBuilder() + .Append("test_label1", "test_value1") + .Append("test_label2", "test_value2") + .ToString(); + MetricCounter periodic_counter("periodic", label_str, {}, true); + MetricCounter nonperiodic_counter("nonperiodic", label_str, {}, false); + + for (size_t i = 0; i < 3; ++i) { + periodic_counter.Inc(); + nonperiodic_counter.Inc(); + } + EXPECT_EQ(periodic_counter.Get(), 3); + EXPECT_EQ(nonperiodic_counter.Get(), 3); + + // do collect + ThisThread::Sleep(10); + + CollectorReportPublisher::GetInstance().Refresh(); + std::shared_ptr report = CollectorReportPublisher::GetInstance().GetCollectorReport(); + + EXPECT_EQ(periodic_counter.Get(), 0); + EXPECT_EQ(nonperiodic_counter.Get(), 3); + + // check report + EXPECT_EQ(report->report.size(), CollectorReportPublisher::GetInstance().collectors_.size()); + value = report->FindMetricValue("periodic", label_str); + EXPECT_EQ(value, 3); + value = report->FindMetricValue("nonperiodic", label_str); + EXPECT_EQ(value, 3); + + // change counter value + periodic_counter.Inc(); + nonperiodic_counter.Dec(); + EXPECT_EQ(periodic_counter.Get(), 1); + EXPECT_EQ(nonperiodic_counter.Get(), 2); + + // report again + CollectorReportPublisher::GetInstance().Refresh(); + report = CollectorReportPublisher::GetInstance().GetCollectorReport(); + EXPECT_EQ(periodic_counter.Get(), 0); + EXPECT_EQ(nonperiodic_counter.Get(), 2); + + value = report->FindMetricValue("periodic", label_str); + EXPECT_EQ(value, 1); + value = report->FindMetricValue("nonperiodic", label_str); + EXPECT_EQ(value, 2); +} + +} // end namespace tera + +/* vim: set expandtab ts=4 sw=4 sts=4 tw=100: */ + diff --git a/src/common/test/profiler_test.cc b/src/common/test/profiler_test.cc new file mode 100644 index 000000000..623d1c0f4 --- /dev/null +++ b/src/common/test/profiler_test.cc @@ -0,0 +1,83 @@ +// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include + +#include +#include + +#include "gtest/gtest.h" + +#include "common/cpu_profiler.h" +#include "common/heap_profiler.h" +#include "common/this_thread.h" + +namespace tera { + +class ProfilerTest : public ::testing::Test { +public: + virtual void SetUp() {} + + virtual void TearDown() {} + +private: + CpuProfiler cpu_profiler_; + HeapProfiler heap_profiler_; +}; + +TEST_F(ProfilerTest, SetEnableTest) { + ProfilerState ps; + EXPECT_FALSE(cpu_profiler_.enable_); + EXPECT_FALSE(heap_profiler_.enable_); + ProfilerGetCurrentState(&ps); + EXPECT_FALSE(ps.enabled); + EXPECT_FALSE(IsHeapProfilerRunning()); + + cpu_profiler_.SetProfilerFile("Cpu") + .SetEnable(true); + + heap_profiler_.SetProfilerFile("Heap") + .SetEnable(true); + + EXPECT_TRUE(cpu_profiler_.enable_); + EXPECT_TRUE(heap_profiler_.enable_); + + ThisThread::Sleep(2000); + ProfilerGetCurrentState(&ps); + EXPECT_TRUE(ps.enabled); + EXPECT_TRUE(IsHeapProfilerRunning()); + + cpu_profiler_.SetEnable(false); + heap_profiler_.SetEnable(false); + + EXPECT_FALSE(cpu_profiler_.enable_); + EXPECT_FALSE(heap_profiler_.enable_); + + ThisThread::Sleep(2000); + ProfilerGetCurrentState(&ps); + EXPECT_FALSE(ps.enabled); + EXPECT_FALSE(IsHeapProfilerRunning()); +} + +TEST_F(ProfilerTest, SetInvervalTest) { + EXPECT_EQ(cpu_profiler_.interval_, std::chrono::seconds(10)); + EXPECT_EQ(heap_profiler_.interval_, std::chrono::seconds(10)); + cpu_profiler_.SetInterval(1000); + heap_profiler_.SetInterval(2000); + EXPECT_EQ(cpu_profiler_.interval_, std::chrono::seconds(1000)); + EXPECT_EQ(heap_profiler_.interval_, std::chrono::seconds(2000)); +} + +TEST_F(ProfilerTest, SetProfilerFileTest) { + EXPECT_EQ(cpu_profiler_.profiler_file_, std::string("")); + EXPECT_EQ(heap_profiler_.profiler_file_, std::string("")); + cpu_profiler_.SetProfilerFile("Good"); + heap_profiler_.SetProfilerFile("Bad"); + EXPECT_EQ(cpu_profiler_.profiler_file_, std::string("Good")); + EXPECT_EQ(heap_profiler_.profiler_file_, std::string("Bad")); +} +} // end namespace tera + +/* vim: set expandtab ts=4 sw=4 sts=4 tw=100: */ + diff --git a/src/common/test/thread_pool_test.cc b/src/common/test/thread_pool_test.cc index 6c1e421cb..8462b78a5 100644 --- a/src/common/test/thread_pool_test.cc +++ b/src/common/test/thread_pool_test.cc @@ -56,7 +56,7 @@ TEST(TimerTest, test1) { clock_gettime(CLOCK_REALTIME, &ts1); gettimeofday(&tv, NULL); - int64_t ts = common::timer::get_micros(); + int64_t ts = get_micros(); int delta = 0; delta = ts1.tv_sec - tv.tv_sec; diff --git a/src/common/timer.h b/src/common/timer.h index 1b335bb6b..b035e18c9 100644 --- a/src/common/timer.h +++ b/src/common/timer.h @@ -1,18 +1,31 @@ -// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved +#pragma once +// Copyright (c) 2017, Baidu.com, Inc. All Rights Reserved // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. // // Author: yanshiguang02@baidu.com -#ifndef TERA_COMMON_TIMER_H_ -#define TERA_COMMON_TIMER_H_ #include #include #include +#include -namespace common { -namespace timer { +namespace tera{ + +static inline int64_t get_timestamp_from_str(const std::string& time) { + struct tm tm; + memset(&tm, 0, sizeof(tm)); + + sscanf(time.c_str(), "%4d%2d%2d-%d:%d:%d", + &tm.tm_year, &tm.tm_mon, &tm.tm_mday, + &tm.tm_hour, &tm.tm_min, &tm.tm_sec); + + tm.tm_year -= 1900; + tm.tm_mon--; + + return mktime(&tm); +} static inline std::string get_time_str(int64_t timestamp) { struct tm tt; @@ -26,12 +39,24 @@ static inline std::string get_curtime_str() { return get_time_str(time(NULL)); } +static inline std::string get_curtime_str_plain() { + struct tm tt; + char buf[20]; + time_t t = time(NULL); + strftime(buf, 20, "%Y%m%d%H%M%S", localtime_r(&t, &tt)); + return std::string(buf); +} + static inline int64_t get_micros() { struct timespec ts; clock_gettime(CLOCK_REALTIME, &ts); return static_cast(ts.tv_sec) * 1000000 + static_cast(ts.tv_nsec) / 1000; } +static inline int64_t get_millis() { + return get_micros() / 1000; +} + static inline int64_t get_unique_micros(int64_t ref) { int64_t now; do { @@ -40,7 +65,12 @@ static inline int64_t get_unique_micros(int64_t ref) { return now; } -} // namespace timer -} // namespace common +static inline int64_t GetTimeStampInUs() { + return get_micros(); +} + +static inline int64_t GetTimeStampInMs() { + return get_millis(); +} +} -#endif // TERA_COMMON_TIMER_H_ diff --git a/src/io/default_compact_strategy.cc b/src/io/default_compact_strategy.cc index b667b8e6e..4e34a6060 100644 --- a/src/io/default_compact_strategy.cc +++ b/src/io/default_compact_strategy.cc @@ -256,6 +256,7 @@ bool DefaultCompactStrategy::InternalMergeProcess(leveldb::Iterator* it, } bool DefaultCompactStrategy::ScanDrop(const Slice& tera_key, uint64_t n) { + bool key_col_qual_same = false; Slice key, col, qual; int64_t ts = -1; leveldb::TeraKeyType type; @@ -345,6 +346,7 @@ bool DefaultCompactStrategy::ScanDrop(const Slice& tera_key, uint64_t n) { } return true; } else { + key_col_qual_same = true; last_type_ = type; } @@ -362,8 +364,7 @@ bool DefaultCompactStrategy::ScanDrop(const Slice& tera_key, uint64_t n) { CHECK(cf_id >= 0) << "illegel column family"; if (type == leveldb::TKT_VALUE) { - if (cur_ts_ == last_ts_ && last_qual_ == qual.ToString() && - last_col_ == col.ToString() && last_key_ == key.ToString()) { + if (cur_ts_ == last_ts_ && key_col_qual_same) { // this is the same key, do not chang version num } else { version_num_++; diff --git a/src/io/tablet_io.cc b/src/io/tablet_io.cc index 81222e447..de97994c7 100644 --- a/src/io/tablet_io.cc +++ b/src/io/tablet_io.cc @@ -27,11 +27,14 @@ #include "leveldb/filter_policy.h" #include "leveldb/raw_key_operator.h" #include "types.h" -#include "utils/counter.h" +#include "common/counter.h" #include "utils/scan_filter.h" #include "utils/string_util.h" -#include "utils/timer.h" +#include "common/timer.h" #include "utils/utils_cmd.h" +#include "common/metric/prometheus_subscriber.h" +#include "common/metric/ratio_subscriber.h" +#include "tabletnode/tabletnode_metric_name.h" DECLARE_string(tera_leveldb_env_type); DECLARE_int64(tera_tablet_log_file_size); @@ -69,11 +72,47 @@ DECLARE_bool(tera_tablet_use_memtable_on_leveldb); DECLARE_int64(tera_tablet_memtable_ldb_write_buffer_size); DECLARE_int64(tera_tablet_memtable_ldb_block_size); -tera::Counter row_read_delay; +DECLARE_bool(tera_leveldb_ignore_corruption_in_open); +DECLARE_int32(tera_leveldb_slow_down_level0_score_limit); +DECLARE_int32(tera_leveldb_max_background_compactions); +DECLARE_int32(tera_tablet_max_sub_parallel_compaction); namespace tera { namespace io { +using tera::tabletnode::kRowDelayMetric; +using tera::tabletnode::kRowCountMetric; +using tera::tabletnode::kRowThroughPutMetric; + +using tera::tabletnode::kApiLabelRead; +using tera::tabletnode::kApiLabelScan; +using tera::tabletnode::kApiLabelWrite; + +using tera::tabletnode::kLowLevelReadMetric; + +tera::MetricCounter low_level_read_count(kLowLevelReadMetric, {SubscriberType::QPS}); + +tera::MetricCounter row_read_delay(kRowDelayMetric, kApiLabelRead, {}); +tera::MetricCounter row_read_count(kRowCountMetric, kApiLabelRead, {SubscriberType::QPS}); +tera::MetricCounter row_read_bytes(kRowThroughPutMetric, kApiLabelRead, {SubscriberType::THROUGHPUT}); + +tera::MetricCounter row_scan_delay(kRowDelayMetric, kApiLabelScan, {}); +tera::MetricCounter row_scan_count(kRowCountMetric, kApiLabelScan, {SubscriberType::QPS}); +tera::MetricCounter row_scan_bytes(kRowThroughPutMetric, kApiLabelScan, {SubscriberType::THROUGHPUT}); + +tera::MetricCounter row_write_bytes(kRowThroughPutMetric, kApiLabelWrite, {SubscriberType::THROUGHPUT}); + +tera::AutoSubscriberRegister row_read_delay_per_row(std::unique_ptr(new tera::RatioSubscriber( + MetricId("tera_ts_row_read_delay_us_per_row"), + std::unique_ptr(new tera::PrometheusSubscriber(MetricId(kRowDelayMetric, kApiLabelRead), SubscriberType::SUM)), + std::unique_ptr(new tera::PrometheusSubscriber(MetricId(kRowCountMetric, kApiLabelRead), SubscriberType::SUM))))); + +tera::AutoSubscriberRegister row_scan_delay_per_row(std::unique_ptr(new tera::RatioSubscriber( + MetricId("tera_ts_row_scan_delay_us_per_row"), + std::unique_ptr(new tera::PrometheusSubscriber(MetricId(kRowDelayMetric, kApiLabelScan), SubscriberType::SUM)), + std::unique_ptr(new tera::PrometheusSubscriber(MetricId(kRowCountMetric, kApiLabelScan), SubscriberType::SUM))))); + + std::ostream& operator << (std::ostream& o, const TabletIO& tablet_io) { o << tablet_io.short_path_ << " [" << DebugString(tablet_io.start_key_) @@ -81,6 +120,17 @@ std::ostream& operator << (std::ostream& o, const TabletIO& tablet_io) { return o; } +std::string MetricLabelToString(const std::string& tablet_path) { + size_t sep_pos = tablet_path.find_last_of("/"); + if (sep_pos == std::string::npos) { + // meta tablet + return LabelStringBuilder().Append("table", tablet_path).Append("tablet", tablet_path).ToString(); + } else { + std::string table_name = tablet_path.substr(0, sep_pos); + return LabelStringBuilder().Append("table", table_name).Append("tablet", tablet_path).ToString(); + } +} + TabletIO::TabletIO(const std::string& key_start, const std::string& key_end, const std::string& path) : async_writer_(NULL), @@ -90,10 +140,12 @@ TabletIO::TabletIO(const std::string& key_start, const std::string& key_end, short_path_(path), compact_status_(kTableNotCompact), status_(kNotInit), + tablet_status_(static_cast(kTabletReady)), ref_count_(1), db_ref_count_(0), db_(NULL), m_memory_cache(NULL), kv_only_(false), key_operator_(NULL), + counter_(short_path_), mock_env_(NULL) { } @@ -138,6 +190,10 @@ std::string TabletIO::GetEndKey() const { return end_key_; } +const std::string& TabletIO::GetMetricLabel() const { + return counter_.label; +} + CompactStatus TabletIO::GetCompactStatus() const { return compact_status_; } @@ -167,6 +223,7 @@ void TabletIO::SetMemoryCache(leveldb::Cache* cache) { bool TabletIO::Load(const TableSchema& schema, const std::string& path, const std::vector& parent_tablets, + const std::set& ignore_err_lgs, std::map snapshots, std::map rollbacks, leveldb::Logger* logger, @@ -226,6 +283,7 @@ bool TabletIO::Load(const TableSchema& schema, ldb_options_.key_start = raw_start_key_; ldb_options_.key_end = raw_end_key_; ldb_options_.l0_slowdown_writes_trigger = FLAGS_tera_tablet_level0_file_limit; + ldb_options_.max_sub_parallel_compaction = FLAGS_tera_tablet_max_sub_parallel_compaction; ldb_options_.ttl_percentage = FLAGS_tera_tablet_ttl_percentage; ldb_options_.del_percentage = FLAGS_tera_tablet_del_percentage; ldb_options_.block_size = FLAGS_tera_tablet_write_block_size * 1024; @@ -234,6 +292,9 @@ bool TabletIO::Load(const TableSchema& schema, ldb_options_.log_async_mode = FLAGS_tera_log_async_mode; ldb_options_.info_log = logger; ldb_options_.max_open_files = FLAGS_tera_memenv_table_cache_size; + ldb_options_.max_background_compactions = FLAGS_tera_leveldb_max_background_compactions; + ldb_options_.slow_down_level0_score_limit = FLAGS_tera_leveldb_slow_down_level0_score_limit; + ldb_options_.ignore_corruption_in_open = FLAGS_tera_leveldb_ignore_corruption_in_open; ldb_options_.use_memtable_on_leveldb = FLAGS_tera_tablet_use_memtable_on_leveldb; ldb_options_.memtable_ldb_write_buffer_size = @@ -277,7 +338,7 @@ bool TabletIO::Load(const TableSchema& schema, ldb_options_.ignore_corruption_in_compaction = FLAGS_tera_leveldb_ignore_corruption_in_compaction; ldb_options_.use_file_lock = FLAGS_tera_leveldb_use_file_lock; ldb_options_.disable_wal = table_schema_.disable_wal(); - SetupOptionsForLG(); + SetupOptionsForLG(ignore_err_lgs); std::string path_prefix = FLAGS_tera_tabletnode_path_prefix; if (*path_prefix.rbegin() != '/') { @@ -328,6 +389,23 @@ bool TabletIO::Load(const TableSchema& schema, return true; } +bool TabletIO::ShouldForceUnloadOnError() { + { + MutexLock lock(&mutex_); + if (status_ != kReady) { + return false; + } + db_ref_count_++; + } + // If TabletIO is Ready but has encountered some fatal errors + bool ret = db_->ShouldForceUnloadOnError(); + { + MutexLock lock(&mutex_); + db_ref_count_--; + } + return ret; +} + bool TabletIO::Unload(StatusCode* status) { { MutexLock lock(&mutex_); @@ -341,7 +419,6 @@ bool TabletIO::Unload(StatusCode* status) { LOG(INFO) << "[Unload] start shutdown1 " << tablet_path_; leveldb::Status s = db_->Shutdown1(); - { MutexLock lock(&mutex_); status_ = kUnLoading2; @@ -566,13 +643,13 @@ bool TabletIO::IsBusy() { db_ref_count_++; } bool is_busy = db_->BusyWrite(); + is_busy = is_busy ? true : async_writer_->IsBusy(); { MutexLock lock(&mutex_); db_ref_count_--; } return is_busy; } - bool TabletIO::Workload(double* write_workload) { { MutexLock lock(&mutex_); @@ -581,7 +658,14 @@ bool TabletIO::Workload(double* write_workload) { } db_ref_count_++; } + + // if busy cause by write log, set workload score more than 10, because level 0 + // limits to 20 sst files by default, which score is 10. db_->Workload(write_workload); + if (*write_workload < 10.618 && async_writer_->IsBusy()) { + *write_workload = 10.618; + } + { MutexLock lock(&mutex_); db_ref_count_--; @@ -700,6 +784,7 @@ bool TabletIO::LowLevelScan(const std::string& start_tera_key, ScanContext* context = new ScanContext; context->compact_strategy = ldb_options_.compact_strategy_factory->NewInstance(); context->version_num = 1; + context->qu_num = 1; bool ret = LowLevelScan(start_tera_key, end_row_key, scan_options, it, context, value_list, next_start_point, read_row_count, read_bytes, is_complete, status); @@ -849,6 +934,7 @@ inline bool TabletIO::LowLevelScan(const std::string& start_tera_key, std::string& last_col = scan_context->last_col; std::string& last_qual = scan_context->last_qual; uint32_t& version_num = scan_context->version_num; + uint64_t& qu_num = scan_context->qu_num; std::list row_buf; uint32_t buffer_size = 0; @@ -861,13 +947,18 @@ inline bool TabletIO::LowLevelScan(const std::string& start_tera_key, KeyValuePair next_start_kv_pair; VLOG(9) << "ll-scan timeout set to be " << scan_options.timeout << ", start_tera_key " << DebugString(start_tera_key) - << ", end_row_key " << DebugString(end_row_key); + << ", end_row_key " << DebugString(end_row_key) + << ", max_size " << scan_options.max_size + << ", number_limit " << scan_options.number_limit + << ", max_versions " << scan_options.max_versions + << ", max_qualifiers " << scan_options.max_qualifiers; *is_complete = false; for (; it->Valid();) { bool has_merged = false; std::string merged_value; counter_.low_read_cell.Inc(); + low_level_read_count.Inc(); *read_bytes += it->key().size() + it->value().size(); now_time = GetTimeStampInMs(); @@ -886,7 +977,21 @@ inline bool TabletIO::LowLevelScan(const std::string& start_tera_key, << "] key=[" << DebugString(key.ToString()) << "] column=[" << DebugString(col.ToString()) << ":" << DebugString(qual.ToString()) - << "] ts=[" << ts << "] type=[" << type << "]"; + << "] ts=[" << ts << "] type=[" << type << "]" + << " buffer_size=[" << buffer_size << "]" + << " number_limit=[" << number_limit << "]" + << " read_bytes=[" << *read_bytes << "]" + << " qu_num=[" << qu_num << "]"; + + if (now_time > time_out) { + VLOG(9) << "ll-scan timeout, now_time: " << now_time << ", time_out: " << time_out; + if (next_start_point != NULL) { + VLOG(9) << "Mark next start key: " << DebugString(tera_key.ToString()); + MakeKvPair(key, col, qual, ts, "", next_start_point); + } + SetStatusCode(kRPCTimeout, status); + break; + } if (end_row_key.size() && key.compare(end_row_key) >= 0) { // scan finished @@ -932,15 +1037,8 @@ inline bool TabletIO::LowLevelScan(const std::string& start_tera_key, *read_row_count += 1; ProcessRowBuffer(row_buf, scan_options, value_list, &buffer_size, &number_limit); row_buf.clear(); - - if (now_time > time_out && (next_start_point != NULL)) { - VLOG(9) << "ll-scan timeout. Mark next start key: " << DebugString(tera_key.ToString()); - MakeKvPair(key, col, qual, ts, "", next_start_point); - break; - } } - // max version filter if (key.compare(last_key) == 0 && col.compare(last_col) == 0 && qual.compare(last_qual) == 0) { @@ -949,6 +1047,16 @@ inline bool TabletIO::LowLevelScan(const std::string& start_tera_key, continue; } } else { + if (key.compare(last_key) == 0 && col.compare(last_col) == 0 ) { + if (++qu_num > scan_options.max_qualifiers) { + VLOG(10) << "max_qualifiers triggered, max_qualifiers: " << scan_options.max_qualifiers; + it->Next(); + continue; + } + } else { + qu_num = 1; + } + last_key.assign(key.data(), key.size()); last_col.assign(col.data(), col.size()); last_qual.assign(qual.data(), qual.size()); @@ -957,6 +1065,7 @@ inline bool TabletIO::LowLevelScan(const std::string& start_tera_key, has_merged = compact_strategy->ScanMergedValue(it, &merged_value, &merged_num); if (has_merged) { counter_.low_read_cell.Add(merged_num - 1); + low_level_read_count.Add(merged_num - 1); value = merged_value; key = last_key; col = last_col; @@ -977,7 +1086,9 @@ inline bool TabletIO::LowLevelScan(const std::string& start_tera_key, // check scan buffer if (buffer_size >= scan_options.max_size || number_limit >= scan_options.number_limit) { - VLOG(10) << "stream scan, break scan context, version_num " << version_num + VLOG(10) << "stream scan, break scan context" + <<", buffer_size " << buffer_size + <<", number_limit " << number_limit << ", key " << DebugString(key.ToString()) << ", col " << DebugString(col.ToString()) << ", qual " << DebugString(qual.ToString()); it->Next(); @@ -1000,6 +1111,9 @@ inline bool TabletIO::LowLevelScan(const std::string& start_tera_key, ProcessRowBuffer(row_buf, scan_options, value_list, &buffer_size, &number_limit); } + if (*status == kRPCTimeout) { + return false; + } if (!it->Valid() && !(it->status().ok())) { SetStatusCode(it->status(), status); VLOG(10) << "ll-scan fail: " << "tablet=[" << tablet_path_ << "], " @@ -1054,6 +1168,7 @@ bool TabletIO::LowLevelSeek(const std::string& row_key, leveldb::TKT_FORSEEK, &row_seek_key); it_data->Seek(row_seek_key); counter_.low_read_cell.Inc(); + low_level_read_count.Inc(); if (it_data->Valid()) { VLOG(10) << "ll-seek: " << "tablet=[" << tablet_path_ << "] row_key=[" << row_key << "]"; @@ -1087,6 +1202,7 @@ bool TabletIO::LowLevelSeek(const std::string& row_key, leveldb::TKT_FORSEEK, &cf_seek_key); it_data->Seek(cf_seek_key); counter_.low_read_cell.Inc(); + low_level_read_count.Inc(); if (it_data->Valid()) { VLOG(10) << "ll-seek: " << "tablet=[" << tablet_path_ << "] row_key=[" << row_key @@ -1122,6 +1238,7 @@ bool TabletIO::LowLevelSeek(const std::string& row_key, uint32_t version_num = 0; for (; it_data->Valid();) { counter_.low_read_cell.Inc(); + low_level_read_count.Inc(); VLOG(10) << "ll-seek: " << "tablet=[" << tablet_path_ << "] row_key=[" << row_key << "] cf=[" << cf_name << "] qu=[" << qu_name << "]"; @@ -1134,7 +1251,7 @@ bool TabletIO::LowLevelSeek(const std::string& row_key, break; } - // skip qu delete mark + // skip qu delete mark and out-of-range version if (compact_strategy->ScanDrop(it_data->key(), 0)) { VLOG(10) << "ll-seek: scan drop " << "tablet=[" << tablet_path_ << "] row_key=[" << row_key << "] cf=[" << cf_name @@ -1143,6 +1260,14 @@ bool TabletIO::LowLevelSeek(const std::string& row_key, continue; } + if (scan_options.ts_start > timestamp) { + break; + } + if (scan_options.ts_end < timestamp) { + it_data->Next(); + continue; + } + // version filter if (++version_num > scan_options.max_versions) { break; @@ -1160,6 +1285,7 @@ bool TabletIO::LowLevelSeek(const std::string& row_key, compact_strategy->ScanMergedValue(it_data, &merged_value, &merged_num); if (has_merged) { counter_.low_read_cell.Add(merged_num - 1); + low_level_read_count.Add(merged_num - 1); kv->set_value(merged_value); VLOG(10) << "ll-seek merge: " << "key=[" << DebugString(row_key) << "] column=[" << DebugString(cf_name) @@ -1188,7 +1314,7 @@ bool TabletIO::LowLevelSeek(const std::string& row_key, } bool TabletIO::ReadCells(const RowReaderInfo& row_reader, RowResult* value_list, - uint64_t snapshot_id, StatusCode* status) { + uint64_t snapshot_id, StatusCode* status, int64_t timeout_ms) { { MutexLock lock(&mutex_); if (status_ != kReady && status_ != kOnSplit @@ -1205,7 +1331,7 @@ bool TabletIO::ReadCells(const RowReaderInfo& row_reader, RowResult* value_list, db_ref_count_++; } - int64_t read_ms = get_micros(); + int64_t start_read_us = get_micros(); if (kv_only_) { std::string key(row_reader.key()); @@ -1215,7 +1341,8 @@ bool TabletIO::ReadCells(const RowReaderInfo& row_reader, RowResult* value_list, } if (!Read(key, &value, snapshot_id, status)) { counter_.read_rows.Inc(); - row_read_delay.Add(get_micros() - read_ms); + row_read_count.Inc(); + row_read_delay.Add(get_micros() - start_read_us); { MutexLock lock(&mutex_); db_ref_count_--; @@ -1226,8 +1353,10 @@ bool TabletIO::ReadCells(const RowReaderInfo& row_reader, RowResult* value_list, result->set_key(row_reader.key()); result->set_value(value); counter_.read_rows.Inc(); + row_read_count.Inc(); counter_.read_size.Add(result->ByteSize()); - row_read_delay.Add(get_micros() - read_ms); + row_read_bytes.Add(result->ByteSize()); + row_read_delay.Add(get_micros() - start_read_us); { MutexLock lock(&mutex_); db_ref_count_--; @@ -1258,12 +1387,23 @@ bool TabletIO::ReadCells(const RowReaderInfo& row_reader, RowResult* value_list, if (row_reader.has_max_version()) { scan_options.max_versions = row_reader.max_version(); } + + if (row_reader.has_max_qualifiers()) { + scan_options.max_qualifiers = row_reader.max_qualifiers(); + } else { + scan_options.max_qualifiers = std::numeric_limits::max(); + } + if (row_reader.has_time_range()) { scan_options.ts_start = row_reader.time_range().ts_start(); scan_options.ts_end = row_reader.time_range().ts_end(); + VLOG(10) << "ReadCells: " << "timerange=[" << scan_options.ts_start + << "," << scan_options.ts_end << "]"; } scan_options.snapshot_id = snapshot_id; + scan_options.timeout = timeout_ms; + VLOG(10) << "ReadCells: " << "key=[" << DebugString(row_reader.key()) << "]"; @@ -1284,7 +1424,8 @@ bool TabletIO::ReadCells(const RowReaderInfo& row_reader, RowResult* value_list, &is_complete, status); } counter_.read_rows.Inc(); - row_read_delay.Add(get_micros() - read_ms); + row_read_count.Inc(); + row_read_delay.Add(get_micros() - start_read_us); { MutexLock lock(&mutex_); db_ref_count_--; @@ -1293,6 +1434,7 @@ bool TabletIO::ReadCells(const RowReaderInfo& row_reader, RowResult* value_list, return false; } else { counter_.read_size.Add(value_list->ByteSize()); + row_read_bytes.Add(value_list->ByteSize()); } if (value_list->key_values_size() == 0) { @@ -1310,7 +1452,6 @@ bool TabletIO::WriteBatch(leveldb::WriteBatch* batch, bool disable_wal, bool syn CHECK_NOTNULL(db_); - counter_.write_size.Add(batch->DataSize()); leveldb::Status db_status = db_->Write(options, batch); if (!db_status.ok()) { LOG(ERROR) << "fail to batch write to tablet: " << tablet_path_ @@ -1318,6 +1459,8 @@ bool TabletIO::WriteBatch(leveldb::WriteBatch* batch, bool disable_wal, bool syn SetStatusCode(kIOError, status); return false; } + counter_.write_size.Add(batch->DataSize()); + row_write_bytes.Add(batch->DataSize()); SetStatusCode(kTabletNodeOk, status); return true; } @@ -1349,6 +1492,10 @@ bool TabletIO::Write(std::vector* row_mutation_vec, } bool ret = async_writer_->Write(row_mutation_vec, status_vec, is_instant, callback, status); + if (!ret) { + counter_.write_reject_rows.Add(row_mutation_vec->size()); + } + { MutexLock lock(&mutex_); db_ref_count_--; @@ -1426,12 +1573,18 @@ bool TabletIO::ScanRowsRestricted(const ScanTabletRequest* request, StatusCode status = kTabletNodeOk; bool ret = false; + + int64_t start_scan_us = get_micros(); + if (LowLevelScan(start_tera_key, end_row_key, scan_options, response->mutable_results(), response->mutable_next_start_point(), &read_row_count, &read_bytes, &is_complete, &status)) { response->set_complete(is_complete); counter_.scan_rows.Add(read_row_count); counter_.scan_size.Add(read_bytes); + row_scan_count.Add(read_row_count); + row_scan_bytes.Add(read_bytes); + row_scan_delay.Add(get_micros() - start_scan_us); ret = true; } @@ -1464,17 +1617,26 @@ bool TabletIO::HandleScan(const ScanTabletRequest* request, void TabletIO::ProcessScan(ScanContext* context) { uint32_t rows_scan_num = 0; uint32_t size_scan_bytes = 0; + + int64_t start_scan_us = get_micros(); + if (LowLevelScan(context->start_tera_key, context->end_row_key, context->scan_options, context->it, context, context->result, NULL, &rows_scan_num, &size_scan_bytes, &context->complete, &context->ret_code)) { counter_.scan_rows.Add(rows_scan_num); counter_.scan_size.Add(size_scan_bytes); + row_scan_count.Add(rows_scan_num); + row_scan_bytes.Add(size_scan_bytes); + row_scan_delay.Add(get_micros() - start_scan_us); } } bool TabletIO::Scan(const ScanOption& option, KeyValueList* kv_list, bool* complete, StatusCode* status) { + + int64_t start_scan_us = get_micros(); + std::string start = option.key_range().key_start(); std::string end = option.key_range().key_end(); if (start < start_key_) { @@ -1558,8 +1720,13 @@ bool TabletIO::Scan(const ScanOption& option, KeyValueList* kv_list, if (!it->Valid()) { *complete = true; } + counter_.scan_rows.Add(kv_list->size()); counter_.scan_size.Add(pack_size); + row_scan_count.Add(kv_list->size()); + row_scan_bytes.Add(pack_size); + row_scan_delay.Add(get_micros() - start_scan_us); + delete it; delete strategy; @@ -1618,6 +1785,11 @@ void TabletIO::SetupScanRowOptions(const ScanTabletRequest* request, if (request->has_max_version()) { scan_options->max_versions = request->max_version(); } + if (request->has_max_qualifiers()) { + scan_options->max_qualifiers = request->max_qualifiers(); + } else { + scan_options->max_qualifiers = std::numeric_limits::max(); + } if (request->has_timerange()) { scan_options->ts_start = request->timerange().ts_start(); scan_options->ts_end = request->timerange().ts_end(); @@ -1635,7 +1807,7 @@ void TabletIO::SetupScanRowOptions(const ScanTabletRequest* request, } // no concurrent, so no lock on schema_mutex_ -void TabletIO::SetupOptionsForLG() { +void TabletIO::SetupOptionsForLG(const std::set& ignore_err_lgs) { if (kv_only_) { if (RawKeyType() == TTLKv) { ldb_options_.compact_strategy_factory = @@ -1656,6 +1828,7 @@ void TabletIO::SetupOptionsForLG() { std::set* exist_lg_list = new std::set; std::map* lg_info_list = new std::map; + std::set ignore_corruption_in_open_lg_list; int64_t triggered_log_size = 0; for (int32_t lg_i = 0; lg_i < table_schema_.locality_groups_size(); @@ -1721,6 +1894,9 @@ void TabletIO::SetupOptionsForLG() { triggered_log_size += lg_info->write_buffer_size; exist_lg_list->insert(lg_i); (*lg_info_list)[lg_i] = lg_info; + if (ignore_err_lgs.find(lg_schema.name()) != ignore_err_lgs.end()) { + ignore_corruption_in_open_lg_list.insert(lg_i); + } } if (mock_env_ != NULL) { ldb_options_.env = LeveldbMockEnv(); @@ -1738,6 +1914,8 @@ void TabletIO::SetupOptionsForLG() { delete lg_info_list; } else { ldb_options_.lg_info_list = lg_info_list; + ldb_options_.ignore_corruption_in_open_lg_list + = ignore_corruption_in_open_lg_list; } IndexingCfToLG(); @@ -1994,23 +2172,6 @@ const leveldb::RawKeyOperator* TabletIO::GetRawKeyOperator() { return key_operator_; } -void TabletIO::GetAndClearCounter(TabletCounter* counter) { - counter->set_low_read_cell(counter_.low_read_cell.Clear()); - counter->set_scan_rows(counter_.scan_rows.Clear()); - counter->set_scan_kvs(counter_.scan_kvs.Clear()); - counter->set_scan_size(counter_.scan_size.Clear()); - counter->set_read_rows(counter_.read_rows.Clear()); - counter->set_read_kvs(counter_.read_kvs.Clear()); - counter->set_read_size(counter_.read_size.Clear()); - counter->set_write_rows(counter_.write_rows.Clear()); - counter->set_write_kvs(counter_.write_kvs.Clear()); - counter->set_write_size(counter_.write_size.Clear()); - counter->set_is_on_busy(IsBusy()); - double write_workload = 0; - Workload(&write_workload); - counter->set_write_workload(write_workload); -} - int32_t TabletIO::AddRef() { MutexLock lock(&mutex_); ++ref_count_; @@ -2040,6 +2201,36 @@ void TabletIO::ApplySchema(const TableSchema& schema) { ldb_options_.compact_strategy_factory->SetArg(&schema); } +bool TabletIO::PutIfAbsentCheck(const std::string& row_key, + const Mutation& mutation) { + RowResult value_list; + ScanOptions scan_options; + std::set& qualifier_list = scan_options.column_family_list[mutation.family()]; + qualifier_list.insert(mutation.qualifier()); + scan_options.iter_cf_set.insert(mutation.family()); + scan_options.max_versions = 1; + StatusCode status; + if (!LowLevelSeek(row_key, scan_options, &value_list, &status)) { + if (status == kKeyNotExist) { + return true; + } + VLOG(9) << "txn of row (PutIfAbsent) " << DebugString(row_key) + << ":" << DebugString(mutation.family()) + << ":" << DebugString(mutation.qualifier()) + << " is interrupted: lowlevelseek fail"; + return false; + } + + if (value_list.key_values_size() > 0) { + VLOG(9) << "txn of row (PutIfAbsent) " << DebugString(row_key) + << ":" << DebugString(mutation.family()) + << ":" << DebugString(mutation.qualifier()) + << " is interrupted: already exist"; + return false; + } + return true; +} + bool TabletIO::SingleRowTxnCheck(const std::string& row_key, const SingleRowTxnReadInfo& txn_read_info, StatusCode* status) { @@ -2098,5 +2289,33 @@ bool TabletIO::SingleRowTxnCheck(const std::string& row_key, return true; } +bool TabletIO::GetDBStatus(tera::TabletStatus* tablet_status, bool slow_check) { + *tablet_status = static_cast(kTabletReady); + { + MutexLock lock(&mutex_); + if (status_ != kReady) { + return false; + } + db_ref_count_++; + } + + std::string db_property_key = "leveldb.verify-db-integrity"; + std::string db_property_val; + if (slow_check && db_->GetProperty(db_property_key, &db_property_val)) { + if (db_property_val.find("verify_fail") != std::string::npos) { + tablet_status_ = kTabletCorruption; + } else { + tablet_status_ = static_cast(kTabletReady); + } + } + *tablet_status = tablet_status_; + + { + MutexLock lock(&mutex_); + db_ref_count_--; + } + return true; +} + } // namespace io } // namespace tera diff --git a/src/io/tablet_io.h b/src/io/tablet_io.h index ba5cd99cf..9ce73d96a 100644 --- a/src/io/tablet_io.h +++ b/src/io/tablet_io.h @@ -13,6 +13,7 @@ #include #include "common/base/scoped_ptr.h" +#include "common/metric/metric_counter.h" #include "common/mutex.h" #include "io/tablet_scanner.h" #include "leveldb/db.h" @@ -26,9 +27,23 @@ #include "proto/table_schema.pb.h" #include "proto/tabletnode_rpc.pb.h" #include "types.h" -#include "utils/counter.h" +#include "common/counter.h" namespace tera { + +// metric name constants +const char* const kLowReadCellMetricName = "tera_ts_tablet_low_read_cell_count"; +const char* const kScanRowsMetricName = "tera_ts_tablet_scan_row_count"; +const char* const kScanKvsMetricName = "tera_ts_tablet_scan_kv_count"; +const char* const kScanThroughPutMetricName = "tera_ts_tablet_scan_through_put"; +const char* const kReadRowsMetricName = "tera_ts_tablet_read_row_count"; +const char* const kReadKvsMetricName = "tera_ts_tablet_read_kv_count"; +const char* const kReadThroughPutMetricName = "tera_ts_tablet_read_through_put"; +const char* const kWriteRowsMetricName = "tera_ts_tablet_write_row_count"; +const char* const kWriteKvsMetricName = "tera_ts_tablet_write_kv_count"; +const char* const kWriteThroughPutMetricName = "tera_ts_tablet_write_through_put"; +const char* const kWriteRejectRowsMetricName = "tera_ts_tablet_write_reject_row_count"; + namespace io { class TabletWriter; @@ -36,6 +51,8 @@ struct ScanOptions; struct ScanContext; class ScanContextManager; +std::string MetricLabelToString(const std::string& tablet_path); + class TabletIO { public: enum CompactionType { @@ -54,16 +71,32 @@ class TabletIO { }; struct StatCounter { - tera::Counter low_read_cell; - tera::Counter scan_rows; - tera::Counter scan_kvs; - tera::Counter scan_size; - tera::Counter read_rows; - tera::Counter read_kvs; - tera::Counter read_size; - tera::Counter write_rows; - tera::Counter write_kvs; - tera::Counter write_size; + const std::string label; + tera::MetricCounter low_read_cell; + tera::MetricCounter scan_rows; + tera::MetricCounter scan_kvs; + tera::MetricCounter scan_size; + tera::MetricCounter read_rows; + tera::MetricCounter read_kvs; + tera::MetricCounter read_size; + tera::MetricCounter write_rows; + tera::MetricCounter write_kvs; + tera::MetricCounter write_size; + tera::MetricCounter write_reject_rows; + + StatCounter(const std::string& tablet_path) + : label(MetricLabelToString(tablet_path)), + low_read_cell(tera::kLowReadCellMetricName, label, {SubscriberType::QPS}), + scan_rows(tera::kScanRowsMetricName, label, {SubscriberType::QPS}), + scan_kvs(tera::kScanKvsMetricName, label, {SubscriberType::QPS}), + scan_size(tera::kScanThroughPutMetricName, label, {SubscriberType::THROUGHPUT}), + read_rows(tera::kReadRowsMetricName, label, {SubscriberType::QPS}), + read_kvs(tera::kReadKvsMetricName, label, {SubscriberType::QPS}), + read_size(tera::kReadThroughPutMetricName, label, {SubscriberType::THROUGHPUT}), + write_rows(tera::kWriteRowsMetricName, label, {SubscriberType::QPS}), + write_kvs(tera::kWriteKvsMetricName, label, {SubscriberType::QPS}), + write_size(tera::kWriteThroughPutMetricName, label, {SubscriberType::THROUGHPUT}), + write_reject_rows(tera::kWriteRejectRowsMetricName, label, {SubscriberType::QPS}) {} }; typedef std::function*, @@ -83,6 +116,7 @@ class TabletIO { std::string GetTablePath() const; std::string GetStartKey() const; std::string GetEndKey() const; + const std::string& GetMetricLabel() const; virtual CompactStatus GetCompactStatus() const; virtual TableSchema GetSchema() const; RawKey RawKeyType() const; @@ -94,6 +128,7 @@ class TabletIO { virtual bool Load(const TableSchema& schema, const std::string& path, const std::vector& parent_tablets, + const std::set& ignore_err_lgs, std::map snapshots, std::map rollbacks, leveldb::Logger* logger = NULL, @@ -118,7 +153,8 @@ class TabletIO { // read a row virtual bool ReadCells(const RowReaderInfo& row_reader, RowResult* value_list, - uint64_t snapshot_id = 0, StatusCode* status = NULL); + uint64_t snapshot_id = 0, StatusCode* status = NULL, + int64_t timeout_ms = std::numeric_limits::max()); /// scan from leveldb return ture means complete flase means not complete bool LowLevelScan(const std::string& start_tera_key, const std::string& end_row_key, @@ -162,8 +198,6 @@ class TabletIO { void SetStatus(TabletStatus status); TabletStatus GetStatus(); - void GetAndClearCounter(TabletCounter* counter); - int32_t AddRef(); int32_t DecRef(); int32_t GetRef() const; @@ -173,6 +207,10 @@ class TabletIO { void ProcessScan(ScanContext* context); void ApplySchema(const TableSchema& schema); + bool ShouldForceUnloadOnError(); + + bool GetDBStatus(tera::TabletStatus* tablet_status, bool slow_check); + private: friend class TabletWriter; friend class ScanConextManager; @@ -180,7 +218,7 @@ class TabletIO { bool sync = false, StatusCode* status = NULL); // int64_t GetDataSizeWithoutLock(StatusCode* status = NULL); - void SetupOptionsForLG(); + void SetupOptionsForLG(const std::set& ignore_err_lgs); void TearDownOptionsForLG(); void IndexingCfToLG(); @@ -245,6 +283,8 @@ class TabletIO { KeyValuePair* next); void SetSchema(const TableSchema& schema); + bool PutIfAbsentCheck(const std::string& row_key, const Mutation& mutation); + bool SingleRowTxnCheck(const std::string& row_key, const SingleRowTxnReadInfo& txn_read_info, StatusCode* status); @@ -263,6 +303,7 @@ class TabletIO { CompactStatus compact_status_; TabletStatus status_; + tera::TabletStatus tablet_status_; // check wether db corruption volatile int32_t ref_count_; volatile int32_t db_ref_count_; leveldb::Options ldb_options_; diff --git a/src/io/tablet_scanner.cc b/src/io/tablet_scanner.cc index d799f3fe9..47f082126 100644 --- a/src/io/tablet_scanner.cc +++ b/src/io/tablet_scanner.cc @@ -134,6 +134,9 @@ bool ScanContextManager::ScheduleScanContext(ScanContext* context) { // complete or io error, return all the rest request to client if (context->complete || (context->ret_code != kTabletNodeOk)) { DeleteScanContext(context); // never use context + if (context->ret_code != kTabletNodeOk) { + return false; + } return true; } if (context->jobs.size() == 0) { @@ -148,6 +151,7 @@ bool ScanContextManager::ScheduleScanContext(ScanContext* context) { MutexLock l(&lock_); if (context->ret_code != kTabletNodeOk) { DeleteScanContext(context); // never use context + return false; } } return true; diff --git a/src/io/tablet_scanner.h b/src/io/tablet_scanner.h index e816e1b11..d468bdb6f 100644 --- a/src/io/tablet_scanner.h +++ b/src/io/tablet_scanner.h @@ -33,12 +33,15 @@ struct ScanOptions { ColumnFamilyMap column_family_list; std::set iter_cf_set; int64_t timeout; + uint64_t max_qualifiers; ScanOptions() : max_versions(std::numeric_limits::max()), max_size(std::numeric_limits::max()), number_limit(std::numeric_limits::max()), - ts_start(kOldestTs), ts_end(kLatestTs), snapshot_id(0), timeout(std::numeric_limits::max() / 2) + ts_start(kOldestTs), ts_end(kLatestTs), snapshot_id(0), + timeout(std::numeric_limits::max() / 2), + max_qualifiers(std::numeric_limits::max()) {} }; @@ -55,6 +58,7 @@ struct ScanContext { leveldb::Iterator* it; // init to NULL leveldb::CompactStrategy* compact_strategy; uint32_t version_num; + uint64_t qu_num; std::string last_key; std::string last_col; std::string last_qual; diff --git a/src/io/tablet_writer.cc b/src/io/tablet_writer.cc index 5e8791cda..81954d5bd 100644 --- a/src/io/tablet_writer.cc +++ b/src/io/tablet_writer.cc @@ -5,6 +5,8 @@ #include "io/tablet_writer.h" #include +#include +#include #include #include @@ -16,9 +18,13 @@ #include "leveldb/lg_coding.h" #include "proto/proto_helper.h" #include "tera/table_descriptor.h" -#include "utils/counter.h" +#include "common/counter.h" #include "utils/string_util.h" -#include "utils/timer.h" +#include "common/timer.h" + +#include "tabletnode/tabletnode_metric_name.h" +#include "common/metric/ratio_subscriber.h" +#include "common/metric/prometheus_subscriber.h" DECLARE_int32(tera_asyncwriter_pending_limit); DECLARE_bool(tera_enable_level0_limit); @@ -30,6 +36,20 @@ DECLARE_bool(tera_sync_log); namespace tera { namespace io { +using tera::tabletnode::kRowDelayMetric; +using tera::tabletnode::kRowCountMetric; + +using tera::tabletnode::kApiLabelWrite; +using tera::Subscriber; + +tera::MetricCounter row_write_count(kRowCountMetric, kApiLabelWrite, {SubscriberType::QPS}); +tera::MetricCounter row_write_delay(kRowDelayMetric, kApiLabelWrite, {}); + +tera::AutoSubscriberRegister row_write_delay_per_row(std::unique_ptr(new tera::RatioSubscriber( + MetricId("tera_ts_row_write_delay_us_per_row"), + std::unique_ptr(new tera::PrometheusSubscriber(MetricId(kRowDelayMetric, kApiLabelWrite), SubscriberType::SUM)), + std::unique_ptr(new tera::PrometheusSubscriber(MetricId(kRowCountMetric, kApiLabelWrite), SubscriberType::SUM))))); + TabletWriter::TabletWriter(TabletIO* tablet_io) : tablet_(tablet_io), stopped_(true), sync_timestamp_(0), @@ -157,15 +177,21 @@ void TabletWriter::DoWork() { } // 否则 flush VLOG(7) << "write data, sleep_duration: " << sleep_duration; - + sync_timestamp_ = GetTimeStampInMs(); FlushToDiskBatch(sealed_buffer_); sealed_buffer_->clear(); - sync_timestamp_ = GetTimeStampInMs(); } LOG(INFO) << "AsyncWriter::DoWork done"; worker_done_event_.Set(); } +bool TabletWriter::IsBusy() { + const uint64_t MAX_PENDING_SIZE = FLAGS_tera_asyncwriter_pending_limit * 1024UL; + + MutexLock lock(&task_mutex_); + return active_buffer_size_ >= MAX_PENDING_SIZE; +} + bool TabletWriter::SwapActiveBuffer(bool force) { const uint64_t SYNC_SIZE = FLAGS_tera_asyncwriter_sync_size_threshold * 1024UL; if (FLAGS_tera_enable_level0_limit == true) { @@ -200,12 +226,14 @@ void TabletWriter::BatchRequest(WriteTaskBuffer* task_buffer, WriteTask& task = (*task_buffer)[task_idx]; const std::vector& row_mutation_vec = *(task.row_mutation_vec); std::vector* status_vec = task.status_vec; + const std::vector& ignore_row_vec = task.ignore_row_vec; for (uint32_t i = 0; i < row_mutation_vec.size(); ++i) { StatusCode* status = &((*status_vec)[i]); + const IgnoreCellFlags& ignore_cell_flags = ignore_row_vec[i]; const RowMutationSequence& row_mu = *row_mutation_vec[i]; const std::string& row_key = row_mu.row_key(); - int32_t mu_num = row_mu.mutation_sequence().size(); + uint32_t mu_num = row_mu.mutation_sequence().size(); if (*status != kTabletNodeOk) { VLOG(11) << "batch write fail, row " << DebugString(row_key) << ", status " << StatusCodeToString(*status); @@ -235,7 +263,12 @@ void TabletWriter::BatchRequest(WriteTaskBuffer* task_buffer, batch->Delete(tera_key); } } else { - for (int32_t t = 0; t < mu_num; ++t) { + for (uint32_t t = 0; t < mu_num; ++t) { + if (t < ignore_cell_flags.size() && ignore_cell_flags[t]) { + VLOG(11) << "batch write ignore cell @ " << DebugString(row_key) + << "[" << task_idx << "," << i << "," << t << "]"; + continue; + } const Mutation& mu = row_mu.mutation_sequence().Get(t); std::string tera_key; leveldb::TeraKeyType type = leveldb::TKT_VALUE; @@ -258,9 +291,11 @@ void TabletWriter::BatchRequest(WriteTaskBuffer* task_buffer, case kAddInt64: type = leveldb::TKT_ADDINT64; break; + /* case kPutIfAbsent: type = leveldb::TKT_PUT_IFABSENT; break; + */ case kAppend: type = leveldb::TKT_APPEND; break; @@ -317,6 +352,8 @@ void TabletWriter::FinishTask(WriteTaskBuffer* task_buffer, StatusCode status) { for (uint32_t task_idx = 0; task_idx < task_buffer->size(); ++task_idx) { WriteTask& task = (*task_buffer)[task_idx]; tablet_->GetCounter().write_rows.Add(task.row_mutation_vec->size()); + row_write_count.Add(task.row_mutation_vec->size()); + row_write_delay.Add(get_micros() - task.start_time); for (uint32_t i = 0; i < task.row_mutation_vec->size(); i++) { tablet_->GetCounter().write_kvs.Add((*task.row_mutation_vec)[i]->mutation_sequence_size()); // set batch_write status for row_mu @@ -329,7 +366,7 @@ void TabletWriter::FinishTask(WriteTaskBuffer* task_buffer, StatusCode status) { return; } -// set status to kTxnFail, if transaction conflicts. +// set status to kTxnFail, if single row transaction or putifabsent conflicts bool TabletWriter::CheckSingleRowTxnConflict(const RowMutationSequence& row_mu, std::set* commit_row_key_set, StatusCode* status) { @@ -359,6 +396,36 @@ bool TabletWriter::CheckSingleRowTxnConflict(const RowMutationSequence& row_mu, return false; } +void TabletWriter::MarkPutIfAbsentConflict(const RowMutationSequence& row_mu, + IgnoreCellFlags* ignore_cell_flags, + std::unordered_set* not_exist_cell_set) { + const std::string& row_key = row_mu.row_key(); + // check every mutate item if mutation type is PutIfAbsent + for (int32_t i = 0; i < row_mu.mutation_sequence_size(); ++i) { + const Mutation& mutation = row_mu.mutation_sequence(i); + if (mutation.type() != kPutIfAbsent) { + continue; + } + std::string cell_key; + tablet_->GetRawKeyOperator()->EncodeTeraKey(row_key, + mutation.family(), mutation.qualifier(), kLatestTs, + leveldb::TKT_FORSEEK, &cell_key); + if (not_exist_cell_set->find(cell_key) != not_exist_cell_set->end()) { + VLOG(9) << "txn of row (PutIfAbsent) " << DebugString(row_key) + << ":" << DebugString(mutation.family()) + << ":" << DebugString(mutation.qualifier()); + (*ignore_cell_flags)[i] = true; + } + if (!tablet_->PutIfAbsentCheck(row_key, mutation)) { + VLOG(9) << "txn of row (PutIfAbsent) " << DebugString(row_key) + << ":" << DebugString(mutation.family()) + << ":" << DebugString(mutation.qualifier()); + (*ignore_cell_flags)[i] = true; + } + not_exist_cell_set->insert(cell_key); + } +} + bool TabletWriter::CheckIllegalRowArg(const RowMutationSequence& row_mu, const std::set& cf_set, StatusCode* status) { @@ -401,6 +468,8 @@ void TabletWriter::CheckRows(WriteTaskBuffer* task_buffer) { } std::set commit_row_key_set; + // for PutIfAbsent, make sure only one PutIfAbsent operation in a cell + std::unordered_set not_exist_cell_set; for (uint32_t task_idx = 0; task_idx < task_buffer->size(); ++task_idx) { WriteTask& task = (*task_buffer)[task_idx]; std::vector& row_mutation_vec = *task.row_mutation_vec; @@ -408,9 +477,15 @@ void TabletWriter::CheckRows(WriteTaskBuffer* task_buffer) { for (uint32_t row_idx = 0; row_idx < row_mutation_vec.size(); ++row_idx) { const RowMutationSequence* row_mu = row_mutation_vec[row_idx]; + IgnoreCellFlags ignore_cell_flags; + // init all cell not ignored + ignore_cell_flags.assign(row_mu->mutation_sequence_size(), false); + task.ignore_row_vec.push_back(ignore_cell_flags); + if(CheckSingleRowTxnConflict(*row_mu, &commit_row_key_set, &status_vec[row_idx])) { continue; } + MarkPutIfAbsentConflict(*row_mu, &(task.ignore_row_vec.back()), ¬_exist_cell_set); if (CheckIllegalRowArg(*row_mu, cf_set, &status_vec[row_idx])) { continue; } @@ -421,18 +496,28 @@ void TabletWriter::CheckRows(WriteTaskBuffer* task_buffer) { } StatusCode TabletWriter::FlushToDiskBatch(WriteTaskBuffer* task_buffer) { - int64_t ts = get_micros(); + int64_t start_ts, check_cost, batch_cost, write_cost, finish_cost; + + start_ts = get_micros(); CheckRows(task_buffer); + check_cost = get_micros(); leveldb::WriteBatch batch; BatchRequest(task_buffer, &batch); + batch_cost = get_micros(); StatusCode status = kTabletNodeOk; const bool disable_wal = false; tablet_->WriteBatch(&batch, disable_wal, FLAGS_tera_sync_log, &status); batch.Clear(); + write_cost = get_micros(); FinishTask(task_buffer, status); - VLOG(7) << "finish a batch: " << task_buffer->size() << ", use " << get_micros() - ts; + finish_cost = get_micros(); + VLOG(7) << "finish a batch: " << task_buffer->size() << ", cost(check/batch/write/finish): " + << check_cost - start_ts << "/" + << batch_cost - check_cost << "/" + << write_cost - batch_cost << "/" + << finish_cost - write_cost; return status; } diff --git a/src/io/tablet_writer.h b/src/io/tablet_writer.h index 561db7b1d..b0019ec8b 100644 --- a/src/io/tablet_writer.h +++ b/src/io/tablet_writer.h @@ -6,6 +6,8 @@ #define TERA_TABLETNODE_TABLET_WRITER_H_ #include +#include +#include #include "common/event.h" #include "common/mutex.h" @@ -27,11 +29,16 @@ class TabletWriter { public: typedef std::function*, \ std::vector*)> WriteCallback; + + typedef std::vector IgnoreCellFlags; struct WriteTask { + WriteTask():start_time(get_micros()) {} std::vector* row_mutation_vec; std::vector* status_vec; + std::vector ignore_row_vec; WriteCallback callback; + int64_t start_time; }; typedef std::vector WriteTaskBuffer; @@ -47,6 +54,7 @@ class TabletWriter { bool kv_only); void Start(); void Stop(); + bool IsBusy(); private: void DoWork(); @@ -57,6 +65,11 @@ class TabletWriter { bool CheckSingleRowTxnConflict(const RowMutationSequence& row_mu, std::set* commit_row_key_set, StatusCode* status); + // mark conflict of PutIfAbsent + void MarkPutIfAbsentConflict(const RowMutationSequence& row_mu, + IgnoreCellFlags* ignore_cell_flags, + std::unordered_set* not_exist_cell_set); + bool CheckIllegalRowArg(const RowMutationSequence& row_mu, const std::set& cf_set, StatusCode* status); diff --git a/src/io/test/load_test.cc b/src/io/test/load_test.cc index 714758a5f..7351488ea 100644 --- a/src/io/test/load_test.cc +++ b/src/io/test/load_test.cc @@ -24,7 +24,7 @@ #include "leveldb/table_utils.h" #include "proto/proto_helper.h" #include "proto/status_code.pb.h" -#include "utils/timer.h" +#include "common/timer.h" #include "utils/utils_cmd.h" DECLARE_int32(tera_io_retry_max_times); @@ -104,7 +104,8 @@ TEST_F(TabletIOTest, General) { leveldb::Status s = leveldb::Env::Default()->NewLogger("./log/leveldblog", &ldb_logger); assert(s.ok()); EXPECT_TRUE(tablet.Load(TableSchema(), tablet_path, std::vector(), - empty_snaphsots_, empty_rollback_, ldb_logger, NULL, NULL, &status)); + std::set(), empty_snaphsots_, + empty_rollback_, ldb_logger, NULL, NULL, &status)); std::string key = "555"; std::string value = "value of 555"; @@ -147,7 +148,8 @@ TEST_F(TabletIOTest, CurrentLost) { assert(s.ok()); ASSERT_FALSE(tablet.Load(TableSchema(), tablet_path, std::vector(), - empty_snaphsots_, empty_rollback_, ldb_logger, NULL, NULL, &status)); + std::set(), empty_snaphsots_, + empty_rollback_, ldb_logger, NULL, NULL, &status)); env->ResetMock(); } @@ -178,7 +180,8 @@ TEST_F(TabletIOTest, CurrentReadFailed) { assert(s.ok()); ASSERT_FALSE(tablet.Load(TableSchema(), tablet_path, std::vector(), - empty_snaphsots_, empty_rollback_, ldb_logger, NULL, NULL, &status)); + std::set(), empty_snaphsots_, + empty_rollback_, ldb_logger, NULL, NULL, &status)); env->ResetMock(); } @@ -216,7 +219,8 @@ TEST_F(TabletIOTest, CurrentCorrupted) { assert(s.ok()); ASSERT_FALSE(tablet.Load(TableSchema(), tablet_path, std::vector(), - empty_snaphsots_, empty_rollback_, ldb_logger, NULL, NULL, &status)); + std::set(), empty_snaphsots_, + empty_rollback_, ldb_logger, NULL, NULL, &status)); env->ResetMock(); } @@ -254,7 +258,8 @@ TEST_F(TabletIOTest, ManifestLost) { assert(s.ok()); ASSERT_FALSE(tablet.Load(TableSchema(), tablet_path, std::vector(), - empty_snaphsots_, empty_rollback_, ldb_logger, NULL, NULL, &status)); + std::set(), empty_snaphsots_, + empty_rollback_, ldb_logger, NULL, NULL, &status)); env->ResetMock(); } @@ -284,7 +289,8 @@ TEST_F(TabletIOTest, ManifestReadFailed) { assert(s.ok()); ASSERT_FALSE(tablet.Load(TableSchema(), tablet_path, std::vector(), - empty_snaphsots_, empty_rollback_, ldb_logger, NULL, NULL, &status)); + std::set(), empty_snaphsots_, + empty_rollback_, ldb_logger, NULL, NULL, &status)); env->ResetMock(); } @@ -322,7 +328,8 @@ TEST_F(TabletIOTest, ManifestCorrupted) { assert(s.ok()); ASSERT_FALSE(tablet.Load(TableSchema(), tablet_path, std::vector(), - empty_snaphsots_, empty_rollback_, ldb_logger, NULL, NULL, &status)); + std::set(), empty_snaphsots_, + empty_rollback_, ldb_logger, NULL, NULL, &status)); env->ResetMock(); } @@ -353,7 +360,8 @@ TEST_F(TabletIOTest, SstLost) { assert(s.ok()); ASSERT_FALSE(tablet.Load(TableSchema(), tablet_path, std::vector(), - empty_snaphsots_, empty_rollback_, ldb_logger, NULL, NULL, &status)); + std::set(), empty_snaphsots_, + empty_rollback_, ldb_logger, NULL, NULL, &status)); env->ResetMock(); } @@ -367,26 +375,30 @@ TEST_F(TabletIOTest, SstLostButIgnore) { TabletIO tablet(key_start, key_end, tablet_path); leveldb::MockEnv* env = (leveldb::MockEnv*)LeveldbMockEnv(); - std::string fname = mock_env_prefix + tablet_path + "/0/__oops"; - int fd = open(fname.c_str(), O_RDWR | O_CREAT); - if (fd == -1) { - std::cout << strerror(errno) << fname << std::endl; - abort(); - } env->SetPrefix(mock_env_prefix); - env->SetGetChildrenCallback(DropSst); tablet.SetMockEnv(env); leveldb::Logger* ldb_logger; leveldb::Status s = leveldb::Env::Default()->NewLogger("./log/leveldblog", &ldb_logger); assert(s.ok()); + std::set ignore_err_lgs; + ignore_err_lgs.insert("lg0"); + TableSchema schema = TableSchema(); + + LocalityGroupSchema* lg = schema.add_locality_groups(); + lg->set_name("lg0"); + + ColumnFamilySchema* cf = schema.add_column_families(); + cf->set_name("column"); + cf->set_locality_group("lg0"); + cf->set_max_versions(3); - ASSERT_TRUE(tablet.Load(TableSchema(), tablet_path, std::vector(), - empty_snaphsots_, empty_rollback_, ldb_logger, NULL, NULL, &status)); + ASSERT_TRUE(tablet.Load(schema, tablet_path, std::vector(), + ignore_err_lgs, empty_snaphsots_, + empty_rollback_, ldb_logger, NULL, NULL, &status)); env->ResetMock(); - close(fd); } //#endif diff --git a/src/io/test/tablet_io_test.cc b/src/io/test/tablet_io_test.cc index 90da431f9..5aa7f12fa 100644 --- a/src/io/test/tablet_io_test.cc +++ b/src/io/test/tablet_io_test.cc @@ -18,7 +18,7 @@ #include "leveldb/table_utils.h" #include "proto/proto_helper.h" #include "proto/status_code.pb.h" -#include "utils/timer.h" +#include "common/timer.h" #include "utils/utils_cmd.h" #include "utils/string_util.h" #include "io/tablet_scanner.h" @@ -93,7 +93,8 @@ TEST_F(TabletIOTest, General) { TabletIO tablet(key_start, key_end, tablet_path); EXPECT_TRUE(tablet.Load(TableSchema(), tablet_path, std::vector(), - empty_snaphsots_, empty_rollback_, NULL, NULL, NULL, &status)); + std::set(), empty_snaphsots_, + empty_rollback_, NULL, NULL, NULL, &status)); std::string key = "555"; std::string value = "value of 555"; @@ -118,7 +119,8 @@ TEST_F(TabletIOTest, Split) { TabletIO tablet(key_start, key_end, tablet_path); EXPECT_TRUE(tablet.Load(TableSchema(), tablet_path, std::vector(), - empty_snaphsots_, empty_rollback_, NULL, NULL, NULL, &status)); + std::set(), empty_snaphsots_, + empty_rollback_, NULL, NULL, NULL, &status)); // prepare test data EXPECT_TRUE(PrepareTestData(&tablet, N)); @@ -139,7 +141,8 @@ TEST_F(TabletIOTest, Split) { key_end = "8000"; TabletIO other_tablet(key_start, key_end, tablet_path); EXPECT_TRUE(other_tablet.Load(TableSchema(), tablet_path, std::vector(), - empty_snaphsots_, empty_rollback_, NULL, NULL, NULL, &status)); + std::set(), empty_snaphsots_, + empty_rollback_, NULL, NULL, NULL, &status)); other_tablet.GetDataSize(&size, NULL, &status); LOG(INFO) << "table[" << key_start << ", " << key_end << "]: size = " << size; @@ -155,7 +158,8 @@ TEST_F(TabletIOTest, Split) { key_end = "5000"; TabletIO l_tablet(key_start, key_end, tablet_path); EXPECT_TRUE(l_tablet.Load(TableSchema(), tablet_path, std::vector(), - empty_snaphsots_, empty_rollback_, NULL, NULL, NULL, &status)); + std::set(), empty_snaphsots_, + empty_rollback_, NULL, NULL, NULL, &status)); l_tablet.GetDataSize(&size, NULL, &status); LOG(INFO) << "table[" << key_start << ", " << key_end << "]: size = " << size; @@ -165,7 +169,8 @@ TEST_F(TabletIOTest, Split) { key_end = ""; TabletIO r_tablet(key_start, key_end, tablet_path); EXPECT_TRUE(r_tablet.Load(TableSchema(), tablet_path, std::vector(), - empty_snaphsots_, empty_rollback_, NULL, NULL, NULL, &status)); + std::set(), empty_snaphsots_, + empty_rollback_, NULL, NULL, NULL, &status)); r_tablet.GetDataSize(&size, NULL, &status); LOG(INFO) << "table[" << key_start << ", " << key_end << "]: size = " << size; @@ -182,7 +187,8 @@ TEST_F(TabletIOTest, SplitAndCheckSize) { TabletIO tablet(key_start, key_end, tablet_path); EXPECT_TRUE(tablet.Load(TableSchema(), tablet_path, std::vector(), - empty_snaphsots_, empty_rollback_, NULL, NULL, NULL, &status)); + std::set(), empty_snaphsots_, + empty_rollback_, NULL, NULL, NULL, &status)); // prepare test data EXPECT_TRUE(PrepareTestData(&tablet, N)); @@ -202,7 +208,8 @@ TEST_F(TabletIOTest, SplitAndCheckSize) { // open from split key to check scope size TabletIO l_tablet(key_start, split_key, tablet_path); EXPECT_TRUE(l_tablet.Load(TableSchema(), tablet_path, std::vector(), - empty_snaphsots_, empty_rollback_, NULL, NULL, NULL, &status)); + std::set(), empty_snaphsots_, + empty_rollback_, NULL, NULL, NULL, &status)); l_tablet.GetDataSize(&size, NULL, &status); LOG(INFO) << "table[" << key_start << ", " << split_key << "]: size = " << size; @@ -210,7 +217,8 @@ TEST_F(TabletIOTest, SplitAndCheckSize) { TabletIO r_tablet(split_key, key_end, tablet_path); EXPECT_TRUE(r_tablet.Load(TableSchema(), tablet_path, std::vector(), - empty_snaphsots_, empty_rollback_, NULL, NULL, NULL, &status)); + std::set(), empty_snaphsots_, + empty_rollback_, NULL, NULL, NULL, &status)); r_tablet.GetDataSize(&size, NULL, &status); LOG(INFO) << "table[" << split_key << ", " << key_end << "]: size = " << size; @@ -227,7 +235,8 @@ TEST_F(TabletIOTest, OverWrite) { TabletIO tablet(key_start, key_end, tablet_path); EXPECT_TRUE(tablet.Load(TableSchema(), tablet_path, std::vector(), - empty_snaphsots_, empty_rollback_, NULL, NULL, NULL, &status)); + std::set(), empty_snaphsots_, + empty_rollback_, NULL, NULL, NULL, &status)); std::string key = "555"; std::string value = "value of 555"; @@ -253,7 +262,8 @@ TEST_F(TabletIOTest, Compact) { TabletIO tablet(key_start, key_end, tablet_path); EXPECT_TRUE(tablet.Load(TableSchema(), tablet_path, std::vector(), - empty_snaphsots_, empty_rollback_, NULL, NULL, NULL, &status)); + std::set(), empty_snaphsots_, + empty_rollback_, NULL, NULL, NULL, &status)); // prepare test data EXPECT_TRUE(PrepareTestData(&tablet, 100)); @@ -269,7 +279,8 @@ TEST_F(TabletIOTest, Compact) { std::string new_key_end = StringFormat("%011llu", 50); // NumberToString(800); TabletIO new_tablet(new_key_start, new_key_end, tablet_path); EXPECT_TRUE(new_tablet.Load(TableSchema(), tablet_path, std::vector(), - empty_snaphsots_, empty_rollback_, NULL, NULL, NULL, &status)); + std::set(), empty_snaphsots_, + empty_rollback_, NULL, NULL, NULL, &status)); EXPECT_TRUE(new_tablet.Compact(0, &status)); uint64_t new_table_size = 0; @@ -291,6 +302,110 @@ TEST_F(TabletIOTest, Compact) { EXPECT_TRUE(new_tablet.Unload()); } +TEST_F(TabletIOTest, LowLevelSeek) { + std::string tablet_path = working_dir + "llseek_tablet"; + std::string key_start = ""; + std::string key_end = ""; + StatusCode status; + + TabletIO tablet(key_start, key_end, tablet_path); + EXPECT_TRUE(tablet.Load(GetTableSchema(), tablet_path, std::vector(), + std::set(), empty_snaphsots_, + empty_rollback_, NULL, NULL, NULL, &status)); + + // init scan + ScanOptions scan_options; + ColumnFamilyMap cf_map; + std::set qu_set; + qu_set.insert("qualifer"); + qu_set.insert("2a"); + qu_set.insert("1a"); + cf_map["column"] = qu_set; + scan_options.column_family_list = cf_map; + scan_options.iter_cf_set.insert("column"); + + std::string tkey1; + // delete this key + tablet.GetRawKeyOperator()->EncodeTeraKey("row", "", "", get_micros(), leveldb::TKT_DEL, &tkey1); + tablet.WriteOne(tkey1, "" , false, NULL); + tablet.GetRawKeyOperator()->EncodeTeraKey("row1", "", "", get_micros(), leveldb::TKT_DEL, &tkey1); + tablet.WriteOne(tkey1, "" , false, NULL); + + // write cell + tablet.GetRawKeyOperator()->EncodeTeraKey("row", "column", "qualifer", get_micros(), leveldb::TKT_VALUE, &tkey1); + tablet.WriteOne(tkey1, "lala" , false, NULL); + RowResult value_list; + + EXPECT_TRUE(tablet.LowLevelSeek("row", scan_options, &value_list, &status)); + EXPECT_EQ(value_list.key_values_size(), 1); + + // delete cell + tablet.GetRawKeyOperator()->EncodeTeraKey("row", "", "", get_micros(), leveldb::TKT_DEL, &tkey1); + tablet.WriteOne(tkey1, "" , false, NULL); + EXPECT_TRUE(tablet.LowLevelSeek("row", scan_options, &value_list, &status)); + EXPECT_EQ(value_list.key_values_size(), 0); + + // write cell again + tablet.GetRawKeyOperator()->EncodeTeraKey("row", "column", "2a", get_micros(), leveldb::TKT_VALUE, &tkey1); + tablet.WriteOne(tkey1, "lala" , false, NULL); + EXPECT_TRUE(tablet.LowLevelSeek("row", scan_options, &value_list, &status)); + EXPECT_EQ(value_list.key_values_size(), 1); + + // clean + tablet.GetRawKeyOperator()->EncodeTeraKey("row", "", "", get_micros(), leveldb::TKT_DEL, &tkey1); + tablet.WriteOne(tkey1, "", false, NULL); + tablet.GetRawKeyOperator()->EncodeTeraKey("row1", "", "", get_micros(), leveldb::TKT_DEL, &tkey1); + tablet.WriteOne(tkey1, "", false, NULL); + + // write 5 versions + tablet.GetRawKeyOperator()->EncodeTeraKey("row", "column", "1a", get_micros(), leveldb::TKT_VALUE, &tkey1); + tablet.WriteOne(tkey1, "lala1", false, NULL); + int64_t start_ts = get_micros(); + tablet.GetRawKeyOperator()->EncodeTeraKey("row", "column", "1a", start_ts, leveldb::TKT_VALUE, &tkey1); + tablet.WriteOne(tkey1, "lala2", false, NULL); + tablet.GetRawKeyOperator()->EncodeTeraKey("row", "column", "1a", get_micros(), leveldb::TKT_VALUE, &tkey1); + tablet.WriteOne(tkey1, "lala3", false, NULL); + int64_t end_ts = get_micros(); + tablet.GetRawKeyOperator()->EncodeTeraKey("row", "column", "1a", end_ts, leveldb::TKT_VALUE, &tkey1); + tablet.WriteOne(tkey1, "lala4", false, NULL); + tablet.GetRawKeyOperator()->EncodeTeraKey("row", "column", "1a", get_micros(), leveldb::TKT_VALUE, &tkey1); + tablet.WriteOne(tkey1, "lala5", false, NULL); + tablet.GetRawKeyOperator()->EncodeTeraKey("row1", "column", "1a", get_micros(), leveldb::TKT_VALUE, &tkey1); + tablet.WriteOne(tkey1, "lala5", false, NULL); + + // read all versions ( write 5 versions, but schema set max_versions = 3 ) + EXPECT_TRUE(tablet.LowLevelSeek("row", scan_options, &value_list, &status)); + EXPECT_EQ(value_list.key_values_size(), 3); + + // for max_versions + // read 2 versions + scan_options.max_versions = 2; + EXPECT_TRUE(tablet.LowLevelSeek("row", scan_options, &value_list, &status)); + EXPECT_EQ(value_list.key_values_size(), 2); + + // for timerange and max_versions + // read 2 versions ( write 5 versions, but schema set max_versions = 3) + scan_options.max_versions = 4; + scan_options.ts_start = start_ts; + scan_options.ts_end = end_ts; + EXPECT_TRUE(tablet.LowLevelSeek("row", scan_options, &value_list, &status)); + EXPECT_EQ(value_list.key_values_size(), 2); + + // start_ts not in top 3 versions + scan_options.ts_start = start_ts; + scan_options.ts_end = start_ts; + EXPECT_TRUE(tablet.LowLevelSeek("row", scan_options, &value_list, &status)); + EXPECT_EQ(value_list.key_values_size(), 0); + + // end_ts in top 3 versions + scan_options.ts_start = end_ts; + scan_options.ts_end = end_ts; + EXPECT_TRUE(tablet.LowLevelSeek("row", scan_options, &value_list, &status)); + EXPECT_EQ(value_list.key_values_size(), 1); + + EXPECT_TRUE(tablet.Unload()); +} + TEST_F(TabletIOTest, LowLevelScan) { std::string tablet_path = working_dir + "llscan_tablet"; std::string key_start = ""; @@ -299,7 +414,8 @@ TEST_F(TabletIOTest, LowLevelScan) { TabletIO tablet(key_start, key_end, tablet_path); EXPECT_TRUE(tablet.Load(GetTableSchema(), tablet_path, std::vector(), - empty_snaphsots_, empty_rollback_, NULL, NULL, NULL, &status)); + std::set(), empty_snaphsots_, + empty_rollback_, NULL, NULL, NULL, &status)); std::string tkey1; @@ -322,19 +438,19 @@ TEST_F(TabletIOTest, LowLevelScan) { uint32_t read_bytes = 0; bool is_complete = false; EXPECT_TRUE(tablet.LowLevelScan(start_tera_key, "", ScanOptions(), - &value_list, &next_start_point, &read_row_count, &read_bytes, &is_complete, NULL)); + &value_list, &next_start_point, &read_row_count, &read_bytes, &is_complete, &status)); EXPECT_EQ(value_list.key_values_size(), 1); tablet.GetRawKeyOperator()->EncodeTeraKey("row", "", "", get_micros(), leveldb::TKT_DEL, &tkey1); tablet.WriteOne(tkey1, "lala" , false, NULL); EXPECT_TRUE(tablet.LowLevelScan(start_tera_key, "", ScanOptions(), - &value_list, &next_start_point, &read_row_count, &read_bytes, &is_complete, NULL)); + &value_list, &next_start_point, &read_row_count, &read_bytes, &is_complete, &status)); EXPECT_EQ(value_list.key_values_size(), 0); tablet.GetRawKeyOperator()->EncodeTeraKey("row", "column", "2a", get_micros(), leveldb::TKT_VALUE, &tkey1); tablet.WriteOne(tkey1, "lala" , false, NULL); EXPECT_TRUE(tablet.LowLevelScan(start_tera_key, "", ScanOptions(), - &value_list, &next_start_point, &read_row_count, &read_bytes, &is_complete, NULL)); + &value_list, &next_start_point, &read_row_count, &read_bytes, &is_complete, &status)); EXPECT_EQ(value_list.key_values_size(), 1); tablet.GetRawKeyOperator()->EncodeTeraKey("row", "", "", get_micros(), leveldb::TKT_DEL, &tkey1); @@ -357,17 +473,17 @@ TEST_F(TabletIOTest, LowLevelScan) { end_row_key = std::string("row1\0", 5); ScanOptions scan_options; EXPECT_TRUE(tablet.LowLevelScan(start_tera_key, end_row_key, scan_options, - &value_list, &next_start_point, &read_row_count, &read_bytes, &is_complete, NULL)); + &value_list, &next_start_point, &read_row_count, &read_bytes, &is_complete, &status)); EXPECT_EQ(value_list.key_values_size(), 5); tablet.GetRawKeyOperator()->EncodeTeraKey("row", "", "", 0, leveldb::TKT_FORSEEK, &start_tera_key); end_row_key = std::string("row\0", 5); scan_options.column_family_list["column"].insert("1a"); EXPECT_TRUE(tablet.LowLevelScan(start_tera_key, end_row_key, scan_options, - &value_list, &next_start_point, &read_row_count, &read_bytes, &is_complete, NULL)); + &value_list, &next_start_point, &read_row_count, &read_bytes, &is_complete, &status)); EXPECT_EQ(value_list.key_values_size(), 3); scan_options.max_versions = 2; EXPECT_TRUE(tablet.LowLevelScan(start_tera_key, end_row_key, scan_options, - &value_list, &next_start_point, &read_row_count, &read_bytes, &is_complete, NULL)); + &value_list, &next_start_point, &read_row_count, &read_bytes, &is_complete, &status)); EXPECT_EQ(value_list.key_values_size(), 2); EXPECT_TRUE(tablet.Unload()); } @@ -382,7 +498,8 @@ TEST_F(TabletIOTest, SplitToSubTable) { TabletIO tablet(key_start, key_end, tablet_path); EXPECT_TRUE(tablet.Load(TableSchema(), tablet_path, std::vector(), - empty_snaphsots_, empty_rollback_, NULL, NULL, NULL, &status)); + std::set(), empty_snaphsots_, + empty_rollback_, NULL, NULL, NULL, &status)); // prepare test data EXPECT_TRUE(PrepareTestData(&tablet, N / 2, 0)); @@ -391,7 +508,8 @@ TEST_F(TabletIOTest, SplitToSubTable) { // make sure all data are dumped into sst EXPECT_TRUE(tablet.Unload()); EXPECT_TRUE(tablet.Load(TableSchema(), tablet_path, std::vector(), - empty_snaphsots_, empty_rollback_, NULL, NULL, NULL, &status)); + std::set(), empty_snaphsots_, + empty_rollback_, NULL, NULL, NULL, &status)); // for first tablet tablet.GetDataSize(&size, NULL, &status); @@ -418,7 +536,8 @@ TEST_F(TabletIOTest, SplitToSubTable) { // 1. load sub-table 1 TabletIO l_tablet(key_start, split_key, split_path_1); EXPECT_TRUE(l_tablet.Load(TableSchema(), split_path_1, parent_tablet, - empty_snaphsots_, empty_rollback_, NULL, NULL, NULL, &status)); + std::set(), empty_snaphsots_, + empty_rollback_, NULL, NULL, NULL, &status)); l_tablet.GetDataSize(&size, NULL, &status); LOG(INFO) << "table[" << key_start << ", " << split_key << "]: size = " << size; @@ -436,7 +555,8 @@ TEST_F(TabletIOTest, SplitToSubTable) { // 2. load sub-table 2 TabletIO r_tablet(split_key, key_end, split_path_2); EXPECT_TRUE(r_tablet.Load(TableSchema(), split_path_2, parent_tablet, - empty_snaphsots_, empty_rollback_, NULL, NULL, NULL, &status)); + std::set(), empty_snaphsots_, + empty_rollback_, NULL, NULL, NULL, &status)); r_tablet.GetDataSize(&size, NULL, &status); LOG(INFO) << "table[" << split_key << ", " << key_end << "]: size = " << size; @@ -554,7 +674,8 @@ TEST_F(TabletIOTest, RowBloomFilter) { TabletIO tablet(key_start, key_end, tablet_path); EXPECT_TRUE(tablet.Load(GetTableSchema(), tablet_path, std::vector(), - empty_snaphsots_, empty_rollback_, NULL, NULL, NULL, &status)); + std::set(), empty_snaphsots_, + empty_rollback_, NULL, NULL, NULL, &status)); // prepare data leveldb::WriteBatch batch; @@ -594,7 +715,7 @@ TEST_F(TabletIOTest, RowBloomFilter) { bool is_complete = false; ASSERT_TRUE(tablet.LowLevelScan(start_tera_key, end_row_key, ScanOptions(), &value_list, &next_start_point, &read_row_count, &read_bytes, - &is_complete, NULL)); + &is_complete, &status)); ASSERT_EQ(value_list.key_values_size(), CR); for (int32_t j = 0; j < CR; j++) { char buf[16]; diff --git a/src/io/test/tablet_scanner_test.cc b/src/io/test/tablet_scanner_test.cc index a53f2d52a..915ad6a92 100644 --- a/src/io/test/tablet_scanner_test.cc +++ b/src/io/test/tablet_scanner_test.cc @@ -20,7 +20,7 @@ #include "leveldb/table_utils.h" #include "proto/proto_helper.h" #include "proto/status_code.pb.h" -#include "utils/timer.h" +#include "common/timer.h" #include "utils/utils_cmd.h" DECLARE_string(tera_tabletnode_path_prefix); @@ -225,7 +225,8 @@ TEST_F(TabletScannerTest, General) { TabletIO tablet(key_start, key_end, tablet_path); EXPECT_TRUE(tablet.Load(GetTableSchema(), tablet_path, std::vector(), - empty_snaphsots_, empty_rollback_, NULL, NULL, NULL, &status)); + std::set(), empty_snaphsots_, + empty_rollback_, NULL, NULL, NULL, &status)); PrepareData(&tablet, 1000000); uint64_t nr = 400; @@ -246,7 +247,8 @@ TEST_F(TabletScannerTest, CacheEvict) { TabletIO tablet(key_start, key_end, tablet_path); EXPECT_TRUE(tablet.Load(GetTableSchema(), tablet_path, std::vector(), - empty_snaphsots_, empty_rollback_, NULL, NULL, NULL, &status)); + std::set(), empty_snaphsots_, + empty_rollback_, NULL, NULL, NULL, &status)); PrepareData(&tablet, 1000000); diff --git a/src/io/utils_leveldb.cc b/src/io/utils_leveldb.cc index 033ad4d37..48a6fa2c8 100644 --- a/src/io/utils_leveldb.cc +++ b/src/io/utils_leveldb.cc @@ -14,6 +14,8 @@ #include "common/base/string_number.h" #include "common/file/file_path.h" #include "common/mutex.h" +#include "common/timer.h" +#include "db/filename.h" #include "io/timekey_comparator.h" #include "leveldb/comparator.h" #include "leveldb/env_dfs.h" @@ -21,7 +23,7 @@ #include "leveldb/env_inmem.h" #include "leveldb/env_mock.h" #include "leveldb/table_utils.h" -#include "utils/timer.h" +#include "common/timer.h" DECLARE_string(tera_leveldb_env_type); DECLARE_string(tera_leveldb_env_dfs_type); @@ -31,6 +33,7 @@ DECLARE_string(tera_leveldb_env_hdfs2_nameservice_list); DECLARE_string(tera_tabletnode_path_prefix); DECLARE_string(tera_dfs_so_path); DECLARE_string(tera_dfs_conf); +DECLARE_int64(tera_master_gc_trash_expire_time_s); namespace tera { namespace io { @@ -99,6 +102,11 @@ std::string GetTrashDir() { return FLAGS_tera_tabletnode_path_prefix + "/" + trash; } +std::string GetTrackableGcTrashDir() { + const std::string trash("#trackable_gc_trash"); + return FLAGS_tera_tabletnode_path_prefix + "/" + trash; +} + bool MoveEnvDirToTrash(const std::string& tablename) { leveldb::Env* env = LeveldbBaseEnv(); std::string src_dir = FLAGS_tera_tabletnode_path_prefix + "/" + tablename; @@ -140,6 +148,75 @@ bool MoveEnvDirToTrash(const std::string& tablename) { return true; } +leveldb::Status MoveSstToTrackableGcTrash(const std::string& table_name, + uint64_t tablet_id, + uint32_t lg_id, + uint64_t file_id) { + leveldb::Status s; + leveldb::Env* env = LeveldbBaseEnv(); + std::string table_path = FLAGS_tera_tabletnode_path_prefix + table_name; + std::string src_path = leveldb::BuildTableFilePath(table_path, tablet_id, lg_id, file_id); + + s = env->FileExists(src_path); + if(s.IsNotFound()) { + // not found, so no need to move + return leveldb::Status::OK(); + } else if (!s.ok()) { + // unknown status + return s; + } + + std::string trash_dir = GetTrackableGcTrashDir(); + s = env->FileExists(trash_dir); + if (s.IsNotFound()) { + if (!env->CreateDir(trash_dir).ok()) { + LOG(ERROR) << "[gc] fail to create trackable gc trash dir: " << trash_dir; + return leveldb::Status::IOError("fail to create trackable gc trash dir"); + } else { + LOG(INFO) << "[gc] succeed in creating trackable gc trash dir: " << trash_dir; + } + } else if (!s.ok()) { + // unknown status + return s; + } + + std::string time = get_curtime_str(); + std::replace(time.begin(), time.end(), ':', '-'); + std::string dest_path = leveldb::BuildTrashTableFilePath( + trash_dir + "/" + table_name, tablet_id, lg_id, file_id, time); + + size_t dir_pos = dest_path.rfind("/"); + if (dir_pos == std::string::npos) { + LOG(ERROR) << "[gc] invalid dest path: " << dest_path; + return leveldb::Status::IOError("invalid dest path"); + } + std::string lg_path = dest_path.substr(0, dir_pos); + s = env->FileExists(lg_path); + if(s.IsNotFound()) { + // not found, so no need to mkdir + s = env->CreateDir(lg_path); + if (!s.ok()) { + LOG(ERROR) << "[gc] create lg dir in trash: " << lg_path + << " failed: " << s.ToString(); + return s; + } + } else if (!s.ok()) { + // unknown status + return s; + } + + s = env->RenameFile(src_path, dest_path); + if (!s.ok()) { + LOG(ERROR) << "[gc] fail to move file to trackable gc trash, src_path: " << src_path + << ", dest_path: " << dest_path << ", status: " << s.ToString(); + return s; + } + LOG(INFO) << "[gc] move file to trackable gc trash, src_path: " << src_path + << ", dest_path: " << dest_path; + + return leveldb::Status::OK(); +} + void CleanTrashDir() { leveldb::Env* env = LeveldbBaseEnv(); std::string trash_dir = GetTrashDir(); @@ -156,6 +233,136 @@ void CleanTrashDir() { return; } +bool TryDeleteEmptyDir(const std::string& dir_path, + size_t total_children_size, + size_t deleted_children_size) { + bool deleted = false; + + if (deleted_children_size == total_children_size) { + leveldb::Status s; + leveldb::Env* env = LeveldbBaseEnv(); + s = env->DeleteDir(dir_path); + if (s.ok()) { + LOG(INFO) << "[gc] delete empty dir: " << dir_path; + deleted = true; + } else { + LOG(WARNING) << "[gc] fail to delete empty dir: " + << dir_path <<" status: " << s.ToString(); + deleted = false; + } + } + + return deleted; +} + +leveldb::Status DeleteTrashFileIfExpired(const std::string& file_path) { + leveldb::Status s; + leveldb::Env* env = LeveldbBaseEnv(); + + std::string file_time_str = leveldb::GetTimeStrFromTrashFile(file_path); + if (file_time_str.empty()) { + LOG(ERROR) << "[gc] skip invalid trash file path: " << file_path; + return leveldb::Status::Corruption("invalid trash file path"); + } + + // change time format + // eg.: change "20170801-15-54-23" to "20170801-15:54:23" + file_time_str = file_time_str.replace(file_time_str.rfind("-"), 1, ":"); + file_time_str = file_time_str.replace(file_time_str.rfind("-"), 1, ":"); + + int64_t file_time = get_timestamp_from_str(file_time_str); + int64_t current_time = time(nullptr); + if (current_time - file_time > FLAGS_tera_master_gc_trash_expire_time_s) { + s = env->DeleteFile(file_path); + if (s.ok()) { + LOG(INFO) << "[gc] delete expired trash file: " << file_path + << ", file added to trash time: " << get_time_str(file_time) + << ", current time: " << get_time_str(current_time); + } else { + LOG(ERROR) << "[gc] fail to delete expired trash file: " << file_path + <<" status: " << s.ToString(); + return s; + } + } else { + return leveldb::Status::Corruption("file not expired"); + } + + return s; +} + +void CleanTrackableGcTrash() { + leveldb::Status s; + leveldb::Env* env = LeveldbBaseEnv(); + std::string trash_dir = GetTrackableGcTrashDir(); + + s = env->FileExists(trash_dir); + if (s.IsNotFound()) { + LOG(INFO) << "[gc] skip empty trash dir: " << trash_dir + <<" status: " << s.ToString(); + return; + } + + std::vector tables; + s = env->GetChildren(trash_dir, &tables); + if (!s.ok()) { + LOG(ERROR) << "[gc] fail to list trash dir: " << trash_dir + <<" status: " << s.ToString(); + return; + } + + for (const auto& table : tables) { + std::string table_path = trash_dir + "/" + table; + std::vector tablets; + s = env->GetChildren(table_path, &tablets); + if (!s.ok()) { + LOG(ERROR) << "[gc] skip due to fail to list table dir: " << table_path + <<" status: " << s.ToString(); + continue; + } + + size_t deleted_empty_tablet_num = 0; + for (const auto& tablet : tablets) { + std::string tablet_path = table_path + "/" + tablet; + std::vector lgs; + s = env->GetChildren(tablet_path, &lgs); + if (!s.ok()) { + LOG(ERROR) << "[gc] skip due to fail to list tablet dir: " << tablet_path + <<" status: " << s.ToString(); + continue; + } + + size_t deleted_empty_lg_num = 0; + for (const auto& lg : lgs) { + std::string lg_path = tablet_path + "/" + lg; + std::vector files; + s = env->GetChildren(lg_path, &files); + if (!s.ok()) { + LOG(ERROR) << "[gc] skip due to fail to list lg dir: " << lg_path + <<" status: " << s.ToString(); + continue; + } + + size_t deleted_file_num = 0; + for (const auto& file : files) { + std::string file_path = lg_path + "/" + file; + if (DeleteTrashFileIfExpired(file_path).ok()) { + ++deleted_file_num; + } + } + if (TryDeleteEmptyDir(lg_path, files.size(), deleted_file_num)) { + ++ deleted_empty_lg_num; + } + } + if (TryDeleteEmptyDir(tablet_path, lgs.size(), deleted_empty_lg_num)) { + ++ deleted_empty_tablet_num; + } + } + TryDeleteEmptyDir(table_path, tablets.size(), deleted_empty_tablet_num); + } + + return; +} + leveldb::Status DeleteEnvDir(const std::string& dir) { leveldb::Status s; static bool is_support_rmdir = true; diff --git a/src/io/utils_leveldb.h b/src/io/utils_leveldb.h index 307c270aa..9654ce5b9 100644 --- a/src/io/utils_leveldb.h +++ b/src/io/utils_leveldb.h @@ -30,10 +30,25 @@ leveldb::Env* LeveldbMockEnv(); std::string GetTrashDir(); +std::string GetTrackableGcTrashDir(); + bool MoveEnvDirToTrash(const std::string& subdir); +leveldb::Status MoveSstToTrackableGcTrash(const std::string& table_name, + uint64_t tablet_id, + uint32_t lg_id, + uint64_t file_id); + void CleanTrashDir(); +bool TryDeleteEmptyDir(const std::string& dir_path, + size_t total_children_size, + size_t deleted_children_size); + +leveldb::Status DeleteTrashFileIfExpired(const std::string& file_path); + +void CleanTrackableGcTrash(); + leveldb::Status DeleteEnvDir(const std::string& subdir); } // namespace io diff --git a/src/lbcli_main.cc b/src/lbcli_main.cc new file mode 100644 index 000000000..c1149812c --- /dev/null +++ b/src/lbcli_main.cc @@ -0,0 +1,314 @@ +// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. +// + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "common/thread_pool.h" +#include "common/base/string_ext.h" +#include "common/base/string_number.h" +#include "common/console/progress_bar.h" +#include "common/file/file_path.h" +#include "io/coding.h" +#include "proto/kv_helper.h" +#include "proto/lb_client.h" +#include "proto/load_balancer_rpc.pb.h" +#include "proto/proto_helper.h" +#include "proto/tabletnode.pb.h" +#include "proto/tabletnode_client.h" +#include "sdk/client_impl.h" +#include "sdk/cookie.h" +#include "sdk/sdk_utils.h" +#include "sdk/sdk_zk.h" +#include "sdk/table_impl.h" +#include "tera.h" +#include "types.h" +#include "utils/crypt.h" +#include "utils/string_util.h" +#include "utils/tprinter.h" +#include "utils/utils_cmd.h" +#include "version.h" + +DECLARE_string(flagfile); + +// using FLAGS instead of isatty() for compatibility +DEFINE_bool(stdout_is_tty, true, "is stdout connected to a tty"); +DEFINE_bool(reorder_tablets, false, "reorder tablets by ts list"); +DEFINE_bool(readable, true, "readable input"); + +DECLARE_string(tera_lb_server_addr); +DECLARE_string(tera_lb_server_port); + +tera::TPrinter::PrintOpt g_printer_opt; + +using namespace tera; + +typedef std::shared_ptr TablePtr; +typedef std::shared_ptr TableImplPtr; +typedef std::map CommandTable; + +static CommandTable& GetCommandTable() { + static CommandTable command_table; + return command_table; +} + +static std::string GetServerAddr() { + return FLAGS_tera_lb_server_addr + ":" + FLAGS_tera_lb_server_port; +} + +const char* builtin_cmd_list[] = { + "safemode", + "safemode [enter | leave | get]", + + "help", + "help [cmd] \n\ + show manual for a or all cmd(s)", + + "version", + "version \n\ + show version info", +}; + +static void PrintCmdHelpInfo(const char* msg) { + if (msg == NULL) { + return; + } + int count = sizeof(builtin_cmd_list)/sizeof(char*); + for (int i = 0; i < count; i+=2) { + if(strncmp(msg, builtin_cmd_list[i], 32) == 0) { + std::cout << builtin_cmd_list[i + 1] << std::endl; + return; + } + } +} + +static void PrintCmdHelpInfo(const std::string& msg) { + PrintCmdHelpInfo(msg.c_str()); +} + +static void PrintAllCmd() { + std::cout << "there is cmd list:" << std::endl; + int count = sizeof(builtin_cmd_list)/sizeof(char*); + bool newline = false; + for (int i = 0; i < count; i+=2) { + std::cout << std::setiosflags(std::ios::left) << std::setw(20) << builtin_cmd_list[i]; + if (newline) { + std::cout << std::endl; + newline = false; + } else { + newline = true; + } + } + + std::cout << std::endl << "help [cmd] for details." << std::endl; +} + +// return false if similar command(s) not found +static bool PromptSimilarCmd(const char* msg) { + if (msg == NULL) { + return false; + } + bool found = false; + int64_t len = strlen(msg); + int64_t threshold = int64_t((len * 0.3 < 3) ? 3 : len * 0.3); + int count = sizeof(builtin_cmd_list)/sizeof(char*); + for (int i = 0; i < count; i+=2) { + if (EditDistance(msg, builtin_cmd_list[i]) <= threshold) { + if (!found) { + std::cout << "Did you mean:" << std::endl; + found = true; + } + std::cout << " " << builtin_cmd_list[i] << std::endl; + } + } + return found; +} + +static void PrintUnknownCmdHelpInfo(const char* msg) { + if (msg != NULL) { + std::cout << "'" << msg << "' is not a valid command." << std::endl << std::endl; + } + if ((msg != NULL) + && PromptSimilarCmd(msg)) { + return; + } + PrintAllCmd(); +} + +int32_t SafemodeOp(Client* client, int32_t argc, std::string* argv, ErrorCode* err) { + if (argc < 3) { + PrintCmdHelpInfo(argv[1]); + return -1; + } + + std::string op = argv[2]; + if (op != "get" && op != "leave" && op != "enter") { + PrintCmdHelpInfo(argv[1]); + return -1; + } + + load_balancer::LBClient lb_client(GetServerAddr()); + CmdCtrlRequest request; + CmdCtrlResponse response; + + request.set_sequence_id(0); + request.set_command("safemode"); + request.add_arg_list(op); + + string reason; + if (lb_client.CmdCtrl(&request, &response)) { + if (response.status() != tera::kLoadBalancerOk) { + reason = StatusCodeToString(response.status()); + LOG(ERROR) << reason; + std::cout << reason << std::endl; + err->SetFailed(ErrorCode::kSystem, reason); + return -1; + } + if (op == "get") { + if (response.bool_result()) { + std::cout << "true" << std::endl; + } else { + std::cout << "false" << std::endl; + } + } + return 0; + } else { + reason = "fail to CmdCtrl"; + LOG(ERROR) << reason; + std::cout << reason << std::endl; + err->SetFailed(ErrorCode::kSystem, reason); + return -1; + } +} + +int32_t HelpOp(Client*, int32_t argc, std::string* argv, ErrorCode*) { + if (argc == 2) { + PrintAllCmd(); + } else if (argc == 3) { + PrintCmdHelpInfo(argv[2]); + } else { + PrintCmdHelpInfo("help"); + } + return 0; +} + +int32_t HelpOp(int32_t argc, char** argv) { + std::vector argv_svec(argv, argv + argc); + return HelpOp(NULL, argc, &argv_svec[0], NULL); +} + +bool ParseCommand(int argc, char** arg_list, std::vector* parsed_arg_list) { + for (int i = 0; i < argc; i++) { + std::string parsed_arg = arg_list[i]; + if (FLAGS_readable && !ParseDebugString(arg_list[i], &parsed_arg)) { + std::cout << "invalid debug format of argument: " << arg_list[i] << std::endl; + return false; + } + parsed_arg_list->push_back(parsed_arg); + } + return true; +} + +static void InitializeCommandTable(){ + CommandTable& command_table = GetCommandTable(); + command_table["safemode"] = SafemodeOp; + command_table["help"] = HelpOp; +} + +int ExecuteCommand(Client* client, int argc, char** arg_list) { + int ret = 0; + ErrorCode error_code; + + std::vector parsed_arg_list; + if (!ParseCommand(argc, arg_list, &parsed_arg_list)) { + return 1; + } + std::string* argv = &parsed_arg_list[0]; + + CommandTable& command_table = GetCommandTable(); + std::string cmd = argv[1]; + if (cmd == "version") { + PrintSystemVersion(); + } else if (command_table.find(cmd) != command_table.end()) { + ret = command_table[cmd](client, argc, argv, &error_code); + } else { + PrintUnknownCmdHelpInfo(argv[1].c_str()); + ret = 1; + } + + if (error_code.GetType() != ErrorCode::kOK) { + LOG(ERROR) << "fail reason: " << error_code.ToString(); + } + return ret; +} + +int main(int argc, char* argv[]) { + FLAGS_minloglevel = 2; + ::google::ParseCommandLineFlags(&argc, &argv, true); + + if (argc > 1 && std::string(argv[1]) == "version") { + PrintSystemVersion(); + return 0; + } else if (argc > 1 && std::string(argv[1]) == "help") { + HelpOp(argc, argv); + return 0; + } + + Client* client = Client::NewClient(FLAGS_flagfile, NULL); + if (client == NULL) { + LOG(ERROR) << "client instance not exist"; + return -1; + } + g_printer_opt.print_head = FLAGS_stdout_is_tty; + + InitializeCommandTable(); + + int ret = 0; + if (argc == 1) { + char* line = NULL; + while ((line = readline("lb> ")) != NULL) { + char* line_copy = strdup(line); + std::vector arg_list; + arg_list.push_back(argv[0]); + char* tmp = NULL; + char* token = strtok_r(line, " \t", &tmp); + while (token != NULL) { + arg_list.push_back(token); + token = strtok_r(NULL, " \t", &tmp); + } + if (arg_list.size() == 2 && + (strcmp(arg_list[1], "quit") == 0 || strcmp(arg_list[1], "exit") == 0)) { + free(line_copy); + free(line); + break; + } + if (arg_list.size() > 1) { + add_history(line_copy); + ret = ExecuteCommand(client, arg_list.size(), &arg_list[0]); + } + free(line_copy); + free(line); + } + } else { + ret = ExecuteCommand(client, argc, argv); + } + + delete client; + return ret; +} diff --git a/src/leveldb/Makefile b/src/leveldb/Makefile index c9162d2eb..175c916dc 100644 --- a/src/leveldb/Makefile +++ b/src/leveldb/Makefile @@ -19,7 +19,7 @@ include ../../depends.mk include build_config.mk CFLAGS += -I. -I./include $(PLATFORM_CCFLAGS) $(OPT) -CXXFLAGS += -I. -I./include $(PLATFORM_CXXFLAGS) $(OPT) +CXXFLAGS += -std=c++11 -I. -I./include $(PLATFORM_CXXFLAGS) $(OPT) LDFLAGS += $(PLATFORM_LDFLAGS) -L$(SNAPPY_LIBDIR) -lrt -ldl -lsnappy LIBS += $(PLATFORM_LIBS) diff --git a/src/leveldb/build_detect_platform b/src/leveldb/build_detect_platform index 8e230d950..325dfaf01 100755 --- a/src/leveldb/build_detect_platform +++ b/src/leveldb/build_detect_platform @@ -22,7 +22,6 @@ # # -DLEVELDB_CSTDATOMIC_PRESENT if is present # -DLEVELDB_PLATFORM_POSIX for Posix-based platforms -# -DSNAPPY if the Snappy library is present # OUTPUT=$1 @@ -176,15 +175,6 @@ EOF COMMON_FLAGS="$COMMON_FLAGS -DLEVELDB_PLATFORM_POSIX" fi - # Test whether Snappy library is installed - # http://code.google.com/p/snappy/ - $CXX $CXXFLAGS -x c++ - -o $CXXOUTPUT 2>/dev/null < - int main() {} -EOF - COMMON_FLAGS="$COMMON_FLAGS -DSNAPPY" - PLATFORM_LIBS="$PLATFORM_LIBS" - # Test whether tcmalloc is available $CXX $CXXFLAGS -x c++ - -o $CXXOUTPUT -ltcmalloc_minimal 2>/dev/null <NumEntries()) { meta->file_size = builder->FileSize(); + meta->data_size = meta->file_size; assert(meta->file_size > 0); *saved_size = builder->SavedSize(); @@ -164,7 +165,7 @@ Status BuildTable(const std::string& dbname, if (s.ok() && meta->file_size > 0) { // Keep it - } else { + } else if (!s.IsIOPermissionDenied()) { env->DeleteFile(fname); } return s; diff --git a/src/leveldb/db/db_impl.cc b/src/leveldb/db/db_impl.cc index c076008de..e8758a8ce 100644 --- a/src/leveldb/db/db_impl.cc +++ b/src/leveldb/db/db_impl.cc @@ -9,13 +9,14 @@ #include "db/db_impl.h" #include - #include #include #include #include #include #include +#include + #include "db/builder.h" #include "db/db_iter.h" #include "db/dbformat.h" @@ -44,11 +45,17 @@ namespace leveldb { +extern Status WriteStringToFileSync(Env* env, const Slice& data, + const std::string& fname); + const int kNumNonTableCacheFiles = 10; // if this file exists, ignore error in db-opening const static std::string mark_file_name = "/__oops"; +// if this file exists, +const static std::string init_load_filelock = "/__init_load_filelock"; + // Information kept for every waiting writer struct DBImpl::Writer { WriteBatch* batch; @@ -87,6 +94,7 @@ struct DBImpl::CompactionState { TableBuilder* builder; uint64_t total_bytes; + Status status; Output* current_output() { return &outputs[outputs.size()-1]; } @@ -129,8 +137,10 @@ Options SanitizeOptions(const std::string& dbname, result.block_cache = NewLRUCache(8 << 20); } + if (result.ignore_corruption_in_open) { + Log(result.info_log, "[%s] caution: open with ignore_corruption_in_open", dbname.c_str()); + } { - // Maybe mark error flag in option std::string oops = dbname + mark_file_name; Status s = src.env->FileExists(oops); if (s.ok()) { @@ -139,7 +149,6 @@ Options SanitizeOptions(const std::string& dbname, } // Ignore error from FileExists since there is no harm } - return result; } @@ -166,10 +175,6 @@ DBImpl::DBImpl(const Options& options, const std::string& dbname) logfile_number_(0), log_(NULL), bound_log_size_(0), - bg_compaction_scheduled_(false), - bg_compaction_score_(0), - bg_compaction_timeout_(0), - bg_schedule_id_(0), manual_compaction_(NULL), consecutive_compaction_errors_(0), flush_on_destroy_(false) { @@ -188,6 +193,11 @@ DBImpl::DBImpl(const Options& options, const std::string& dbname) &internal_comparator_); } +bool DBImpl::ShouldForceUnloadOnError() { + MutexLock l(&mutex_); + return bg_error_.IsIOPermissionDenied(); +} + Status DBImpl::Shutdown1() { assert(state_ == kOpened); state_ = kShutdown1; @@ -196,12 +206,17 @@ Status DBImpl::Shutdown1() { shutting_down_.Release_Store(this); // Any non-NULL value is ok Log(options_.info_log, "[%s] wait bg compact finish", dbname_.c_str()); - if (bg_compaction_scheduled_) { - env_->ReSchedule(bg_schedule_id_, kDumpMemTableUrgentScore, 0); + std::vector::iterator it = bg_compaction_tasks_.begin(); + for (; it != bg_compaction_tasks_.end(); ++it) { + env_->ReSchedule((*it)->id, kDumpMemTableUrgentScore, 0); } - while (bg_compaction_scheduled_) { + while (bg_compaction_tasks_.size() > 0) { bg_cv_.Wait(); } + // has enconutered IOPermission Denied error, return immediately and do not try to compact memory table aynmore + if (bg_error_.IsIOPermissionDenied()) { + return bg_error_; + } Status s; if (!options_.dump_mem_on_shutdown) { @@ -231,6 +246,9 @@ Status DBImpl::Shutdown2() { state_ = kShutdown2; MutexLock l(&mutex_); + if(bg_error_.IsIOPermissionDenied()) { + return bg_error_; + } Status s; if (!options_.dump_mem_on_shutdown) { return s; @@ -315,26 +333,30 @@ void DBImpl::MaybeIgnoreError(Status* s) const { } void DBImpl::DeleteObsoleteFiles() { + mutex_.AssertHeld(); if (!bg_error_.ok()) { // After a background error, we don't know whether a new version may // or may not have been committed, so we cannot safely garbage collect. return; } + // check filesystem, and then check pending_outputs_ + std::vector filenames; + mutex_.Unlock(); + env_->GetChildren(dbname_, &filenames); // Ignoring errors on purpose + mutex_.Lock(); + // Make a set of all of the live files std::set live = pending_outputs_; versions_->AddLiveFiles(&live); // manifest file set, keep latest 3 manifest files for backup - std::set manifest_set; + //std::set manifest_set; - Log(options_.info_log, "[%s] try DeleteObsoleteFiles, total live file num: %llu\n", - dbname_.c_str(), static_cast(live.size())); - - std::vector filenames; - mutex_.Unlock(); - env_->GetChildren(dbname_, &filenames); // Ignoring errors on purpose - mutex_.Lock(); + Log(options_.info_log, "[%s] try DeleteObsoleteFiles, total live file num: %llu," + " pending_outputs %lu, children_nr %lu\n", + dbname_.c_str(), static_cast(live.size()), + pending_outputs_.size(), filenames.size()); uint64_t number; FileType type; for (size_t i = 0; i < filenames.size(); i++) { @@ -345,28 +367,28 @@ void DBImpl::DeleteObsoleteFiles() { keep = ((number >= versions_->LogNumber()) || (number == versions_->PrevLogNumber())); break; - case kDescriptorFile: - manifest_set.insert(filenames[i]); - if (manifest_set.size() > 3) { - std::set::iterator it = manifest_set.begin(); - ParseFileName(*it, &number, &type); - if (number < versions_->ManifestFileNumber()) { - // Keep my manifest file, and any newer incarnations' - // (in case there is a race that allows other incarnations) - filenames[i] = *it; - keep = false; - manifest_set.erase(it); - } - } - break; + //case kDescriptorFile: + // manifest_set.insert(filenames[i]); + // if (manifest_set.size() > 3) { + // std::set::iterator it = manifest_set.begin(); + // ParseFileName(*it, &number, &type); + // if (number < versions_->ManifestFileNumber()) { + // // Keep my manifest file, and any newer incarnations' + // // (in case there is a race that allows other incarnations) + // filenames[i] = *it; + // keep = false; + // manifest_set.erase(it); + // } + // } + // break; case kTableFile: keep = (live.find(BuildFullFileNumber(dbname_, number)) != live.end()); break; - case kTempFile: - // Any temp files that are currently being written to must - // be recorded in pending_outputs_, which is inserted into "live" - keep = (live.find(number) != live.end()); - break; + //case kTempFile: + // // Any temp files that are currently being written to must + // // be recorded in pending_outputs_, which is inserted into "live" + // keep = (live.find(number) != live.end()); + // break; case kCurrentFile: case kDBLockFile: case kInfoLogFile: @@ -381,9 +403,9 @@ void DBImpl::DeleteObsoleteFiles() { if (type == kTableFile) { table_cache_->Evict(dbname_, BuildFullFileNumber(dbname_, number)); } - Log(options_.info_log, "[%s] Delete type=%s #%lld\n", + Log(options_.info_log, "[%s] Delete type=%s #%lld, fname %s\n", dbname_.c_str(), FileTypeToString(type), - static_cast(number)); + static_cast(number), filenames[i].c_str()); mutex_.Unlock(); env_->DeleteFile(dbname_ + "/" + filenames[i]); mutex_.Lock(); @@ -548,15 +570,7 @@ Status DBImpl::DbExists(bool* exists) { Status DBImpl::Recover(VersionEdit* edit) { mutex_.AssertHeld(); - if (options_.ignore_corruption_in_open) { - Status s = env_->DeleteFile(dbname_ + mark_file_name); - if (!s.ok()) { - // legacy mark-file is dangerous - Log(options_.info_log, "[%s] delete mark-file failed for %s", - dbname_.c_str(), s.ToString().c_str()); - return Status::IOError("delete mark-file failed"); - } - } + bool need_newdb_txn = false; { Status s = env_->FileExists(dbname_); @@ -567,14 +581,24 @@ Status DBImpl::Recover(VersionEdit* edit) { dbname_.c_str(), s.ToString().c_str()); return s; } + need_newdb_txn = true; } else if (s.ok()) { - // Directory exists, do nothing + // lg directory exists and not ignore curruption in open + if (!options_.ignore_corruption_in_open) { + s = env_->FileExists(dbname_ + init_load_filelock); + if (s.ok()) { + need_newdb_txn = true; + } else if (!s.IsNotFound()) { + // Unknown status + return s; + } + } } else { // Unknown status return s; } } - + if (options_.use_file_lock) { Status s = env_->LockFile(LockFileName(dbname_), &db_lock_); if (!s.ok()) { @@ -582,6 +606,36 @@ Status DBImpl::Recover(VersionEdit* edit) { } } + if (options_.ignore_corruption_in_open) { + Status s = env_->FileExists(dbname_ + init_load_filelock); + if (s.ok()) { + s = env_->DeleteFile(dbname_ + init_load_filelock); + if (!s.ok()) { + // legacy initlock-file is dangerous + Log(options_.info_log, "[%s] delete initlock-file failed for %s", + dbname_.c_str(), s.ToString().c_str()); + return Status::IOError("delete initlock-file failed"); + } + } + s = env_->FileExists(dbname_ + mark_file_name); + if (s.ok()) { + s = env_->DeleteFile(dbname_ + mark_file_name); + if (!s.ok()) { + // legacy mark-file is dangerous + Log(options_.info_log, "[%s] delete mark-file failed for %s", + dbname_.c_str(), s.ToString().c_str()); + return Status::IOError("delete mark-file failed"); + } + } + } + + if (need_newdb_txn) { + Status s = BeginNewDbTransaction(); + if (!s.ok()) { + return s; + } + } + bool db_exists; Status s = DbExists(&db_exists); if (!s.ok()) { @@ -664,7 +718,12 @@ Status DBImpl::Recover(VersionEdit* edit) { } } } - + if (need_newdb_txn) { + Status s = CommitNewDbTransaction(); + if (!s.ok()) { + return s; + } + } if (s.ok()) { state_ = kOpened; } @@ -672,11 +731,14 @@ Status DBImpl::Recover(VersionEdit* edit) { } Status DBImpl::WriteLevel0Table(MemTable* mem, VersionEdit* edit, - Version* base) { + Version* base, uint64_t* number) { mutex_.AssertHeld(); const uint64_t start_micros = env_->NowMicros(); FileMetaData meta; meta.number = BuildFullFileNumber(dbname_, versions_->NewFileNumber()); + if (number) { + *number = meta.number; + } pending_outputs_.insert(meta.number); Iterator* iter = mem->NewIterator(); Log(options_.info_log, "[%s] Level-0 table #%u: started", @@ -724,15 +786,39 @@ Status DBImpl::WriteLevel0Table(MemTable* mem, VersionEdit* edit, return s; } -Status DBImpl::CompactMemTable() { +// multithread safe +Status DBImpl::CompactMemTable(bool* sched_idle) { mutex_.AssertHeld(); assert(imm_ != NULL); + Status s; + if (sched_idle) { + *sched_idle = true; + } + if (imm_->BeingFlushed()) { + //Log(options_.info_log, "[%s] CompactMemTable conflict, seq %lu", + // dbname_.c_str(), GetLastSequence(false)); + return s; + } + imm_->SetBeingFlushed(true); + + if (imm_->ApproximateMemoryUsage() <= 0) { // imm is empty, do nothing + Log(options_.info_log, "[%s] CompactMemTable empty memtable %lu", + dbname_.c_str(), GetLastSequence(false)); + imm_->Unref(); + imm_ = NULL; + has_imm_.Release_Store(NULL); + return s; + } + if (sched_idle) { + *sched_idle = false; + } // Save the contents of the memtable as a new Table VersionEdit edit; + uint64_t number; Version* base = versions_->current(); base->Ref(); - Status s = WriteLevel0Table(imm_, &edit, base); + s = WriteLevel0Table(imm_, &edit, base, &number); base->Unref(); if (s.ok() && shutting_down_.Acquire_Load()) { @@ -741,6 +827,7 @@ Status DBImpl::CompactMemTable() { // Replace immutable memtable with the generated Table if (s.ok()) { + pending_outputs_.insert(number); // LogAndApply donot holds lock, so use pending_outputs_ to make sure new file will not be deleted edit.SetPrevLogNumber(0); edit.SetLogNumber(logfile_number_); // Earlier logs no longer needed if (imm_->GetLastSequence()) { @@ -749,6 +836,7 @@ Status DBImpl::CompactMemTable() { Log(options_.info_log, "[%s] CompactMemTable SetLastSequence %lu", dbname_.c_str(), edit.GetLastSequence()); s = versions_->LogAndApply(&edit, &mutex_); + pending_outputs_.erase(number); } if (s.ok()) { @@ -756,6 +844,9 @@ Status DBImpl::CompactMemTable() { imm_->Unref(); imm_ = NULL; has_imm_.Release_Store(NULL); + } else { + // imm dump fail, reset being flush flag + imm_->SetBeingFlushed(false); } return s; @@ -787,6 +878,8 @@ void DBImpl::TEST_CompactRange(int level, const Slice* begin,const Slice* end) { ManualCompaction manual; manual.level = level; manual.done = false; + manual.being_sched = false; + manual.compaction_conflict = kManualCompactIdle; if (begin == NULL) { manual.begin = NULL; } else { @@ -805,6 +898,9 @@ void DBImpl::TEST_CompactRange(int level, const Slice* begin,const Slice* end) { if (manual_compaction_ == NULL) { // Idle manual_compaction_ = &manual; MaybeScheduleCompaction(); + } else if (manual_compaction_->compaction_conflict == kManualCompactWakeup) { + manual_compaction_->compaction_conflict = kManualCompactIdle; + MaybeScheduleCompaction(); } else { // Running either my compaction or another compaction. bg_cv_.Wait(); } @@ -877,117 +973,144 @@ void DBImpl::AddInheritedLiveFiles(std::vector >* live) { } Status DBImpl::RecoverInsertMem(WriteBatch* batch, VersionEdit* edit) { - MutexLock lock(&mutex_); + MutexLock lock(&mutex_); - if (recover_mem_ == NULL) { - recover_mem_ = NewMemTable(); - recover_mem_->Ref(); - } - uint64_t log_sequence = WriteBatchInternal::Sequence(batch); - uint64_t last_sequence = log_sequence + WriteBatchInternal::Count(batch) - 1; + if (recover_mem_ == NULL) { + recover_mem_ = NewMemTable(); + recover_mem_->Ref(); + } + uint64_t log_sequence = WriteBatchInternal::Sequence(batch); + uint64_t last_sequence = log_sequence + WriteBatchInternal::Count(batch) - 1; - // if duplicate record, ignore - if (log_sequence <= recover_mem_->GetLastSequence()) { - assert (last_sequence <= recover_mem_->GetLastSequence()); - Log(options_.info_log, "[%s] duplicate record, ignore %lu ~ %lu", - dbname_.c_str(), log_sequence, last_sequence); - return Status::OK(); - } + // if duplicate record, ignore + if (log_sequence <= recover_mem_->GetLastSequence()) { + assert (last_sequence <= recover_mem_->GetLastSequence()); + Log(options_.info_log, "[%s] duplicate record, ignore %lu ~ %lu", + dbname_.c_str(), log_sequence, last_sequence); + return Status::OK(); + } - Status status = WriteBatchInternal::InsertInto(batch, recover_mem_); - MaybeIgnoreError(&status); + Status status = WriteBatchInternal::InsertInto(batch, recover_mem_); + MaybeIgnoreError(&status); + if (!status.ok()) { + return status; + } + if (recover_mem_->ApproximateMemoryUsage() > options_.write_buffer_size) { + edit->SetLastSequence(recover_mem_->GetLastSequence()); + status = WriteLevel0Table(recover_mem_, edit, NULL); if (!status.ok()) { - return status; - } - if (recover_mem_->ApproximateMemoryUsage() > options_.write_buffer_size) { - edit->SetLastSequence(recover_mem_->GetLastSequence()); - status = WriteLevel0Table(recover_mem_, edit, NULL); - if (!status.ok()) { - // Reflect errors immediately so that conditions like full - // file-systems cause the DB::Open() to fail. - return status; - } - recover_mem_->Unref(); - recover_mem_ = NULL; + // Reflect errors immediately so that conditions like full + // file-systems cause the DB::Open() to fail. + return status; } - return status; + recover_mem_->Unref(); + recover_mem_ = NULL; + } + return status; } Status DBImpl::RecoverLastDumpToLevel0(VersionEdit* edit) { - MutexLock lock(&mutex_); - Status status; - if (recover_mem_ == NULL) { - return status; - } + MutexLock lock(&mutex_); + Status s; + if (recover_mem_ != NULL) { if (recover_mem_->GetLastSequence() > 0) { - edit->SetLastSequence(recover_mem_->GetLastSequence()); - status = WriteLevel0Table(recover_mem_, edit, NULL); + edit->SetLastSequence(recover_mem_->GetLastSequence()); + s = WriteLevel0Table(recover_mem_, edit, NULL); } recover_mem_->Unref(); recover_mem_ = NULL; - return status; -} + } + assert(recover_mem_ == NULL); + // LogAndApply to lg's manifest + if (s.ok()) { + s = versions_->LogAndApply(edit, &mutex_); + if (s.ok()) { + DeleteObsoleteFiles(); + MaybeScheduleCompaction(); + } else { + Log(options_.info_log, "[%s] Fail to modify manifest", + dbname_.c_str()); + } + } else { + Log(options_.info_log, "[%s] Fail to dump log to level 0", dbname_.c_str()); + } + return s; +} // end of tera-specific +bool ScoreSortGreater(std::pair i, std::pair j) { + if (i.second != j.second) { + return i.second < j.second; + } else { + return i.first > j.first; + } +} void DBImpl::MaybeScheduleCompaction() { mutex_.AssertHeld(); if (shutting_down_.Acquire_Load()) { // DB is being deleted; no more background compactions + } else if (bg_error_.IsIOPermissionDenied()) { + // We have met an PermissionDenied error, not try to do compaction anymore, the tablet will be unloaded soon } else { - uint64_t timeout = 0; - double score = versions_->CompactionScore(&timeout); - if (manual_compaction_ != NULL) { - score = kManualCompactScore; - timeout = 0; - } - if (imm_ != NULL) { - score = kDumpMemTableScore; - timeout = 0; - } - if (score > 0) { - if (!bg_compaction_scheduled_) { - bg_schedule_id_ = env_->Schedule(&DBImpl::BGWork, this, score, timeout); - Log(options_.info_log, "[%s] Schedule Compact[%ld] score= %.2f, timeout=%lu", - dbname_.c_str(), bg_schedule_id_, score, timeout); - bg_compaction_score_ = score; - bg_compaction_timeout_ = timeout; - bg_compaction_scheduled_ = true; - assert(score <= 1 || timeout == 0); // if score > 1, then timeout MUST be 0 - } else { - // use the same way to compute priority score, like util/thread_pool.h - bool need_resched = false; - if (timeout != bg_compaction_timeout_) { - need_resched = timeout < bg_compaction_timeout_; - } else if (score != bg_compaction_score_) { - need_resched = score > bg_compaction_score_; - } - - if (need_resched) { - env_->ReSchedule(bg_schedule_id_, score, timeout); - Log(options_.info_log, "[%s] ReSchedule Compact[%ld] score= %.2f, timeout=%lu", - dbname_.c_str(), bg_schedule_id_, score, timeout); - bg_compaction_score_ = score; - bg_compaction_timeout_ = timeout; - assert(score <= 1 || timeout == 0); // if score > 1, then timeout MUST be 0 + std::vector > scores; + if (imm_ && !imm_->BeingFlushed()) { + scores.push_back(std::pair(kDumpMemTableScore, 0)); + } + if (manual_compaction_ && !manual_compaction_->being_sched && + (manual_compaction_->compaction_conflict != kManualCompactConflict)) { + scores.push_back(std::pair(kManualCompactScore, 0)); + } + versions_->CompactionScore(&scores); + + size_t qlen = scores.size() > bg_compaction_tasks_.size() ? scores.size(): bg_compaction_tasks_.size(); + for (size_t i = 0; i < bg_compaction_tasks_.size(); i++) { + CompactionTask* task = bg_compaction_tasks_[i]; + scores.push_back(std::pair(task->score, task->timeout)); + } + std::sort(scores.begin(), scores.end(), ScoreSortGreater); + + for (size_t i = 0; i < qlen; i++) { + if (bg_compaction_tasks_.size() < options_.max_background_compactions) { + if (i < bg_compaction_tasks_.size()) { // try reschedule + CompactionTask* task = bg_compaction_tasks_[i]; + if (ScoreSortGreater(scores[i], std::pair(task->score, task->timeout))) { // resched + task->score = scores[i].first; + task->timeout = scores[i].second; + env_->ReSchedule(task->id, task->score, task->timeout); + Log(options_.info_log, "[%s] ReSchedule Compact[%ld] score= %.2f, timeout=%lu, currency %d", + dbname_.c_str(), task->id, task->score, task->timeout, (int)bg_compaction_tasks_.size()); + assert(scores[i].first <= 1 || scores[i].second == 0); // if score > 1, then timeout MUST be 0 + } + } else { // new compact task + CompactionTask* task = new CompactionTask; + task->db = this; + task->score = scores[i].first; + task->timeout = scores[i].second; + bg_compaction_tasks_.push_back(task); + task->id = env_->Schedule(&DBImpl::BGWork, task, task->score, task->timeout); + Log(options_.info_log, "[%s] Schedule Compact[%ld] score= %.2f, timeout=%lu, currency %d", + dbname_.c_str(), task->id, task->score, task->timeout, (int)bg_compaction_tasks_.size()); + assert(scores[i].first <= 1 || scores[i].second == 0); // if score > 1, then timeout MUST be 0 } } - } else { - // No work to be done } } + return; } -void DBImpl::BGWork(void* db) { - reinterpret_cast(db)->BackgroundCall(); +void DBImpl::BGWork(void* task) { + CompactionTask* ctask = reinterpret_cast(task); + reinterpret_cast(ctask->db)->BackgroundCall(ctask); } -void DBImpl::BackgroundCall() { - Log(options_.info_log, "[%s] BackgroundCall", dbname_.c_str()); +void DBImpl::BackgroundCall(CompactionTask* task) { MutexLock l(&mutex_); - assert(bg_compaction_scheduled_); + Log(options_.info_log, "[%s] BackgroundCompact[%ld] score= %.2f currency %d", + dbname_.c_str(), task->id, task->score, (int)bg_compaction_tasks_.size()); + bool sched_idle = false; if (!shutting_down_.Acquire_Load()) { - Status s = BackgroundCompaction(); + Status s = BackgroundCompaction(&sched_idle); if (s.ok()) { // Success consecutive_compaction_errors_ = 0; @@ -1001,12 +1124,12 @@ void DBImpl::BackgroundCall() { bg_cv_.SignalAll(); // In case a waiter can proceed despite the error Log(options_.info_log, "[%s] Waiting after background compaction error: %s, retry: %d", dbname_.c_str(), s.ToString().c_str(), consecutive_compaction_errors_); - mutex_.Unlock(); ++consecutive_compaction_errors_; - if (consecutive_compaction_errors_ > 100000) { + if (s.IsIOPermissionDenied() || consecutive_compaction_errors_ > 100000) { bg_error_ = s; consecutive_compaction_errors_ = 0; } + mutex_.Unlock(); int seconds_to_sleep = 1; for (int i = 0; i < 3 && i < consecutive_compaction_errors_ - 1; ++i) { seconds_to_sleep *= 2; @@ -1014,36 +1137,53 @@ void DBImpl::BackgroundCall() { env_->SleepForMicroseconds(seconds_to_sleep * 1000000); mutex_.Lock(); } + } else { + sched_idle = true; } - bg_compaction_scheduled_ = false; + std::vector::iterator task_id = std::find(bg_compaction_tasks_.begin(), + bg_compaction_tasks_.end(), + task); + assert(task_id != bg_compaction_tasks_.end()); + bg_compaction_tasks_.erase(task_id); + delete task; // Previous compaction may have produced too many files in a level, // so reschedule another compaction if needed. - MaybeScheduleCompaction(); + if (!sched_idle) { + MaybeScheduleCompaction(); + } bg_cv_.SignalAll(); } -Status DBImpl::BackgroundCompaction() { +Status DBImpl::BackgroundCompaction(bool* sched_idle) { mutex_.AssertHeld(); - if (imm_ != NULL) { - return CompactMemTable(); + *sched_idle = false; + if (imm_ && !imm_->BeingFlushed()) { + return CompactMemTable(sched_idle); } - Compaction* c; + Status status; + Compaction* c = NULL; bool is_manual = (manual_compaction_ != NULL); InternalKey manual_end; if (is_manual) { ManualCompaction* m = manual_compaction_; - c = versions_->CompactRange(m->level, m->begin, m->end); - m->done = (c == NULL); + if (m->being_sched) { // other thread doing manual compaction or range being compacted + return status; + } + m->being_sched = true; + bool conflict = false; + c = versions_->CompactRange(m->level, m->begin, m->end, &conflict); + m->compaction_conflict = conflict? kManualCompactConflict : kManualCompactIdle; + m->done = (c == NULL && !conflict); if (c != NULL) { manual_end = c->input(0, c->num_input_files(0) - 1)->largest; } Log(options_.info_log, - "[%s] Manual compaction at level-%d from %s .. %s; will stop at %s\n", - dbname_.c_str(), m->level, + "[%s] Manual compaction, conflit %u, at level-%d from %s .. %s; will stop at %s\n", + dbname_.c_str(), conflict, m->level, (m->begin ? m->begin->DebugString().c_str() : "(begin)"), (m->end ? m->end->DebugString().c_str() : "(end)"), (m->done ? "(end)" : manual_end.DebugString().c_str())); @@ -1051,9 +1191,9 @@ Status DBImpl::BackgroundCompaction() { c = versions_->PickCompaction(); } - Status status; if (c == NULL) { // Nothing to do + *sched_idle = true; } else if (!is_manual && c->IsTrivialMove()) { // Move file to next level assert(c->num_input_files(0) == 1); @@ -1070,12 +1210,9 @@ Status DBImpl::BackgroundCompaction() { static_cast(f->file_size), status.ToString().c_str(), versions_->LevelSummary(&tmp)); + versions_->ReleaseCompaction(c, status); } else { - CompactionState* compact = new CompactionState(c); - status = DoCompactionWork(compact); - CleanupCompaction(compact); - c->ReleaseInputs(); - DeleteObsoleteFiles(); + status = ParallelCompaction(c); } delete c; @@ -1094,16 +1231,127 @@ Status DBImpl::BackgroundCompaction() { if (is_manual) { ManualCompaction* m = manual_compaction_; - if (!status.ok()) { - m->done = true; + m->being_sched = false; + if (m->compaction_conflict != kManualCompactConflict) { // PickRange success + if (!status.ok()) { + m->done = true; + } + if (!m->done) { + // We only compacted part of the requested range. Update *m + // to the range that is left to be compacted. + m->tmp_storage = manual_end; + m->begin = &m->tmp_storage; + } + manual_compaction_ = NULL; } - if (!m->done) { - // We only compacted part of the requested range. Update *m - // to the range that is left to be compacted. - m->tmp_storage = manual_end; - m->begin = &m->tmp_storage; + } else if (manual_compaction_ != NULL) { // non manual compact + ManualCompaction* m = manual_compaction_; + m->compaction_conflict = kManualCompactWakeup;// Wakeup here, ManualCompact thread check it + Log(options_.info_log, + "[%s] Wakeup Manual compaction at level-%d from %s .. %s", + dbname_.c_str(), m->level, + (m->begin ? m->begin->DebugString().c_str() : "(begin)"), + (m->end ? m->end->DebugString().c_str() : "(end)")); + } + return status; +} + +Status DBImpl::ParallelCompaction(Compaction* c) { + const uint64_t start_micros = env_->NowMicros(); + std::vector compaction_vec; + std::vector compaction_state_vec; + std::vector compact_stragety_vec; + assert(versions_->NumLevelFiles(c->level()) > 0); + SequenceNumber smallest_snapshot = snapshots_.empty() ? kMaxSequenceNumber : *(snapshots_.begin()); + versions_->GenerateSubCompaction(c, &compaction_vec, &mutex_); + mutex_.Unlock(); + + // handle compaction without Lock + std::vector thread_pool; + thread_pool.reserve(compaction_vec.size() - 1); + Log(options_.info_log, "[%s] parallel compacting %d@%d + %d@%d files, " + "sub_compact %lu, snapshot %lu\n", + dbname_.c_str(), + c->num_input_files(0), + c->level(), + c->num_input_files(1), + c->output_level(), + compaction_vec.size(), + smallest_snapshot); + for (size_t i = 0; i < compaction_vec.size(); i++) { + CompactionState* compaction = new CompactionState(compaction_vec[i]); + assert(compaction->builder == NULL); + assert(compaction->outfile == NULL); + compaction->smallest_snapshot = smallest_snapshot; + compaction_state_vec.push_back(compaction); + + CompactStrategy* compact_strategy = NewCompactStrategy(compaction); + compact_stragety_vec.push_back(compact_strategy); + if (i == 0) { + Log(options_.info_log, "[%s] compact strategy: %s, snapshot %lu\n", + dbname_.c_str(), + compact_strategy->Name(), + compaction->smallest_snapshot); } - manual_compaction_ = NULL; + + if (i < compaction_vec.size() - 1) { + thread_pool.emplace_back(&DBImpl::HandleCompactionWork, this, + compaction, compact_strategy); + } else { + HandleCompactionWork(compaction, compact_strategy); + } + } + for (auto& t : thread_pool) { + t.join(); + } + + CompactionStats stats; + CompactionState* compact = new CompactionState(c); + compact->smallest_snapshot = smallest_snapshot; + for (size_t i = 0; i < compaction_vec.size(); i++) { + CompactionState* compaction = compaction_state_vec[i]; + for (auto & out : compaction->outputs) { + compact->outputs.push_back(out); + stats.bytes_written += out.file_size; + } + compact->total_bytes += compaction->total_bytes; + if (compact->status.ok()) { + compact->status = compaction->status; + } + + CompactStrategy* compact_stragety = compact_stragety_vec[i]; + delete compact_stragety; + } + for (int which = 0; which < 2; which++) { + for (int i = 0; i < compact->compaction->num_input_files(which); i++) { + stats.bytes_read += compact->compaction->input(which, i)->file_size; + } + } + + mutex_.Lock(); + Status status = compact->status; + if (status.ok()) { + status = InstallCompactionResults(compact); + } + VersionSet::LevelSummaryStorage tmp; + Log(options_.info_log, "[%s] compacted to: %s, compacte stat %s", + dbname_.c_str(), versions_->LevelSummary(&tmp), status.ToString().c_str()); + stats.micros = env_->NowMicros() - start_micros; + stats_[compact->compaction->output_level()].Add(stats); + + for (size_t i = 0; i < compaction_vec.size(); i++) { + CompactionState* compaction = compaction_state_vec[i]; + CleanupCompaction(compaction); // pop pedning output, which can be deleted in DeleteObSoleteFiles() + delete compaction_vec[i]; + } + assert(compact->builder == NULL); + assert(compact->outfile == NULL); + CleanupCompaction(compact); + + versions_->ReleaseCompaction(c, status); // current_version has reference to c->inputs_[0,1] + c->ReleaseInputs(); + if (!status.IsIOPermissionDenied()) { + DeleteObsoleteFiles(); } return status; } @@ -1120,7 +1368,9 @@ void DBImpl::CleanupCompaction(CompactionState* compact) { delete compact->outfile; for (size_t i = 0; i < compact->outputs.size(); i++) { const CompactionState::Output& out = compact->outputs[i]; - pending_outputs_.erase(BuildFullFileNumber(dbname_, out.number)); + if (pending_outputs_.erase(BuildFullFileNumber(dbname_, out.number)) > 0) { + Log(options_.info_log, "[%s] erase pending_output #%lu", dbname_.c_str(), out.number); + } } delete compact; } @@ -1138,6 +1388,8 @@ Status DBImpl::OpenCompactionOutputFile(CompactionState* compact) { out.smallest.Clear(); out.largest.Clear(); compact->outputs.push_back(out); + + Log(options_.info_log, "[%s] insert pending_output #%lu", dbname_.c_str(), file_number); mutex_.Unlock(); } @@ -1253,64 +1505,63 @@ Status DBImpl::InstallCompactionResults(CompactionState* compact) { return versions_->LogAndApply(compact->compaction->edit(), &mutex_); } -Status DBImpl::DoCompactionWork(CompactionState* compact) { - const uint64_t start_micros = env_->NowMicros(); - int64_t imm_micros = 0; // Micros spent doing imm_ compactions - - Log(options_.info_log, "[%s] Compacting %d@%d + %d@%d files", - dbname_.c_str(), - compact->compaction->num_input_files(0), - compact->compaction->level(), - compact->compaction->num_input_files(1), - compact->compaction->output_level()); - - assert(versions_->NumLevelFiles(compact->compaction->level()) > 0); - assert(compact->builder == NULL); - assert(compact->outfile == NULL); - if (snapshots_.empty()) { - compact->smallest_snapshot = kMaxSequenceNumber; - } else { - compact->smallest_snapshot = *(snapshots_.begin()); - } - +CompactStrategy* DBImpl::NewCompactStrategy(CompactionState* compact) { CompactStrategy* compact_strategy = NULL; if (options_.compact_strategy_factory) { compact_strategy = options_.compact_strategy_factory->NewInstance(); - if (snapshots_.empty()) { - compact_strategy->SetSnapshot(kMaxSequenceNumber); - } else { - compact_strategy->SetSnapshot(*(snapshots_.begin())); - } - Log(options_.info_log, "[%s] Compact strategy: %s", - dbname_.c_str(), - compact_strategy->Name()); + compact_strategy->SetSnapshot(compact->smallest_snapshot); } + return compact_strategy; +} - // Release mutex while we're actually doing the compaction work - mutex_.Unlock(); +// ** Handle sub compaction without LOCK ** +void DBImpl::HandleCompactionWork(CompactionState* compact, + CompactStrategy* compact_strategy) { + Compaction* c = compact->compaction; + Status& status = compact->status; + Iterator* input = versions_->MakeInputIterator(c); + if (c->sub_compact_start_ == "") { + input->SeekToFirst(); + } else { + input->Seek(c->sub_compact_start_); + } + Slice end_key(c->sub_compact_end_); + Log(options_.info_log, "[%s] handle %d@%d + %d@%d compact, range [%s, %s)\n", + dbname_.c_str(), + c->num_input_files(0), + c->level(), + c->num_input_files(1), + c->output_level(), + c->sub_compact_start_.c_str(), + c->sub_compact_end_.c_str()); - Iterator* input = versions_->MakeInputIterator(compact->compaction); - input->SeekToFirst(); - Status status; ParsedInternalKey ikey; std::string current_user_key; bool has_current_user_key = false; SequenceNumber last_sequence_for_key = kMaxSequenceNumber; - for (; input->Valid() && !shutting_down_.Acquire_Load(); ) { // Prioritize immutable compaction work if (has_imm_.NoBarrier_Load() != NULL) { - const uint64_t imm_start = env_->NowMicros(); mutex_.Lock(); - if (imm_ != NULL) { - CompactMemTable(); + if (imm_ && !imm_->BeingFlushed()) { + CompactMemTable(); // no need check failure, because imm_ not null if dump fail. bg_cv_.SignalAll(); // Wakeup MakeRoomForWrite() if necessary } mutex_.Unlock(); - imm_micros += (env_->NowMicros() - imm_start); } Slice key = input->key(); + if (end_key.size() > 0 && internal_comparator_.InternalKeyComparator::Compare(input->key(), end_key) >= 0) { + Log(options_.info_log, "[%s] handle %d@%d + %d@%d compact, stop at %s\n", + dbname_.c_str(), + c->num_input_files(0), + c->level(), + c->num_input_files(1), + c->output_level(), + end_key.data()); + break; // reach end_key, stop this sub compaction + } + if (compact->compaction->ShouldStopBefore(key) && compact->builder != NULL) { // should not overlap level() + 2 too much status = FinishCompactionOutputFile(compact, input); @@ -1431,10 +1682,6 @@ Status DBImpl::DoCompactionWork(CompactionState* compact) { } } - if (compact_strategy) { - delete compact_strategy; - } - if (status.ok() && shutting_down_.Acquire_Load()) { status = Status::IOError("Deleting DB during compaction"); } @@ -1451,28 +1698,6 @@ Status DBImpl::DoCompactionWork(CompactionState* compact) { } delete input; input = NULL; - - CompactionStats stats; - stats.micros = env_->NowMicros() - start_micros - imm_micros; - for (int which = 0; which < 2; which++) { - for (int i = 0; i < compact->compaction->num_input_files(which); i++) { - stats.bytes_read += compact->compaction->input(which, i)->file_size; - } - } - for (size_t i = 0; i < compact->outputs.size(); i++) { - stats.bytes_written += compact->outputs[i].file_size; - } - - mutex_.Lock(); - stats_[compact->compaction->output_level()].Add(stats); - - if (status.ok()) { - status = InstallCompactionResults(compact); - } - VersionSet::LevelSummaryStorage tmp; - Log(options_.info_log, - "[%s] compacted to: %s", dbname_.c_str(), versions_->LevelSummary(&tmp)); - return status; } struct IterState { @@ -1652,8 +1877,9 @@ bool DBImpl::BusyWrite() { void DBImpl::Workload(double* write_workload) { MutexLock l(&mutex_); - uint64_t timeout = 0; - double wwl = versions_->CompactionScore(&timeout); + std::vector > scores; + versions_->CompactionScore(&scores); + double wwl = scores.size() > 0? scores[0].first: 0; if (wwl >= 0) { *write_workload = wwl; } else { @@ -1837,6 +2063,71 @@ bool DBImpl::GetProperty(const Slice& property, std::string* value) { } else if (in == "sstables") { *value = versions_->current()->DebugString(); return true; + } else if (in == "verify-db-integrity") { + std::map check_file_list; + versions_->AddLiveFilesWithSize(&check_file_list); + mutex_.Unlock(); + + std::set tablet_num; + std::map::iterator it = check_file_list.begin(); + for (; it != check_file_list.end(); ++it) { + uint64_t tablet; + ParseFullFileNumber(it->first, &tablet, NULL); + tablet_num.insert(tablet); + } + + Status s; + std::set::iterator it_tablet = tablet_num.begin(); + for (; s.ok() && it_tablet != tablet_num.end(); ++it_tablet) { + std::vector filenames; + std::string tablet_path = RealDbName(dbname_, *it_tablet); + s = env_->GetChildren(tablet_path, &filenames); + //Log(options_.info_log, "[%s] verify db(slow), GetChildren %s, files_nr %lu, status %s", + // dbname_.c_str(), tablet_path.c_str(), filenames.size(), s.ToString().c_str()); + + uint64_t number; + FileType type; + for (size_t i = 0; i < filenames.size(); i++) { + if (ParseFileName(filenames[i], &number, &type) && (type == kTableFile)) { + uint64_t tablet_no = BuildFullFileNumber(tablet_path, number); + if (check_file_list.find(tablet_no) == check_file_list.end()) { + continue; + } + + uint64_t fsize = 0; + Status s1 = env_->GetFileSize(tablet_path + "/" + filenames[i], &fsize); + if (!s1.ok() || check_file_list[tablet_no] == fsize) { + check_file_list.erase(tablet_no); + } else { + Log(options_.info_log, "[%s] verify db, size mismatch, " + "path %s, tablet %s, size(in meta) %lu, size(in fs) %lu", + dbname_.c_str(), tablet_path.c_str(), filenames[i].c_str(), check_file_list[tablet_no], fsize); + } + } + } + } + + mutex_.Lock(); + std::map live; + versions_->AddLiveFilesWithSize(&live); + + it = check_file_list.begin(); + while (it != check_file_list.end()) { + if (live.find(it->first) == live.end()) { + it = check_file_list.erase(it); + } else { + ++it; + } + } + + if (s.ok() && check_file_list.empty()) { // verify success + value->append("verify_success"); + } else if (s.ok()) { //sst file lost + value->append("verify_fail"); + Log(options_.info_log, "[%s] db_corruption, lost %lu", + dbname_.c_str(), check_file_list.size()); + } + return s.ok(); } return false; @@ -1894,29 +2185,29 @@ uint64_t DBImpl::GetLastSequence(bool is_locked) { retval = versions_->LastSequence(); } if (is_locked) { - mutex_.Unlock(); + mutex_.Unlock(); } return retval; } MemTable* DBImpl::NewMemTable() const { - if (!options_.use_memtable_on_leveldb) { - return new MemTable(internal_comparator_, - options_.enable_strategy_when_get ? options_.compact_strategy_factory : NULL); - } else { - Logger* info_log = NULL; - //Logger* info_log = options_.info_log; - MemTableOnLevelDB* new_mem = new MemTableOnLevelDB(dbname_, internal_comparator_, - options_.compact_strategy_factory, - options_.memtable_ldb_write_buffer_size, - options_.memtable_ldb_block_size, - info_log); - std::multiset::iterator i = snapshots_.begin(); - for (; i != snapshots_.end(); ++i) { - new_mem->GetSnapshot(*i); - } - return new_mem; - } + if (!options_.use_memtable_on_leveldb) { + return new MemTable(internal_comparator_, + options_.enable_strategy_when_get ? options_.compact_strategy_factory : NULL); + } else { + Logger* info_log = NULL; + //Logger* info_log = options_.info_log; + MemTableOnLevelDB* new_mem = new MemTableOnLevelDB(dbname_, internal_comparator_, + options_.compact_strategy_factory, + options_.memtable_ldb_write_buffer_size, + options_.memtable_ldb_block_size, + info_log); + std::multiset::iterator i = snapshots_.begin(); + for (; i != snapshots_.end(); ++i) { + new_mem->GetSnapshot(*i); + } + return new_mem; + } } uint64_t DBImpl::GetLastVerSequence() { @@ -1929,4 +2220,70 @@ Iterator* DBImpl::NewInternalIterator() { return NewInternalIterator(ReadOptions(), &ignored); } +Status DBImpl::BeginNewDbTransaction() { + std::string lock_file_name = dbname_ + init_load_filelock; + Status s = env_->FileExists(lock_file_name); + if (s.IsNotFound()) { + // first new by split or merge add __lock file for first create lg + s = WriteStringToFileSync(env_, "\n", lock_file_name); + if (!s.ok()) { + Log(options_.info_log, "[%s] fail to start new db transaction: %s", + dbname_.c_str(), s.ToString().c_str()); + return s; + } + } else if (s.ok()) { + // have failed before this time to open + // && ignore corruption option not opened + // && don't have sst files + // need to delete all files in this db except __init_load_filelock file + Log(options_.info_log, "[%s] begin to re-new db: %s", + dbname_.c_str(), s.ToString().c_str()); + std::vector files; + s = env_->GetChildren(dbname_, &files); + if (!s.ok()) { + Log(options_.info_log, "[%s] fail to re-new db: %s", + dbname_.c_str(), s.ToString().c_str()); + return s; + } + uint64_t number; + FileType type; + for (size_t f = 0; f < files.size(); ++f) { + if (ParseFileName(files[f], &number, &type) && kTableFile == type) { + return s; + } + } + for (size_t f = 0; f < files.size(); ++f) { + std::string old_file_name = dbname_ + "/" + files[f]; + if ("/" + files[f] != init_load_filelock) { + s = env_->DeleteFile(old_file_name); + if (!s.ok()) { + Log(options_.info_log, "[%s] fail to re-new db: %s", + dbname_.c_str(), s.ToString().c_str()); + return s; + } + } + } + } + return s; +} + +Status DBImpl::CommitNewDbTransaction() { + std::string lock_file_name = dbname_ + init_load_filelock; + Status s = env_->FileExists(lock_file_name); + if (s.IsNotFound()) { + // lost lock file during this new db + Log(options_.info_log, "[%s] find transaction lock file fail: %s", + dbname_.c_str(), s.ToString().c_str()); + return Status::Corruption("newdb transaction lock disappeared"); + } else if (s.ok()) { + s = env_->DeleteFile(lock_file_name); + if (!s.ok()) { + Log(options_.info_log, "[%s] delete transaction lock file fail: %s", + dbname_.c_str(), s.ToString().c_str()); + return Status::Corruption("newdb transaction clean lock faild"); + } + } + return s; +} + } // namespace leveldb diff --git a/src/leveldb/db/db_impl.h b/src/leveldb/db/db_impl.h index 05b1ae623..8f23fb1c2 100644 --- a/src/leveldb/db/db_impl.h +++ b/src/leveldb/db/db_impl.h @@ -14,6 +14,8 @@ #include "db/db_table.h" #include "db/dbformat.h" #include "db/log_writer.h" +#include "db/version_set.h" +#include "leveldb/compact_strategy.h" #include "leveldb/db.h" #include "leveldb/env.h" #include "port/port.h" @@ -52,6 +54,8 @@ class DBImpl : public DB { virtual void GetApproximateSizes(uint64_t* size, std::vector* lgsize = NULL); virtual void CompactRange(const Slice* begin, const Slice* end, int lg_no = -1); + virtual bool ShouldForceUnloadOnError(); + void AddBoundLogSize(uint64_t size); // tera-specific @@ -96,6 +100,12 @@ class DBImpl : public DB { friend class DBTable; struct CompactionState; struct Writer; + struct CompactionTask { + int64_t id; // compaction thread id + double score; // compaction score + uint64_t timeout; // compaction task delay time + DBImpl* db; + }; Iterator* NewInternalIterator(const ReadOptions&, SequenceNumber* latest_snapshot); @@ -105,15 +115,23 @@ class DBImpl : public DB { void MaybeIgnoreError(Status* s) const; + // parallel compaction + Status ParallelCompaction(Compaction* c); + + CompactStrategy* NewCompactStrategy(CompactionState* compact); + + void HandleCompactionWork(CompactionState* compact, + CompactStrategy* compact_strategy); + // Delete any unneeded files and stale in-memory entries. void DeleteObsoleteFiles(); // Compact the in-memory write buffer to disk. Switches to a new // log-file/memtable and writes a new descriptor iff successful. - Status CompactMemTable() + Status CompactMemTable(bool* sched_idle = NULL) EXCLUSIVE_LOCKS_REQUIRED(mutex_); - Status WriteLevel0Table(MemTable* mem, VersionEdit* edit, Version* base) + Status WriteLevel0Table(MemTable* mem, VersionEdit* edit, Version* base, uint64_t* number = NULL) EXCLUSIVE_LOCKS_REQUIRED(mutex_); Status MakeRoomForWrite(bool force /* compact even if there is room? */) @@ -121,12 +139,10 @@ class DBImpl : public DB { void MaybeScheduleCompaction() EXCLUSIVE_LOCKS_REQUIRED(mutex_); static void BGWork(void* db); - void BackgroundCall(); - Status BackgroundCompaction() EXCLUSIVE_LOCKS_REQUIRED(mutex_); + void BackgroundCall(CompactionTask* task); + Status BackgroundCompaction(bool* sched_idle) EXCLUSIVE_LOCKS_REQUIRED(mutex_); void CleanupCompaction(CompactionState* compact) EXCLUSIVE_LOCKS_REQUIRED(mutex_); - Status DoCompactionWork(CompactionState* compact) - EXCLUSIVE_LOCKS_REQUIRED(mutex_); Status OpenCompactionOutputFile(CompactionState* compact); Status FinishCompactionOutputFile(CompactionState* compact, Iterator* input); @@ -154,6 +170,10 @@ class DBImpl : public DB { bool CheckMemTableCompaction(uint64_t last_sequence); MemTable* NewMemTable() const; + // new db transaction process + Status BeginNewDbTransaction(); + Status CommitNewDbTransaction(); + // Constant after construction Env* const env_; const InternalKeyComparator internal_comparator_; @@ -196,18 +216,24 @@ class DBImpl : public DB { std::set pending_outputs_; // Has a background compaction been scheduled or is running? - bool bg_compaction_scheduled_; - double bg_compaction_score_; - uint64_t bg_compaction_timeout_; - int64_t bg_schedule_id_; + std::vector bg_compaction_tasks_; + std::vector bg_compaction_score_; + std::vector bg_schedule_id_; // Information for a manual compaction + enum ManualCompactState { + kManualCompactIdle, // manual compact inited + kManualCompactConflict, // manual compact run simultaneously + kManualCompactWakeup, // restart delay compact task + }; struct ManualCompaction { int level; bool done; + bool being_sched; const InternalKey* begin; // NULL means beginning of key range const InternalKey* end; // NULL means end of key range InternalKey tmp_storage; // Used to keep track of compaction progress + ManualCompactState compaction_conflict; // 0 == idle, 1 == conflict, 2 == wake }; ManualCompaction* manual_compaction_; diff --git a/src/leveldb/db/db_table.cc b/src/leveldb/db/db_table.cc index d9a4a725c..89ea76a1e 100644 --- a/src/leveldb/db/db_table.cc +++ b/src/leveldb/db/db_table.cc @@ -98,6 +98,10 @@ Options InitOptionsLG(const Options& options, uint32_t lg_id) { opt.sst_size = lg_info->sst_size; opt.write_buffer_size = lg_info->write_buffer_size; opt.seek_latency = lg_info->seek_latency; + if (options.ignore_corruption_in_open_lg_list.find(lg_id) + != options.ignore_corruption_in_open_lg_list.end()) { + opt.ignore_corruption_in_open = true; + } return opt; } @@ -311,22 +315,6 @@ Status DBTable::Init() { uint32_t i = *it; DBImpl* impl = lg_list_[i]; s = impl->RecoverLastDumpToLevel0(lg_edits[i]); - - // LogAndApply to lg's manifest - if (s.ok()) { - MutexLock lock(&impl->mutex_); - s = impl->versions_->LogAndApply(lg_edits[i], &impl->mutex_); - if (s.ok()) { - impl->DeleteObsoleteFiles(); - impl->MaybeScheduleCompaction(); - } else { - Log(options_.info_log, "[%s] Fail to modify manifest of lg %d", - dbname_.c_str(), - i); - } - } else { - Log(options_.info_log, "[%s] Fail to dump log to level 0", dbname_.c_str()); - } delete lg_edits[i]; } @@ -497,6 +485,9 @@ Status DBTable::Write(const WriteOptions& options, WriteBatch* my_batch) { break; } mutex_.Lock(); + if (s.IsIOPermissionDenied()) { + fatal_error_ = s; + } } if (s.ok()) { std::vector lg_updates; @@ -525,7 +516,6 @@ Status DBTable::Write(const WriteOptions& options, WriteBatch* my_batch) { Log(options_.info_log, "[%s] [Fatal] Write to lg%u fail", dbname_.c_str(), i); s = lg_s; - fatal_error_ = lg_s; break; } } @@ -534,7 +524,10 @@ Status DBTable::Write(const WriteOptions& options, WriteBatch* my_batch) { for (uint32_t i = 0; i < lg_list_.size(); ++i) { lg_list_[i]->AddBoundLogSize(updates->DataSize()); } + } else { + fatal_error_ = s; } + // Commit updates if (s.ok() && lg_list_.size() > 1) { for (uint32_t i = 0; i < lg_list_.size(); ++i) { @@ -696,6 +689,19 @@ void DBTable::ReleaseSnapshot(uint64_t sequence_number) { } } +bool DBTable::ShouldForceUnloadOnError() { + MutexLock l(&mutex_); + bool permission_error = fatal_error_.IsIOPermissionDenied(); + if (permission_error) { //return early + return permission_error; + } + std::set::iterator it = options_.exist_lg_list->begin(); + for (; it != options_.exist_lg_list->end(); ++it) { + permission_error |= lg_list_[*it]->ShouldForceUnloadOnError(); + } + return permission_error; +} + const uint64_t DBTable::Rollback(uint64_t snapshot_seq, uint64_t rollback_point) { std::set::iterator it = options_.exist_lg_list->begin(); uint64_t rollback_seq = rollback_point == kMaxSequenceNumber ? last_sequence_ : rollback_point;; @@ -708,21 +714,28 @@ const uint64_t DBTable::Rollback(uint64_t snapshot_seq, uint64_t rollback_point) bool DBTable::GetProperty(const Slice& property, std::string* value) { bool ret = true; std::string ret_string; + std::set::iterator it = options_.exist_lg_list->begin(); for (; it != options_.exist_lg_list->end(); ++it) { std::string lg_value; bool lg_ret = lg_list_[*it]->GetProperty(property, &lg_value); if (lg_ret) { if (options_.exist_lg_list->size() > 1) { - ret_string.append(Uint64ToString(*it) + ": {\n"); + ret_string.append("LG:" + Uint64ToString(*it) + ":"); } ret_string.append(lg_value); if (options_.exist_lg_list->size() > 1) { - ret_string.append("\n}\n"); + ret_string.append(" "); } + } else { + ret = false; + break; } } - *value = ret_string; + + if (ret) { + *value = ret_string; + } return ret; } @@ -936,7 +949,6 @@ Status DBTable::RecoverLogFile(uint64_t log_number, uint64_t recover_limit, } } delete file; - return status; } @@ -1131,6 +1143,14 @@ int64_t DBTable::TEST_MaxNextLevelOverlappingBytes() { } int DBTable::SwitchLog(bool blocked_switch) { + { + MutexLock l(&mutex_); + if (fatal_error_.IsIOPermissionDenied()) { + Log(options_.info_log, "[%s] can not switch log becasue %s", + dbname_.c_str(), fatal_error_.ToString().c_str()); + return 2; + } + } if (!blocked_switch || log::AsyncWriter::BlockLogNum() < options_.max_block_log_number) { if (current_log_size_ == 0) { @@ -1156,6 +1176,10 @@ int DBTable::SwitchLog(bool blocked_switch) { Log(options_.info_log, "[%s] SwitchLog", dbname_.c_str()); } return 0; // success + } else if (s.IsIOPermissionDenied()) { + MutexLock l(&mutex_); + fatal_error_ = s; + return 2; // posix error EACCES = 13 } else { Log(options_.info_log, "[%s] fail to open logfile %s. SwitchLog failed", dbname_.c_str(), log_file_name.c_str()); diff --git a/src/leveldb/db/db_table.h b/src/leveldb/db/db_table.h index 4fa0a11c4..4ff14f46a 100644 --- a/src/leveldb/db/db_table.h +++ b/src/leveldb/db/db_table.h @@ -88,6 +88,8 @@ class DBTable : public DB { virtual const uint64_t Rollback(uint64_t snapshot_seq, uint64_t rollback_point = kMaxSequenceNumber); + virtual bool ShouldForceUnloadOnError(); + // DB implementations can export properties about their state // via this method. If "property" is a valid property understood by this // DB implementation, fills "*value" with its current value and returns diff --git a/src/leveldb/db/db_test.cc b/src/leveldb/db/db_test.cc index 7c25f2de6..a12a0536a 100644 --- a/src/leveldb/db/db_test.cc +++ b/src/leveldb/db/db_test.cc @@ -98,6 +98,23 @@ class SpecialEnv : public EnvWrapper { } Status NewWritableFile(const std::string& f, WritableFile** r) { + class InitLoadLockFile : public WritableFile { + private: + SpecialEnv* env_; + WritableFile* base_; + + public: + InitLoadLockFile(SpecialEnv* env, WritableFile* base) + : env_(env), + base_(base) { + } + ~InitLoadLockFile() { delete base_; } + Status Append(const Slice& data) { return base_->Append(data); } + Status Close() { return base_->Close(); } + Status Flush() { return base_->Flush(); } + Status Sync() { return base_->Sync(); } + }; + class SSTableFile : public WritableFile { private: SpecialEnv* env_; @@ -165,6 +182,8 @@ class SpecialEnv : public EnvWrapper { *r = new SSTableFile(this, *r); } else if (strstr(f.c_str(), "MANIFEST") != NULL) { *r = new ManifestFile(this, *r); + } else if (strstr(f.c_str(), "__init_load_filelock") != NULL) { + *r = new InitLoadLockFile(this, *r); } } return s; @@ -872,6 +891,40 @@ TEST(DBTest, Recover) { } while (ChangeOptions()); } +TEST(DBTest, RecoverWithLostCurrent) { + // before write anything delete current file + ASSERT_OK(env_->DeleteFile(CurrentFileName(dbname_ + "/0"))); + leveldb::WritableFile* lock_file; + ASSERT_OK(env_->NewWritableFile(dbname_ + "/0/__init_load_filelock", &lock_file)); + ASSERT_OK(lock_file->Append("\n")); + ASSERT_OK(lock_file->Sync()); + ASSERT_OK(lock_file->Close()); + delete lock_file; + do { + Reopen(); + ASSERT_OK(Put("foo", "v3")); + Reopen(); + ASSERT_EQ("v3", Get("foo")); + } while (ChangeOptions()); +} + +TEST(DBTest, RecoverWithLostManifest) { + // before write anything delete current file + ASSERT_OK(env_->DeleteFile(DescriptorFileName(dbname_ + "/0", 1))); + leveldb::WritableFile* lock_file; + ASSERT_OK(env_->NewWritableFile(dbname_ + "/0/__init_load_filelock", &lock_file)); + ASSERT_OK(lock_file->Append("\n")); + ASSERT_OK(lock_file->Sync()); + ASSERT_OK(lock_file->Close()); + delete lock_file; + do { + Reopen(); + ASSERT_OK(Put("foo", "v3")); + Reopen(); + ASSERT_EQ("v3", Get("foo")); + } while (ChangeOptions()); +} + TEST(DBTest, RecoveryWithEmptyLog) { do { ASSERT_OK(Put("foo", "v1")); diff --git a/src/leveldb/db/filename.cc b/src/leveldb/db/filename.cc index d56ea2ff7..4ac4a3864 100644 --- a/src/leveldb/db/filename.cc +++ b/src/leveldb/db/filename.cc @@ -129,7 +129,7 @@ bool ParseFileName(const std::string& fname, if (rest == "CURRENT") { *number = 0; *type = kCurrentFile; - } else if (rest == "LOCK") { + } else if (rest == "LOCK" || rest == "__init_load_filelock") { *number = 0; *type = kDBLockFile; } else if (rest == "LOG" || rest == "LOG.old") { @@ -242,6 +242,15 @@ std::string BuildTabletPath(const std::string& prefix, uint64_t tablet) { return dbname; } +std::string BuildTabletLgPath(const std::string& prefix, uint64_t tablet, uint64_t lg) { + char buf[100]; + snprintf(buf, sizeof(buf), "/tablet%08llu/%llu", + static_cast(tablet), + static_cast(lg)); + std::string lg_path = prefix + buf; + return lg_path; +} + std::string BuildTableFilePath(const std::string& prefix, uint64_t tablet, uint64_t lg, uint64_t number) { char buf[100]; @@ -252,6 +261,35 @@ std::string BuildTableFilePath(const std::string& prefix, uint64_t tablet, return MakeFileName(dbname, number & 0xffffffff, "sst"); } +std::string BuildTrashTableFilePath(const std::string& prefix, uint64_t tablet, + uint32_t lg_id, uint64_t number, + const std::string& time) { + char buf[100]; + snprintf(buf, sizeof(buf), "/tablet%08llu/%lu/%08llu.sst.%s", + static_cast(tablet), + static_cast(lg_id), + static_cast(number), + time.c_str()); + + return prefix + buf; +} + +std::string GetTimeStrFromTrashFile(const std::string& path) { + size_t dir_pos = path.rfind("/"); + if (dir_pos == std::string::npos || dir_pos == path.length() - 1) { + return ""; + } + std::string file = path.substr(dir_pos + 1, path.length() - dir_pos - 1); + + size_t time_pos = file.rfind("."); + if (time_pos == std::string::npos) { + return ""; + } + std::string time_str = file.substr(time_pos + 1, file.length() - time_pos - 1); + + return time_str; +} + std::string BuildTableFilePath(const std::string& prefix, uint64_t lg, uint64_t full_number) { uint64_t tablet, number; ParseFullFileNumber(full_number, &tablet, &number); diff --git a/src/leveldb/db/filename.h b/src/leveldb/db/filename.h index ede91c51a..b151c165b 100644 --- a/src/leveldb/db/filename.h +++ b/src/leveldb/db/filename.h @@ -97,12 +97,27 @@ extern uint64_t BuildFullFileNumber(const std::string& dbname, // from (/table1, 3) std::string BuildTabletPath(const std::string& prefix, uint64_t tablet); +std::string BuildTabletLgPath(const std::string& prfix, uint64_t tablet, uint64_t lg); + // Build file path from tablet_num & lg_num & file number // E.g. construct "/table1/tablet000003/0/00000001.sst" // from (/table1, 3, 0, 1) std::string BuildTableFilePath(const std::string& prefix, uint64_t tablet, uint64_t lg, uint64_t number); +// Build trash file path from tablet_num & lg & file number & time +// E.g. construct "/table1/tablet000003/0/00000001.sst.20170718-17-08-30" +// from (/table1, 3, 0, 1, 20170718-17-08-30) +std::string BuildTrashTableFilePath(const std::string& prefix, uint64_t tablet, + uint32_t lg_id, uint64_t number, + const std::string& time); + +// get time string from trash file path +// E.g. get "20170718-17-08-30" +// from "/table1/tablet000003/0/00000001.sst.20170718-17-08-30" +// if path is invalid, return "" +std::string GetTimeStrFromTrashFile(const std::string& path); + // Build file path from lg_num & full file number // E.g. construct "/table1/tablet000003/0/00000001.sst" // from (/table1, 0, 0x8000000300000001) diff --git a/src/leveldb/db/memtable.cc b/src/leveldb/db/memtable.cc index c9f284110..ddee41b1d 100644 --- a/src/leveldb/db/memtable.cc +++ b/src/leveldb/db/memtable.cc @@ -26,6 +26,7 @@ MemTable::MemTable(const InternalKeyComparator& cmp, CompactStrategyFactory* com : last_seq_(0), comparator_(cmp), refs_(0), + being_flushed_(false), table_(comparator_, &arena_), empty_(true), compact_strategy_factory_(compact_strategy_factory) { diff --git a/src/leveldb/db/memtable.h b/src/leveldb/db/memtable.h index ba608550e..a2a1a073a 100644 --- a/src/leveldb/db/memtable.h +++ b/src/leveldb/db/memtable.h @@ -79,6 +79,13 @@ class MemTable { empty_ = false; } + bool BeingFlushed() { return being_flushed_;} + void SetBeingFlushed(bool flag) { + assert(flag ? !being_flushed_ + : being_flushed_); + being_flushed_ = flag; + } + virtual ~MemTable(); protected: @@ -97,6 +104,7 @@ class MemTable { KeyComparator comparator_; int refs_; + bool being_flushed_; Arena arena_; Table table_; diff --git a/src/leveldb/db/version_edit.cc b/src/leveldb/db/version_edit.cc index fc95284e6..244733915 100644 --- a/src/leveldb/db/version_edit.cc +++ b/src/leveldb/db/version_edit.cc @@ -11,7 +11,6 @@ #include "db/filename.h" #include "db/version_set.h" #include "util/coding.h" - namespace leveldb { // Tag numbers for serialized VersionEdit. These numbers are written to @@ -29,6 +28,7 @@ enum Tag { kNewFile = 10, kDeletedFile = 11, kNewFileInfo = 12, + kSstFileDataSize = 13, // no more than 1<<20 kMaxTag = 1 << 20, @@ -147,6 +147,13 @@ void VersionEdit::EncodeTo(std::string* dst) const { PutVarint32(dst, str.size() + kMaxTag); PutVarint32(dst, kNewFileInfo); dst->append(str.data(), str.size()); + + // record sst FileData + str.clear(); + PutVarint64(&str, f.data_size); + PutVarint32(dst, str.size() + kMaxTag); + PutVarint32(dst, kSstFileDataSize); + dst->append(str.data(), str.size()); } } @@ -171,6 +178,43 @@ static bool GetLevel(Slice* input, int* level) { } } +Status VersionEdit::DecodeNewFileInfo(Slice* input, FileMetaData* f) { + bool decode_continue = true; + + while (decode_continue && input->size() > 0) { + uint32_t len = 0; + uint32_t tag = 0; + Slice file_input = *input; + GetVarint32(&file_input, &len); + if (len <= kMaxTag) { + break; + } + + GetVarint32(&file_input, &tag); + switch (tag) { + case kNewFileInfo: + GetVarint32(input, &len);// ignore len + GetVarint32(input, &tag);// ignore tag + GetVarint64(input, &f->del_percentage); + GetVarint64(input, &f->ttl_percentage); + GetVarint64(input, &f->check_ttl_ts); + break; + case kSstFileDataSize: + GetVarint32(input, &len); + GetVarint32(input, &tag); + GetVarint64(input, &f->data_size); + break; + default: + fprintf(stderr, "NewFile %lu without info, skip tag %d, len %d\n", + f->number & 0xffffffff, + tag, len); + decode_continue = false; + break; + } + } + return Status::OK(); +} + Status VersionEdit::DecodeFrom(const Slice& src) { Clear(); Slice input = src; @@ -285,29 +329,7 @@ Status VersionEdit::DecodeFrom(const Slice& src) { f.largest_fake = true; } - // new file format parser - Slice file_ptr = input; - uint32_t file_tag; - GetVarint32(&file_ptr, &file_tag); - if (file_tag > kMaxTag) { - // file_tag - kMaxTag; - GetVarint32(&file_ptr, &tag); - } - switch (tag) { - case kNewFileInfo: - GetVarint32(&input, &tag);// ignore len - GetVarint32(&input, &tag);// ignore tag - GetVarint64(&input, &f.del_percentage); - GetVarint64(&input, &f.ttl_percentage); - GetVarint64(&input, &f.check_ttl_ts); - break; - - default: - fprintf(stderr, "NewFile %lu without info, skip tag %d, len %d\n", - f.number & 0xffffffff, - tag, file_tag); - break; - } + DecodeNewFileInfo(&input, &f); new_files_.push_back(std::make_pair(level, f)); } else { msg = "new-file entry 1"; @@ -400,6 +422,8 @@ std::string VersionEdit::DebugString() const { AppendNumberTo(&r, file_number); r.append(" size "); AppendNumberTo(&r, f.file_size); + r.append(" data_size "); + AppendNumberTo(&r, f.data_size); r.append(" "); r.append(f.smallest.DebugString()); r.append(" .. "); diff --git a/src/leveldb/db/version_edit.h b/src/leveldb/db/version_edit.h index 0c64728d0..17b9bfc36 100644 --- a/src/leveldb/db/version_edit.h +++ b/src/leveldb/db/version_edit.h @@ -33,6 +33,7 @@ struct FileMetaData { InternalKey largest; // Largest internal key served by table bool smallest_fake; // smallest is not real, have out-of-range keys bool largest_fake; // largest is not real, have out-of-range keys + bool being_compacted; // Is this file undergoing compaction? FileMetaData() : refs(0), @@ -44,7 +45,8 @@ struct FileMetaData { file_size(0), data_size(0), smallest_fake(false), - largest_fake(false) { } + largest_fake(false), + being_compacted(false) { } }; class VersionEdit { @@ -157,6 +159,7 @@ class VersionEdit { FileMetaData f; f.number = file; f.file_size = file_size; + f.data_size = f.file_size; f.smallest = smallest; f.largest = largest; f.del_percentage = del_percentage; @@ -185,6 +188,7 @@ class VersionEdit { void EncodeTo(std::string* dst) const; Status DecodeFrom(const Slice& src); + Status DecodeNewFileInfo(Slice* input, FileMetaData* f); std::string DebugString() const; diff --git a/src/leveldb/db/version_edit_test.cc b/src/leveldb/db/version_edit_test.cc index c728af4cc..44a5d308f 100644 --- a/src/leveldb/db/version_edit_test.cc +++ b/src/leveldb/db/version_edit_test.cc @@ -26,6 +26,7 @@ enum Tag { kNewFile = 10, kDeletedFile = 11, kNewFileInfo = 12, + kSstFileDataSize = 13, // no more than 1<<20 kMaxTag = 1 << 20, @@ -53,7 +54,7 @@ class VersionEditTest: public VersionEdit { dst->append(str.data(), str.size()); } } - void EncodeToOld(std::string* dst) { + void EncodeToOld(std::string* dst, bool with_sst, bool with_data_size) { DumpToOldFormat(); if (has_comparator_) { PutVarint32(dst, kComparator); @@ -71,7 +72,43 @@ class VersionEditTest: public VersionEdit { PutVarint32(dst, kLastSequence); PutVarint64(dst, last_sequence_); } + if (!with_sst) { + return; + } + for (uint32_t i = 0; i < 5; i++) { + FileMetaData f; + f.number = 100 + i; + f.file_size = 200 + i; + f.data_size = f.file_size; + f.smallest = InternalKey("apple", 300 + i, kTypeValue); + f.largest = InternalKey("zookeeper", 400 + i, kTypeDeletion); + EncodeSstFile(i, f, dst, with_data_size); + } + } + + void EncodeSstFile(uint32_t level, const FileMetaData& f, std::string* dst, bool with_data_size) { + std::string str; + PutVarint32(&str,level); // level + PutVarint64(&str, f.number); + PutVarint64(&str, f.file_size); + PutLengthPrefixedSlice(&str, f.smallest.Encode()); + PutLengthPrefixedSlice(&str, f.largest.Encode()); + PutVarint32(&str, 0); // put f.smallest_fake + PutVarint32(&str, 0); // put f.largest_fake + + PutVarint32(dst, str.size() + kMaxTag); + PutVarint32(dst, kNewFile); + dst->append(str.data(), str.size()); + // record sst FileData + if (with_data_size) { + str.clear(); + PutVarint64(&str, f.data_size); + PutVarint32(dst, str.size() + kMaxTag); + PutVarint32(dst, kSstFileDataSize); + dst->append(str.data(), str.size()); + } } + void DumpToOldFormat() { has_comparator_ = HasComparator(); comparator_ = GetComparatorName(); @@ -126,22 +163,28 @@ static void CreateEditContent(VersionEditTest* edit) { edit->SetLastSequence(900); TestEncodeDecode(*edit); } -static void CreateEditContentV2(VersionEditTest* edit) { +static void CreateOldEncodedContent(VersionEditTest* edit, std::string* dst, + bool with_sst, bool with_data_size) { edit->SetComparatorName("test_nil_cmp"); edit->SetLogNumber(700); edit->SetNextFile(800); edit->SetLastSequence(900); TestEncodeDecode(*edit); + edit->EncodeToOld(dst, with_sst, with_data_size); } -static void CreateEditWithTtlInfo(VersionEditTest* edit) { +static void CreateEditWithSstDetail(VersionEditTest* edit) { for (int i = 0; i < 5; i++) { TestEncodeDecode(*edit); - edit->AddFile(i, 100 + i, 200 + i, - InternalKey("apple", 300 + i, kTypeValue), - InternalKey("zookeeper", 400 + i, kTypeDeletion), - 20 + i/* del percentage */, - 1000000000 + i/* timeout */, - 50 + i/* del percentage */); + FileMetaData f; + f.number = 100 + i; + f.file_size = 200 + i; + f.data_size = f.file_size; + f.smallest = InternalKey("apple", 300 + i, kTypeValue); + f.largest = InternalKey("zookeeper", 400 + i, kTypeDeletion); + f.del_percentage = 20 + i; + f.ttl_percentage = 50 + i; + f.check_ttl_ts = 1000000000 + i; + edit->AddFile(i, f); edit->DeleteFile(i, 500 + i); edit->SetCompactPointer(i, InternalKey("x00", 600 + i, kTypeValue)); } @@ -154,14 +197,13 @@ static void CreateEditWithTtlInfo(VersionEditTest* edit) { } TEST(VersionEditTest, EncodeFileInfoTag) { VersionEditTest edit; - CreateEditWithTtlInfo(&edit); + CreateEditWithSstDetail(&edit); fprintf(stderr, "%s\n", edit.DebugString().c_str()); } TEST(VersionEditTest, OldFormatRead) { VersionEditTest edit; - CreateEditContentV2(&edit); std::string c1, c3; - edit.EncodeToOld(&c1); // dump into old format + CreateOldEncodedContent(&edit, &c1, false, false); edit.EncodeTo(&c3); // dump into new format VersionEditTest parsed; @@ -174,6 +216,23 @@ TEST(VersionEditTest, OldFormatRead) { fprintf(stderr, "%s\n", parsed.DebugString().c_str()); } +TEST(VersionEditTest, DecodeFormatWithoutSstFileDataSize) { + VersionEditTest edit; + std::string c1, c3; + CreateOldEncodedContent(&edit, &c1, true, false); + edit.EncodeTo(&c3); // dump into new format + + VersionEditTest parsed; + Status s = parsed.DecodeFrom(c1); // use new Decode to parse old format + ASSERT_TRUE(s.ok()) << s.ToString(); + std::string c2; + parsed.EncodeTo(&c2); + + ASSERT_NE(c2, c3); + fprintf(stderr, "%s\n", parsed.DebugString().c_str()); + +} + TEST(VersionEditTest, EncodeUnknowTag) { VersionEditTest edit; CreateEditContent(&edit); diff --git a/src/leveldb/db/version_set.cc b/src/leveldb/db/version_set.cc index 088acd090..4c5d328aa 100644 --- a/src/leveldb/db/version_set.cc +++ b/src/leveldb/db/version_set.cc @@ -70,6 +70,15 @@ static int64_t TotalFileSize(const std::vector& files) { } return sum; } +static int64_t TotalFileSizeNotBeingCompacted(const std::vector& files) { + int64_t sum = 0; + for (size_t i = 0; i < files.size(); i++) { + if (!files[i]->being_compacted) { + sum += files[i]->file_size; + } + } + return sum; +} Version::~Version() { assert(refs_ == 0); @@ -129,11 +138,11 @@ static bool BeforeFile(const Comparator* ucmp, bool SomeFileOverlapsRange( const InternalKeyComparator& icmp, + const Comparator* ucmp, bool disjoint_sorted_files, const std::vector& files, const Slice* smallest_user_key, const Slice* largest_user_key) { - const Comparator* ucmp = icmp.user_comparator(); if (!disjoint_sorted_files) { // Need to check against all files for (size_t i = 0; i < files.size(); i++) { @@ -473,8 +482,17 @@ void Version::Unref() { bool Version::OverlapInLevel(int level, const Slice* smallest_user_key, const Slice* largest_user_key) { - return SomeFileOverlapsRange(vset_->icmp_, (level > 0), files_[level], - smallest_user_key, largest_user_key); + + // use row key comparator + CompactStrategy* strategy = vset_->options_->compact_strategy_factory->NewInstance(); + const Comparator* ucmp = strategy->RowKeyComparator(); + if (ucmp == NULL) { + ucmp = vset_->icmp_.user_comparator(); + } + bool overlap = SomeFileOverlapsRange(vset_->icmp_, ucmp, (level > 0), files_[level], + smallest_user_key, largest_user_key); + delete strategy; + return overlap; } int Version::PickLevelForMemTableOutput( @@ -516,12 +534,10 @@ void Version::GetOverlappingInputs( if (end != NULL) { user_end = end->user_key(); } - const Comparator* user_cmp = NULL; - CompactStrategy* strategy = NULL; - if (!vset_->options_->drop_base_level_del_in_compaction) { // use row key comparator - strategy = vset_->options_->compact_strategy_factory->NewInstance(); - user_cmp = strategy->RowKeyComparator(); - } + + // use row key comparator + CompactStrategy* strategy = vset_->options_->compact_strategy_factory->NewInstance(); + const Comparator* user_cmp = strategy->RowKeyComparator(); if (user_cmp == NULL) { user_cmp = vset_->icmp_.user_comparator(); } @@ -818,11 +834,7 @@ class VersionSetBuilder { FileMetaData* f = new FileMetaData(f_new); f->refs = 1; - - if (f->data_size == 0 && !f->smallest_fake && !f->largest_fake) { - // Make sure this is a new file generated by compaction. - f->data_size = f->file_size; - } + f->being_compacted = false; // We arrange to automatically compact this file after // a certain number of seeks. Let's assume: @@ -927,6 +939,7 @@ class VersionSetBuilder { vset_->db_key_start_.DebugString().c_str()); f->smallest = vset_->db_key_start_; f->smallest_fake = true; + f->data_size = 0; } else { // file out of tablet range, skip it; return false; @@ -943,6 +956,7 @@ class VersionSetBuilder { vset_->db_key_end_.DebugString().c_str()); f->largest = vset_->db_key_end_; f->largest_fake = true; + f->data_size = 0; } else { // file out of tablet range, skip it; return false; @@ -1014,7 +1028,18 @@ void VersionSet::AppendVersion(Version* v) { v->next_->prev_ = v; } -Status VersionSet::LogAndApply(VersionEdit* edit, port::Mutex* mu) { +// multi thread safe +// Information kept for every waiting manifest writer +struct VersionSet::ManifestWriter { + Status status; + VersionEdit* edit; + bool done; + port::CondVar cv; + + explicit ManifestWriter(port::Mutex* mu) : done(false), cv(mu) { } +}; +void VersionSet::LogAndApplyHelper(VersionSetBuilder* builder, + VersionEdit* edit) { if (edit->has_log_number_) { assert(edit->log_number_ >= log_number_); assert(edit->log_number_ < next_file_number_); @@ -1036,13 +1061,28 @@ Status VersionSet::LogAndApply(VersionEdit* edit, port::Mutex* mu) { edit->SetLastSequence(last_sequence_); } + builder->Apply(edit); +} + +Status VersionSet::LogAndApply(VersionEdit* edit, port::Mutex* mu) { + mu->AssertHeld(); + // multi write control, do not batch edit write, but multi thread safety + ManifestWriter w(mu); + w.edit = edit; + manifest_writers_.push_back(&w); + while (!w.done && &w != manifest_writers_.front()) { + w.cv.Wait(); + } + assert(manifest_writers_.front() == &w); + + // first manifest writer, batch edit Version* v = new Version(this); { VersionSetBuilder builder(this, current_); - builder.Apply(edit); + LogAndApplyHelper(&builder, w.edit); builder.SaveTo(v); } - Finalize(v); + Finalize(v); // recalculate new version score const uint64_t switch_interval = options_->manifest_switch_interval * 1000000UL; if (descriptor_log_ != NULL && @@ -1050,6 +1090,7 @@ Status VersionSet::LogAndApply(VersionEdit* edit, port::Mutex* mu) { force_switch_manifest_ = true; } + uint64_t manifest_file_num = manifest_file_number_; int retry_count = 0; Status s; // Unlock during expensive MANIFEST log write @@ -1063,13 +1104,14 @@ Status VersionSet::LogAndApply(VersionEdit* edit, port::Mutex* mu) { } mu->Unlock(); + // close current manifest if (force_switch_manifest_) { delete descriptor_log_; delete descriptor_file_; descriptor_log_ = NULL; descriptor_file_ = NULL; - Log(options_->info_log, "[%s] force switch MANIFEST to %lu", - dbname_.c_str(), manifest_file_number_); + Log(options_->info_log, "[%s] force switch MANIFEST #%lu to #%lu", + dbname_.c_str(), manifest_file_num, manifest_file_number_); force_switch_manifest_ = false; } @@ -1113,15 +1155,65 @@ Status VersionSet::LogAndApply(VersionEdit* edit, port::Mutex* mu) { if (s.ok() && !new_manifest_file.empty()) { s = SetCurrentFile(env_, dbname_, manifest_file_number_); if (s.ok()) { - Log(options_->info_log, "[%s] set CURRENT to %llu\n", - dbname_.c_str(), static_cast(manifest_file_number_)); + Log(options_->info_log, "[%s] set CURRENT #%lu to #%llu success\n", + dbname_.c_str(),manifest_file_num, + static_cast(manifest_file_number_)); + manifest_file_num = manifest_file_number_; } else { - Log(options_->info_log, "[%s][dfs error] set CURRENT error: %s\n", - dbname_.c_str(), s.ToString().c_str()); + Log(options_->info_log, "[%s][dfs error] set CURRENT #%lu to #%lu error: %s\n", + dbname_.c_str(), manifest_file_num, manifest_file_number_, + s.ToString().c_str()); } } - if (!s.ok()) { + // switch manifest success, try delete obsolete file + if (!new_manifest_file.empty() && s.ok()) { + // manifest file set, keep latest 3 manifest files for backup + std::set manifest_set; + std::vector filenames; + env_->GetChildren(dbname_, &filenames); + + uint64_t number; + FileType type; + for (size_t i = 0; i < filenames.size(); i++) { + if (ParseFileName(filenames[i], &number, &type)) { + bool keep = true; + switch (type) { + case kDescriptorFile: + manifest_set.insert(filenames[i]); + if (manifest_set.size() > 3) { + std::set::iterator it = manifest_set.begin(); + ParseFileName(*it, &number, &type); + if (number < manifest_file_number_) { + // Keep my manifest file, and any newer incarnations' + // (in case there is a race that allows other incarnations) + filenames[i] = *it; + keep = false; + manifest_set.erase(it); + } + } + break; + case kTempFile: + // Any temp files that are currently being written to must + // be recorded in pending_outputs_, which is inserted into "live" + keep = false; + break; + default: + break; + } + + if (!keep) { + Log(options_->info_log, "[%s] version_set Delete type=%s #%lld, fname %s\n", + dbname_.c_str(), FileTypeToString(type), + static_cast(number), filenames[i].c_str()); + env_->DeleteFile(dbname_ + "/" + filenames[i]); + } + } + } + } + // if MANIFEST or CURRENT file write error because of losting directory lock, + // do not try to switch manifest anymore + if (!s.ok() && !s.IsIOPermissionDenied()) { force_switch_manifest_ = true; if (!new_manifest_file.empty()) { env_->DeleteFile(new_manifest_file); @@ -1141,7 +1233,7 @@ Status VersionSet::LogAndApply(VersionEdit* edit, port::Mutex* mu) { } mu->Lock(); - } while (force_switch_manifest_); + } while (force_switch_manifest_); // bugfix issue=tera-10, dfs sync fail, but eventually success, cause reload fail // Install the new version if (s.ok()) { @@ -1155,6 +1247,10 @@ Status VersionSet::LogAndApply(VersionEdit* edit, port::Mutex* mu) { Log(options_->info_log, "[%s][dfs error] set force_switch_manifest", dbname_.c_str()); } + manifest_writers_.pop_front(); + if (!manifest_writers_.empty()) { + manifest_writers_.front()->cv.Signal(); + } return s; } @@ -1462,9 +1558,11 @@ Status VersionSet::Recover() { FileMetaData* f = files[i]; ModifyFileSize(f); // Debug - Log(options_->info_log, "[%s] recover: %s, level: %d, del_p: %lu, check_ttl_ts %lu, ttl_p %lu, s: %d %s, l: %d %s\n", + Log(options_->info_log, "[%s] recover: %s, level: %d, file_size %lu, data_size %lu, " + "del_p: %lu, check_ttl_ts %lu, ttl_p %lu, s: %d %s, l: %d %s\n", dbname_.c_str(), FileNumberDebugString(f->number).c_str(), level, + f->file_size, f->data_size, f->del_percentage, f->check_ttl_ts, f->ttl_percentage, @@ -1480,6 +1578,9 @@ Status VersionSet::Recover() { // Modify data_size of file meta bool VersionSet::ModifyFileSize(FileMetaData* f) { + if (f->data_size != 0) { + return true; + } // Try modify data_size in file meta // data_size = largest_key_offset - smallest_key_offset if (f->largest_fake || f->smallest_fake) { @@ -1508,8 +1609,7 @@ bool VersionSet::ModifyFileSize(FileMetaData* f) { static_cast(f->file_size), static_cast(f->data_size)); delete iter; - } else { - // do not need modify + } else { // for compatibility, we have not decoded f->data_size from MANIFEST f->data_size = f->file_size; } return true; @@ -1523,8 +1623,6 @@ void VersionSet::MarkFileNumberUsed(uint64_t number) { void VersionSet::Finalize(Version* v) { // Precomputed best level for next compaction - int best_level = -1; - double best_score = -1; int best_del_level = -1; int best_del_idx = -1; int best_ttl_level = -1; @@ -1532,8 +1630,8 @@ void VersionSet::Finalize(Version* v) { int base_level = -1; for (int level = config::kNumLevels - 1; level >= 0; level--) { - double score; - if (level == 0) { + double score = 0; + if (level == 0 && level0_compactions_in_progress_.empty()) { // We treat level-0 specially by bounding the number of files // instead of number of bytes for two reasons: // @@ -1548,11 +1646,16 @@ void VersionSet::Finalize(Version* v) { // // (3) More level0 files means write hotspot. // We give lower score to avoid too much level0 compaction. - score = sqrt(v->files_[level].size() / - static_cast(config::kL0_CompactionTrigger)); - } else { + if (v->files_[level].size() <= (size_t)options_->slow_down_level0_score_limit) { + score = v->files_[level].size() / + static_cast(config::kL0_CompactionTrigger); + } else { + score = sqrt(v->files_[level].size() / + static_cast(config::kL0_CompactionTrigger)); + } + } else if (level > 0) { // Compute the ratio of current size to size limit. - const uint64_t level_bytes = TotalFileSize(v->files_[level]); + const uint64_t level_bytes = TotalFileSizeNotBeingCompacted(v->files_[level]); score = static_cast(level_bytes) / MaxBytesForLevel(level, options_->sst_size); } @@ -1562,16 +1665,15 @@ void VersionSet::Finalize(Version* v) { base_level = level; } - // size compaction does not allow trigger by base level - if ((score > best_score) && (level < config::kNumLevels - 1)) { - best_level = level; - best_score = score; + if (level < config::kNumLevels - 1) { + v->compaction_level_[level] = level; + v->compaction_score_[level] = (score < 1.0) ? 0: score; } for (size_t i = 0; i < v->files_[level].size(); i++) { FileMetaData* f = v->files_[level][i]; // del compaction does not allow trigger by base level - if ((level > 0) && (level < base_level) && + if ((!f->being_compacted) && (level > 0) && (level < base_level) && (f->del_percentage > options_->del_percentage) && (best_del_level < 0 || v->files_[best_del_level][best_del_idx]->del_percentage < f->del_percentage)) { @@ -1580,7 +1682,7 @@ void VersionSet::Finalize(Version* v) { } // ttl compaction can trigger in base level - if ((f->check_ttl_ts > 0) && + if ((!f->being_compacted) && (f->check_ttl_ts > 0) && (best_ttl_level < 0 || v->files_[best_ttl_level][best_ttl_idx]->check_ttl_ts > f->check_ttl_ts)) { best_ttl_level = level; @@ -1589,30 +1691,44 @@ void VersionSet::Finalize(Version* v) { } } - v->compaction_level_ = best_level; - v->compaction_score_ = best_score; + // sort all the levels based on their score. Higher scores get listed + // first. Use bubble sort because the number of entries are small. + for (int i = 0; i < config::kNumLevels - 2; i++) { + for (int j = i + 1; j < config::kNumLevels - 1; j++) { + if (v->compaction_score_[i] < v->compaction_score_[j]) { + int level = v->compaction_level_[i]; + double score = v->compaction_score_[i]; + v->compaction_level_[i] = v->compaction_level_[j]; + v->compaction_score_[i] = v->compaction_score_[j]; + v->compaction_level_[j] = level; + v->compaction_score_[j] = score; + } + } + } + if (best_del_level >= 0) { v->del_trigger_compact_ = v->files_[best_del_level][best_del_idx]; v->del_trigger_compact_level_ = best_del_level; Log(options_->info_log, - "[%s] del_strategy(current), level %d, num #%lu, file_size %lu, del_p %lu\n", - dbname_.c_str(), - v->del_trigger_compact_level_, - (v->del_trigger_compact_->number) & 0xffffffff, - v->del_trigger_compact_->file_size, - v->del_trigger_compact_->del_percentage); + "[%s] del_strategy(current), level %d, num #%lu, file_size %lu, del_p %lu\n", + dbname_.c_str(), + v->del_trigger_compact_level_, + (v->del_trigger_compact_->number) & 0xffffffff, + v->del_trigger_compact_->file_size, + v->del_trigger_compact_->del_percentage); } + if (best_ttl_level >= 0) { v->ttl_trigger_compact_ = v->files_[best_ttl_level][best_ttl_idx]; v->ttl_trigger_compact_level_ = best_ttl_level; Log(options_->info_log, - "[%s] ttl_strategy(current), level %d, num #%lu, file_size %lu, ttl_p %lu, check_ts %lu\n", - dbname_.c_str(), - v->ttl_trigger_compact_level_, - (v->ttl_trigger_compact_->number) & 0xffffffff, - v->ttl_trigger_compact_->file_size, - v->ttl_trigger_compact_->ttl_percentage, - v->ttl_trigger_compact_->check_ttl_ts); + "[%s] ttl_strategy(current), level %d, num #%lu, file_size %lu, ttl_p %lu, check_ts %lu\n", + dbname_.c_str(), + v->ttl_trigger_compact_level_, + (v->ttl_trigger_compact_->number) & 0xffffffff, + v->ttl_trigger_compact_->file_size, + v->ttl_trigger_compact_->ttl_percentage, + v->ttl_trigger_compact_->check_ttl_ts); } } @@ -1757,6 +1873,19 @@ void VersionSet::AddLiveFiles(std::map* live) { } } +void VersionSet::AddLiveFilesWithSize(std::map* live) { + for (Version* v = dummy_versions_.next_; + v != &dummy_versions_; + v = v->next_) { + for (int level = 0; level < config::kNumLevels; level++) { + const std::vector& files = v->files_[level]; + for (size_t i = 0; i < files.size(); i++) { + (*live)[files[i]->number] = files[i]->file_size; + } + } + } +} + int64_t VersionSet::NumLevelBytes(int level) const { assert(level >= 0); assert(level < config::kNumLevels); @@ -1854,97 +1983,472 @@ Iterator* VersionSet::MakeInputIterator(Compaction* c) { return result; } +void VersionSet::PrintFilesInCompaction(const std::vector& inputs) { + char buf[30]; + std::string fstr = "file: "; + for (size_t i = 0; i < inputs.size(); i++) { + FileMetaData* f = inputs[i]; + if (f->being_compacted) { + snprintf(buf, sizeof(buf), "%lu ", f->number); + fstr.append(buf); + break; + } + } + Log(options_->info_log, "[%s] test mark level [%s] bening compact.", dbname_.c_str(), + fstr.c_str()); + return; +} + +bool VersionSet::FilesInCompaction(const std::vector& inputs) { + for (size_t i = 0; i < inputs.size(); i++) { + FileMetaData* f = inputs[i]; + if (f->being_compacted) { + return true; + } + } + return false; +} + +void VersionSet::PrintRangeInCompaction(const InternalKey* smallest, const InternalKey* largest, int level) { + std::vector inputs; + assert(level < config::kNumLevels); + current_->GetOverlappingInputs(level, smallest, largest, &inputs); + PrintFilesInCompaction(inputs); + return; +} + +bool VersionSet::RangeInCompaction(const InternalKey* smallest, const InternalKey* largest, int level) { + std::vector inputs; + assert(level < config::kNumLevels); + current_->GetOverlappingInputs(level, smallest, largest, &inputs); + return FilesInCompaction(inputs); +} + +bool VersionSet::PickFutureCompaction(int level, std::vector* inputs) { + inputs->clear(); + std::vector candidate; + double low_level_score = 0; + double high_level_score = 0; + for (size_t li = 0; li < current_->compaction_score_.size(); li++) { + if (current_->compaction_level_[li] == level) { + low_level_score = current_->compaction_score_[li]; + } else if (current_->compaction_level_[li] == level + 1) { + high_level_score = current_->compaction_score_[li]; + } + } + if (low_level_score < 1.0 || + low_level_score <= high_level_score) { + return false; + } + + // file in level need compaction, pick file in next compaction + for (size_t i = 0; i < current_->files_[level].size(); i++) { + FileMetaData* f = current_->files_[level][i]; + if (f->being_compacted) { + continue; + } + + if (!compact_pointer_[level].empty() && + icmp_.Compare(f->largest.Encode(), compact_pointer_[level]) <= 0) { + candidate.push_back(f); + continue; + } + + inputs->push_back(f); + break; + } + + if (inputs->empty()) { + FileMetaData* f = current_->files_[level][0]; + if (!f->being_compacted) { + inputs->push_back(f); + } + } + if (inputs->empty() && candidate.size() > 0) { + inputs->push_back(candidate[candidate.size() - 1]); + } + return !inputs->empty(); +} + +bool VersionSet::IsOverlapInFileRange(FileMetaData* lf, FileMetaData* f) { + if (lf == NULL || f == NULL) { + return false; + } + if (icmp_.Compare(lf->largest.Encode(), f->smallest.Encode()) < 0 || + icmp_.Compare(f->largest.Encode(), lf->smallest.Encode()) < 0) { + return false; + } + //Log(options_->info_log, "[%s] file range overlap, lfile #%d, [%s, %s] being_compact %d, " + // "file #%d, [%s, %s] being_compact %d\n", + // dbname_.c_str(), + // static_cast(lf->number & 0xffffffff), + // lf->smallest.Encode().ToString().c_str(), + // lf->largest.Encode().ToString().c_str(), + // lf->being_compacted, + // static_cast(f->number & 0xffffffff), + // f->smallest.Encode().ToString().c_str(), + // f->largest.Encode().ToString().c_str(), + // f->being_compacted); + return true; +} + +// Note: +// 1) if f in level1 being compacted, level0 may be blocked; +// 2) compacting pointer may cause other f in the same level to be blocked. +bool VersionSet::PickCompactionBySize(int level, std::vector* inputs) { + // Pick low level file, which will be compact next time + std::vector low_level_inputs; + PickFutureCompaction(level - 1, &low_level_inputs); + FileMetaData* low_level_file = NULL; + if (low_level_inputs.size() > 0) { + low_level_file = low_level_inputs[0]; + //Log(options_->info_log, "[%s] PickCompactionBySize, low_level %d, f[%s, %s] being_compact %d\n", + // dbname_.c_str(), level - 1, + // low_level_file->smallest.Encode().ToString().c_str(), + // low_level_file->largest.Encode().ToString().c_str(), + // low_level_file->being_compacted); + } + + inputs->clear(); + std::vector candidate; + // Pick the first file that comes after compact_pointer_[level] + for (size_t i = 0; i < current_->files_[level].size(); i++) { + FileMetaData* f = current_->files_[level][i]; + if (f->being_compacted) { + //Log(options_->info_log, "[%s] PickCompactionBySize, level %d, f[%s, %s] being_compact %d\n", + // dbname_.c_str(), level, + // f->smallest.Encode().ToString().c_str(), f->largest.Encode().ToString().c_str(), + // f->being_compacted); + continue; + } + if (!compact_pointer_[level].empty() && + icmp_.Compare(f->largest.Encode(), compact_pointer_[level]) <= 0) { + //Log(options_->info_log, "[%s] PickCompactionBySize, skip by compact_pointer_[%d] %s, f[%s, %s] being_compacted %d\n", + // dbname_.c_str(), level, compact_pointer_[level].c_str(), + // f->smallest.Encode().ToString().c_str(), f->largest.Encode().ToString().c_str(), + // f->being_compacted); + if (!RangeInCompaction(&f->smallest, &f->largest, level + 1) && + !IsOverlapInFileRange(low_level_file, f)) { + candidate.push_back(f); + } + continue; + } + if (RangeInCompaction(&f->smallest, &f->largest, level + 1) || + IsOverlapInFileRange(low_level_file, f)) { + //PrintRangeInCompaction(&f->smallest, &f->largest, level + 1); + continue; + } + inputs->push_back(f); + break; + } + if (inputs->empty()) { + // Wrap-around to the beginning of the key space + FileMetaData* f = current_->files_[level][0]; + if (!f->being_compacted && !RangeInCompaction(&f->smallest, &f->largest, level + 1) && + !IsOverlapInFileRange(low_level_file, f)) { + inputs->push_back(f); + } + //Log(options_->info_log, "[%s] PickCompactBySize, wrap-arroud level %d, f[%s, %s] being_compacted %d\n", + // dbname_.c_str(), level, + // f->smallest.Encode().ToString().c_str(), f->largest.Encode().ToString().c_str(), + // f->being_compacted); + //PrintRangeInCompaction(&f->smallest, &f->largest, level + 1); + } + if (inputs->empty() && candidate.size() > 0) { + inputs->push_back(candidate[candidate.size() - 1]); + } + return !inputs->empty(); +} + // timeout for micro_second -double VersionSet::CompactionScore(uint64_t* timeout) const { - *timeout = 0; +void VersionSet::CompactionScore(std::vector >* scores) { uint64_t ts = env_->NowMicros(); Version* v = current_; - if (v->compaction_score_ >= 1) { - return v->compaction_score_; - } else if (v->del_trigger_compact_ != NULL && - v->del_trigger_compact_->del_percentage > options_->del_percentage) { - return (double)(v->del_trigger_compact_->del_percentage / 100.0); - } else if (v->ttl_trigger_compact_ != NULL && - ts >= v->ttl_trigger_compact_->check_ttl_ts) { - return (double)((v->ttl_trigger_compact_->ttl_percentage + 1) / 100.0); - } else if (v->file_to_compact_ != NULL) { - return 0.1f; + for (size_t i = 0; i < v->compaction_score_.size(); i++) { + if (v->compaction_score_[i] >= 1) { + scores->push_back(std::pair(v->compaction_score_[i], 0)); + } + } + if (v->del_trigger_compact_ != NULL && + !v->del_trigger_compact_->being_compacted && + v->del_trigger_compact_->del_percentage > options_->del_percentage) { + scores->push_back(std::pair( + (double)(v->del_trigger_compact_->del_percentage / 100.0), 0)); + } + if (v->ttl_trigger_compact_ != NULL && + !v->ttl_trigger_compact_->being_compacted && + ts >= v->ttl_trigger_compact_->check_ttl_ts) { + scores->push_back(std::pair( + (double)((v->ttl_trigger_compact_->ttl_percentage + 1) / 100.0), 0)); + } + if (v->file_to_compact_ != NULL && + !v->file_to_compact_->being_compacted) { + scores->push_back(std::pair(0.1, 0)); } // delay task if (v->ttl_trigger_compact_ != NULL && + !v->ttl_trigger_compact_->being_compacted && ts < v->ttl_trigger_compact_->check_ttl_ts) { - *timeout = (v->ttl_trigger_compact_->check_ttl_ts - ts + 1000000) / 1000; - return (double)((v->ttl_trigger_compact_->ttl_percentage + 1) / 100.0); + scores->push_back(std::pair( + (double)((v->ttl_trigger_compact_->ttl_percentage + 1) / 100.0), + ((v->ttl_trigger_compact_->check_ttl_ts - ts + 1000000) / 1000))); + } +} + +Compaction* VersionSet::NewSubCompact(Compaction* compact) { + Compaction* c = new Compaction(compact->level_); + c->output_level_ = compact->output_level_; + c->max_output_file_size_ = compact->max_output_file_size_; + c->input_version_ = compact->input_version_; + c->input_version_->Ref(); // make sure compacting version will not delete + + for (size_t i = 0; i < 2; i++) { + for (size_t j = 0; j < compact->inputs_[i].size(); j++) { + c->inputs_[i].push_back((compact->inputs_[i])[j]); + } + } + + for (size_t i = 0; i < compact->grandparents_.size(); i++) { + c->grandparents_.push_back(compact->grandparents_[i]); + } + c->grandparent_index_ = compact->grandparent_index_; + c->seen_key_ = compact->seen_key_; + c->overlapped_bytes_ = compact->overlapped_bytes_; + + c->drop_lower_bound_ = compact->drop_lower_bound_; + c->force_non_trivial_ = compact->force_non_trivial_; + return c; +} + +struct InternalKeyCompare { + InternalKeyCompare(const InternalKeyComparator* cmp) + : icmp(cmp) {} + + InternalKeyCompare(const InternalKeyCompare& key_cmp) + : icmp(key_cmp.icmp) {} + + // retuen true if a < b + bool operator () (const std::string& ikey_a, const std::string& ikey_b) { + InternalKey ikey1, ikey2; + ikey1.DecodeFrom(ikey_a); + ikey2.DecodeFrom(ikey_b); + bool res = icmp->InternalKeyComparator::Compare(ikey1.Encode(), ikey2.Encode()) < 0; + return res; + } + + const InternalKeyComparator* icmp; +}; + +uint64_t VersionSet::GetApproximateSizeByLevel(Version* v, int level, const InternalKey& ikey) { + uint64_t result = 0; + const std::vector& files = v->files_[level]; + for (size_t i = 0; i < files.size(); i++) { + if (icmp_.Compare(files[i]->largest, ikey) <= 0) { + // Entire file is before "ikey", so just add the file size + result += files[i]->file_size; + } else if (icmp_.Compare(files[i]->smallest, ikey) > 0) { + // Entire file is after "ikey", so ignore + if (level > 0) { + // Files other than level 0 are sorted by meta->smallest, so + // no further files in this level will contain data for + // "ikey". + break; + } + } else { + // "ikey" falls in the range for this table. Add the + // approximate offset of "ikey" within the table. + Table* tableptr; + Slice smallest = files[i]->smallest_fake ? files[i]->smallest.Encode() : ""; + Slice largest = files[i]->largest_fake ? files[i]->largest.Encode() : ""; + Iterator* iter = table_cache_->NewIterator( + ReadOptions(options_), dbname_, files[i]->number, files[i]->file_size, + smallest, largest, &tableptr); + if (tableptr != NULL) { + result += tableptr->ApproximateOffsetOf(ikey.Encode()); + } + delete iter; + } + } + return result; +} + +void VersionSet::GenerateSubCompaction(Compaction* compact, std::vector * compact_vec, + port::Mutex* mu) { + mu->AssertHeld(); + if (options_->max_sub_parallel_compaction <= 1) { + Compaction* c = NewSubCompact(compact); + compact_vec->push_back(c); + return; + } + + // generate candidate sub compaction split key + InternalKeyCompare icmp(&icmp_); + std::set boundary(icmp); + for (int i = compact->level_; i < compact->output_level_; i++ ) { + for (size_t j = 0; j < compact->inputs_[i - compact->level_].size(); j++) { + FileMetaData* f = compact->inputs_[i - compact->level_][j]; + boundary.insert(f->smallest.Encode().ToString()); + boundary.insert(f->largest.Encode().ToString()); + } + } + for (size_t j = 1; j < compact->inputs_[compact->output_level_ - compact->level_].size(); j++) { + FileMetaData* f = compact->inputs_[compact->output_level_ - compact->level_][j]; + boundary.insert(f->smallest.Encode().ToString()); + } + + mu->Unlock(); + // generate sub compaction range by output file size + uint64_t sum = 0, prev_sum = 0; + std::set::iterator it = boundary.begin(); + while (it != boundary.end()) { + sum = 0; + InternalKey ikey; + ikey.DecodeFrom(*it); + for (int i = compact->level_; i <= compact->output_level_; i++ ) { + sum += GetApproximateSizeByLevel(compact->input_version_, i, ikey); + } + + assert(sum >= prev_sum); + if (compact->max_output_file_size_ > sum - prev_sum) { + it = boundary.erase(it); + } else { + ++it; + prev_sum = sum; + } + } + mu->Lock(); + + // limit max sub compaction + assert(options_->max_sub_parallel_compaction > 1); + uint64_t avg_num = (boundary.size() + 1) / options_->max_sub_parallel_compaction + 1; + it = boundary.begin(); + uint64_t i = 1; + while (avg_num > 1 && it != boundary.end()) { + if (i % avg_num != 0) { + it = boundary.erase(it); + } else { + ++it; + } + i++; } - // nothing to do - return -1.0; + // construct compaction + if (boundary.size() == 0) { + Compaction* c = NewSubCompact(compact); + compact_vec->push_back(c); + } else { + std::set::iterator it = boundary.begin(); + std::string prev_key; + while (true) { + Compaction* c = NewSubCompact(compact); + c->sub_compact_start_ = prev_key; + c->sub_compact_end_ = *it; + compact_vec->push_back(c); + + ++it; + prev_key = c->sub_compact_end_; + if (it == boundary.end()) { + Compaction* c1 = NewSubCompact(compact); + c1->sub_compact_start_ = prev_key; + compact_vec->push_back(c1); + break; + } + } + } } Compaction* VersionSet::PickCompaction() { - Compaction* c; - int level; + int level = -1; + std::vector inputs; + bool set_non_trivial = false; // We prefer compactions triggered by too much data in a level over // the compactions triggered by seeks. - const bool size_compaction = (current_->compaction_score_ >= 1); + const bool size_compaction = (current_->compaction_score_[0] >= 1); const bool seek_compaction = (current_->file_to_compact_ != NULL); const bool del_compaction = (current_->del_trigger_compact_ != NULL); const bool ttl_compaction = (current_->ttl_trigger_compact_ != NULL); - if (size_compaction) { - level = current_->compaction_level_; - assert(level >= 0); - assert(level+1 < config::kNumLevels); - c = new Compaction(level); - // Pick the first file that comes after compact_pointer_[level] - for (size_t i = 0; i < current_->files_[level].size(); i++) { - FileMetaData* f = current_->files_[level][i]; - if (compact_pointer_[level].empty() || - icmp_.Compare(f->largest.Encode(), compact_pointer_[level]) > 0) { - c->inputs_[0].push_back(f); + // check size compaction + assert(level0_compactions_in_progress_.size() <= 1); + bool skipped_l0 = false; + for (size_t li = 0; size_compaction && li < current_->compaction_score_.size(); li++) { + double score = current_->compaction_score_[li]; + level = current_->compaction_level_[li]; + assert(li == 0 || score <= current_->compaction_score_[li - 1]); + if (score >= 1) { + assert(level >= 0); + assert(level+1 < config::kNumLevels); + if (skipped_l0 && level <= 1) { + // level0 in progress and level 0 will not directly compact to level > 1 + //Log(options_->info_log, "[%s] lock level %d, conflict, score %.2f\n", + // dbname_.c_str(), level, score); + continue; + } + if (level == 0 && !level0_compactions_in_progress_.empty()) { + skipped_l0 = true; + //Log(options_->info_log, "[%s] level %d in progress, conflict, score %.2f\n", + // dbname_.c_str(), level, score); + continue; + } + if (PickCompactionBySize(level, &inputs)) { break; } + //Log(options_->info_log, "[%s] pick level %d, conflict, score %.2f\n", + // dbname_.c_str(), level, score); } - if (c->inputs_[0].empty()) { - // Wrap-around to the beginning of the key space - c->inputs_[0].push_back(current_->files_[level][0]); - } - } else if (seek_compaction) { - // compaction trigger by seek percentage - // TODO: multithread should lock it + } + + // check seek compaction + if (inputs.empty() && seek_compaction) { level = current_->file_to_compact_level_; - c = new Compaction(level); - c->inputs_[0].push_back(current_->file_to_compact_); - } else if (del_compaction) { + assert(level >= 0); + assert(level+1 < config::kNumLevels); + FileMetaData* f = current_->file_to_compact_; + if (!f->being_compacted && + (level > 0 || level0_compactions_in_progress_.empty()) && + !RangeInCompaction(&f->smallest, &f->largest, level + 1)) { + inputs.push_back(f); + } + } + + // check del compaction + if (inputs.empty() && del_compaction) { // compaction trigger by delete tags percentage; // TODO: multithread should lock it level = current_->del_trigger_compact_level_; assert(level >= 0); assert(level+1 < config::kNumLevels); - c = new Compaction(level); - c->SetNonTrivial(true); - c->inputs_[0].push_back(current_->del_trigger_compact_); - Log(options_->info_log, + FileMetaData* f = current_->del_trigger_compact_; + if (!f->being_compacted && + (level > 0 || level0_compactions_in_progress_.empty()) && + !RangeInCompaction(&f->smallest, &f->largest, level + 1)) { + inputs.push_back(f); + set_non_trivial = true; + Log(options_->info_log, "[%s] compact trigger by del stragety, level %d, num #%lu, file_size %lu, del_p %lu\n", dbname_.c_str(), current_->del_trigger_compact_level_, (current_->del_trigger_compact_->number) & 0xffffffff, current_->del_trigger_compact_->file_size, current_->del_trigger_compact_->del_percentage); - } else if (ttl_compaction) { + } + } + + // check ttl compaction + if (inputs.empty() && ttl_compaction) { // compaction trigger by ttl tags percentage // TODO: multithread should lock it level = current_->ttl_trigger_compact_level_; assert(level >= 0); - c = new Compaction(level); - c->SetNonTrivial(true); - c->inputs_[0].push_back(current_->ttl_trigger_compact_); - if (level == config::kNumLevels - 1) {// level in last level - c->set_output_level(level); - } - Log(options_->info_log, + FileMetaData* f = current_->ttl_trigger_compact_; + if (!f->being_compacted && + (level > 0 || level0_compactions_in_progress_.empty()) && + (level+1 == config::kNumLevels || !RangeInCompaction(&f->smallest, &f->largest, level + 1))) { + inputs.push_back(f); + set_non_trivial = true; + Log(options_->info_log, "[%s] compact trigger by ttl stragety, level %d, num #%lu, file_size %lu, ttl_p %lu, check_ts %lu\n", dbname_.c_str(), current_->ttl_trigger_compact_level_, @@ -1952,32 +2456,57 @@ Compaction* VersionSet::PickCompaction() { current_->ttl_trigger_compact_->file_size, current_->ttl_trigger_compact_->ttl_percentage, current_->ttl_trigger_compact_->check_ttl_ts); - } else { + } + } + if (inputs.empty()) { return NULL; } - c->input_version_ = current_; - c->input_version_->Ref(); - c->max_output_file_size_ = - MaxFileSizeForLevel(c->output_level(), current_->vset_->options_->sst_size); - + assert(inputs.size() == 1); + assert(level >= 0); // Files in level 0 may overlap each other, so pick up all overlapping ones if (level == 0) { + assert(level0_compactions_in_progress_.size() == 0); InternalKey smallest, largest; - GetRange(c->inputs_[0], &smallest, &largest); + GetRange(inputs, &smallest, &largest); // Note that the next call will discard the file we placed in // c->inputs_[0] earlier and replace it with an overlapping set // which will include the picked file. - current_->GetOverlappingInputs(0, &smallest, &largest, &c->inputs_[0]); - assert(!c->inputs_[0].empty()); + current_->GetOverlappingInputs(level, &smallest, &largest, &inputs); + GetRange(inputs, &smallest, &largest); + if (RangeInCompaction(&smallest, &largest, level + 1)) { // make sure level1 not in compaction + Log(options_->info_log, "[%s] level1 in compacting, level0 conflict\n", + dbname_.c_str()); + return NULL; + } + assert(!inputs.empty()); + assert(!FilesInCompaction(inputs)); + } + + // expand inputs + Compaction* c = new Compaction(level); + c->SetNonTrivial(set_non_trivial); + c->input_version_ = current_; + c->input_version_->Ref(); // make sure compacting version will not delete + if (level == config::kNumLevels - 1) {// level in last level + c->set_output_level(level); } + c->max_output_file_size_ = + MaxFileSizeForLevel(c->output_level(), current_->vset_->options_->sst_size); + c->inputs_[0] = inputs; SetupOtherInputs(c); // tera-specific: calculate the smallest rowkey which overlap with file not // in this compaction. SetupCompactionBoundary(c); + + // mark being compacted + c->MarkBeingCompacted(true); + if (level == 0) { + level0_compactions_in_progress_.push_back(c); + } + Finalize(current_); // reculate level score return c; } - void VersionSet::SetupOtherInputs(Compaction* c) { if (c->level() == c->output_level()) { // self level compaction, should select next level return; @@ -2008,7 +2537,10 @@ void VersionSet::SetupOtherInputs(Compaction* c) { std::vector expanded1; current_->GetOverlappingInputs(c->output_level(), &new_start, &new_limit, &expanded1); - if (expanded1.size() == c->inputs_[1].size()) { + // check expanded file wether in compacting + if ((expanded1.size() == c->inputs_[1].size()) && + !RangeInCompaction(&new_start, &new_limit, level) && + !RangeInCompaction(&new_start, &new_limit, c->output_level())) { Log(options_->info_log, "[%s] Expanding@%d %d+%d (%ld+%ld bytes) to %d+%d (%ld+%ld bytes)\n", dbname_.c_str(), @@ -2084,11 +2616,18 @@ void VersionSet::SetupCompactionBoundary(Compaction* c) { Compaction* VersionSet::CompactRange( int level, const InternalKey* begin, - const InternalKey* end) { + const InternalKey* end, bool* being_compacted) { + *being_compacted = false; std::vector inputs; current_->GetOverlappingInputs(level, begin, end, &inputs); if (inputs.empty()) { - return NULL; + return NULL; + } + + // check level0 wether in compaction + if (level == 0 && !level0_compactions_in_progress_.empty()) { + *being_compacted = true; + return NULL; } // Avoid compacting too much in one shot in case the range is large. @@ -2109,6 +2648,18 @@ Compaction* VersionSet::CompactRange( } } + // check being compacting + InternalKey smallest, largest; + GetRange(inputs, &smallest, &largest); + if (FilesInCompaction(inputs) || RangeInCompaction(&smallest, &largest, level + 1)) { + PrintFilesInCompaction(inputs); + PrintRangeInCompaction(&smallest, &largest, level + 1); + Log(options_->info_log, "[%s] RangeCompaction : %s...%s, level: %d or %d, in compaction", + dbname_.c_str(), smallest.DebugString().c_str(), largest.DebugString().c_str(), level, level + 1); + *being_compacted = true; + return NULL; + } + Compaction* c = new Compaction(level); c->input_version_ = current_; c->input_version_->Ref(); @@ -2119,9 +2670,28 @@ Compaction* VersionSet::CompactRange( // tera-specific: calculate the smallest rowkey which overlap with file not // in this compaction. SetupCompactionBoundary(c); + + // mark being compacted + c->MarkBeingCompacted(true); + if (level == 0) { + level0_compactions_in_progress_.push_back(c); + } + Finalize(current_); // reculate level score return c; } +void VersionSet::ReleaseCompaction(Compaction* c, Status& s) { + c->MarkBeingCompacted(false); + assert(level0_compactions_in_progress_.size() <= 1); + if (c->level() == 0 && level0_compactions_in_progress_[0] == c) { + level0_compactions_in_progress_.resize(0); + } + if (!s.ok()) { + Finalize(current_); + } + return; +} + Compaction::Compaction(int level) : level_(level), output_level_(level + 1), @@ -2209,6 +2779,16 @@ bool Compaction::ShouldStopBefore(const Slice& internal_key) { } } +void Compaction::MarkBeingCompacted(bool flag) { + for (size_t i = 0; i < 2; i++) { + for (size_t j = 0; j < inputs_[i].size(); j++) { + assert(flag ? !inputs_[i][j]->being_compacted + : inputs_[i][j]->being_compacted); + inputs_[i][j]->being_compacted = flag; + } + } +} + void Compaction::ReleaseInputs() { if (input_version_ != NULL) { input_version_->Unref(); diff --git a/src/leveldb/db/version_set.h b/src/leveldb/db/version_set.h index 5a01d8dba..c933efced 100644 --- a/src/leveldb/db/version_set.h +++ b/src/leveldb/db/version_set.h @@ -19,6 +19,7 @@ #ifndef STORAGE_LEVELDB_DB_VERSION_SET_H_ #define STORAGE_LEVELDB_DB_VERSION_SET_H_ +#include #include #include #include @@ -56,6 +57,7 @@ extern int FindFile(const InternalKeyComparator& icmp, // in sorted order. extern bool SomeFileOverlapsRange( const InternalKeyComparator& icmp, + const Comparator* ucmp, bool disjoint_sorted_files, const std::vector& files, const Slice* smallest_user_key, @@ -147,8 +149,8 @@ class Version { // Level that should be compacted next and its compaction score. // Score < 1 means compaction is not strictly needed. These fields // are initialized by Finalize(). - double compaction_score_; - int compaction_level_; + std::vector compaction_score_; + std::vector compaction_level_; explicit Version(VersionSet* vset) : vset_(vset), next_(this), prev_(this), refs_(0), @@ -157,9 +159,13 @@ class Version { ttl_trigger_compact_(NULL), ttl_trigger_compact_level_(-1), del_trigger_compact_(NULL), - del_trigger_compact_level_(-1), - compaction_score_(-1), - compaction_level_(-1) { + del_trigger_compact_level_(-1) { + compaction_score_.resize(config::kNumLevels - 1); + compaction_level_.resize(config::kNumLevels - 1); + for (size_t i = 0; i < config::kNumLevels - 1; i++) { + compaction_score_[i] = -1.0; + compaction_level_[i] = -1; + } } ~Version(); @@ -182,6 +188,8 @@ class VersionSet { // current version. Will release *mu while actually writing to the file. // REQUIRES: *mu is held on entry. // REQUIRES: no other thread concurrently calls LogAndApply() + void LogAndApplyHelper(VersionSetBuilder* builder, + VersionEdit* edit); Status LogAndApply(VersionEdit* edit, port::Mutex* mu) EXCLUSIVE_LOCKS_REQUIRED(mu); @@ -231,7 +239,8 @@ class VersionSet { // being compacted, or zero if there is no such log file. uint64_t PrevLogNumber() const { return prev_log_number_; } - double CompactionScore(uint64_t* timeout) const; + // + void CompactionScore(std::vector >* scores); // Pick level and inputs for a new compaction. // Returns NULL if there is no compaction to be done. // Otherwise returns a pointer to a heap-allocated object that @@ -245,7 +254,10 @@ class VersionSet { Compaction* CompactRange( int level, const InternalKey* begin, - const InternalKey* end); + const InternalKey* end, bool* being_compacted); + + // release file's being_compacted flag, and release level0's lock + void ReleaseCompaction(Compaction* c, Status& s); // Return the maximum overlapping data (in bytes) at next level for any // file at a level >= 1. @@ -259,6 +271,7 @@ class VersionSet { // May also mutate some internal state. void AddLiveFiles(std::set* live); void AddLiveFiles(std::map* live); + void AddLiveFilesWithSize(std::map* live); // Return the approximate offset in the database of the data for // "key" as of version "v". @@ -271,10 +284,17 @@ class VersionSet { }; const char* LevelSummary(LevelSummaryStorage* scratch) const; + void GenerateSubCompaction(Compaction* compact, std::vector * compact_vec, + port::Mutex* mu); + private: friend class Compaction; friend class Version; friend class VersionSetBuilder; + struct ManifestWriter; + + Compaction* NewSubCompact(Compaction* compact); + uint64_t GetApproximateSizeByLevel(Version* v, int level, const InternalKey& ikey); void Finalize(Version* v); @@ -301,6 +321,15 @@ class VersionSet { bool ModifyFileSize(FileMetaData* f); + // milti thread compaction relatively + void PrintFilesInCompaction(const std::vector& inputs); + bool FilesInCompaction(const std::vector& inputs); + void PrintRangeInCompaction(const InternalKey* smallest, const InternalKey* largest, int level); + bool RangeInCompaction(const InternalKey* smallest, const InternalKey* largest, int level); + bool IsOverlapInFileRange(FileMetaData* lf, FileMetaData* f); + bool PickFutureCompaction(int level, std::vector* inputs); + bool PickCompactionBySize(int level, std::vector* inputs); + Env* const env_; const std::string dbname_; const Options* const options_; @@ -316,6 +345,8 @@ class VersionSet { uint64_t log_number_; uint64_t prev_log_number_; // 0 or backing store for memtable being compacted + std::deque manifest_writers_; + // Opened lazily WritableFile* descriptor_file_; log::Writer* descriptor_log_; @@ -325,6 +356,7 @@ class VersionSet { // Per-level key at which the next compaction at that level should start. // Either an empty string, or a valid InternalKey. std::string compact_pointer_[config::kNumLevels]; + std::vector level0_compactions_in_progress_; // No copying allowed VersionSet(const VersionSet&); @@ -372,6 +404,8 @@ class Compaction { // before processing "internal_key". bool ShouldStopBefore(const Slice& internal_key); + void MarkBeingCompacted(bool flag); + // Release the input version for the compaction, once the compaction // is successful. void ReleaseInputs(); @@ -384,6 +418,7 @@ class Compaction { private: friend class Version; friend class VersionSet; + friend class DBImpl; explicit Compaction(int level); @@ -420,6 +455,10 @@ class Compaction { // support self compaction bool force_non_trivial_; + + // support parallel compaction + std::string sub_compact_start_; // own by child + std::string sub_compact_end_; // own by child }; } // namespace leveldb diff --git a/src/leveldb/db/version_set_test.cc b/src/leveldb/db/version_set_test.cc index f4ad56367..4292ab0e7 100644 --- a/src/leveldb/db/version_set_test.cc +++ b/src/leveldb/db/version_set_test.cc @@ -6,10 +6,15 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. +#define private public #include "db/version_set.h" +#undef private + +#include "db/dbformat.h" #include "util/logging.h" #include "util/testharness.h" #include "util/testutil.h" +#include "leveldb/compact_strategy.h" namespace leveldb { @@ -46,7 +51,7 @@ class FindFileTest { InternalKeyComparator cmp(BytewiseComparator()); Slice s(smallest != NULL ? smallest : ""); Slice l(largest != NULL ? largest : ""); - return SomeFileOverlapsRange(cmp, disjoint_sorted_files_, files_, + return SomeFileOverlapsRange(cmp, cmp.user_comparator(), disjoint_sorted_files_, files_, (smallest != NULL ? &s : NULL), (largest != NULL ? &l : NULL)); } @@ -90,7 +95,6 @@ TEST(FindFileTest, Single) { ASSERT_TRUE(Overlaps(NULL, NULL)); } - TEST(FindFileTest, Multiple) { Add("150", "200"); Add("200", "250"); @@ -176,6 +180,57 @@ TEST(FindFileTest, OverlappingFiles) { ASSERT_TRUE(Overlaps("600", "700")); } +class VersionSetTest { +public: + VersionSetTest () + : icmp(opt.comparator), + t_log_number(10), + t_next_file(20), + t_last_seq(100) { + opt.compact_strategy_factory = new DummyCompactStrategyFactory(); + opt.env->DeleteDirRecursive("/tmp/db/test"); + opt.env->CreateDir("/tmp/db/test"); + t_vset = new VersionSet(std::string("/tmp/db/test"), &opt, NULL, &icmp); + t_vset->manifest_file_number_ = 100; + } + +public: + Options opt; + const InternalKeyComparator icmp; + VersionSet* t_vset; + uint64_t t_log_number; + uint64_t t_next_file; + uint64_t t_last_seq; + port::Mutex t_mu; +}; + +TEST(VersionSetTest, PickCompactionTest) { + VersionEdit edit; + + edit.AddFile(0, t_vset->NewFileNumber(), 200, + InternalKey("a0001", 1, kTypeValue), + InternalKey("a0002", 1, kTypeDeletion)); + edit.AddFile(0, t_vset->NewFileNumber(), 200, + InternalKey("a0003", 1, kTypeValue), + InternalKey("a0004", 1, kTypeValue)); + edit.SetComparatorName(leveldb::BytewiseComparator()->Name()); + t_mu.Lock(); + t_vset->LogAndApply(&edit, &t_mu); + t_mu.Unlock(); + Compaction* c = t_vset->PickCompaction(); + ASSERT_TRUE((uint64_t)t_vset->level0_compactions_in_progress_[0] == (uint64_t)c); + + VersionEdit edit1; + edit1.AddFile(0, t_vset->NewFileNumber(), 200, + InternalKey("a0005", 1, kTypeValue), + InternalKey("a0006", 1, kTypeValue)); + edit1.SetComparatorName(leveldb::BytewiseComparator()->Name()); + t_mu.Lock(); + t_vset->LogAndApply(&edit1, &t_mu); + t_mu.Unlock(); + ASSERT_TRUE(t_vset->PickCompaction() == NULL); +} + } // namespace leveldb int main(int argc, char** argv) { diff --git a/src/leveldb/include/leveldb/db.h b/src/leveldb/include/leveldb/db.h index 1d235801a..1b93fe8df 100644 --- a/src/leveldb/include/leveldb/db.h +++ b/src/leveldb/include/leveldb/db.h @@ -174,6 +174,8 @@ class DB { // Add all sst files inherited from other tablets virtual void AddInheritedLiveFiles(std::vector >* live) = 0; + virtual bool ShouldForceUnloadOnError() { return false; } + private: // No copying allowed DB(const DB&); diff --git a/src/leveldb/include/leveldb/dfs.h b/src/leveldb/include/leveldb/dfs.h index b5874848d..b5df4b0b2 100644 --- a/src/leveldb/include/leveldb/dfs.h +++ b/src/leveldb/include/leveldb/dfs.h @@ -7,6 +7,7 @@ #include #include +#include #include namespace leveldb { @@ -70,8 +71,12 @@ class Dfs { static Dfs* NewDfs(const std::string& so_path, const std::string& conf); /// Returns 0 on success. virtual int32_t UnlockDirectory(const std::string& path) = 0; + + virtual int32_t ClearDirOwner(const std::string& path) = 0; /// Returns DfsFile handler on success, NULL on error.WithTime virtual DfsFile* OpenFile(const std::string& filename, int32_t flags) = 0; + + virtual int32_t Stat(const std::string& filename, struct stat* fstat) = 0; private: Dfs(const Dfs&); void operator=(const Dfs&); diff --git a/src/leveldb/include/leveldb/env_dfs.h b/src/leveldb/include/leveldb/env_dfs.h index d34a2c697..bc0e65d9a 100644 --- a/src/leveldb/include/leveldb/env_dfs.h +++ b/src/leveldb/include/leveldb/env_dfs.h @@ -17,7 +17,7 @@ #include "leveldb/dfs.h" #include "leveldb/env.h" #include "leveldb/status.h" -#include "../../../utils/counter.h" +#include "../../../common/counter.h" namespace leveldb { @@ -60,6 +60,8 @@ class DfsEnv : public EnvWrapper { virtual Status UnlockFile(FileLock* lock); + int32_t ClearDirOwner(const std::string& dir) {return dfs_->ClearDirOwner(dir);} + virtual Env* CacheEnv() { return this; } static uint64_t gettid() { diff --git a/src/leveldb/include/leveldb/options.h b/src/leveldb/include/leveldb/options.h index be78d0d30..6793f0299 100644 --- a/src/leveldb/include/leveldb/options.h +++ b/src/leveldb/include/leveldb/options.h @@ -223,6 +223,8 @@ struct Options { std::set* exist_lg_list; std::map* lg_info_list; + std::set ignore_corruption_in_open_lg_list; + // compaction strategy to determine how to // drop the obsoleted kv records bool enable_strategy_when_get; @@ -310,13 +312,24 @@ struct Options { bool ignore_corruption_in_open; // Statistic: By default, if 10% entry timeout, will trigger compaction - // Default: 10 % + // Default: 99 % uint64_t ttl_percentage; // Statistic: delete tag's percentage in sst - // Default: 10 % + // Default: 20 % uint64_t del_percentage; + // Max thread alloc for lg's compaction + // Default: 5 + uint32_t max_background_compactions; + + // if level0's file num >= limit, use sqrt slow down level score + // Default: 30 + int slow_down_level0_score_limit; + + // parallel compaction + int max_sub_parallel_compaction; + // Create an Options object with default values for all fields. Options(); }; diff --git a/src/leveldb/include/leveldb/status.h b/src/leveldb/include/leveldb/status.h index 4bd364cca..0e062e6c1 100644 --- a/src/leveldb/include/leveldb/status.h +++ b/src/leveldb/include/leveldb/status.h @@ -55,6 +55,10 @@ class Status { return Status(kTimeOut, msg, msg2); } + static Status IOPermissionDenied(const Slice& msg, const Slice msg2 = Slice()) { + return Status(kIOPermissionDenied, msg, msg2); + } + // Returns true iff the status indicates success. bool ok() const { return (state_ == NULL); } @@ -69,6 +73,8 @@ class Status { // Returns true iff the status indicates an TimeOut. bool IsTimeOut() const { return code() == kTimeOut; } + + bool IsIOPermissionDenied() const { return code() == kIOPermissionDenied; } // Return a string representation of this status suitable for printing. // Returns the string "OK" for success. std::string ToString() const; @@ -88,7 +94,8 @@ class Status { kNotSupported = 3, kInvalidArgument = 4, kIOError = 5, - kTimeOut = 6 + kTimeOut = 6, + kIOPermissionDenied = 13 }; Code code() const { diff --git a/src/leveldb/port/port_posix.h b/src/leveldb/port/port_posix.h index ed19e222f..65f4274a1 100644 --- a/src/leveldb/port/port_posix.h +++ b/src/leveldb/port/port_posix.h @@ -46,9 +46,7 @@ #endif #include -#ifdef SNAPPY #include -#endif #include #include #include "port/atomic_pointer.h" @@ -124,33 +122,21 @@ extern void InitOnce(OnceType* once, void (*initializer)()); inline bool Snappy_Compress(const char* input, size_t length, ::std::string* output) { -#ifdef SNAPPY output->resize(snappy::MaxCompressedLength(length)); size_t outlen; snappy::RawCompress(input, length, &(*output)[0], &outlen); output->resize(outlen); return true; -#endif - - return false; } inline bool Snappy_GetUncompressedLength(const char* input, size_t length, size_t* result) { -#ifdef SNAPPY return snappy::GetUncompressedLength(input, length, result); -#else - return false; -#endif } inline bool Snappy_Uncompress(const char* input, size_t length, char* output) { -#ifdef SNAPPY return snappy::RawUncompress(input, length, output); -#else - return false; -#endif } /////////// Compression Ext /////////// diff --git a/src/leveldb/table/table_builder.cc b/src/leveldb/table/table_builder.cc index 9d6a7983b..63b70bb63 100644 --- a/src/leveldb/table/table_builder.cc +++ b/src/leveldb/table/table_builder.cc @@ -18,7 +18,7 @@ #include "table/format.h" #include "util/coding.h" #include "util/crc32c.h" -#include "../utils/counter.h" +#include "../common/counter.h" namespace leveldb { diff --git a/src/leveldb/util/env_cache.cc b/src/leveldb/util/env_cache.cc index 9d99fd168..51db78a27 100644 --- a/src/leveldb/util/env_cache.cc +++ b/src/leveldb/util/env_cache.cc @@ -33,6 +33,9 @@ const char* paths[] = {"./cache_dir_1/", "./cache_dir_2/"}; std::vector ThreeLevelCacheEnv::cache_paths_(paths, paths + 2); static Status IOError(const std::string& context, int err_number) { + if (err_number == EACCES) { + return Status::IOPermissionDenied(context, strerror(err_number)); + } return Status::IOError(context, strerror(err_number)); } diff --git a/src/leveldb/util/env_dfs.cc b/src/leveldb/util/env_dfs.cc index 53fde1804..f9f260b13 100644 --- a/src/leveldb/util/env_dfs.cc +++ b/src/leveldb/util/env_dfs.cc @@ -22,7 +22,7 @@ #include "leveldb/table_utils.h" #include "nfs.h" #include "util/mutexlock.h" -#include "../utils/counter.h" +#include "../common/counter.h" namespace leveldb { @@ -95,6 +95,9 @@ char* get_time_str(char* p, size_t len) // Log error message static Status IOError(const std::string& context, int err_number) { + if (err_number == EACCES) { + return Status::IOPermissionDenied(context, strerror(err_number)); + } return Status::IOError(context, strerror(err_number)); } diff --git a/src/leveldb/util/env_flash.cc b/src/leveldb/util/env_flash.cc index fd0702388..c6c42a9cc 100644 --- a/src/leveldb/util/env_flash.cc +++ b/src/leveldb/util/env_flash.cc @@ -21,7 +21,7 @@ #include "util/hash.h" #include "util/mutexlock.h" #include "helpers/memenv/memenv.h" -#include "../utils/counter.h" +#include "../common/counter.h" #include "leveldb/env_flash.h" @@ -38,6 +38,9 @@ const int64_t kUpdateFlashRetryIntervalMillis = 60 * 1000; // Log error message static Status IOError(const std::string& context, int err_number) { + if (err_number == EACCES) { + return Status::IOPermissionDenied(context, strerror(err_number)); + } return Status::IOError(context, strerror(err_number)); } @@ -68,7 +71,7 @@ Status CopyToLocal(const std::string& local_fname, Env* env, if (!s.ok()) { Log("[env_flash] create dir: %s failed: %s, exit", local_fname.substr(0, dir_pos).c_str(), s.ToString().c_str()); - exit(-1); + _exit(-1); } } @@ -79,7 +82,7 @@ Status CopyToLocal(const std::string& local_fname, Env* env, if (!vanish_allowed) { Log("[env_flash] create file: %s failed: %s, exit", local_fname.c_str(), s.ToString().c_str()); - exit(-1); + _exit(-1); } delete dfs_file; return s; @@ -501,7 +504,7 @@ void FlashEnv::SetFlashPath(const std::string& path, bool vanish_allowed) { && !Env::Default()->CreateDir(flash_paths_.back()).ok()) { Log("[env_flash] cannot access cache dir: %s\n", flash_paths_.back().c_str()); - exit(-1); + _exit(-1); } } } diff --git a/src/leveldb/util/env_inmem.cc b/src/leveldb/util/env_inmem.cc index 4e9855269..a587eacac 100644 --- a/src/leveldb/util/env_inmem.cc +++ b/src/leveldb/util/env_inmem.cc @@ -20,7 +20,7 @@ #include "leveldb/table_utils.h" #include "util/mutexlock.h" #include "helpers/memenv/memenv.h" -#include "../utils/counter.h" +#include "../common/counter.h" #include "leveldb/env_inmem.h" diff --git a/src/leveldb/util/env_mock.cc b/src/leveldb/util/env_mock.cc index 5265e58ea..abf13089e 100644 --- a/src/leveldb/util/env_mock.cc +++ b/src/leveldb/util/env_mock.cc @@ -51,6 +51,9 @@ void MockEnv::SetPrefix(const std::string& p) // Log error message static Status IOError(const std::string& context, int err_number) { + if (err_number == EACCES) { + return Status::IOPermissionDenied(context, strerror(err_number)); + } return Status::IOError(context, strerror(err_number)); } diff --git a/src/leveldb/util/env_posix.cc b/src/leveldb/util/env_posix.cc index fdc1d2ce4..6d495768e 100644 --- a/src/leveldb/util/env_posix.cc +++ b/src/leveldb/util/env_posix.cc @@ -36,7 +36,7 @@ #include "util/posix_logger.h" #include "util/string_ext.h" #include "util/thread_pool.h" -#include "../utils/counter.h" +#include "../common/counter.h" namespace leveldb { @@ -59,6 +59,9 @@ tera::Counter posix_other_counter; namespace { static Status IOError(const std::string& context, int err_number) { + if (err_number == EACCES) { + return Status::IOPermissionDenied(context, strerror(err_number)); + } return Status::IOError(context, strerror(err_number)); } @@ -132,9 +135,13 @@ class PosixRandomAccessFile: public RandomAccessFile { // problems for very large databases. class MmapLimiter { public: - // Up to 1000 mmaps for 64-bit binaries; none for smaller pointer sizes. MmapLimiter() { - SetAllowed(sizeof(void*) >= 8 ? 1000 : 0); + //Disable mmap in tera for reducing memory use. + SetAllowed(0); + + // Up to 1000 mmaps for 64-bit binaries; none for smaller pointer sizes. + //SetAllowed(sizeof(void*) >= 8 ? 1000 : 0); + //If you want to enable mmap, uncomment the line above. } // If another mmap slot is available, acquire it and return true. diff --git a/src/leveldb/util/hdfs.cc b/src/leveldb/util/hdfs.cc index b90fea36e..4a9721bc2 100644 --- a/src/leveldb/util/hdfs.cc +++ b/src/leveldb/util/hdfs.cc @@ -6,10 +6,10 @@ #include #include - #include "hdfs.h" #include "include/hdfs.h" -#include "../utils/counter.h" +#include "hdfs_util.h" +#include "../common/counter.h" namespace leveldb { @@ -233,6 +233,21 @@ int32_t Hdfs::UnlockDirectory(const std::string& path) { return -1; } + +int32_t Hdfs::Stat(const std::string& filename, struct stat* fstat) { + hdfsFileInfo* pFileInfo = (*hdfsGetPathInfo)((hdfsFS)fs_, filename.c_str()); + if (pFileInfo != NULL) { + HdfsFileInfo2PosixFileStat(pFileInfo, fstat); + (*hdfsFreeFileInfo)(pFileInfo, 1); + return 0; + } + return -1; +} + +int32_t Hdfs::ClearDirOwner(const std::string& path) { + // hdfs has no dir owner, so we return succ directly + return 0; } +} /* vim: set expandtab ts=2 sw=2 sts=2 tw=100: */ diff --git a/src/leveldb/util/hdfs.h b/src/leveldb/util/hdfs.h index 81ed269ac..ebf464f6b 100644 --- a/src/leveldb/util/hdfs.h +++ b/src/leveldb/util/hdfs.h @@ -48,8 +48,9 @@ class Hdfs : public Dfs { int32_t ListDirectory(const std::string& path, std::vector* result); int32_t LockDirectory(const std::string& path); int32_t UnlockDirectory(const std::string& path); + int32_t ClearDirOwner(const std::string& path); DfsFile* OpenFile(const std::string& filename, int32_t flags); - + int32_t Stat(const std::string& filename, struct stat* fstat); private: void* fs_; @@ -92,8 +93,10 @@ class Hdfs2 : public Dfs { int32_t ListDirectory(const std::string& path, std::vector* result); int32_t LockDirectory(const std::string& path); int32_t UnlockDirectory(const std::string& path); + int32_t ClearDirOwner(const std::string& path); DfsFile* OpenFile(const std::string& filename, int32_t flags); + int32_t Stat(const std::string& filename, struct stat* fstat); private: void* GetFSHandle(const std::string& path); std::vector fs_list_; diff --git a/src/leveldb/util/hdfs2.cc b/src/leveldb/util/hdfs2.cc index fa3a8902c..0eac0ecea 100644 --- a/src/leveldb/util/hdfs2.cc +++ b/src/leveldb/util/hdfs2.cc @@ -7,8 +7,9 @@ #include "hdfs.h" #include "include/hdfs2.h" +#include "hdfs_util.h" #include "util/hash.h" -#include "../utils/counter.h" +#include "../common/counter.h" namespace leveldb { @@ -257,6 +258,21 @@ int32_t Hdfs2::UnlockDirectory(const std::string& path) { return -1; } +int32_t Hdfs2::ClearDirOwner(const std::string& path) { + // hdfs has no dir owner, so return succ directly + return 0; +} + +int32_t Hdfs2::Stat(const std::string& filepath, struct stat* st) { + hdfsFileInfo* pFileInfo = (*hdfsGetPathInfo)((hdfsFS)GetFSHandle(filepath), filepath.c_str()); + if (pFileInfo != NULL) { + HdfsFileInfo2PosixFileStat(pFileInfo, st); + return 0; + } + return -1; + +} + } // namespace leveldb diff --git a/src/leveldb/util/hdfs_util.h b/src/leveldb/util/hdfs_util.h new file mode 100644 index 000000000..ba2eb720b --- /dev/null +++ b/src/leveldb/util/hdfs_util.h @@ -0,0 +1,64 @@ +// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. +// + +#ifndef TERA_LEVELDB_HDFS_UTIL_H +#define TERA_LEVELDB_HDFS_UTIL_H +#include +#include +#include +#include +#include +namespace leveldb { + +static void HdfsFileInfo2PosixFileStat(hdfsFileInfo* info, struct stat* st) { + memset(st, 0, sizeof(struct stat)); + //by default: set to 0 to indicate not support for directory because we can not get this info + st->st_nlink = (info->mKind == kObjectKindDirectory) ? 0 : 1; + uid_t owner_id = 99; // no body, magic number in linux + if (info->mOwner != NULL) { + struct passwd passwd_info; + struct passwd* result = NULL; + ssize_t buf_size = sysconf(_SC_GETPW_R_SIZE_MAX); + buf_size = buf_size == -1 ? 16384 : buf_size; + char* pwbuf = new char[buf_size]; + if (0 == getpwnam_r(info->mOwner, &passwd_info, pwbuf, buf_size, &result)) { + if (result != NULL) { + owner_id = passwd_info.pw_uid; + } + } + delete [] pwbuf; + } + gid_t group_id = 99; // no body, magic number in posix + if (info->mGroup != NULL) { + struct group result; + struct group* resultp; + ssize_t len = sysconf(_SC_GETGR_R_SIZE_MAX); + len = len == -1 ? 16384 : len; + char* group_buf = new char[len]; + if (0 == getgrnam_r(info->mGroup, &result, group_buf, len, &resultp)) { + if (resultp != NULL) { + group_id = result.gr_gid; + } + } + delete [] group_buf; + } + short file_mode = (info->mKind == kObjectKindDirectory) ? (S_IFDIR | 0777) : (S_IFREG | 0666); + if (info->mPermissions > 0) { + file_mode = (info->mKind == kObjectKindDirectory) ? S_IFDIR: S_IFREG; + file_mode |= info->mPermissions; + } + st->st_size = (info->mKind == kObjectKindDirectory) ? 4096 : info->mSize; + st->st_blksize = 512; // posix default block size + st->st_blocks = (st->st_size + st->st_blksize - 1)/st->st_blksize; + st->st_mode = file_mode; + st->st_uid = owner_id; + st->st_gid = group_id; + st->st_atime = info->mLastAccess; + st->st_ctime = info->mLastMod; + st->st_mtime = info->mLastMod; + return; +} +} +#endif diff --git a/src/leveldb/util/nfs.cc b/src/leveldb/util/nfs.cc index cb07a1797..37f0f0666 100644 --- a/src/leveldb/util/nfs.cc +++ b/src/leveldb/util/nfs.cc @@ -13,7 +13,7 @@ #include "util/mutexlock.h" #include "util/string_ext.h" #include "../common/timer.h" -#include "../utils/counter.h" +#include "../common/counter.h" namespace leveldb { @@ -29,6 +29,7 @@ static struct ::dirent* (*nfsReaddir)(nfs::NFSDIR* dir); static int (*nfsClosedir)(nfs::NFSDIR* dir); static int (*nfsSetDirOwner)(const char* path); static int (*nfsClearDirOwner)(const char* path); +static int (*nfsForceClearDirOwner)(const char* path); static int (*nfsStat)(const char* path, struct ::stat* stat); static int (*nfsUnlink)(const char* path); @@ -90,7 +91,7 @@ void Nfs::LoadSymbol() { } *(void**)(&printVersion) = ResolveSymbol(dl, "PrintNfsVersion"); - fprintf(stderr, "libnfs.so version: \n%s\n\n", (*printVersion)()); + //fprintf(stderr, "libnfs.so version: \n%s\n\n", (*printVersion)()); *(void**)(&nfsInit) = ResolveSymbol(dl, "Init"); *(void**)(&nfsSetComlogLevel) = ResolveSymbol(dl, "SetComlogLevel"); @@ -102,6 +103,7 @@ void Nfs::LoadSymbol() { *(void**)(&nfsClosedir) = ResolveSymbol(dl, "Closedir"); *(void**)(&nfsSetDirOwner) = ResolveSymbol(dl, "SetDirOwner"); *(void**)(&nfsClearDirOwner) = ResolveSymbol(dl, "ClearDirOwner"); + *(void**)(&nfsForceClearDirOwner) = ResolveSymbol(dl, "ForceClearDirOwner"); *(void**)(&nfsStat) = ResolveSymbol(dl, "Stat"); *(void**)(&nfsUnlink) = ResolveSymbol(dl, "Unlink"); *(void**)(&nfsAccess) = ResolveSymbol(dl, "Access"); @@ -256,7 +258,7 @@ int32_t Nfs::CreateDirectory(const std::string& name) { if (0 != (*nfsAccess)(path.c_str(), F_OK) && (*nfsGetErrno)() == ENOENT) { if (0 != (*nfsMkdir)(path.c_str()) && (*nfsGetErrno)() != EEXIST) { errno = (*nfsGetErrno)(); - fprintf(stderr, "[%s] Createdir %s fail: %d\n", common::timer::get_curtime_str().c_str(), name.c_str(), errno); + fprintf(stderr, "[%s] Createdir %s fail: %d\n", tera::get_curtime_str().c_str(), name.c_str(), errno); return -1; } } @@ -268,7 +270,7 @@ int32_t Nfs::DeleteDirectory(const std::string& name) { int32_t retval = (*nfsRmdir)(name.c_str()); if (retval != 0) { errno = (*nfsGetErrno)(); - fprintf(stderr, "[%s] DeleteDirectory %s fail: %d\n", common::timer::get_curtime_str().c_str(), name.c_str(), errno); + fprintf(stderr, "[%s] DeleteDirectory %s fail: %d\n", tera::get_curtime_str().c_str(), name.c_str(), errno); } return retval; } @@ -277,7 +279,7 @@ int32_t Nfs::Exists(const std::string& filename) { if (retval != 0) { errno = (*nfsGetErrno)(); int errno_saved = errno; - fprintf(stderr, "[%s] Exists %s fail: %d\n", common::timer::get_curtime_str().c_str(), filename.c_str(), errno); + fprintf(stderr, "[%s] Exists %s fail: %d\n", tera::get_curtime_str().c_str(), filename.c_str(), errno); errno = errno_saved; } return retval; @@ -286,7 +288,7 @@ int32_t Nfs::Delete(const std::string& filename) { int32_t retval = (*nfsUnlink)(filename.c_str()); if (retval != 0) { errno = (*nfsGetErrno)(); - fprintf(stderr, "[%s] Delete %s fail: %d\n", common::timer::get_curtime_str().c_str(), filename.c_str(), errno); + fprintf(stderr, "[%s] Delete %s fail: %d\n", tera::get_curtime_str().c_str(), filename.c_str(), errno); } return retval; } @@ -297,7 +299,7 @@ int32_t Nfs::GetFileSize(const std::string& filename, uint64_t* size) { *size = fileinfo.st_size; } else { errno = (*nfsGetErrno)(); - fprintf(stderr, "[%s] Getfilesize %s fail: %d\n", common::timer::get_curtime_str().c_str(), filename.c_str(), errno); + fprintf(stderr, "[%s] Getfilesize %s fail: %d\n", tera::get_curtime_str().c_str(), filename.c_str(), errno); } return retval; } @@ -305,7 +307,7 @@ int32_t Nfs::Rename(const std::string& from, const std::string& to) { int32_t retval = (*nfsRename)(from.c_str(), to.c_str()); if (retval != 0) { errno = (*nfsGetErrno)(); - fprintf(stderr, "[%s] Rename %s to %s fail: %d\n", common::timer::get_curtime_str().c_str(), from.c_str(), to.c_str(), errno); + fprintf(stderr, "[%s] Rename %s to %s fail: %d\n", tera::get_curtime_str().c_str(), from.c_str(), to.c_str(), errno); } return retval; } @@ -322,10 +324,19 @@ DfsFile* Nfs::OpenFile(const std::string& filename, int32_t flags) { return new NFile(file, filename); } errno = (*nfsGetErrno)(); - fprintf(stderr, "[%s] Openfile %s fail: %d\n", common::timer::get_curtime_str().c_str(), filename.c_str(), errno); + fprintf(stderr, "[%s] Openfile %s fail: %d\n", tera::get_curtime_str().c_str(), filename.c_str(), errno); return NULL; } +int32_t Nfs::Stat(const std::string& filename, struct stat* fstat) { + int32_t retval = (*nfsStat)(filename.c_str(), fstat); + if (retval != 0) { + errno = (*nfsGetErrno)(); + //fprintf(stderr, "[%s] Stat %s fail: %d\n", tera::get_curtime_str().c_str(), filename.c_str(), errno); + } + return retval; +} + int32_t Nfs::Copy(const std::string& from, const std::string& to) { // not support return -1; @@ -336,7 +347,7 @@ int32_t Nfs::ListDirectory(const std::string& path, if (NULL == dir) { errno = (*nfsGetErrno)(); int errno_saved = errno; - fprintf(stderr, "[%s] Opendir %s fail: %d\n", common::timer::get_curtime_str().c_str(), path.c_str(), errno); + fprintf(stderr, "[%s] Opendir %s fail: %d\n", tera::get_curtime_str().c_str(), path.c_str(), errno); errno = errno_saved; return -1; } @@ -350,7 +361,7 @@ int32_t Nfs::ListDirectory(const std::string& path, errno = (*nfsGetErrno)(); int errno_saved = errno; if (0 != errno) { - fprintf(stderr, "[%s] List %s error: %d\n", common::timer::get_curtime_str().c_str(), path.c_str(), errno); + fprintf(stderr, "[%s] List %s error: %d\n", tera::get_curtime_str().c_str(), path.c_str(), errno); (*nfsClosedir)(dir); errno = errno_saved; return -1; @@ -394,5 +405,9 @@ int32_t Nfs::UnlockDirectory(const std::string& path) { return (*nfsClearDirOwner)(path.c_str()); } +int32_t Nfs::ClearDirOwner(const std::string& path) { + return (*nfsForceClearDirOwner)(path.c_str()); +} + } /* vim: set expandtab ts=2 sw=2 sts=2 tw=100: */ diff --git a/src/leveldb/util/nfs.h b/src/leveldb/util/nfs.h index b80dd0316..ab286d82b 100644 --- a/src/leveldb/util/nfs.h +++ b/src/leveldb/util/nfs.h @@ -50,7 +50,10 @@ class Nfs : public Dfs { int32_t ListDirectory(const std::string& path, std::vector* result); int32_t LockDirectory(const std::string& path); int32_t UnlockDirectory(const std::string& path); + int32_t ClearDirOwner(const std::string& path); + DfsFile* OpenFile(const std::string& filename, int32_t flags); + int32_t Stat(const std::string& filename, struct stat* fstat); private: Nfs(); static port::Mutex mu_; diff --git a/src/leveldb/util/options.cc b/src/leveldb/util/options.cc index ecd11b57e..e64512908 100644 --- a/src/leveldb/util/options.cc +++ b/src/leveldb/util/options.cc @@ -53,7 +53,10 @@ Options::Options() disable_wal(false), ignore_corruption_in_open(false), ttl_percentage(99), - del_percentage(20) { + del_percentage(20), + max_background_compactions(5), + slow_down_level0_score_limit(30), + max_sub_parallel_compaction(10) { } } // namespace leveldb diff --git a/src/leveldb/util/raw_key_operator.cc b/src/leveldb/util/raw_key_operator.cc index 9d5b5d3dc..8ce699c5b 100644 --- a/src/leveldb/util/raw_key_operator.cc +++ b/src/leveldb/util/raw_key_operator.cc @@ -7,7 +7,7 @@ #include #include "coding.h" -#include "../utils/counter.h" +#include "../common/counter.h" namespace leveldb { diff --git a/src/leveldb/util/status.cc b/src/leveldb/util/status.cc index 871a34872..14b22f82e 100644 --- a/src/leveldb/util/status.cc +++ b/src/leveldb/util/status.cc @@ -65,6 +65,9 @@ std::string Status::ToString() const { case kTimeOut: type = "Timeout error: "; break; + case kIOPermissionDenied: + type = "IO Permission Denied: "; + break; default: snprintf(tmp, sizeof(tmp), "Unknown code(%d): ", static_cast(code())); diff --git a/src/load_balancer/action.h b/src/load_balancer/action.h new file mode 100644 index 000000000..754382916 --- /dev/null +++ b/src/load_balancer/action.h @@ -0,0 +1,45 @@ +// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef TERA_LOAD_BALANCER_ACTION_H_ +#define TERA_LOAD_BALANCER_ACTION_H_ + +#include +#include + +namespace tera { +namespace load_balancer { + +class Action { +public: + enum class Type { + ASSIGN, + MOVE, + SWAP, + EMPTY, + }; + + Type GetType() const { + return type_; + } + +public: + Action(Type t) { + type_ = t; + } + + virtual ~Action() {} + + virtual Action* UndoAction() = 0; + + virtual std::string ToString() const = 0; + +private: + Type type_; +}; + +} // namespace load_balancer +} // namespace tera + +#endif // TERA_LOAD_BALANCER_ACTION_H_ diff --git a/src/load_balancer/action_generator.h b/src/load_balancer/action_generator.h new file mode 100644 index 000000000..77403bfe1 --- /dev/null +++ b/src/load_balancer/action_generator.h @@ -0,0 +1,67 @@ +// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef TERA_LOAD_BALANCER_ACTION_GENERATOR_H_ +#define TERA_LOAD_BALANCER_ACTION_GENERATOR_H_ + +#include +#include +#include + +#include "load_balancer/action.h" +#include "load_balancer/cluster.h" +#include "load_balancer/random.h" + +namespace tera { +namespace load_balancer { + +const uint32_t kInvalidNodeIndex = std::numeric_limits::max(); +const uint32_t kInvalidTabletIndex = std::numeric_limits::max(); + +class ActionGenerator { +public: + virtual ~ActionGenerator() {} + + virtual Action* Generate(const std::shared_ptr& cluster) = 0; + + virtual std::string Name() = 0; + + virtual uint32_t PickRandomNode(const std::shared_ptr& cluster) { + if (cluster->tablet_node_num_ > 0) { + return Random::Rand(0, cluster->tablet_node_num_); + } else { + return kInvalidNodeIndex; + } + } + + // pick a different node with the picked_index + virtual uint32_t PickOtherRandomNode(const std::shared_ptr& cluster, + const uint32_t picked_index) { + assert(cluster->tablet_node_num_ >= 2); + + while (true) { + uint32_t node_index = PickRandomNode(cluster); + if (node_index != picked_index) { + return node_index; + } + } + } + + virtual uint32_t PickRandomTabletOfNode(const std::shared_ptr& cluster, + const uint32_t node_index) { + uint32_t tablet_num = cluster->tablets_per_node_[node_index].size(); + + if (tablet_num > 0) { + uint32_t rand = Random::Rand(0, tablet_num); + return cluster->tablets_per_node_[node_index][rand]; + } else { + return kInvalidTabletIndex; + } + } +}; + +} // namespace load_balancer +} // namespace tera + +#endif // TERA_LOAD_BALANCER_ACTION_GENERATOR_H_ diff --git a/src/load_balancer/action_generators.cc b/src/load_balancer/action_generators.cc new file mode 100644 index 000000000..f0cfe53d1 --- /dev/null +++ b/src/load_balancer/action_generators.cc @@ -0,0 +1,344 @@ +// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include + +#include + +#include "glog/logging.h" +#include "load_balancer/action_generators.h" +#include "load_balancer/actions.h" +#include "load_balancer/random.h" + +namespace tera { +namespace load_balancer { + +RandomActionGenerator::RandomActionGenerator() : + name_("RandomActionGenerator") { +} + +RandomActionGenerator::~RandomActionGenerator() { +} + +Action* RandomActionGenerator::Generate(const std::shared_ptr& cluster) { + VLOG(20) << "[lb] RandomActionGenerator worked"; + + if (cluster->tablet_node_num_ < 2) { + return new EmptyAction(); + } + + uint32_t source_node_index = PickRandomNode(cluster); + uint32_t dest_node_index = PickOtherRandomNode(cluster, source_node_index); + uint32_t tablet_index = PickRandomTabletOfNode(cluster, source_node_index); + + if (tablet_index == kInvalidTabletIndex || + source_node_index == kInvalidNodeIndex || + dest_node_index == kInvalidNodeIndex) { + return new EmptyAction(); + } + + return new MoveAction(tablet_index, source_node_index, dest_node_index); +} + +std::string RandomActionGenerator::Name() { + return name_; +} + +TabletCountActionGenerator::TabletCountActionGenerator() : + name_("TabletCountActionGenerator") { +} + +TabletCountActionGenerator::~TabletCountActionGenerator() { +} + +Action* TabletCountActionGenerator::Generate(const std::shared_ptr& cluster) { + VLOG(20) << "[lb] TabletCountActionGenerator worked"; + + if (cluster->tablet_node_num_ < 2) { + return new EmptyAction(); + } + + cluster->SortNodesByTabletCount(); + + uint32_t source_node_index = PickMostTabletsNode(cluster); + uint32_t dest_node_index = PickLeastTabletsNode(cluster); + uint32_t tablet_index = PickRandomTabletOfNode(cluster, source_node_index); + + if (tablet_index == kInvalidTabletIndex || + source_node_index == kInvalidNodeIndex || + dest_node_index == kInvalidNodeIndex || + source_node_index == dest_node_index) { + return new EmptyAction(); + } + + return new MoveAction(tablet_index, source_node_index, dest_node_index); +} + +uint32_t TabletCountActionGenerator::PickMostTabletsNode(const std::shared_ptr& cluster) { + if (cluster->node_index_sorted_by_tablet_count_.size() >= 1) { + return cluster->node_index_sorted_by_tablet_count_[cluster->node_index_sorted_by_tablet_count_.size() - 1]; + } else { + return kInvalidTabletIndex; + } +} + +uint32_t TabletCountActionGenerator::PickLeastTabletsNode(const std::shared_ptr& cluster) { + if (cluster->node_index_sorted_by_tablet_count_.size() >= 1) { + uint32_t index = 0; + if (cluster->lb_options_.meta_table_isolate_enabled) { + while (cluster->node_index_sorted_by_tablet_count_[index] == cluster->meta_table_node_index_) { + ++index; + if (index == cluster->node_index_sorted_by_tablet_count_.size()) { + return kInvalidNodeIndex; + } + } + } + return cluster->node_index_sorted_by_tablet_count_[index]; + } else { + return kInvalidTabletIndex; + } +} + +std::string TabletCountActionGenerator::Name() { + return name_; +} + +SizeActionGenerator::SizeActionGenerator() : + name_("SizeActionGenerator") { +} + +SizeActionGenerator::~SizeActionGenerator() { +} + +Action* SizeActionGenerator::Generate(const std::shared_ptr& cluster) { + VLOG(20) << "[lb] SizeActionGenerator worked"; + + if (cluster->tablet_node_num_ < 2) { + return new EmptyAction(); + } + + cluster->SortNodesBySize(); + + uint32_t source_node_index = PickLargestSizeNode(cluster); + uint32_t dest_node_index = PickSmallestSizeNode(cluster); + uint32_t tablet_index = PickRandomTabletOfNode(cluster, source_node_index); + + if (tablet_index == kInvalidTabletIndex || + source_node_index == kInvalidNodeIndex || + dest_node_index == kInvalidNodeIndex || + source_node_index == dest_node_index) { + return new EmptyAction(); + } + + return new MoveAction(tablet_index, source_node_index, dest_node_index); +} + +uint32_t SizeActionGenerator::PickLargestSizeNode(const std::shared_ptr& cluster) { + if (cluster->node_index_sorted_by_size_.size() >= 1) { + return cluster->node_index_sorted_by_size_[cluster->node_index_sorted_by_size_.size() - 1]; + } else { + return kInvalidTabletIndex; + } +} + +uint32_t SizeActionGenerator::PickSmallestSizeNode(const std::shared_ptr& cluster) { + if (cluster->node_index_sorted_by_size_.size() >= 1) { + uint32_t index = 0; + if (cluster->lb_options_.meta_table_isolate_enabled) { + while (cluster->node_index_sorted_by_size_[index] == cluster->meta_table_node_index_) { + ++index; + if (index == cluster->node_index_sorted_by_size_.size()) { + return kInvalidNodeIndex; + } + } + } + return cluster->node_index_sorted_by_size_[index]; + } else { + return kInvalidTabletIndex; + } +} + +std::string SizeActionGenerator::Name() { + return name_; +} + +ReadLoadActionGenerator::ReadLoadActionGenerator() : + name_("ReadLoadActionGenerator") { +} + +ReadLoadActionGenerator::~ReadLoadActionGenerator() { +} + +Action* ReadLoadActionGenerator::Generate(const std::shared_ptr& cluster) { + VLOG(20) << "[lb] ReadLoadActionGenerator worked"; + + if (cluster->tablet_node_num_ < 2) { + return new EmptyAction(); + } + + cluster->SortNodesByReadLoad(); + + uint32_t source_node_index = PickMostReadNode(cluster); + uint32_t dest_node_index = PickLeastReadNode(cluster); + uint32_t tablet_index = PickRandomTabletOfNode(cluster, source_node_index); + + if (tablet_index == kInvalidTabletIndex || + source_node_index == kInvalidNodeIndex || + dest_node_index == kInvalidNodeIndex || + source_node_index == dest_node_index) { + return new EmptyAction(); + } + + return new MoveAction(tablet_index, source_node_index, dest_node_index); +} + +uint32_t ReadLoadActionGenerator::PickMostReadNode(const std::shared_ptr& cluster) { + if (cluster->node_index_sorted_by_read_load_.size() >= 1) { + return cluster->node_index_sorted_by_read_load_[cluster->node_index_sorted_by_read_load_.size() - 1]; + } else { + return kInvalidTabletIndex; + } +} + +uint32_t ReadLoadActionGenerator::PickLeastReadNode(const std::shared_ptr& cluster) { + if (cluster->node_index_sorted_by_read_load_.size() >= 1) { + uint32_t index = 0; + if (cluster->lb_options_.meta_table_isolate_enabled) { + while (cluster->node_index_sorted_by_read_load_[index] == cluster->meta_table_node_index_) { + ++index; + if (index == cluster->node_index_sorted_by_read_load_.size()) { + return kInvalidNodeIndex; + } + } + } + return cluster->node_index_sorted_by_read_load_[index]; + } else { + return kInvalidTabletIndex; + } +} + +std::string ReadLoadActionGenerator::Name() { + return name_; +} + +WriteLoadActionGenerator::WriteLoadActionGenerator() : + name_("WriteLoadActionGenerator") { +} + +WriteLoadActionGenerator::~WriteLoadActionGenerator() { +} + +Action* WriteLoadActionGenerator::Generate(const std::shared_ptr& cluster) { + VLOG(20) << "[lb] WriteLoadActionGenerator worked"; + + if (cluster->tablet_node_num_ < 2) { + return new EmptyAction(); + } + + cluster->SortNodesByWriteLoad(); + + uint32_t source_node_index = PickMostWriteNode(cluster); + uint32_t dest_node_index = PickLeastWriteNode(cluster); + uint32_t tablet_index = PickRandomTabletOfNode(cluster, source_node_index); + + if (tablet_index == kInvalidTabletIndex || + source_node_index == kInvalidNodeIndex || + dest_node_index == kInvalidNodeIndex || + source_node_index == dest_node_index) { + return new EmptyAction(); + } + + return new MoveAction(tablet_index, source_node_index, dest_node_index); +} + +uint32_t WriteLoadActionGenerator::PickMostWriteNode(const std::shared_ptr& cluster) { + if (cluster->node_index_sorted_by_write_load_.size() >= 1) { + return cluster->node_index_sorted_by_write_load_[cluster->node_index_sorted_by_write_load_.size() - 1]; + } else { + return kInvalidTabletIndex; + } +} + +uint32_t WriteLoadActionGenerator::PickLeastWriteNode(const std::shared_ptr& cluster) { + if (cluster->node_index_sorted_by_write_load_.size() >= 1) { + uint32_t index = 0; + if (cluster->lb_options_.meta_table_isolate_enabled) { + while (cluster->node_index_sorted_by_write_load_[index] == cluster->meta_table_node_index_) { + ++index; + if (index == cluster->node_index_sorted_by_write_load_.size()) { + return kInvalidNodeIndex; + } + } + } + return cluster->node_index_sorted_by_write_load_[index]; + } else { + return kInvalidTabletIndex; + } +} + +std::string WriteLoadActionGenerator::Name() { + return name_; +} + +ScanLoadActionGenerator::ScanLoadActionGenerator() : + name_("ScanLoadActionGenerator") { +} + +ScanLoadActionGenerator::~ScanLoadActionGenerator() { +} + +Action* ScanLoadActionGenerator::Generate(const std::shared_ptr& cluster) { + VLOG(20) << "[lb] ScanLoadActionGenerator worked"; + + if (cluster->tablet_node_num_ < 2) { + return new EmptyAction(); + } + + cluster->SortNodesByScanLoad(); + + uint32_t source_node_index = PickMostScanNode(cluster); + uint32_t dest_node_index = PickLeastScanNode(cluster); + uint32_t tablet_index = PickRandomTabletOfNode(cluster, source_node_index); + + if (tablet_index == kInvalidTabletIndex || + source_node_index == kInvalidNodeIndex || + dest_node_index == kInvalidNodeIndex || + source_node_index == dest_node_index) { + return new EmptyAction(); + } + + return new MoveAction(tablet_index, source_node_index, dest_node_index); +} + +uint32_t ScanLoadActionGenerator::PickMostScanNode(const std::shared_ptr& cluster) { + if (cluster->node_index_sorted_by_scan_load_.size() >= 1) { + return cluster->node_index_sorted_by_scan_load_[cluster->node_index_sorted_by_scan_load_.size() - 1]; + } else { + return kInvalidTabletIndex; + } +} + +uint32_t ScanLoadActionGenerator::PickLeastScanNode(const std::shared_ptr& cluster) { + if (cluster->node_index_sorted_by_scan_load_.size() >= 1) { + uint32_t index = 0; + if (cluster->lb_options_.meta_table_isolate_enabled) { + while (cluster->node_index_sorted_by_scan_load_[index] == cluster->meta_table_node_index_) { + ++index; + if (index == cluster->node_index_sorted_by_scan_load_.size()) { + return kInvalidNodeIndex; + } + } + } + return cluster->node_index_sorted_by_scan_load_[index]; + } else { + return kInvalidTabletIndex; + } +} + +std::string ScanLoadActionGenerator::Name() { + return name_; +} + +} // namespace load_balancer +} // namespace tera diff --git a/src/load_balancer/action_generators.h b/src/load_balancer/action_generators.h new file mode 100644 index 000000000..16c663ae7 --- /dev/null +++ b/src/load_balancer/action_generators.h @@ -0,0 +1,134 @@ +// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef TERA_LOAD_BALANCER_ACTION_GENERATORS_H_ +#define TERA_LOAD_BALANCER_ACTION_GENERATORS_H_ + +#include + +#include "load_balancer/action_generator.h" +#include "load_balancer/actions.h" + +namespace tera { +namespace load_balancer { + +// move a random tablet of a random node to another random node +class RandomActionGenerator : public ActionGenerator { +public: + RandomActionGenerator(); + virtual ~RandomActionGenerator(); + + // generate a random move action + virtual Action* Generate(const std::shared_ptr& cluster) override; + + virtual std::string Name() override; + +private: + std::string name_; +}; + +// move a tablet +// from the node holding most tablets +// to the node holding least tablets +class TabletCountActionGenerator : public ActionGenerator { +public: + TabletCountActionGenerator(); + virtual ~TabletCountActionGenerator(); + + virtual Action* Generate(const std::shared_ptr& cluster) override; + + virtual std::string Name() override; + +private: + uint32_t PickMostTabletsNode(const std::shared_ptr& cluster); + uint32_t PickLeastTabletsNode(const std::shared_ptr& cluster); + +private: + std::string name_; +}; + +// move a tablet +// from the node holding largest data size +// to the node holding smallest data size +class SizeActionGenerator : public ActionGenerator { +public: + SizeActionGenerator(); + virtual ~SizeActionGenerator(); + + virtual Action* Generate(const std::shared_ptr& cluster) override; + + virtual std::string Name() override; + +private: + uint32_t PickLargestSizeNode(const std::shared_ptr& cluster); + uint32_t PickSmallestSizeNode(const std::shared_ptr& cluster); + +private: + std::string name_; +}; + +// move a tablet +// from the node has most read load +// to the node has least read load +class ReadLoadActionGenerator : public ActionGenerator { +public: + ReadLoadActionGenerator(); + virtual ~ReadLoadActionGenerator(); + + virtual Action* Generate(const std::shared_ptr& cluster) override; + + virtual std::string Name() override; + +private: + uint32_t PickMostReadNode(const std::shared_ptr& cluster); + uint32_t PickLeastReadNode(const std::shared_ptr& cluster); + +private: + std::string name_; +}; + +// move a tablet +// from the node has most write load +// to the node has least write load +class WriteLoadActionGenerator : public ActionGenerator { +public: + WriteLoadActionGenerator(); + virtual ~WriteLoadActionGenerator(); + + virtual Action* Generate(const std::shared_ptr& cluster) override; + + virtual std::string Name() override; + +private: + uint32_t PickMostWriteNode(const std::shared_ptr& cluster); + uint32_t PickLeastWriteNode(const std::shared_ptr& cluster); + +private: + std::string name_; +}; + +// move a tablet +// from the node has most scan load +// to the node has least scan load +class ScanLoadActionGenerator : public ActionGenerator { +public: + ScanLoadActionGenerator(); + virtual ~ScanLoadActionGenerator(); + + virtual Action* Generate(const std::shared_ptr& cluster) override; + + virtual std::string Name() override; + +private: + uint32_t PickMostScanNode(const std::shared_ptr& cluster); + uint32_t PickLeastScanNode(const std::shared_ptr& cluster); + +private: + std::string name_; +}; + +} // namespace load_balancer +} // namespace tera + +#endif // TERA_LOAD_BALANCER_ACTION_GENERATORS_H_ diff --git a/src/load_balancer/actions.cc b/src/load_balancer/actions.cc new file mode 100644 index 000000000..0be2d9d5e --- /dev/null +++ b/src/load_balancer/actions.cc @@ -0,0 +1,47 @@ +// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include + +#include "load_balancer/actions.h" + +namespace tera { +namespace load_balancer { + +EmptyAction::EmptyAction() : + Action(Action::Type::EMPTY) { +} + +EmptyAction::~EmptyAction() { +} + +Action* EmptyAction::UndoAction() { + return new EmptyAction(); +} + +std::string EmptyAction::ToString() const { + return "EmptyAction"; +} + +MoveAction::MoveAction(uint32_t tablet_index, uint32_t source_node_index, uint32_t dest_node_index) : + Action(Action::Type::MOVE), + tablet_index_(tablet_index), + source_node_index_(source_node_index), + dest_node_index_(dest_node_index) { +} + +MoveAction::~MoveAction() { +} + +Action* MoveAction::UndoAction() { + return new MoveAction(tablet_index_, dest_node_index_, source_node_index_); +} + +std::string MoveAction::ToString() const { + return "move " + std::to_string(tablet_index_) + " from " + + std::to_string(source_node_index_) + " to " + std::to_string(dest_node_index_); +} + +} // namespace load_balancer +} // namespace tera diff --git a/src/load_balancer/actions.h b/src/load_balancer/actions.h new file mode 100644 index 000000000..f4751ea9c --- /dev/null +++ b/src/load_balancer/actions.h @@ -0,0 +1,43 @@ +// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef TERA_LOAD_BALANCER_ACTIONS_H_ +#define TERA_LOAD_BALANCER_ACTIONS_H_ + +#include + +#include "load_balancer/action.h" + +namespace tera { +namespace load_balancer { + +class EmptyAction : public Action { +public: + EmptyAction(); + virtual ~EmptyAction(); + + virtual Action* UndoAction() override; + + virtual std::string ToString() const override; +}; + +class MoveAction : public Action { +public: + MoveAction(uint32_t tablet_index, uint32_t source_node_index, uint32_t dest_node_index); + virtual ~MoveAction(); + + virtual Action* UndoAction() override; + + virtual std::string ToString() const override; + +public: + uint32_t tablet_index_; + uint32_t source_node_index_; + uint32_t dest_node_index_; +}; + +} // namespace load_balancer +} // namespace tera + +#endif // TERA_LOAD_BALANCER_ACTIONS_H_ diff --git a/src/load_balancer/balancer.h b/src/load_balancer/balancer.h new file mode 100644 index 000000000..2ad1727ea --- /dev/null +++ b/src/load_balancer/balancer.h @@ -0,0 +1,39 @@ +// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef TERA_LOAD_BALANCER_BALANCER_H_ +#define TERA_LOAD_BALANCER_BALANCER_H_ + +#include +#include + +#include "load_balancer/lb_node.h" +#include "load_balancer/options.h" +#include "load_balancer/plan.h" +#include "master/tablet_manager.h" +#include "master/tabletnode_manager.h" + +namespace tera { +namespace load_balancer { + +class Balancer { +public: + virtual ~Balancer() {} + + // balance the whole cluster + virtual bool BalanceCluster( + const std::vector>& lb_nodes, + std::vector* plans) = 0; + + // balance for the specified table + virtual bool BalanceCluster( + const std::string& table_name, + const std::vector>& lb_nodes, + std::vector* plans) = 0; +}; + +} // namespace load_balancer +} // namespace tera + +#endif // TERA_LOAD_BALANCER_BALANCER_H_ diff --git a/src/load_balancer/cluster.cc b/src/load_balancer/cluster.cc new file mode 100644 index 000000000..72a3f740e --- /dev/null +++ b/src/load_balancer/cluster.cc @@ -0,0 +1,537 @@ +// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include + +#include +#include + +#include "glog/logging.h" +#include "load_balancer/actions.h" +#include "load_balancer/cluster.h" +#include "common/timer.h" + +namespace tera { +namespace load_balancer { + +Cluster::Cluster(const std::vector>& lb_nodes, + const LBOptions& options) : + meta_table_node_index_(std::numeric_limits::max()), + lb_options_(options) { + int64_t start_time_ns = get_micros(); + + for (const auto& node : lb_nodes) { + if (lb_options_.meta_table_isolate_enabled && + node->tablet_node_ptr->GetAddr() == lb_options_.meta_table_node_addr) { + VLOG(5) << "skip meta table node:" << lb_options_.meta_table_node_addr; + } else { + lb_nodes_.emplace_back(node); + } + } + + table_num_ = 0; + tablet_node_num_ = 0; + tablet_num_ = 0; + tablet_moved_num_ = 0; + + for (const auto& node : lb_nodes_) { + uint32_t node_index = nodes_.size(); + nodes_[node_index] = node; + + std::string addr = node->tablet_node_ptr->GetAddr(); + assert(nodes_to_index_.find(addr) == nodes_to_index_.end()); + nodes_to_index_[addr] = node_index; + + tablets_per_node_[node_index].clear(); + initial_tablets_not_ready_per_node_[node_index].clear(); + size_per_node_[node_index] = 0; + read_load_per_node_[node_index] = 0; + write_load_per_node_[node_index] = 0; + scan_load_per_node_[node_index] = 0; + + node_index_sorted_by_tablet_count_.emplace_back(node_index); + node_index_sorted_by_size_.emplace_back(node_index); + node_index_sorted_by_read_load_.emplace_back(node_index); + node_index_sorted_by_write_load_.emplace_back(node_index); + node_index_sorted_by_scan_load_.emplace_back(node_index); + + if (node->tablet_node_ptr->GetReadPending() > 0) { + read_pending_nodes_index_.insert(node_index); + } + if (node->tablet_node_ptr->GetWritePending() > 0) { + write_pending_nodes_index_.insert(node_index); + } + if (node->tablet_node_ptr->GetScanPending() > 0) { + scan_pending_nodes_index_.insert(node_index); + } + + for (const auto& tablet : node->tablets) { + uint32_t tablet_index = tablets_.size(); + + RegisterTablet(tablet, tablet_index, node_index); + + tablets_per_node_[node_index].emplace_back(tablet_index); + if (tablets_[tablet_index]->tablet_ptr->GetStatus() != kTableReady) { + initial_tablets_not_ready_per_node_[node_index].emplace_back(tablet_index); + } + size_per_node_[node_index] += static_cast(tablet->tablet_ptr->GetDataSize()); + read_load_per_node_[node_index] += static_cast(tablet->tablet_ptr->GetReadQps()); + write_load_per_node_[node_index] += static_cast(tablet->tablet_ptr->GetWriteQps()); + scan_load_per_node_[node_index] += static_cast(tablet->tablet_ptr->GetScanQps()); + + ++tablet_num_; + } + + ++ tablet_node_num_; + } + + // if not ready tablets' ratio is higher than option, the node is considered abnormal + for (uint32_t i = 0; i < tablets_per_node_.size(); ++i) { + if (tablets_per_node_[i].size() != 0) { + double note_ready_num = static_cast(initial_tablets_not_ready_per_node_[i].size()); + double total_num = static_cast(tablets_per_node_[i].size()); + if (note_ready_num / total_num >= lb_options_.abnormal_node_ratio) { + abnormal_nodes_index_.insert(i); + } + } + } + + assert(table_num_ == tables_.size()); + assert(tablet_node_num_ == nodes_.size()); + assert(tablet_num_ == tablets_.size()); + + assert(table_num_ == tables_to_index_.size()); + assert(tablet_node_num_ == nodes_to_index_.size()); + assert(tablet_num_ == tablets_to_index_.size()); + + assert(tablet_num_ == tablet_index_to_node_index_.size()); + assert(tablet_num_ == initial_tablet_index_to_node_index_.size()); + assert(tablet_num_ == tablet_index_to_table_index_.size()); + + assert(tablet_node_num_ == tablets_per_node_.size()); + assert(tablet_node_num_ == initial_tablets_not_ready_per_node_.size()); + assert(tablet_node_num_ == size_per_node_.size()); + assert(tablet_node_num_ == read_load_per_node_.size()); + assert(tablet_node_num_ == write_load_per_node_.size()); + assert(tablet_node_num_ == scan_load_per_node_.size()); + assert(abnormal_nodes_index_.size() <= tablet_node_num_); + assert(read_pending_nodes_index_.size() <= tablet_node_num_); + assert(write_pending_nodes_index_.size() <= tablet_node_num_); + assert(scan_pending_nodes_index_.size() <= tablet_node_num_); + + assert(tablet_node_num_ == node_index_sorted_by_tablet_count_.size()); + assert(tablet_node_num_ == node_index_sorted_by_size_.size()); + assert(tablet_node_num_ == node_index_sorted_by_read_load_.size()); + assert(tablet_node_num_ == node_index_sorted_by_write_load_.size()); + assert(tablet_node_num_ == node_index_sorted_by_scan_load_.size()); + + VLOG(20) << "[lb] construct Cluster cost time(ms):" << (get_micros() - start_time_ns) / 1000; +} + +Cluster::~Cluster() { +} + +void Cluster::DebugCluster() { + LOG(INFO) << ""; + LOG(INFO) << "DebugCluster begin -----"; + + LOG(INFO) << "table_num_:" << table_num_; + LOG(INFO) << "tablet_node_num_:" << tablet_node_num_; + LOG(INFO) << "tablet_num_:" << tablet_num_; + LOG(INFO) << "tablet_moved_num_:" << tablet_moved_num_; + + LOG(INFO) << "[table_index -> table]:"; + for (const auto& table : tables_) { + LOG(INFO) << table.first << " -> " << table.second; + } + + LOG(INFO) << "[node_index -> node]:"; + for (const auto& node : nodes_) { + LOG(INFO) << node.first << " -> " << node.second->tablet_node_ptr->GetAddr(); + } + LOG(INFO) << "meta_table_node_index_:" << meta_table_node_index_; + + LOG(INFO) << "[tablet_index -> tablet]:"; + for (const auto& tablet : tablets_) { + LOG(INFO) << tablet.first << " -> " << tablet.second->tablet_ptr->GetPath(); + } + + LOG(INFO) << "[table -> table_index]:"; + for (const auto& table : tables_to_index_) { + LOG(INFO) << table.first << " -> " << table.second; + } + + LOG(INFO) << "[node -> node_index]:"; + for (const auto& node : nodes_to_index_) { + LOG(INFO) << node.first << " -> " << node.second; + } + + LOG(INFO) << "[tablet -> tablet_index]:"; + for (const auto& tablet : tablets_to_index_) { + LOG(INFO) << tablet.first << " -> " << tablet.second; + } + + LOG(INFO) << "[tablet_index -> node_index]:"; + for (const auto& it : tablet_index_to_node_index_) { + LOG(INFO) << it.first << " -> " << it.second; + } + + LOG(INFO) << "[initial tablet_index -> node_index]:"; + for (const auto& it : initial_tablet_index_to_node_index_) { + LOG(INFO) << it.first << " -> " << it.second; + } + + LOG(INFO) << "[tablet_index -> table_index]:"; + for (const auto& it : tablet_index_to_table_index_) { + LOG(INFO) << it.first << " -> " << it.second; + } + + LOG(INFO) << "[node_index -> tablets index]:"; + for (const auto& it : tablets_per_node_) { + std::string line = std::to_string(it.first) + " ->"; + for (const auto tablet : it.second) { + line += " "; + line += std::to_string(tablet); + } + LOG(INFO) << line; + } + + LOG(INFO) << "[node_index -> data size]:"; + for (const auto& it : size_per_node_) { + LOG(INFO) << it.first << " -> " << it.second << "B"; + } + + LOG(INFO) << "[node_index -> read load]:"; + for (const auto& it : read_load_per_node_) { + LOG(INFO) << it.first << " -> " << it.second; + } + + LOG(INFO) << "[node_index -> write load]:"; + for (const auto& it : write_load_per_node_) { + LOG(INFO) << it.first << " -> " << it.second; + } + + LOG(INFO) << "[node_index -> scan load]:"; + for (const auto& it : scan_load_per_node_) { + LOG(INFO) << it.first << " -> " << it.second; + } + + LOG(INFO) << "[tablets index of moved too frequently]:"; + for (const auto& tablet : tablets_moved_too_frequently_) { + LOG(INFO) << tablet; + } + + LOG(INFO) << "[node_index -> not ready tablets index]:"; + for (const auto& it : initial_tablets_not_ready_per_node_) { + std::string line = std::to_string(it.first) + " ->"; + for (const auto tablet : it.second) { + line += " "; + line += std::to_string(tablet); + } + LOG(INFO) << line; + } + + LOG(INFO) << "[abnormal nodes index]:"; + for (const auto& node: abnormal_nodes_index_) { + LOG(INFO) << node; + } + + LOG(INFO) << "[tablets index of moved to abnormal nodes]:"; + for (const auto& tablet : tablets_moved_to_abnormal_nodes_) { + LOG(INFO) << tablet; + } + + LOG(INFO) << "[read pending nodes index]:"; + for (const auto& node: read_pending_nodes_index_) { + LOG(INFO) << node; + } + + LOG(INFO) << "[tablets index of moved to read pending nodes]:"; + for (const auto& tablet : tablets_moved_to_read_pending_nodes_) { + LOG(INFO) << tablet; + } + + LOG(INFO) << "[write pending nodes index]:"; + for (const auto& node: write_pending_nodes_index_) { + LOG(INFO) << node; + } + + LOG(INFO) << "[tablets index of moved to write pending nodes]:"; + for (const auto& tablet : tablets_moved_to_write_pending_nodes_) { + LOG(INFO) << tablet; + } + + LOG(INFO) << "[scan pending nodes index]:"; + for (const auto& node: scan_pending_nodes_index_) { + LOG(INFO) << node; + } + + LOG(INFO) << "[tablets index of moved to scan pending nodes]:"; + for (const auto& tablet : tablets_moved_to_scan_pending_nodes_) { + LOG(INFO) << tablet; + } + + LOG(INFO) << "DebugCluster end -----"; + LOG(INFO) << ""; +} + +bool Cluster::ValidAction(const std::shared_ptr& action) { + switch (action->GetType()) { + case Action::Type::EMPTY: + return false; + case Action::Type::ASSIGN: + return true; + case Action::Type::MOVE: { + MoveAction* move_action = dynamic_cast(action.get()); + if (tablets_[move_action->tablet_index_]->tablet_ptr->GetStatus() != kTableReady) { + VLOG(20) << "[lb] invalid action, reason:tablet not ready, tablet status:" + << StatusCodeToString(tablets_[move_action->tablet_index_]->tablet_ptr->GetStatus()); + return false; + } + + if (tables_[tablet_index_to_table_index_[move_action->tablet_index_]] == + lb_options_.meta_table_name) { + VLOG(20) << "[lb] invalid action, reason:move meta table"; + return false; + } + + if (lb_options_.meta_table_isolate_enabled && + move_action->dest_node_index_ == meta_table_node_index_) { + VLOG(20) << "[lb] invalid action, reason:move tablet to meta table node"; + return false; + } + + return true; + } + case Action::Type::SWAP: + return true;; + default: + return false; + } +} + +void Cluster::DoAction(const std::shared_ptr& action) { + switch (action->GetType()) { + case Action::Type::EMPTY: + break; + case Action::Type::ASSIGN: + break; + case Action::Type::MOVE: { + MoveAction* move_action = dynamic_cast(action.get()); + VLOG(20) << "[lb] DoAction: " << move_action->ToString(); + assert(move_action->source_node_index_ != move_action->dest_node_index_); + + RemoveTablet(move_action->tablet_index_, move_action->source_node_index_); + AddTablet(move_action->tablet_index_, move_action->dest_node_index_); + MoveTablet(move_action->tablet_index_, move_action->source_node_index_, move_action->dest_node_index_); + + break; + } + case Action::Type::SWAP: + break; + default: + break; + } +} + +void Cluster::SortNodesByTabletCount() { + std::sort( + node_index_sorted_by_tablet_count_.begin(), + node_index_sorted_by_tablet_count_.end(), + [this](int a, int b) { + return tablets_per_node_[a].size() < tablets_per_node_[b].size(); + }); +} + +void Cluster::SortNodesBySize() { + std::sort( + node_index_sorted_by_size_.begin(), + node_index_sorted_by_size_.end(), + [this](int a, int b) { + return size_per_node_[a] < size_per_node_[b]; + }); +} + +void Cluster::SortNodesByReadLoad() { + std::sort( + node_index_sorted_by_read_load_.begin(), + node_index_sorted_by_read_load_.end(), + [this](int a, int b) { + return read_load_per_node_[a] < read_load_per_node_[b]; + }); +} + +void Cluster::SortNodesByWriteLoad() { + std::sort( + node_index_sorted_by_write_load_.begin(), + node_index_sorted_by_write_load_.end(), + [this](int a, int b) { + return write_load_per_node_[a] < write_load_per_node_[b]; + }); +} + +void Cluster::SortNodesByScanLoad() { + std::sort( + node_index_sorted_by_scan_load_.begin(), + node_index_sorted_by_scan_load_.end(), + [this](int a, int b) { + return scan_load_per_node_[a] < scan_load_per_node_[b]; + }); +} + +void Cluster::RegisterTablet(const std::shared_ptr& tablet, uint32_t tablet_index, uint32_t node_index) { + std::string table_name = tablet->tablet_ptr->GetTableName(); + if (tables_to_index_.find(table_name) == tables_to_index_.end()) { + uint32_t table_index = tables_.size(); + tables_[table_index] = table_name; + tables_to_index_[table_name] = table_index; + ++table_num_; + + if (table_name == lb_options_.meta_table_name) { + meta_table_node_index_ = node_index; + } + } + + std::string path = tablet->tablet_ptr->GetPath(); + tablets_to_index_[path] = tablet_index; + tablets_[tablet_index] = tablet; + + tablet_index_to_node_index_[tablet_index] = node_index; + initial_tablet_index_to_node_index_[tablet_index] = node_index; + tablet_index_to_table_index_[tablet_index] = tables_to_index_[table_name]; +} + +void Cluster::AddTablet(uint32_t tablet_index, uint32_t to_node_index) { + tablets_per_node_[to_node_index].emplace_back(tablet_index); + + size_per_node_[to_node_index] += static_cast( + tablets_[tablet_index]->tablet_ptr->GetDataSize()); + read_load_per_node_[to_node_index] += static_cast( + tablets_[tablet_index]->tablet_ptr->GetReadQps()); + write_load_per_node_[to_node_index] += static_cast( + tablets_[tablet_index]->tablet_ptr->GetWriteQps()); + scan_load_per_node_[to_node_index] += static_cast( + tablets_[tablet_index]->tablet_ptr->GetScanQps()); +} + +void Cluster::RemoveTablet(uint32_t tablet_index, uint32_t from_node_index) { + if (tablets_per_node_.find(from_node_index) == tablets_per_node_.end()) { + return; + } + auto& tablets = tablets_per_node_[from_node_index]; + for (auto it = tablets.begin(); it != tablets.end();) { + if (*it == tablet_index) { + it = tablets.erase(it); + break; + } else { + ++it; + } + } + + size_per_node_[from_node_index] -= static_cast( + tablets_[tablet_index]->tablet_ptr->GetDataSize()); + read_load_per_node_[from_node_index] -= static_cast( + tablets_[tablet_index]->tablet_ptr->GetReadQps()); + write_load_per_node_[from_node_index] -= static_cast( + tablets_[tablet_index]->tablet_ptr->GetWriteQps()); + scan_load_per_node_[from_node_index] -= static_cast( + tablets_[tablet_index]->tablet_ptr->GetScanQps()); + + assert(size_per_node_[from_node_index] >= 0); + assert(read_load_per_node_[from_node_index] >= 0); + assert(write_load_per_node_[from_node_index] >= 0); + assert(scan_load_per_node_[from_node_index] >= 0); +} + +void Cluster::MoveTablet(uint32_t tablet_index, uint32_t source_node_index, uint32_t dest_node_index) { + tablet_index_to_node_index_[tablet_index] = dest_node_index; + + if (initial_tablet_index_to_node_index_[tablet_index] == source_node_index) { + ++tablet_moved_num_; + + int64_t last_move_time_us = tablets_[tablet_index]->tablet_ptr->LastMoveTime(); + int64_t current_time_us = get_micros(); + if (current_time_us - last_move_time_us < + 1000000 * static_cast(lb_options_.tablet_move_too_frequently_threshold_s)) { + tablets_moved_too_frequently_.insert(tablet_index); + VLOG(20) << "[lb] add tablet moved too frequently, tablet index: " << tablet_index + << ", last_move_time: " << last_move_time_us << ", current time: " << current_time_us + << ", tablets_moved_too_frequently_ size: " << tablets_moved_too_frequently_.size(); + } + } else if (initial_tablet_index_to_node_index_[tablet_index] == dest_node_index) { + // tablet moved back + --tablet_moved_num_; + assert(tablet_moved_num_ >= 0); + + if (tablets_moved_too_frequently_.find(tablet_index) != tablets_moved_too_frequently_.end()) { + tablets_moved_too_frequently_.erase(tablet_index); + VLOG(20) << "[lb] remove tablet moved too frequently, tablet index: " << tablet_index + << ", tablets_moved_too_frequently_ size: " << tablets_moved_too_frequently_.size(); + } + } else { + } + + if (abnormal_nodes_index_.find(dest_node_index) != abnormal_nodes_index_.end() && + dest_node_index != initial_tablet_index_to_node_index_[tablet_index]) { + tablets_moved_to_abnormal_nodes_.insert(tablet_index); + VLOG(20) << "[lb] add tablet moved to abnormal node, tablet index: " << tablet_index + << ", node index: " << dest_node_index + << ", tablets_moved_to_abnormal_nodes_ size: " << tablets_moved_to_abnormal_nodes_.size(); + } else if (abnormal_nodes_index_.find(source_node_index) != abnormal_nodes_index_.end()) { + if (tablets_moved_to_abnormal_nodes_.find(tablet_index) != tablets_moved_to_abnormal_nodes_.end()) { + tablets_moved_to_abnormal_nodes_.erase(tablet_index); + VLOG(20) << "[lb] remove tablet moved to abnormal nodes, tablet index: " << tablet_index + << ", tablets_moved_to_abnormal_nodes_ size: " << tablets_moved_to_abnormal_nodes_.size(); + } + } else { + } + + if (read_pending_nodes_index_.find(dest_node_index) != read_pending_nodes_index_.end() && + dest_node_index != initial_tablet_index_to_node_index_[tablet_index]) { + tablets_moved_to_read_pending_nodes_.insert(tablet_index); + VLOG(20) << "[lb] add tablet moved to read pending node, tablet index: " << tablet_index + << ", node index: " << dest_node_index + << ", tablets_moved_to_read_pending_nodes_ size: " << tablets_moved_to_read_pending_nodes_.size(); + } else if (read_pending_nodes_index_.find(source_node_index) != read_pending_nodes_index_.end()) { + if (tablets_moved_to_read_pending_nodes_.find(tablet_index) != tablets_moved_to_read_pending_nodes_.end()) { + tablets_moved_to_read_pending_nodes_.erase(tablet_index); + VLOG(20) << "[lb] remove tablet moved to read pending nodes, tablet index: " << tablet_index + << ", tablets_moved_to_read_pending_nodes_ size: " << tablets_moved_to_read_pending_nodes_.size(); + } + } else { + } + + if (write_pending_nodes_index_.find(dest_node_index) != write_pending_nodes_index_.end() && + dest_node_index != initial_tablet_index_to_node_index_[tablet_index]) { + tablets_moved_to_write_pending_nodes_.insert(tablet_index); + VLOG(20) << "[lb] add tablet moved to write pending node, tablet index: " << tablet_index + << ", node index: " << dest_node_index + << ", tablets_moved_to_write_pending_nodes_ size: " << tablets_moved_to_write_pending_nodes_.size(); + } else if (write_pending_nodes_index_.find(source_node_index) != write_pending_nodes_index_.end()) { + if (tablets_moved_to_write_pending_nodes_.find(tablet_index) != tablets_moved_to_write_pending_nodes_.end()) { + tablets_moved_to_write_pending_nodes_.erase(tablet_index); + VLOG(20) << "[lb] remove tablet moved to write pending nodes, tablet index: " << tablet_index + << ", tablets_moved_to_write_pending_nodes_ size: " << tablets_moved_to_write_pending_nodes_.size(); + } + } else { + } + + if (scan_pending_nodes_index_.find(dest_node_index) != scan_pending_nodes_index_.end() && + dest_node_index != initial_tablet_index_to_node_index_[tablet_index]) { + tablets_moved_to_scan_pending_nodes_.insert(tablet_index); + VLOG(20) << "[lb] add tablet moved to scan pending node, tablet index: " << tablet_index + << ", node index: " << dest_node_index + << ", tablets_moved_to_scan_pending_nodes_ size: " << tablets_moved_to_scan_pending_nodes_.size(); + } else if (scan_pending_nodes_index_.find(source_node_index) != scan_pending_nodes_index_.end()) { + if (tablets_moved_to_scan_pending_nodes_.find(tablet_index) != tablets_moved_to_scan_pending_nodes_.end()) { + tablets_moved_to_scan_pending_nodes_.erase(tablet_index); + VLOG(20) << "[lb] remove tablet moved to scan pending nodes, tablet index: " << tablet_index + << ", tablets_moved_to_scan_pending_nodes_ size: " << tablets_moved_to_scan_pending_nodes_.size(); + } + } else { + } +} + +} // namespace load_balancer +} // namespace tera diff --git a/src/load_balancer/cluster.h b/src/load_balancer/cluster.h new file mode 100644 index 000000000..8a22acd7c --- /dev/null +++ b/src/load_balancer/cluster.h @@ -0,0 +1,130 @@ +// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef TERA_LOAD_BALANCER_CLUSTER_H_ +#define TERA_LOAD_BALANCER_CLUSTER_H_ + +#include +#include +#include +#include + +#include "load_balancer/action.h" +#include "load_balancer/lb_node.h" +#include "load_balancer/options.h" +#include "master/tablet_manager.h" +#include "master/tabletnode_manager.h" + +namespace tera { +namespace load_balancer { + +class Cluster { +public: + Cluster(const std::vector>& tablet_nodes, + const LBOptions& options); + + virtual ~Cluster(); + + void DebugCluster(); + + bool ValidAction(const std::shared_ptr& action); + + void DoAction(const std::shared_ptr& action); + + void SortNodesByTabletCount(); + + void SortNodesBySize(); + + void SortNodesByReadLoad(); + + void SortNodesByWriteLoad(); + + void SortNodesByScanLoad(); + +private: + void RegisterTablet(const std::shared_ptr& tablet, uint32_t tablet_index, uint32_t node_index); + void AddTablet(uint32_t tablet_index, uint32_t to_node_index); + void RemoveTablet(uint32_t tablet_index, uint32_t from_node_index); + void MoveTablet(uint32_t tablet_index, uint32_t source_node_index, uint32_t dest_node_index); + +// cluster info, use index to speed up the calculation +// make these info public also for speeding up +public: + uint32_t table_num_; + uint32_t tablet_node_num_; + uint32_t tablet_num_; + uint32_t tablet_moved_num_; + + // table_index -> table + std::map tables_; + // node_index -> node + std::map> nodes_; + // tablet_index -> tablet + std::map> tablets_; + + // table -> table_index + std::map tables_to_index_; + // node -> node_index + std::map nodes_to_index_; + // tablet -> tablet_index + std::map tablets_to_index_; + + // tablet_index -> node_index + std::map tablet_index_to_node_index_; + // initial tablet_index -> node_index, it's the initial cluster state + std::map initial_tablet_index_to_node_index_; + // tablet_index -> table_index + std::map tablet_index_to_table_index_; + + // node_index -> tablets index on the node + std::map> tablets_per_node_; + // node_index -> tablets index of not ready on the node + std::map> initial_tablets_not_ready_per_node_; + // abnormal nodes index + std::unordered_set abnormal_nodes_index_; + // index of tablets moved to abnormal nodes + std::unordered_set tablets_moved_to_abnormal_nodes_; + // read pending nodes index + std::unordered_set read_pending_nodes_index_; + // index of tablets moved to read pending nodes + std::unordered_set tablets_moved_to_read_pending_nodes_; + // write pending nodes index + std::unordered_set write_pending_nodes_index_; + // index of tablets moved to write pending nodes + std::unordered_set tablets_moved_to_write_pending_nodes_; + // scan pending nodes index + std::unordered_set scan_pending_nodes_index_; + // index of tablets moved to scan pending nodes + std::unordered_set tablets_moved_to_scan_pending_nodes_; + // node_index -> data size on the node + std::map size_per_node_; + // node_index -> read load on the node + std::map read_load_per_node_; + // node_index -> write load on the node + std::map write_load_per_node_; + // node_index -> scan load on the node + std::map scan_load_per_node_; + // tablets index of moved too frequently + std::unordered_set tablets_moved_too_frequently_; + + // meta table node index + uint32_t meta_table_node_index_; + + // for ActionGenerator + std::vector node_index_sorted_by_tablet_count_; + std::vector node_index_sorted_by_size_; + std::vector node_index_sorted_by_read_load_; + std::vector node_index_sorted_by_write_load_; + std::vector node_index_sorted_by_scan_load_; + + LBOptions lb_options_; + +private: + std::vector> lb_nodes_; +}; + +} // namespace load_balancer +} // namespace tera + +#endif // TERA_LOAD_BALANCER_CLUSTER_H_ diff --git a/src/load_balancer/cost_function.h b/src/load_balancer/cost_function.h new file mode 100644 index 000000000..862b09285 --- /dev/null +++ b/src/load_balancer/cost_function.h @@ -0,0 +1,125 @@ +// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef TERA_LOAD_BALANCER_COST_FUNCTION_H_ +#define TERA_LOAD_BALANCER_COST_FUNCTION_H_ + +#include + +#include +#include +#include +#include +#include + +#include "glog/logging.h" +#include "load_balancer/cluster.h" +#include "load_balancer/options.h" + +namespace tera { +namespace load_balancer { + +class CostFunction { +public: + CostFunction(const LBOptions& options, const std::string& name) : + lb_options_(options), + name_(name) { + + } + + virtual ~CostFunction() {} + + virtual double Cost() = 0; + + virtual void Init(const std::shared_ptr& cluster) { + cluster_ = cluster; + } + + double GetWeight() const { + return weight_; + } + + void SetWeight(double w) { + weight_ = w; + } + + std::string Name() const { + return name_; + } + +protected: + double Scale(double min, double max, double value) { + VLOG(20) << "[lb] Scale begin, min:" << min << " max:" << max << " value:" << value; + if (max <= min || value <= min) { + return 0.0; + } + if (max - min == 0) { + return 0.0; + } + + double scaled = std::max(0.0, std::min(1.0, (value - min) / (max - min))); + VLOG(20) << "[lb] Scale end, scaled:" << scaled; + return scaled; + } + + double ScaleFromArray(const std::vector& stats) { + if (lb_options_.debug_mode_enabled) { + std::string line; + for (const auto& s : stats) { + line += std::to_string(s); + line += " "; + } + LOG(INFO) << "[lb] stats:" << line; + } + + double total_cost = 0; + double total = GetSum(stats); + + double count = stats.size(); + double mean = total/count; + + double max = ((count - 1) * mean) + (total - mean); + + double min; + if (count > total) { + min = ((count - total) * mean) + ((1 - mean) * total); + } else { + int num_high = (int) (total - (floor(mean) * count)); + int num_low = (int) (count - num_high); + + min = (num_high * (ceil(mean) - mean)) + (num_low * (mean - floor(mean))); + + } + min = std::max(0.0, min); + for (size_t i = 0; i < stats.size(); i++) { + double n = stats[i]; + double diff = std::abs(mean - n); + total_cost += diff; + } + + return Scale(min, max, total_cost); + } + +private: + double GetSum(const std::vector& stats) { + double total = 0; + for (const auto& s : stats) { + total += s; + } + return total; + } + +protected: + std::shared_ptr cluster_; + +private: + double weight_; + LBOptions lb_options_; + std::string name_; +}; + +} // namespace load_balancer +} // namespace tera + +#endif // TERA_LOAD_BALANCER_COST_FUNCTION_H_ diff --git a/src/load_balancer/cost_functions.cc b/src/load_balancer/cost_functions.cc new file mode 100644 index 000000000..e459b4337 --- /dev/null +++ b/src/load_balancer/cost_functions.cc @@ -0,0 +1,222 @@ +// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "load_balancer/cost_functions.h" + +#include +#include + +namespace tera { +namespace load_balancer { + +MoveCountCostFunction::MoveCountCostFunction (const LBOptions& options) : + CostFunction(options, "MoveCountCostFunction"), + kExpensiveCost(1000000), + tablet_max_move_num_(options.tablet_max_move_num), + tablet_max_move_percent_(options.tablet_max_move_percent) { + SetWeight(options.move_count_cost_weight); +} + +MoveCountCostFunction::~MoveCountCostFunction() { +} + +double MoveCountCostFunction::Cost() { + uint32_t max_move_num = std::max(tablet_max_move_num_, static_cast(cluster_->tablet_num_ * tablet_max_move_percent_)); + double cost = cluster_->tablet_moved_num_; + if (cost > static_cast(max_move_num)) { + // return an expensive cost + VLOG(20) << "[lb] reach max move num limit, max_move_num:" << max_move_num; + return kExpensiveCost; + } + + return Scale(0, std::min(cluster_->tablet_num_, max_move_num), cost); +} + +MoveFrequencyCostFunction::MoveFrequencyCostFunction(const LBOptions& options) : + CostFunction(options, "MoveFrequencyCostFunction"), + kExpensiveCost(100000) { + SetWeight(options.move_frequency_cost_weight); +} + +MoveFrequencyCostFunction::~MoveFrequencyCostFunction() { +} + +double MoveFrequencyCostFunction::Cost() { + if (cluster_->tablets_moved_too_frequently_.size() > 0) { + // there are tablets moved too frequently, return an expensive cost + VLOG(20) << "[lb] there are " << cluster_->tablets_moved_too_frequently_.size() + << " tablets moved too frequently"; + return kExpensiveCost; + } else { + return 0; + } +} + +AbnormalNodeCostFunction::AbnormalNodeCostFunction(const LBOptions& options) : + CostFunction(options, "AbnormalNodeCostFunction"), + kExpensiveCost(100000) { + SetWeight(options.abnormal_node_cost_weight); +} + +AbnormalNodeCostFunction::~AbnormalNodeCostFunction() { +} + +double AbnormalNodeCostFunction::Cost() { + if (cluster_->tablets_moved_to_abnormal_nodes_.size() > 0) { + // there are tablets moved to abnormal nodes, return an expensive cost + VLOG(20) << "[lb] there are " << cluster_->tablets_moved_to_abnormal_nodes_.size() + << " tablets moved to abnormal nodes"; + return kExpensiveCost; + } else { + return 0; + } +} + +ReadPendingNodeCostFunction::ReadPendingNodeCostFunction(const LBOptions& options) : + CostFunction(options, "ReadPendingNodeCostFunction"), + kExpensiveCost(10000) { + SetWeight(options.read_pending_node_cost_weight); +} + +ReadPendingNodeCostFunction::~ReadPendingNodeCostFunction() { +} + +double ReadPendingNodeCostFunction::Cost() { + if (cluster_->tablets_moved_to_read_pending_nodes_.size() > 0) { + // there are tablets moved to read pending nodes, return an expensive cost + VLOG(20) << "[lb] there are " << cluster_->tablets_moved_to_read_pending_nodes_.size() + << " tablets moved to read pending nodes"; + return kExpensiveCost; + } else { + return 0; + } +} + +WritePendingNodeCostFunction::WritePendingNodeCostFunction(const LBOptions& options) : + CostFunction(options, "WritePendingNodeCostFunction"), + kExpensiveCost(10000) { + SetWeight(options.write_pending_node_cost_weight); +} + +WritePendingNodeCostFunction::~WritePendingNodeCostFunction() { +} + +double WritePendingNodeCostFunction::Cost() { + if (cluster_->tablets_moved_to_write_pending_nodes_.size() > 0) { + // there are tablets moved to write pending nodes, return an expensive cost + VLOG(20) << "[lb] there are " << cluster_->tablets_moved_to_write_pending_nodes_.size() + << " tablets moved to write pending nodes"; + return kExpensiveCost; + } else { + return 0; + } +} + +ScanPendingNodeCostFunction::ScanPendingNodeCostFunction(const LBOptions& options) : + CostFunction(options, "ScanPendingNodeCostFunction"), + kExpensiveCost(10000) { + SetWeight(options.scan_pending_node_cost_weight); +} + +ScanPendingNodeCostFunction::~ScanPendingNodeCostFunction() { +} + +double ScanPendingNodeCostFunction::Cost() { + if (cluster_->tablets_moved_to_scan_pending_nodes_.size() > 0) { + // there are tablets moved to scan pending nodes, return an expensive cost + VLOG(20) << "[lb] there are " << cluster_->tablets_moved_to_scan_pending_nodes_.size() + << " tablets moved to scan pending nodes"; + return kExpensiveCost; + } else { + return 0; + } +} + +TabletCountCostFunction::TabletCountCostFunction (const LBOptions& options) : + CostFunction(options, "TabletCountCostFunction") { + SetWeight(options.tablet_count_cost_weight); +} + +TabletCountCostFunction::~TabletCountCostFunction() { +} + +double TabletCountCostFunction::Cost() { + std::vector tablet_nums_per_node; + for (uint32_t i = 0; i < cluster_->tablet_node_num_; ++i) { + tablet_nums_per_node.emplace_back(cluster_->tablets_per_node_[i].size()); + } + + return ScaleFromArray(tablet_nums_per_node); +} + +SizeCostFunction::SizeCostFunction (const LBOptions& options) : + CostFunction(options, "SizeCostFunction") { + SetWeight(options.size_cost_weight); +} + +SizeCostFunction::~SizeCostFunction() { +} + +double SizeCostFunction::Cost() { + std::vector size_per_node; + for (uint32_t i = 0; i < cluster_->tablet_node_num_; ++i) { + size_per_node.emplace_back(cluster_->size_per_node_[i]); + } + + return ScaleFromArray(size_per_node); +} + +ReadLoadCostFunction::ReadLoadCostFunction (const LBOptions& options) : + CostFunction(options, "ReadLoadCostFunction") { + SetWeight(options.read_load_cost_weight); +} + +ReadLoadCostFunction::~ReadLoadCostFunction() { +} + +double ReadLoadCostFunction::Cost() { + std::vector read_load_per_node; + for (uint32_t i = 0; i < cluster_->tablet_node_num_; ++i) { + read_load_per_node.emplace_back(cluster_->read_load_per_node_[i]); + } + + return ScaleFromArray(read_load_per_node); +} + +WriteLoadCostFunction::WriteLoadCostFunction (const LBOptions& options) : + CostFunction(options, "WriteLoadCostFunction") { + SetWeight(options.write_load_cost_weight); +} + +WriteLoadCostFunction::~WriteLoadCostFunction() { +} + +double WriteLoadCostFunction::Cost() { + std::vector write_load_per_node; + for (uint32_t i = 0; i < cluster_->tablet_node_num_; ++i) { + write_load_per_node.emplace_back(cluster_->write_load_per_node_[i]); + } + + return ScaleFromArray(write_load_per_node); +} + +ScanLoadCostFunction::ScanLoadCostFunction (const LBOptions& options) : + CostFunction(options, "ScanLoadCostFunction") { + SetWeight(options.scan_load_cost_weight); +} + +ScanLoadCostFunction::~ScanLoadCostFunction() { +} + +double ScanLoadCostFunction::Cost() { + std::vector scan_load_per_node; + for (uint32_t i = 0; i < cluster_->tablet_node_num_; ++i) { + scan_load_per_node.emplace_back(cluster_->scan_load_per_node_[i]); + } + + return ScaleFromArray(scan_load_per_node); +} + +} // namespace load_balancer +} // namespace tera diff --git a/src/load_balancer/cost_functions.h b/src/load_balancer/cost_functions.h new file mode 100644 index 000000000..5f977275a --- /dev/null +++ b/src/load_balancer/cost_functions.h @@ -0,0 +1,135 @@ +// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef TERA_LOAD_BALANCER_COST_FUNCTIONS_H_ +#define TERA_LOAD_BALANCER_COST_FUNCTIONS_H_ + +#include "load_balancer/cost_function.h" + +namespace tera { +namespace load_balancer { + +// moving too many tablets will cost high +class MoveCountCostFunction : public CostFunction { +public: + MoveCountCostFunction(const LBOptions& options); + virtual ~MoveCountCostFunction(); + + virtual double Cost() override; + +private: + const double kExpensiveCost; + uint32_t tablet_max_move_num_; + double tablet_max_move_percent_; +}; + +// moving tablet oo frequently will cost high +class MoveFrequencyCostFunction : public CostFunction { +public: + MoveFrequencyCostFunction(const LBOptions& options); + virtual ~MoveFrequencyCostFunction(); + + virtual double Cost() override; + +private: + const double kExpensiveCost; +}; + +// moving a tablet to an abnormal node will cost high +class AbnormalNodeCostFunction : public CostFunction { +public: + AbnormalNodeCostFunction(const LBOptions& options); + virtual ~AbnormalNodeCostFunction(); + + virtual double Cost() override; + +private: + const double kExpensiveCost; +}; + +// moving a tablet to a read pending node will cost high +class ReadPendingNodeCostFunction : public CostFunction { +public: + ReadPendingNodeCostFunction(const LBOptions& options); + virtual ~ReadPendingNodeCostFunction(); + + virtual double Cost() override; + +private: + const double kExpensiveCost; +}; + +// moving a tablet to a write pending node will cost high +class WritePendingNodeCostFunction : public CostFunction { +public: + WritePendingNodeCostFunction(const LBOptions& options); + virtual ~WritePendingNodeCostFunction(); + + virtual double Cost() override; + +private: + const double kExpensiveCost; +}; + +// moving a tablet to a scan pending node will cost high +class ScanPendingNodeCostFunction : public CostFunction { +public: + ScanPendingNodeCostFunction(const LBOptions& options); + virtual ~ScanPendingNodeCostFunction(); + + virtual double Cost() override; + +private: + const double kExpensiveCost; +}; + +// balance the tablets num for each tablet node +class TabletCountCostFunction : public CostFunction { +public: + TabletCountCostFunction(const LBOptions& options); + virtual ~TabletCountCostFunction(); + + virtual double Cost() override; +}; + +// banlance the data size for each tablet node +class SizeCostFunction : public CostFunction { +public: + SizeCostFunction(const LBOptions& options); + virtual ~SizeCostFunction(); + + virtual double Cost() override; +}; + +// banlance the read load for each tablet node +class ReadLoadCostFunction : public CostFunction { +public: + ReadLoadCostFunction(const LBOptions& options); + virtual ~ReadLoadCostFunction(); + + virtual double Cost() override; +}; + +// banlance the write load for each tablet node +class WriteLoadCostFunction : public CostFunction { +public: + WriteLoadCostFunction(const LBOptions& options); + virtual ~WriteLoadCostFunction(); + + virtual double Cost() override; +}; + +// banlance the scan load for each tablet node +class ScanLoadCostFunction : public CostFunction { +public: + ScanLoadCostFunction(const LBOptions& options); + virtual ~ScanLoadCostFunction(); + + virtual double Cost() override; +}; + +} // namespace load_balancer +} // namespace tera + +#endif // TERA_LOAD_BALANCER_COST_FUNCTIONS_H_ diff --git a/src/load_balancer/lb_entry.cc b/src/load_balancer/lb_entry.cc new file mode 100644 index 000000000..abf0b3ad6 --- /dev/null +++ b/src/load_balancer/lb_entry.cc @@ -0,0 +1,74 @@ +// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "load_balancer/lb_entry.h" + +#include + +#include "gflags/gflags.h" +#include "glog/logging.h" + +#include "common/net/ip_address.h" +#include "common/this_thread.h" +#include "load_balancer/lb_impl.h" +#include "load_balancer/lb_service_impl.h" + +DECLARE_string(tera_lb_server_addr); +DECLARE_string(tera_lb_server_port); + +std::string GetTeraEntryName() { + return "load_balancer"; +} + +tera::TeraEntry* GetTeraEntry() { + return new tera::load_balancer::LBEntry(); +} + +namespace tera { +namespace load_balancer { + +LBEntry::LBEntry() : + rpc_server_(nullptr), + lb_service_impl_(nullptr), + lb_impl_(nullptr) { + sofa::pbrpc::RpcServerOptions rpc_options; + rpc_server_.reset(new sofa::pbrpc::RpcServer(rpc_options)); +} + +LBEntry::~LBEntry() { +} + +bool LBEntry::StartServer() { + IpAddress lb_addr(FLAGS_tera_lb_server_addr, FLAGS_tera_lb_server_port); + LOG(INFO) << "Start load balancer RPC server at: " << lb_addr.ToString(); + + lb_impl_.reset(new LBImpl()); + lb_service_impl_ = new LBServiceImpl(lb_impl_); + + if (!lb_impl_->Init()) { + return false; + } + + rpc_server_->RegisterService(lb_service_impl_); + if (!rpc_server_->Start(lb_addr.ToString())) { + LOG(ERROR) << "start RPC server error"; + return false; + } + + LOG(INFO) << "finish starting load balancer server"; + return true; +} + +bool LBEntry::Run() { + ThisThread::Sleep(1000); + return true; +} + +void LBEntry::ShutdownServer() { + rpc_server_->Stop(); +} + +} // namespace load_balancer +} // namespace tera + diff --git a/src/load_balancer/lb_entry.h b/src/load_balancer/lb_entry.h new file mode 100644 index 000000000..03399bc00 --- /dev/null +++ b/src/load_balancer/lb_entry.h @@ -0,0 +1,38 @@ +// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef TERA_LOAD_BALANCER_LB_ENTRY_H_ +#define TERA_LOAD_BALANCER_LB_ENTRY_H_ + +#include + +#include "sofa/pbrpc/pbrpc.h" + +#include "tera_entry.h" + +namespace tera { +namespace load_balancer { + +class LBServiceImpl; +class LBImpl; + +class LBEntry : public TeraEntry { +public: + LBEntry(); + virtual ~LBEntry(); + + virtual bool StartServer(); + virtual bool Run(); + virtual void ShutdownServer(); + +private: + std::unique_ptr rpc_server_; + LBServiceImpl* lb_service_impl_; + std::shared_ptr lb_impl_; +}; + +} // namespace load_balancer +} // namespace tera + +#endif // TERA_LOAD_BALANCER_LB_ENTRY_H_ diff --git a/src/load_balancer/lb_impl.cc b/src/load_balancer/lb_impl.cc new file mode 100644 index 000000000..690528531 --- /dev/null +++ b/src/load_balancer/lb_impl.cc @@ -0,0 +1,531 @@ +// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "load_balancer/lb_impl.h" + +#include +#include +#include +#include + +#include "gflags/gflags.h" +#include "glog/logging.h" + +#include "load_balancer/unity_balancer.h" +#include "proto/tabletnode.pb.h" +#include "tera.h" +#include "common/timer.h" + +DECLARE_bool(tera_master_meta_isolate_enabled); +DECLARE_string(tera_master_meta_table_name); +DECLARE_int32(tera_lb_impl_thread_num); +DECLARE_int32(tera_lb_load_balance_period_s); +DECLARE_int32(tera_lb_max_compute_steps); +DECLARE_int32(tera_lb_max_compute_steps_per_tablet); +DECLARE_int32(tera_lb_max_compute_time_ms); +DECLARE_double(tera_lb_min_cost_need_balance); +DECLARE_double(tera_lb_move_count_cost_weight); +DECLARE_int32(tera_lb_tablet_max_move_num); +DECLARE_double(tera_lb_tablet_max_move_percent); +DECLARE_double(tera_lb_move_frequency_cost_weight); +DECLARE_int32(tera_lb_tablet_move_too_frequently_threshold_s); +DECLARE_double(tera_lb_abnormal_node_cost_weight); +DECLARE_double(tera_lb_abnormal_node_ratio); +DECLARE_double(tera_lb_read_pending_node_cost_weight); +DECLARE_double(tera_lb_write_pending_node_cost_weight); +DECLARE_double(tera_lb_scan_pending_node_cost_weight); +DECLARE_double(tera_lb_tablet_count_cost_weight); +DECLARE_double(tera_lb_size_cost_weight); +DECLARE_double(tera_lb_read_load_cost_weight); +DECLARE_double(tera_lb_write_load_cost_weight); +DECLARE_double(tera_lb_scan_load_cost_weight); +DECLARE_bool(tera_lb_debug_mode_enabled); + +using tera::master::NodeState; +using tera::master::Table; +using tera::master::TablePtr; +using tera::master::Tablet; +using tera::master::TabletPtr; +using tera::master::TabletNode; +using tera::master::TabletNodePtr; + +namespace tera { +namespace load_balancer { + +LBImpl::LBImpl() : + thread_pool_(new ThreadPool(FLAGS_tera_lb_impl_thread_num)), + sdk_client_(nullptr), + safemode_(true), + round_(0), + lb_debug_mode_(FLAGS_tera_lb_debug_mode_enabled) { +} + +LBImpl::~LBImpl() { +} + +bool LBImpl::Init() { + if (lb_debug_mode_) { + LOG(INFO) << "[lb] debug mode enabled"; + } + + // tera_entry has init glog + Client::SetGlogIsInitialized(); + + sdk_client_.reset(Client::NewClient()); + if (!sdk_client_) { + LOG(ERROR) << "[lb] open sdk client fail"; + return false; + } + + ScheduleLoadBalance(); + + return true; +} + +void LBImpl::ScheduleLoadBalance() { + int schedule_period = FLAGS_tera_lb_load_balance_period_s * 1000; + VLOG(5) << "[lb] LoadBalance will be scheduled in: " << FLAGS_tera_lb_load_balance_period_s << "s"; + thread_pool_->DelayTask(schedule_period, + [this](int64_t) { + DoLoadBalance(); + ScheduleLoadBalance(); + } + ); +} + +void LBImpl::DoLoadBalance() { + ++round_; + VLOG(5) << "[lb] LoadBalance begin round: " << round_; + int64_t start_time = get_micros(); + + std::vector tablet_nodes; + std::vector tables; + std::vector tablets; + if (!Collect(&tablet_nodes, &tables, &tablets)) { + return; + } + + if (lb_debug_mode_) { + DebugCollect(tablet_nodes, tables, tablets); + } + + std::vector> lb_nodes; + CreateLBInput(tables, tablet_nodes, tablets, &lb_nodes); + + if (lb_debug_mode_) { + DebugLBNode(lb_nodes); + } + + LBOptions options; + options.max_compute_steps = FLAGS_tera_lb_max_compute_steps; + options.max_compute_steps_per_tablet = FLAGS_tera_lb_max_compute_steps_per_tablet; + options.max_compute_time_ms = FLAGS_tera_lb_max_compute_time_ms; + options.min_cost_need_balance = FLAGS_tera_lb_min_cost_need_balance; + options.move_count_cost_weight = FLAGS_tera_lb_move_count_cost_weight; + options.tablet_max_move_num = FLAGS_tera_lb_tablet_max_move_num; + options.tablet_max_move_percent = FLAGS_tera_lb_tablet_max_move_percent; + options.move_frequency_cost_weight = FLAGS_tera_lb_move_frequency_cost_weight; + options.tablet_move_too_frequently_threshold_s = FLAGS_tera_lb_tablet_move_too_frequently_threshold_s; + options.abnormal_node_cost_weight = FLAGS_tera_lb_abnormal_node_cost_weight; + options.abnormal_node_ratio = FLAGS_tera_lb_abnormal_node_ratio; + options.read_pending_node_cost_weight = FLAGS_tera_lb_read_pending_node_cost_weight; + options.write_pending_node_cost_weight = FLAGS_tera_lb_write_pending_node_cost_weight; + options.scan_pending_node_cost_weight = FLAGS_tera_lb_scan_pending_node_cost_weight; + options.tablet_count_cost_weight = FLAGS_tera_lb_tablet_count_cost_weight; + options.size_cost_weight = FLAGS_tera_lb_size_cost_weight; + options.read_load_cost_weight = FLAGS_tera_lb_read_load_cost_weight; + options.write_load_cost_weight = FLAGS_tera_lb_write_load_cost_weight; + options.scan_load_cost_weight = FLAGS_tera_lb_scan_load_cost_weight; + options.meta_table_isolate_enabled = FLAGS_tera_master_meta_isolate_enabled; + options.meta_table_name = FLAGS_tera_master_meta_table_name; + options.meta_table_node_addr = GetMetaNodeAddr(); + options.debug_mode_enabled = lb_debug_mode_; + + std::unique_ptr balancer(new UnityBalancer(options)); + std::vector plans; + if (!balancer->BalanceCluster(lb_nodes, &plans)) { + LOG(WARNING) << "[lb] LoadBalance failed"; + return; + } + + DebugPlan(plans); + + if (!IsSafemode()) { + bool master_safe_mode = true; + bool get_success = GetMasterSafemode(&master_safe_mode); + + if (get_success && !master_safe_mode) { + ExecutePlan(plans); + } else if (!get_success) { + VLOG(5) << "[lb] skip execute plan due to fail to get master safe mode"; + } else if (master_safe_mode) { + VLOG(5) << "[lb] skip execute plan due to master is in safe mode"; + } else { + } + } else { + VLOG(5) << "[lb] skip execute plan in safe mode"; + } + + int64_t cost_time = get_micros() - start_time; + VLOG(5) << "[lb] LoadBalance end round: " << round_ + <<", cost: " << cost_time / 1000.0 << "ms"; +} + +bool LBImpl::CreateLBInput( + const std::vector& tables, + const std::vector& nodes, + const std::vector& tablets, + std::vector>* lb_nodes) { + lb_nodes->clear(); + + std::map> nodes_map; + for (const auto& node : nodes) { + LBTabletNode* p_lb_node = new LBTabletNode(); + p_lb_node->tablet_node_ptr = node; + nodes_map[node->GetAddr()].reset(p_lb_node); + } + + for (const auto& tablet : tablets) { + std::string addr = tablet->GetServerAddr(); + if (nodes_map.find(addr) != nodes_map.end()) { + LBTablet* p_lb_tablet = new LBTablet(); + p_lb_tablet->tablet_ptr = tablet; + std::shared_ptr lb_tablet(p_lb_tablet); + nodes_map[addr]->tablets.emplace_back(lb_tablet); + } else { + // TODO + // unassigned tablet, skip now + } + } + + for (const auto& pair : nodes_map) { + lb_nodes->emplace_back(pair.second); + } + + return true; +} + +bool LBImpl::Collect(std::vector* nodes, + std::vector* tables, + std::vector* tablets) { + if (nodes == nullptr || tables == nullptr || tablets == nullptr) { + return false; + } + nodes->clear(); + tables->clear(); + tablets->clear(); + + int64_t start_time = get_micros(); + + if (!CollectNodes(nodes)) { + LOG(ERROR) << "[lb] collect nodes fail"; + return false; + } + + if (!CollectTablets(tables, tablets)) { + LOG(ERROR) << "[lb] collect tablets fail"; + return false; + } + + int64_t cost_time = get_micros() - start_time; + VLOG(5) << "[lb] Collect cost: " << cost_time / 1000.0 << "ms"; + + return true; +} + +bool LBImpl::CollectNodes(std::vector* nodes) { + tera::ClientImpl* client_impl = static_cast(sdk_client_.get()); + std::vector infos; + ErrorCode err; + if (!client_impl->ShowTabletNodesInfo(&infos, &err)) { + LOG(ERROR) << "[lb] fail to get TabletNodeInfo, err: " << err.ToString(); + return false; + } + + for (const auto& info : infos) { + TabletNodePtr node(new TabletNode()); + NodeInfoToNode(info, node); + nodes->push_back(node); + } + + VLOG(5) << "[lb] collected node size: " << nodes->size(); + + return true; +} + +bool LBImpl::NodeInfoToNode(const TabletNodeInfo& info, TabletNodePtr node) { + node->info_ = info; + + node->addr_ = info.addr(); + node->state_ = StringToNodeState(info.status_m()); + node->data_size_ = info.load(); + node->average_counter_.read_pending_ = info.read_pending(); + node->average_counter_.write_pending_ = info.write_pending(); + node->average_counter_.scan_pending_ = info.scan_pending(); + + return true; +} + +NodeState LBImpl::StringToNodeState(const std::string& str) { + if (str == "kReady") { + return tera::master::kReady; + } else if (str == "kOffLine") { + return tera::master::kOffLine; + } else if (str == "kOnKick") { + return tera::master::kOnKick; + } else if (str == "kWaitKick") { + return tera::master::kWaitKick; + } else { + return tera::master::kOffLine; + } +} + +bool LBImpl::CollectTablets(std::vector* tables, + std::vector* tablets) { + tera::ClientImpl* client_impl = static_cast(sdk_client_.get()); + TableMetaList table_list; + TabletMetaList tablet_list; + bool is_brief = false; + ErrorCode err; + if (!client_impl->ShowTablesInfo(&table_list, &tablet_list, is_brief, &err)) { + LOG(ERROR) << "[lb] fail to get tablets, err: " << err.ToString(); + return false; + } + + std::map table_name_to_ptr; + + for (int i = 0; i < table_list.meta_size(); ++i) { + std::string table_name = table_list.meta(i).table_name(); + TablePtr table(new tera::master::Table(table_name)); + TableMetaToTable(table_list.meta(i), table); + tables->push_back(table); + + if (table_name_to_ptr.find(table_name) == table_name_to_ptr.end()) { + table_name_to_ptr[table_name] = table; + } + } + + if (tablet_list.meta_size() != tablet_list.counter_size()) { + LOG(ERROR) << "[lb] invalid TabletMetaList, meta size: " << tablet_list.meta_size() + << " counter size: " << tablet_list.counter_size(); + return false; + } + for (int i = 0; i < tablet_list.meta_size(); ++i) { + std::string table_name = tablet_list.meta(i).table_name(); + if (table_name_to_ptr.find(table_name) == table_name_to_ptr.end()) { + LOG(WARNING) << "[lb] tablet's table not exist " << "tablet path: " + << tablet_list.meta(i).path() << "table: " << table_name; + continue; + } + TabletPtr tablet(new tera::master::Tablet(tablet_list.meta(i), table_name_to_ptr[table_name])); + tablet->SetCounter(tablet_list.counter(i)); + if (tablet_list.meta(i).has_last_move_time_us()) { + tablet->SetLastMoveTime(tablet_list.meta(i).last_move_time_us()); + } else { + // !!! compatible with old master + // !!! set last move time to 0 will disable the MoveFrequencyCostFunction strategy + tablet->SetLastMoveTime(0); + } + tablets->push_back(tablet); + + if (table_name == FLAGS_tera_master_meta_table_name) { + SetMetaNodeAddr(tablet->GetServerAddr()); + VLOG(5) << "[lb] meta table node addr: " << GetMetaNodeAddr(); + } + } + + VLOG(5) << "[lb] collected table size: " << tables->size(); + VLOG(5) << "[lb] collected tablet size: " << tablets->size(); + + return true; +} + +bool LBImpl::TableMetaToTable(const TableMeta& meta, TablePtr table) { + table->SetStatus(meta.status()); + table->SetSchema(meta.schema()); + + return true; +} + +void LBImpl::DebugCollect(const std::vector& nodes, + const std::vector& tables, + const std::vector& tablets) { + LOG(INFO) << ""; + LOG(INFO) << "[lb] DebugCollect begin -----"; + + LOG(INFO) << "[lb] " << tables.size() << " table:" ; + for (const auto& table : tables) { + LOG(INFO) << "table:" + table->GetTableName() + << " status:" << StatusCodeToString(table->GetStatus()); + } + + LOG(INFO) << "[lb] " << nodes.size() << " node:"; + for (const auto& node : nodes) { + LOG(INFO) << "addr:" + node->GetAddr() + << " state:" << tera::master::NodeStateToString(node->GetState()) + << " size:" << node->GetSize() << "B" + << " r_pending:" << node->GetReadPending() + << " w_pending:" << node->GetWritePending() + << " s_pending:" << node->GetScanPending(); + } + + LOG(INFO) << "[lb] " << tablets.size() << " tablet:"; + for (const auto& tablet : tablets) { + LOG(INFO) << "path:" + tablet->GetPath() + << " status:" << StatusCodeToString(tablet->GetStatus()) + << " server:" << tablet->GetServerAddr() + << " table:" << tablet->GetTableName() + << " last_move_time_us:" << tablet->LastMoveTime(); + } + + LOG(INFO) << "[lb] DebugCollect end -----"; + LOG(INFO) << ""; +} + +void LBImpl::DebugLBNode(const std::vector>& lb_nodes) { + LOG(INFO) << ""; + LOG(INFO) << "[lb] DebugLBNode begin -----"; + LOG(INFO) << "[lb] " << lb_nodes.size() << " lb_nodes:" ; + + for (const auto& node : lb_nodes) { + LOG(INFO) << "[lb] " << node->tablet_node_ptr->GetAddr() << ":"; + for (const auto& lb_tablet : node->tablets) { + LOG(INFO) << "[lb] " << lb_tablet->tablet_ptr->GetPath(); + } + } + + LOG(INFO) << "[lb] DebugLBNode end -----"; + LOG(INFO) << ""; +} + +void LBImpl::DebugPlan(const std::vector& plans) { + VLOG(5) << ""; + VLOG(5) << "[lb] DebugPlan begin ----"; + VLOG(5) << plans.size() << " plans:"; + + for (const auto& plan : plans) { + VLOG(5) << "[lb] " + plan.ToString(); + } + + VLOG(5) << "[lb] DebugPlan end ----"; + VLOG(5) << ""; +} + +void LBImpl::ExecutePlan(const std::vector& plans) { + tera::ClientImpl* client_impl = static_cast(sdk_client_.get()); + for (const auto& plan : plans) { + std::string tablet_path = plan.TabletPath(); + std::string dest_addr = plan.DestAddr(); + + std::vector arg_list; + arg_list.emplace_back("move"); + arg_list.emplace_back(tablet_path); + arg_list.emplace_back(dest_addr); + + ErrorCode err; + if (!client_impl->CmdCtrl("tablet", arg_list, nullptr, nullptr, &err)) { + LOG(ERROR) << "[lb] fail to execute plan:" << plan.ToString() << err.ToString(); + } else { + VLOG(5) << "[lb] execute plan success:" << plan.ToString(); + } + } +} + +bool LBImpl::IsSafemode() const { + MutexLock lock(&mutex_); + return safemode_; +} + +bool LBImpl::SetSafemode(bool value) { + MutexLock lock(&mutex_); + safemode_ = value; + + if (value) { + LOG(INFO) << "[lb] LoadBanlacer enter safemode"; + } else { + LOG(INFO) << "[lb] LoadBanlacer leave safemode"; + } + + return true; +} + +bool LBImpl::GetMasterSafemode(bool* safe_mode) { + if (safe_mode == nullptr) { + return false; + } + + std::string op = "get"; + std::vector arg_list; + arg_list.push_back(op); + + tera::ClientImpl* client_impl = static_cast(sdk_client_.get()); + ErrorCode err; + if (!client_impl->CmdCtrl("safemode", arg_list, safe_mode, NULL, &err)) { + LOG(ERROR) << "[lb] fail to " << op << " master safemode" << err.ToString(); + return false; + } + + VLOG(20) << "[lb] master safemode: " << *safe_mode; + return true; +} + +std::string LBImpl::GetMetaNodeAddr() const { + MutexLock lock(&mutex_); + return meta_node_addr_; +} + +bool LBImpl::SetMetaNodeAddr(const std::string& addr) { + MutexLock lock(&mutex_); + meta_node_addr_ = addr; + return true; +} + +void LBImpl::CmdCtrl(const CmdCtrlRequest* request, + CmdCtrlResponse* response, + google::protobuf::Closure* done) { + std::string cmd_line; + for (int32_t i = 0; i < request->arg_list_size(); i++) { + cmd_line += request->arg_list(i); + if (i != request->arg_list_size() - 1) { + cmd_line += " "; + } + } + LOG(INFO) << "[lb] receive cmd: " << request->command() << " " << cmd_line; + + response->set_sequence_id(request->sequence_id()); + + if (request->command() == "safemode") { + SafeModeCmdCtrl(request, response); + } else { + response->set_status(kInvalidArgument); + } + + done->Run(); + return; +} + +void LBImpl::SafeModeCmdCtrl(const CmdCtrlRequest* request, + CmdCtrlResponse* response) { + if (request->arg_list_size() != 1) { + response->set_status(kInvalidArgument); + return; + } + + if (request->arg_list(0) == "enter") { + SetSafemode(true); + response->set_status(kLoadBalancerOk); + } else if (request->arg_list(0) == "leave") { + SetSafemode(false); + response->set_status(kLoadBalancerOk); + } else if (request->arg_list(0) == "get") { + response->set_bool_result(IsSafemode()); + response->set_status(kLoadBalancerOk); + } else { + response->set_status(kInvalidArgument); + } +} + +} // namespace load_balancer +} // namespace tera + diff --git a/src/load_balancer/lb_impl.h b/src/load_balancer/lb_impl.h new file mode 100644 index 000000000..2e2abe88d --- /dev/null +++ b/src/load_balancer/lb_impl.h @@ -0,0 +1,93 @@ +// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef TERA_LOAD_BALANCER_LB_IMPL_H_ +#define TERA_LOAD_BALANCER_LB_IMPL_H_ + +#include +#include +#include +#include + +#include "common/mutex.h" +#include "common/thread_pool.h" +#include "load_balancer/lb_node.h" +#include "load_balancer/plan.h" +#include "master/tablet_manager.h" +#include "master/tabletnode_manager.h" +#include "proto/load_balancer_rpc.pb.h" +#include "sdk/client_impl.h" + +namespace tera { +namespace load_balancer { + +class LBImpl { +public: + LBImpl(); + virtual ~LBImpl(); + + bool Init(); + + void CmdCtrl(const CmdCtrlRequest* request, + CmdCtrlResponse* response, + google::protobuf::Closure* done); + +private: + void ScheduleLoadBalance(); + void DoLoadBalance(); + + bool CreateLBInput(const std::vector& tables, + const std::vector& nodes, + const std::vector& tablets, + std::vector>* lb_nodes); + + bool Collect(std::vector* nodes, + std::vector* tables, + std::vector* tablets); + + bool CollectNodes(std::vector* nodes); + bool NodeInfoToNode(const TabletNodeInfo& info, + tera::master::TabletNodePtr node); + tera::master::NodeState StringToNodeState(const std::string& str); + + bool CollectTablets(std::vector* tables, + std::vector* tablets); + bool TableMetaToTable(const TableMeta& meta, tera::master::TablePtr table); + + void ExecutePlan(const std::vector& plans); + + bool IsSafemode() const; + bool SetSafemode(bool value); + + bool GetMasterSafemode(bool* safe_mode); + + std::string GetMetaNodeAddr() const; + bool SetMetaNodeAddr(const std::string& addr); + + void DebugCollect(const std::vector& nodes, + const std::vector& tables, + const std::vector& tablets); + void DebugLBNode(const std::vector>& lb_nodes); + void DebugPlan(const std::vector& plans); + + void SafeModeCmdCtrl(const CmdCtrlRequest* request, + CmdCtrlResponse* response); + +private: + mutable Mutex mutex_; + + std::unique_ptr thread_pool_; + std::unique_ptr sdk_client_; + + bool safemode_; + uint64_t round_; + std::string meta_node_addr_; + + bool lb_debug_mode_; +}; + +} // namespace load_balancer +} // namespace tera + +#endif // TERA_LOAD_BALANCER_LB_IMPL_H_ diff --git a/src/load_balancer/lb_node.h b/src/load_balancer/lb_node.h new file mode 100644 index 000000000..b3b4430e2 --- /dev/null +++ b/src/load_balancer/lb_node.h @@ -0,0 +1,30 @@ +// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef TERA_LOAD_BALANCER_LB_NODE_H_ +#define TERA_LOAD_BALANCER_LB_NODE_H_ + +#include +#include +#include + +#include "master/tablet_manager.h" +#include "master/tabletnode_manager.h" + +namespace tera { +namespace load_balancer { + +struct LBTablet { + tera::master::TabletPtr tablet_ptr; +}; + +struct LBTabletNode { + tera::master::TabletNodePtr tablet_node_ptr; + std::vector> tablets; +}; + +} // namespace load_balancer +} // namespace tera + +#endif // TERA_LOAD_BALANCER_LB_NODE_H_ diff --git a/src/load_balancer/lb_service_impl.cc b/src/load_balancer/lb_service_impl.cc new file mode 100644 index 000000000..e67759c1c --- /dev/null +++ b/src/load_balancer/lb_service_impl.cc @@ -0,0 +1,49 @@ +// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "load_balancer/lb_service_impl.h" + +#include + +#include "gflags/gflags.h" +#include "glog/logging.h" + +#include "load_balancer/lb_impl.h" +#include "utils/network_utils.h" + +DECLARE_int32(tera_lb_server_thread_num); + +namespace tera { +namespace load_balancer { + +LBServiceImpl::LBServiceImpl(const std::shared_ptr& lb_impl) : + lb_impl_(lb_impl), + thread_pool_(new ThreadPool(FLAGS_tera_lb_server_thread_num)) { +} + +LBServiceImpl::~LBServiceImpl() { +} + +void LBServiceImpl::CmdCtrl(google::protobuf::RpcController* controller, + const CmdCtrlRequest* request, + CmdCtrlResponse* response, + google::protobuf::Closure* done) { + VLOG(20) << "accept RPC (CmdCtrl) from: " << tera::utils::GetRemoteAddress(controller); + ThreadPool::Task task = + std::bind(&LBServiceImpl::DoCmdCtrl, this, controller, request, response, done); + thread_pool_->AddTask(task); +} + +void LBServiceImpl::DoCmdCtrl(google::protobuf::RpcController* controller, + const CmdCtrlRequest* request, + CmdCtrlResponse* response, + google::protobuf::Closure* done) { + VLOG(20) << "run RPC (CmdCtrl)"; + lb_impl_->CmdCtrl(request, response, done); + VLOG(20) << "finish RPC (CmdCtrl)"; +} + +} // namespace load_balancer +} // namespace tera + diff --git a/src/load_balancer/lb_service_impl.h b/src/load_balancer/lb_service_impl.h new file mode 100644 index 000000000..f0754bb6e --- /dev/null +++ b/src/load_balancer/lb_service_impl.h @@ -0,0 +1,42 @@ +// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef TERA_LOAD_BALANCER_LB_SERVICE_IMPL_H_ +#define TERA_LOAD_BALANCER_LB_SERVICE_IMPL_H_ + +#include + +#include "common/thread_pool.h" +#include "proto/load_balancer_rpc.pb.h" + +namespace tera { +namespace load_balancer { + +class LBImpl; + +class LBServiceImpl: public LoadBalancerService { +public: + explicit LBServiceImpl(const std::shared_ptr& lb_impl); + virtual ~LBServiceImpl(); + + void CmdCtrl(google::protobuf::RpcController* controller, + const CmdCtrlRequest* request, + CmdCtrlResponse* response, + google::protobuf::Closure* done); + +private: + void DoCmdCtrl(google::protobuf::RpcController* controller, + const CmdCtrlRequest* request, + CmdCtrlResponse* response, + google::protobuf::Closure* done); + +private: + std::shared_ptr lb_impl_; + std::unique_ptr thread_pool_; +}; + +} // namespace load_balancer +} // namespace tera + +#endif // TERA_LOAD_BALANCER_LB_SERVICE_IMPL_H_ diff --git a/src/load_balancer/options.h b/src/load_balancer/options.h new file mode 100644 index 000000000..4d280c6ce --- /dev/null +++ b/src/load_balancer/options.h @@ -0,0 +1,100 @@ +// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef TERA_LOAD_BALANCER_OPTIONS_H_ +#define TERA_LOAD_BALANCER_OPTIONS_H_ + +#include + +namespace tera { +namespace load_balancer { + +struct LBOptions { + // calculate + uint64_t max_compute_steps; + uint32_t max_compute_steps_per_tablet; + uint64_t max_compute_time_ms; + double min_cost_need_balance; + + // MoveCountCostFunction + double move_count_cost_weight; + uint32_t tablet_max_move_num; + double tablet_max_move_percent; + + // MoveFrequencyCostFunction + double move_frequency_cost_weight; + uint32_t tablet_move_too_frequently_threshold_s; + + // AbnormalNodeCostFunction + double abnormal_node_cost_weight; + // if not ready tablets's ratio is hither than this value, + // the node in considered abnormal + double abnormal_node_ratio; + + // ReadPendingNodeCostFunction + double read_pending_node_cost_weight; + + // WritePendingNodeCostFunction + double write_pending_node_cost_weight; + + // ScanPendingNodeCostFunction + double scan_pending_node_cost_weight; + + // CountCostFunction + double tablet_count_cost_weight; + + // SizeCostFunction + double size_cost_weight; + + // LoadCostFunction + double read_load_cost_weight; + double write_load_cost_weight; + double scan_load_cost_weight; + + // meta table + bool meta_table_isolate_enabled; + std::string meta_table_name; + std::string meta_table_node_addr; + + // debug + bool debug_mode_enabled; + + LBOptions() : + max_compute_steps(1000000), + max_compute_steps_per_tablet(1000), + max_compute_time_ms(30 * 1000), + min_cost_need_balance(0.1), + + move_count_cost_weight(10), + tablet_max_move_num(10), + tablet_max_move_percent(0.001), + + move_frequency_cost_weight(10), + tablet_move_too_frequently_threshold_s(600), + + abnormal_node_cost_weight(10), + abnormal_node_ratio(0.5), + + read_pending_node_cost_weight(10), + write_pending_node_cost_weight(10), + scan_pending_node_cost_weight(10), + + tablet_count_cost_weight(0), + size_cost_weight(100), + read_load_cost_weight(0), + write_load_cost_weight(0), + scan_load_cost_weight(0), + + meta_table_isolate_enabled(true), + meta_table_name("meta_table"), + meta_table_node_addr(""), + + debug_mode_enabled(false) { + } +}; + +} // namespace load_balancer +} // namespace tera + +#endif // TERA_LOAD_BALANCER_OPTIONS_H_ diff --git a/src/load_balancer/plan.h b/src/load_balancer/plan.h new file mode 100644 index 000000000..6e4ca41ae --- /dev/null +++ b/src/load_balancer/plan.h @@ -0,0 +1,71 @@ +// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef TERA_LOAD_BALANCER_PLAN_H_ +#define TERA_LOAD_BALANCER_PLAN_H_ + +#include + +#include "master/tablet_manager.h" +#include "master/tabletnode_manager.h" + +namespace tera { +namespace load_balancer { + +class Plan { +public: + Plan() {} + + Plan(const tera::master::TabletPtr& tablet, + const tera::master::TabletNodePtr& source, + const tera::master::TabletNodePtr& dest) { + tablet_ = tablet; + source_ = source; + dest_ = dest; + } + + virtual ~Plan() {} + + virtual std::string TabletPath() const { + if (tablet_) { + return tablet_->GetPath(); + } else { + return ""; + } + } + + virtual std::string SourceAddr() const { + if (source_) { + return source_->GetAddr(); + } else { + return ""; + } + } + + virtual std::string DestAddr() const { + if (dest_) { + return dest_->GetAddr(); + } else { + return ""; + } + } + + virtual std::string ToString() const { + std::string str = "tablet:" + (tablet_ ? tablet_->GetPath() : "") + + " source:" + (source_ ? source_->GetAddr() : "") + + " dest:" + (dest_ ? dest_->GetAddr() : ""); + + return str; + } + +private: + tera::master::TabletPtr tablet_; + tera::master::TabletNodePtr source_; + tera::master::TabletNodePtr dest_; +}; + +} // namespace load_balancer +} // namespace tera + +#endif // TERA_LOAD_BALANCER_PLAN_H_ diff --git a/src/load_balancer/random.h b/src/load_balancer/random.h new file mode 100644 index 000000000..46a43008f --- /dev/null +++ b/src/load_balancer/random.h @@ -0,0 +1,73 @@ +// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef TERA_LOAD_BALANCER_RANDOM_H_ +#define TERA_LOAD_BALANCER_RANDOM_H_ + +#include + +#include +#include + +#include "common/timer.h" + +namespace tera { +namespace load_balancer { + +class Random { +public: + // random from [a, b) + // a < b should be ensured + // can generate negative number + // avg time cost: 25us + static int RandStd(int a, int b) { + assert(a < b); + + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_int_distribution<> dis(a, b - 1); + + return dis(gen); + } + + // random from [a, b) + // a < b should be ensured + // can not generate negative number + // avg time cost: 150ns + static uint32_t RandTime(uint32_t a, uint32_t b) { + assert(a < b); + + int64_t time_us = get_micros(); + return time_us % (b - a) + a; + } + + // random from [a, b) + // a < b should be ensured + // can not generate negative number + // avg time cost: 15ns + static uint32_t Rand(uint32_t a, uint32_t b) { + assert(a < b); + + uint32_t rand = xorshift32(); + return rand % (b - a) + a; + } + +private: + /* The state word must be initialized to non-zero */ + static uint32_t xorshift32() { + /* Algorithm "xor" from p. 4 of Marsaglia, "Xorshift RNGs" */ + static uint32_t state = time(NULL); + uint32_t x = state; + x ^= x << 13; + x ^= x >> 17; + x ^= x << 5; + state = x; + return x; + } +}; + +} // namespace load_balancer +} // namespace tera + +#endif // TERA_LOAD_BALANCER_RANDOM_H_ diff --git a/src/load_balancer/test/action_generators_test.cc b/src/load_balancer/test/action_generators_test.cc new file mode 100644 index 000000000..6cbe65e4d --- /dev/null +++ b/src/load_balancer/test/action_generators_test.cc @@ -0,0 +1,311 @@ +// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include + +#include "gflags/gflags.h" +#include "glog/logging.h" +#include "gtest/gtest.h" + +#include "load_balancer/action_generators.h" + +namespace tera { +namespace load_balancer { + +class RandomActionGeneratorTest : public ::testing::Test { +public: + virtual void SetUp() { + random_action_generator_.reset(new RandomActionGenerator()); + + std::vector> empty_lb_nodes; + LBOptions options; + cluster_.reset(new Cluster(empty_lb_nodes, options)); + } + + virtual void TearDown() { + } + +private: + std::shared_ptr random_action_generator_; + std::shared_ptr cluster_; +}; + +class TabletCountActionGeneratorTest : public ::testing::Test { +public: + virtual void SetUp() { + tablet_count_action_generator_.reset(new TabletCountActionGenerator()); + + std::vector> empty_lb_nodes; + LBOptions options; + cluster_.reset(new Cluster(empty_lb_nodes, options)); + } + + virtual void TearDown() { + } + +private: + std::shared_ptr tablet_count_action_generator_; + std::shared_ptr cluster_; +}; + +class SizeActionGeneratorTest : public ::testing::Test { +public: + virtual void SetUp() { + size_action_generator_.reset(new SizeActionGenerator()); + + std::vector> empty_lb_nodes; + LBOptions options; + cluster_.reset(new Cluster(empty_lb_nodes, options)); + } + + virtual void TearDown() { + } + +private: + std::shared_ptr size_action_generator_; + std::shared_ptr cluster_; +}; + +class ReadLoadActionGeneratorTest : public ::testing::Test { +public: + virtual void SetUp() { + read_load_action_generator_.reset(new ReadLoadActionGenerator()); + + std::vector> empty_lb_nodes; + LBOptions options; + cluster_.reset(new Cluster(empty_lb_nodes, options)); + } + + virtual void TearDown() { + } + +private: + std::shared_ptr read_load_action_generator_; + std::shared_ptr cluster_; +}; + +class WriteLoadActionGeneratorTest : public ::testing::Test { +public: + virtual void SetUp() { + write_load_action_generator_.reset(new WriteLoadActionGenerator()); + + std::vector> empty_lb_nodes; + LBOptions options; + cluster_.reset(new Cluster(empty_lb_nodes, options)); + } + + virtual void TearDown() { + } + +private: + std::shared_ptr write_load_action_generator_; + std::shared_ptr cluster_; +}; + +class ScanLoadActionGeneratorTest : public ::testing::Test { +public: + virtual void SetUp() { + scan_load_action_generator_.reset(new ScanLoadActionGenerator()); + + std::vector> empty_lb_nodes; + LBOptions options; + cluster_.reset(new Cluster(empty_lb_nodes, options)); + } + + virtual void TearDown() { + } + +private: + std::shared_ptr scan_load_action_generator_; + std::shared_ptr cluster_; +}; + +TEST_F(RandomActionGeneratorTest, PickNodeTest) { + cluster_->tablet_node_num_ = 10; + + uint32_t index = random_action_generator_->PickRandomNode(cluster_); + ASSERT_GE(index, 0); + ASSERT_LT(index, cluster_->tablet_node_num_); + + uint32_t other_index = random_action_generator_->PickOtherRandomNode(cluster_, index); + ASSERT_GE(other_index, 0); + ASSERT_LT(other_index, cluster_->tablet_node_num_); + ASSERT_NE(index, other_index); +} + +TEST_F(RandomActionGeneratorTest, PickRandomTabletOfNodeTest) { + cluster_->tablet_node_num_ = 1; + ASSERT_EQ(random_action_generator_->PickRandomTabletOfNode(cluster_, 0), std::numeric_limits::max()); + + cluster_->tablets_per_node_[0].emplace_back(0); + ASSERT_EQ(random_action_generator_->PickRandomTabletOfNode(cluster_, 0), 0); +} + +TEST_F(RandomActionGeneratorTest, GenerateTest) { + cluster_->tablet_node_num_ = 1; + std::shared_ptr action(random_action_generator_->Generate(cluster_)); + ASSERT_EQ(Action::Type::EMPTY, action->GetType()); + + cluster_->tablet_node_num_ = 2; + cluster_->tablets_per_node_[0].emplace_back(0); + cluster_->tablets_per_node_[1].emplace_back(1); + std::shared_ptr action_0(random_action_generator_->Generate(cluster_)); + ASSERT_EQ(Action::Type::MOVE, action_0->GetType()); +} + +TEST_F(TabletCountActionGeneratorTest, GenerateTest) { + uint32_t more_tablets_node_index = 0; + uint32_t less_tablets_node_index = 1; + cluster_->tablets_per_node_[more_tablets_node_index].emplace_back(0); + cluster_->tablets_per_node_[more_tablets_node_index].emplace_back(1); + cluster_->tablets_per_node_[less_tablets_node_index].emplace_back(2); + + cluster_->tablet_node_num_ = 2; + + cluster_->node_index_sorted_by_tablet_count_.emplace_back(more_tablets_node_index); + cluster_->node_index_sorted_by_tablet_count_.emplace_back(less_tablets_node_index); + + cluster_->SortNodesByTabletCount(); + ASSERT_EQ(more_tablets_node_index, tablet_count_action_generator_->PickMostTabletsNode(cluster_)); + ASSERT_EQ(less_tablets_node_index, tablet_count_action_generator_->PickLeastTabletsNode(cluster_)); + + std::shared_ptr action(tablet_count_action_generator_->Generate(cluster_)); + ASSERT_EQ(Action::Type::MOVE, action->GetType()); + MoveAction* move_action = dynamic_cast(action.get()); + ASSERT_EQ(more_tablets_node_index, move_action->source_node_index_); + ASSERT_EQ(less_tablets_node_index, move_action->dest_node_index_); + + cluster_->meta_table_node_index_ = less_tablets_node_index; + ASSERT_EQ(more_tablets_node_index, tablet_count_action_generator_->PickMostTabletsNode(cluster_)); + ASSERT_EQ(more_tablets_node_index, tablet_count_action_generator_->PickLeastTabletsNode(cluster_)); +} + +TEST_F(SizeActionGeneratorTest, GenerateTest) { + uint32_t larger_size_node_index = 0; + uint32_t smaller_size_node_index = 1; + cluster_->size_per_node_[larger_size_node_index] = 20; + cluster_->size_per_node_[smaller_size_node_index] = 10; + + uint32_t tablet_index_on_larger_size_node = 0; + uint32_t tablet_index_on_smaller_size_node = 1; + cluster_->tablet_node_num_ = 2; + cluster_->tablets_per_node_[larger_size_node_index].emplace_back(tablet_index_on_larger_size_node); + cluster_->tablets_per_node_[smaller_size_node_index].emplace_back(tablet_index_on_smaller_size_node); + + cluster_->node_index_sorted_by_size_.emplace_back(larger_size_node_index); + cluster_->node_index_sorted_by_size_.emplace_back(smaller_size_node_index); + + cluster_->SortNodesBySize(); + ASSERT_EQ(larger_size_node_index, size_action_generator_->PickLargestSizeNode(cluster_)); + ASSERT_EQ(smaller_size_node_index, size_action_generator_->PickSmallestSizeNode(cluster_)); + + std::shared_ptr action(size_action_generator_->Generate(cluster_)); + ASSERT_EQ(Action::Type::MOVE, action->GetType()); + MoveAction* move_action = dynamic_cast(action.get()); + ASSERT_EQ(tablet_index_on_larger_size_node, move_action->tablet_index_); + ASSERT_EQ(larger_size_node_index, move_action->source_node_index_); + ASSERT_EQ(smaller_size_node_index, move_action->dest_node_index_); + + cluster_->meta_table_node_index_ = smaller_size_node_index; + ASSERT_EQ(larger_size_node_index, size_action_generator_->PickLargestSizeNode(cluster_)); + ASSERT_EQ(larger_size_node_index, size_action_generator_->PickSmallestSizeNode(cluster_)); +} + +TEST_F(ReadLoadActionGeneratorTest, GenerateTest) { + uint32_t more_read_node_index = 0; + uint32_t less_read_node_index = 1; + cluster_->read_load_per_node_[more_read_node_index] = 20; + cluster_->read_load_per_node_[less_read_node_index] = 10; + + uint32_t tablet_index_on_more_read_node = 0; + uint32_t tablet_index_on_less_read_node = 1; + cluster_->tablet_node_num_ = 2; + cluster_->tablets_per_node_[more_read_node_index].emplace_back(tablet_index_on_more_read_node); + cluster_->tablets_per_node_[less_read_node_index].emplace_back(tablet_index_on_less_read_node); + + cluster_->node_index_sorted_by_read_load_.emplace_back(more_read_node_index); + cluster_->node_index_sorted_by_read_load_.emplace_back(less_read_node_index); + + cluster_->SortNodesByReadLoad(); + ASSERT_EQ(more_read_node_index, read_load_action_generator_->PickMostReadNode(cluster_)); + ASSERT_EQ(less_read_node_index, read_load_action_generator_->PickLeastReadNode(cluster_)); + + std::shared_ptr action(read_load_action_generator_->Generate(cluster_)); + ASSERT_EQ(Action::Type::MOVE, action->GetType()); + MoveAction* move_action = dynamic_cast(action.get()); + ASSERT_EQ(tablet_index_on_more_read_node, move_action->tablet_index_); + ASSERT_EQ(more_read_node_index, move_action->source_node_index_); + ASSERT_EQ(less_read_node_index, move_action->dest_node_index_); + + cluster_->meta_table_node_index_ = less_read_node_index; + ASSERT_EQ(more_read_node_index, read_load_action_generator_->PickMostReadNode(cluster_)); + ASSERT_EQ(more_read_node_index, read_load_action_generator_->PickLeastReadNode(cluster_)); +} + +TEST_F(WriteLoadActionGeneratorTest, GenerateTest) { + uint32_t more_write_node_index = 0; + uint32_t less_write_node_index = 1; + cluster_->write_load_per_node_[more_write_node_index] = 20; + cluster_->write_load_per_node_[less_write_node_index] = 10; + + uint32_t tablet_index_on_more_write_node = 0; + uint32_t tablet_index_on_less_write_node = 1; + cluster_->tablet_node_num_ = 2; + cluster_->tablets_per_node_[more_write_node_index].emplace_back(tablet_index_on_more_write_node); + cluster_->tablets_per_node_[less_write_node_index].emplace_back(tablet_index_on_less_write_node); + + cluster_->node_index_sorted_by_write_load_.emplace_back(more_write_node_index); + cluster_->node_index_sorted_by_write_load_.emplace_back(less_write_node_index); + + cluster_->SortNodesByWriteLoad(); + ASSERT_EQ(more_write_node_index, write_load_action_generator_->PickMostWriteNode(cluster_)); + ASSERT_EQ(less_write_node_index, write_load_action_generator_->PickLeastWriteNode(cluster_)); + + std::shared_ptr action(write_load_action_generator_->Generate(cluster_)); + ASSERT_EQ(Action::Type::MOVE, action->GetType()); + MoveAction* move_action = dynamic_cast(action.get()); + ASSERT_EQ(tablet_index_on_more_write_node, move_action->tablet_index_); + ASSERT_EQ(more_write_node_index, move_action->source_node_index_); + ASSERT_EQ(less_write_node_index, move_action->dest_node_index_); + + cluster_->meta_table_node_index_ = less_write_node_index; + ASSERT_EQ(more_write_node_index, write_load_action_generator_->PickMostWriteNode(cluster_)); + ASSERT_EQ(more_write_node_index, write_load_action_generator_->PickLeastWriteNode(cluster_)); +} + +TEST_F(ScanLoadActionGeneratorTest, GenerateTest) { + uint32_t more_scan_node_index = 0; + uint32_t less_scan_node_index = 1; + cluster_->scan_load_per_node_[more_scan_node_index] = 20; + cluster_->scan_load_per_node_[less_scan_node_index] = 10; + + uint32_t tablet_index_on_more_scan_node = 0; + uint32_t tablet_index_on_less_scan_node = 1; + cluster_->tablet_node_num_ = 2; + cluster_->tablets_per_node_[more_scan_node_index].emplace_back(tablet_index_on_more_scan_node); + cluster_->tablets_per_node_[less_scan_node_index].emplace_back(tablet_index_on_less_scan_node); + + cluster_->node_index_sorted_by_scan_load_.emplace_back(more_scan_node_index); + cluster_->node_index_sorted_by_scan_load_.emplace_back(less_scan_node_index); + + cluster_->SortNodesByScanLoad(); + ASSERT_EQ(more_scan_node_index, scan_load_action_generator_->PickMostScanNode(cluster_)); + ASSERT_EQ(less_scan_node_index, scan_load_action_generator_->PickLeastScanNode(cluster_)); + + std::shared_ptr action(scan_load_action_generator_->Generate(cluster_)); + ASSERT_EQ(Action::Type::MOVE, action->GetType()); + MoveAction* move_action = dynamic_cast(action.get()); + ASSERT_EQ(tablet_index_on_more_scan_node, move_action->tablet_index_); + ASSERT_EQ(more_scan_node_index, move_action->source_node_index_); + ASSERT_EQ(less_scan_node_index, move_action->dest_node_index_); + + cluster_->meta_table_node_index_ = less_scan_node_index; + ASSERT_EQ(more_scan_node_index, scan_load_action_generator_->PickMostScanNode(cluster_)); + ASSERT_EQ(more_scan_node_index, scan_load_action_generator_->PickLeastScanNode(cluster_)); +} + +} // namespace load_balancer +} // namespace tera + +/* vim: set expandtab ts=4 sw=4 sts=4 tw=100: */ diff --git a/src/load_balancer/test/actions_test.cc b/src/load_balancer/test/actions_test.cc new file mode 100644 index 000000000..28096efa3 --- /dev/null +++ b/src/load_balancer/test/actions_test.cc @@ -0,0 +1,29 @@ +// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "gflags/gflags.h" +#include "glog/logging.h" +#include "gtest/gtest.h" + +#include "load_balancer/actions.h" + +namespace tera { +namespace load_balancer { + +class ActionsTest : public ::testing::Test { +}; + +TEST_F(ActionsTest, MoveActionTest) { + MoveAction move_action(0, 0, 1); + std::shared_ptr undo_action(dynamic_cast(move_action.UndoAction())); + + ASSERT_EQ(move_action.tablet_index_, undo_action->tablet_index_); + ASSERT_EQ(move_action.source_node_index_, undo_action->dest_node_index_); + ASSERT_EQ(move_action.dest_node_index_, undo_action->source_node_index_); +} + +} // namespace load_balancer +} // namespace tera + +/* vim: set expandtab ts=4 sw=4 sts=4 tw=100: */ diff --git a/src/load_balancer/test/balancer_test_main.cc b/src/load_balancer/test/balancer_test_main.cc new file mode 100644 index 000000000..c08b2451d --- /dev/null +++ b/src/load_balancer/test/balancer_test_main.cc @@ -0,0 +1,29 @@ +// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include + +#include "gflags/gflags.h" +#include "glog/logging.h" +#include "gtest/gtest.h" + +#include "utils/utils_cmd.h" + +int main(int argc, char** argv) { + ::google::InitGoogleLogging(argv[0]); + FLAGS_v = 16; + FLAGS_minloglevel=0; + FLAGS_log_dir = "./log"; + if (access(FLAGS_log_dir.c_str(), F_OK)) { + mkdir(FLAGS_log_dir.c_str(), 0777); + } + std::string pragram_name("load balancer"); + tera::utils::SetupLog(pragram_name); + ::google::ParseCommandLineFlags(&argc, &argv, true); + ::testing::InitGoogleTest(&argc, argv); + + return RUN_ALL_TESTS(); +} + +/* vim: set expandtab ts=4 sw=4 sts=4 tw=100: */ diff --git a/src/load_balancer/test/cluster_test.cc b/src/load_balancer/test/cluster_test.cc new file mode 100644 index 000000000..026ad1b78 --- /dev/null +++ b/src/load_balancer/test/cluster_test.cc @@ -0,0 +1,391 @@ +// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "gflags/gflags.h" +#include "glog/logging.h" +#include "gtest/gtest.h" + +#include "load_balancer/actions.h" +#include "load_balancer/cluster.h" +#include "load_balancer/lb_node.h" +#include "common/timer.h" + +namespace tera { +namespace load_balancer { + +class ClusterTest : public ::testing::Test { +public: + virtual void SetUp() { + std::vector> empty_lb_nodes; + LBOptions options; + cluster_.reset(new Cluster(empty_lb_nodes, options)); + } + + virtual void TearDown() { + } + +private: + std::shared_ptr cluster_; +}; + +TEST_F(ClusterTest, ValidActionTest) { + TabletMeta tablet_meta_meta; + TabletMeta tablet_meta_other; + tera::master::TabletPtr tablet_ptr_meta(new tera::master::Tablet(tablet_meta_meta)); + tera::master::TabletPtr tablet_ptr_other(new tera::master::Tablet(tablet_meta_other)); + std::shared_ptr lb_tablet_meta = std::make_shared(); + std::shared_ptr lb_tablet_other = std::make_shared(); + lb_tablet_meta->tablet_ptr = tablet_ptr_meta; + lb_tablet_other->tablet_ptr = tablet_ptr_other; + + cluster_->lb_options_.meta_table_name = "meta_table"; + uint32_t table_index_meta = 0; + uint32_t table_index_other= 1; + cluster_->tables_[table_index_meta] = "meta_table"; + cluster_->tables_[table_index_other] = "other_table"; + uint32_t tablet_index_meta = 0; + uint32_t tablet_index_other = 1; + cluster_->tablet_index_to_table_index_[tablet_index_meta] = table_index_meta; + cluster_->tablet_index_to_table_index_[tablet_index_other] = table_index_other; + cluster_->tablets_[tablet_index_meta] = lb_tablet_meta; + cluster_->tablets_[tablet_index_other] = lb_tablet_other; + + uint32_t meta_table_node_index = 0; + uint32_t other_node_index = 1; + cluster_->meta_table_node_index_ = meta_table_node_index; + + // empty action is invalid + std::shared_ptr empty_action(new EmptyAction()); + ASSERT_FALSE(cluster_->ValidAction(empty_action)); + + std::shared_ptr normal_move_action(new MoveAction(tablet_index_meta, 0, 1)); + // move not ready tablet is invalid + ASSERT_TRUE(cluster_->tablets_[tablet_index_meta]->tablet_ptr->SetStatus(kTableOffLine)); + ASSERT_FALSE(cluster_->ValidAction(normal_move_action)); + + // move meta table is invalid + std::shared_ptr move_meta_table_action(new MoveAction(tablet_index_meta, 0, 1)); + ASSERT_TRUE(cluster_->tablets_[tablet_index_meta]->tablet_ptr->SetStatus(kTableReady)); + ASSERT_FALSE(cluster_->ValidAction(move_meta_table_action)); + // move nomal tablet is valid + std::shared_ptr move_other_table_action(new MoveAction(tablet_index_other, 0, 1)); + ASSERT_TRUE(cluster_->tablets_[tablet_index_other]->tablet_ptr->SetStatus(kTableReady)); + ASSERT_TRUE(cluster_->ValidAction(move_other_table_action)); + + std::shared_ptr move_to_meta_table_node_action(new MoveAction(tablet_index_other, 0, meta_table_node_index)); + std::shared_ptr move_to_other_node_action(new MoveAction(tablet_index_other, 0, other_node_index)); + cluster_->lb_options_.meta_table_isolate_enabled = true; + // move tablet to meta node is invalid if meta_table_isolate_enabled is true + ASSERT_FALSE(cluster_->ValidAction(move_to_meta_table_node_action)); + // move tablet to normal node is valid even if meta_table_isolate_enabled is true + ASSERT_TRUE(cluster_->ValidAction(move_to_other_node_action)); + cluster_->lb_options_.meta_table_isolate_enabled = false; + // move tablet to any node is valid if meta_table_isolate_enabled is true + ASSERT_TRUE(cluster_->ValidAction(move_to_meta_table_node_action)); + ASSERT_TRUE(cluster_->ValidAction(move_to_other_node_action)); +} + +TEST_F(ClusterTest, RegisterTabletTest) { + TabletMeta tablet_meta_meta; + tablet_meta_meta.set_table_name("meta_table"); + tablet_meta_meta.set_path("path/meta_table"); + tera::master::TabletPtr tablet_ptr_meta(new tera::master::Tablet(tablet_meta_meta)); + std::shared_ptr lb_tablet_meta = std::make_shared(); + lb_tablet_meta->tablet_ptr = tablet_ptr_meta; + + uint32_t tablet_index_0 = 0; + uint32_t node_index_0 = 0; + cluster_->RegisterTablet(lb_tablet_meta, tablet_index_0, node_index_0); + + ASSERT_EQ(1, cluster_->table_num_); + ASSERT_EQ(1, cluster_->tables_.size()); + ASSERT_STREQ("meta_table", cluster_->tables_[0].c_str()); + ASSERT_EQ(0, cluster_->tables_to_index_["meta_table"]); + + ASSERT_EQ(tablet_index_0, cluster_->tablets_to_index_["path/meta_table"]); + + ASSERT_EQ(node_index_0, cluster_->tablet_index_to_node_index_[tablet_index_0]); + ASSERT_EQ(node_index_0, cluster_->initial_tablet_index_to_node_index_[tablet_index_0]); + ASSERT_EQ(0, cluster_->tablet_index_to_table_index_[tablet_index_0]); +} + +TEST_F(ClusterTest, AddTabletTest) { + TabletMeta tablet_meta_meta; + tablet_meta_meta.set_size(10); + tera::master::TabletPtr tablet_ptr_meta(new tera::master::Tablet(tablet_meta_meta)); + tablet_ptr_meta->average_counter_.set_read_rows(20); + tablet_ptr_meta->average_counter_.set_write_rows(30); + tablet_ptr_meta->average_counter_.set_scan_rows(40); + std::shared_ptr lb_tablet_meta = std::make_shared(); + lb_tablet_meta->tablet_ptr = tablet_ptr_meta; + + uint32_t tablet_index = 0; + cluster_->tablets_[tablet_index] = lb_tablet_meta; + + uint32_t node_index = 0; + cluster_->size_per_node_[node_index] = 0; + cluster_->read_load_per_node_[node_index] = 0; + cluster_->write_load_per_node_[node_index] = 0; + cluster_->scan_load_per_node_[node_index] = 0; + + cluster_->AddTablet(tablet_index, node_index); + + ASSERT_EQ(1, cluster_->tablets_per_node_.size()); + ASSERT_EQ(10, cluster_->size_per_node_[node_index]); + ASSERT_EQ(20, cluster_->read_load_per_node_[node_index]); + ASSERT_EQ(30, cluster_->write_load_per_node_[node_index]); + ASSERT_EQ(40, cluster_->scan_load_per_node_[node_index]); +} + +TEST_F(ClusterTest, RemoveTabletTest) { + TabletMeta tablet_meta_meta; + tablet_meta_meta.set_size(10); + tera::master::TabletPtr tablet_ptr_meta(new tera::master::Tablet(tablet_meta_meta)); + tablet_ptr_meta->average_counter_.set_read_rows(20); + tablet_ptr_meta->average_counter_.set_write_rows(30); + tablet_ptr_meta->average_counter_.set_scan_rows(40); + std::shared_ptr lb_tablet_meta = std::make_shared(); + lb_tablet_meta->tablet_ptr = tablet_ptr_meta; + + uint32_t tablet_index = 0; + cluster_->tablets_[tablet_index] = lb_tablet_meta; + + uint32_t node_index = 0; + cluster_->tablets_per_node_[node_index].emplace_back(tablet_index); + + cluster_->size_per_node_[node_index] = 10; + cluster_->read_load_per_node_[node_index] = 20; + cluster_->write_load_per_node_[node_index] = 30; + cluster_->scan_load_per_node_[node_index] = 40; + + cluster_->RemoveTablet(tablet_index, node_index); + + ASSERT_EQ(0, cluster_->tablets_per_node_[node_index].size()); + ASSERT_EQ(0, cluster_->size_per_node_[node_index]); + ASSERT_EQ(0, cluster_->read_load_per_node_[node_index]); + ASSERT_EQ(0, cluster_->write_load_per_node_[node_index]); + ASSERT_EQ(0, cluster_->scan_load_per_node_[node_index]); +} + +TEST_F(ClusterTest, MoveTabletTest) { + TabletMeta tablet_meta_meta; + tablet_meta_meta.set_size(10); + tera::master::TabletPtr tablet_ptr_meta(new tera::master::Tablet(tablet_meta_meta)); + tablet_ptr_meta->average_counter_.set_read_rows(20); + tablet_ptr_meta->average_counter_.set_write_rows(30); + tablet_ptr_meta->average_counter_.set_scan_rows(40); + std::shared_ptr lb_tablet_meta = std::make_shared(); + lb_tablet_meta->tablet_ptr = tablet_ptr_meta; + + uint32_t tablet_index = 0; + uint32_t first_node_index = 0; + uint32_t second_node_index = 1; + uint32_t third_node_index = 2; + + cluster_->tablets_[tablet_index] = lb_tablet_meta; + cluster_->tablet_moved_num_ = 0; + cluster_->initial_tablet_index_to_node_index_[tablet_index] = first_node_index; + cluster_->tablet_index_to_node_index_[tablet_index] = first_node_index; + cluster_->abnormal_nodes_index_.insert(second_node_index); + cluster_->read_pending_nodes_index_.insert(second_node_index); + cluster_->write_pending_nodes_index_.insert(second_node_index); + cluster_->scan_pending_nodes_index_.insert(second_node_index); + + ASSERT_EQ(0, cluster_->tablets_moved_too_frequently_.size()); + ASSERT_EQ(0, cluster_->tablets_moved_to_abnormal_nodes_.size()); + ASSERT_EQ(0, cluster_->tablets_moved_to_read_pending_nodes_.size()); + ASSERT_EQ(0, cluster_->tablets_moved_to_write_pending_nodes_.size()); + ASSERT_EQ(0, cluster_->tablets_moved_to_scan_pending_nodes_.size()); + ASSERT_TRUE(cluster_->tablets_[tablet_index]->tablet_ptr->SetStatus(kTableReady)); + int64_t current_time_us = tera::get_micros(); + cluster_->lb_options_.tablet_move_too_frequently_threshold_s = 600; + cluster_->tablets_[tablet_index]->tablet_ptr->last_move_time_us_ = current_time_us; + + cluster_->MoveTablet(tablet_index, first_node_index, second_node_index); + ASSERT_EQ(first_node_index, cluster_->initial_tablet_index_to_node_index_[tablet_index]); + ASSERT_EQ(second_node_index, cluster_->tablet_index_to_node_index_[tablet_index]); + ASSERT_EQ(1, cluster_->tablet_moved_num_); + ASSERT_EQ(1, cluster_->tablets_moved_too_frequently_.size()); + ASSERT_EQ(1, cluster_->tablets_moved_to_abnormal_nodes_.size()); + ASSERT_EQ(1, cluster_->tablets_moved_to_read_pending_nodes_.size()); + ASSERT_EQ(1, cluster_->tablets_moved_to_write_pending_nodes_.size()); + ASSERT_EQ(1, cluster_->tablets_moved_to_scan_pending_nodes_.size()); + + cluster_->MoveTablet(tablet_index, second_node_index, third_node_index); + ASSERT_EQ(first_node_index, cluster_->initial_tablet_index_to_node_index_[tablet_index]); + ASSERT_EQ(third_node_index, cluster_->tablet_index_to_node_index_[tablet_index]); + ASSERT_EQ(1, cluster_->tablet_moved_num_); + ASSERT_EQ(1, cluster_->tablets_moved_too_frequently_.size()); + ASSERT_EQ(0, cluster_->tablets_moved_to_abnormal_nodes_.size()); + ASSERT_EQ(0, cluster_->tablets_moved_to_read_pending_nodes_.size()); + ASSERT_EQ(0, cluster_->tablets_moved_to_write_pending_nodes_.size()); + ASSERT_EQ(0, cluster_->tablets_moved_to_scan_pending_nodes_.size()); + + cluster_->MoveTablet(tablet_index, third_node_index, first_node_index); + ASSERT_EQ(first_node_index, cluster_->initial_tablet_index_to_node_index_[tablet_index]); + ASSERT_EQ(first_node_index, cluster_->tablet_index_to_node_index_[tablet_index]); + ASSERT_EQ(0, cluster_->tablet_moved_num_); + ASSERT_EQ(0, cluster_->tablets_moved_too_frequently_.size()); + ASSERT_EQ(0, cluster_->tablets_moved_to_abnormal_nodes_.size()); + ASSERT_EQ(0, cluster_->tablets_moved_to_read_pending_nodes_.size()); + ASSERT_EQ(0, cluster_->tablets_moved_to_write_pending_nodes_.size()); + ASSERT_EQ(0, cluster_->tablets_moved_to_scan_pending_nodes_.size()); + + cluster_->tablets_[tablet_index]->tablet_ptr->last_move_time_us_ = current_time_us - 2 * cluster_->lb_options_.tablet_move_too_frequently_threshold_s * 1000000; + cluster_->MoveTablet(tablet_index, first_node_index, second_node_index); + ASSERT_EQ(0, cluster_->tablets_moved_too_frequently_.size()); + ASSERT_EQ(1, cluster_->tablets_moved_to_abnormal_nodes_.size()); + ASSERT_EQ(1, cluster_->tablets_moved_to_read_pending_nodes_.size()); + ASSERT_EQ(1, cluster_->tablets_moved_to_write_pending_nodes_.size()); + ASSERT_EQ(1, cluster_->tablets_moved_to_scan_pending_nodes_.size()); +} + +TEST_F(ClusterTest, AbnormalNodeConstructTest) { + TabletMeta tablet_meta_0; + tablet_meta_0.set_path("path/meta_0"); + tera::master::TabletPtr tablet_ptr_0(new tera::master::Tablet(tablet_meta_0)); + std::shared_ptr lb_tablet_0 = std::make_shared(); + lb_tablet_0->tablet_ptr = tablet_ptr_0; + + TabletMeta tablet_meta_1; + tablet_meta_1.set_path("path/meta_1"); + tera::master::TabletPtr tablet_ptr_1(new tera::master::Tablet(tablet_meta_1)); + std::shared_ptr lb_tablet_1 = std::make_shared(); + lb_tablet_1->tablet_ptr = tablet_ptr_1; + + TabletMeta tablet_meta_2; + tablet_meta_2.set_path("path/meta_2"); + tera::master::TabletPtr tablet_ptr_2(new tera::master::Tablet(tablet_meta_2)); + std::shared_ptr lb_tablet_2 = std::make_shared(); + lb_tablet_2->tablet_ptr = tablet_ptr_2; + + tera::master::TabletNodePtr tablet_node_ptr(new tera::master::TabletNode()); + tablet_node_ptr->addr_ = "127.0.0.1:2200"; + std::shared_ptr lb_node = std::make_shared(); + lb_node->tablet_node_ptr = tablet_node_ptr; + lb_node->tablets.emplace_back(lb_tablet_0); + lb_node->tablets.emplace_back(lb_tablet_1); + lb_node->tablets.emplace_back(lb_tablet_2); + + std::vector> lb_nodes; + lb_nodes.emplace_back(lb_node); + + LBOptions options; + options.abnormal_node_ratio = 0.5; + + tablet_ptr_0->SetStatus(kTableReady); + tablet_ptr_1->SetStatus(kTableReady); + tablet_ptr_2->SetStatus(kTableReady); + cluster_.reset(new Cluster(lb_nodes, options)); + ASSERT_EQ(0, cluster_->initial_tablets_not_ready_per_node_[0].size()); + ASSERT_EQ(0, cluster_->abnormal_nodes_index_.size()); + + tablet_ptr_0->SetStatus(kTableOffLine); + cluster_.reset(new Cluster(lb_nodes, options)); + ASSERT_EQ(1, cluster_->initial_tablets_not_ready_per_node_[0].size()); + ASSERT_EQ(0, cluster_->abnormal_nodes_index_.size()); + + tablet_ptr_1->SetStatus(kTableOffLine); + cluster_.reset(new Cluster(lb_nodes, options)); + ASSERT_EQ(2, cluster_->initial_tablets_not_ready_per_node_[0].size()); + ASSERT_EQ(1, cluster_->abnormal_nodes_index_.size()); +} + +TEST_F(ClusterTest, SortNodesByTabletCount) { + cluster_->tablets_per_node_[0].emplace_back(0); + cluster_->tablets_per_node_[0].emplace_back(1); + cluster_->tablets_per_node_[1].emplace_back(2); + cluster_->tablets_per_node_[2].emplace_back(3); + cluster_->tablets_per_node_[2].emplace_back(4); + cluster_->tablets_per_node_[2].emplace_back(5); + + cluster_->node_index_sorted_by_tablet_count_.emplace_back(0); + cluster_->node_index_sorted_by_tablet_count_.emplace_back(1); + cluster_->node_index_sorted_by_tablet_count_.emplace_back(2); + ASSERT_EQ(0, cluster_->node_index_sorted_by_tablet_count_[0]); + ASSERT_EQ(1, cluster_->node_index_sorted_by_tablet_count_[1]); + ASSERT_EQ(2, cluster_->node_index_sorted_by_tablet_count_[2]); + + cluster_->SortNodesByTabletCount(); + ASSERT_EQ(1, cluster_->node_index_sorted_by_tablet_count_[0]); + ASSERT_EQ(0, cluster_->node_index_sorted_by_tablet_count_[1]); + ASSERT_EQ(2, cluster_->node_index_sorted_by_tablet_count_[2]); +} + +TEST_F(ClusterTest, SortNodesBySizeTest) { + cluster_->size_per_node_[0] = 20; + cluster_->size_per_node_[1] = 10; + cluster_->size_per_node_[2] = 30; + + cluster_->node_index_sorted_by_size_.emplace_back(0); + cluster_->node_index_sorted_by_size_.emplace_back(1); + cluster_->node_index_sorted_by_size_.emplace_back(2); + ASSERT_EQ(0, cluster_->node_index_sorted_by_size_[0]); + ASSERT_EQ(1, cluster_->node_index_sorted_by_size_[1]); + ASSERT_EQ(2, cluster_->node_index_sorted_by_size_[2]); + + cluster_->SortNodesBySize(); + ASSERT_EQ(1, cluster_->node_index_sorted_by_size_[0]); + ASSERT_EQ(0, cluster_->node_index_sorted_by_size_[1]); + ASSERT_EQ(2, cluster_->node_index_sorted_by_size_[2]); +} + +TEST_F(ClusterTest, SortNodesByReadLoad) { + cluster_->read_load_per_node_[0] = 20; + cluster_->read_load_per_node_[1] = 10; + cluster_->read_load_per_node_[2] = 30; + + cluster_->node_index_sorted_by_read_load_.emplace_back(0); + cluster_->node_index_sorted_by_read_load_.emplace_back(1); + cluster_->node_index_sorted_by_read_load_.emplace_back(2); + ASSERT_EQ(0, cluster_->node_index_sorted_by_read_load_[0]); + ASSERT_EQ(1, cluster_->node_index_sorted_by_read_load_[1]); + ASSERT_EQ(2, cluster_->node_index_sorted_by_read_load_[2]); + + cluster_->SortNodesByReadLoad(); + ASSERT_EQ(1, cluster_->node_index_sorted_by_read_load_[0]); + ASSERT_EQ(0, cluster_->node_index_sorted_by_read_load_[1]); + ASSERT_EQ(2, cluster_->node_index_sorted_by_read_load_[2]); +} + +TEST_F(ClusterTest, SortNodesByWriteLoad) { + cluster_->write_load_per_node_[0] = 20; + cluster_->write_load_per_node_[1] = 10; + cluster_->write_load_per_node_[2] = 30; + + cluster_->node_index_sorted_by_write_load_.emplace_back(0); + cluster_->node_index_sorted_by_write_load_.emplace_back(1); + cluster_->node_index_sorted_by_write_load_.emplace_back(2); + ASSERT_EQ(0, cluster_->node_index_sorted_by_write_load_[0]); + ASSERT_EQ(1, cluster_->node_index_sorted_by_write_load_[1]); + ASSERT_EQ(2, cluster_->node_index_sorted_by_write_load_[2]); + + cluster_->SortNodesByWriteLoad(); + ASSERT_EQ(1, cluster_->node_index_sorted_by_write_load_[0]); + ASSERT_EQ(0, cluster_->node_index_sorted_by_write_load_[1]); + ASSERT_EQ(2, cluster_->node_index_sorted_by_write_load_[2]); +} + +TEST_F(ClusterTest, SortNodesByScanLoad) { + cluster_->scan_load_per_node_[0] = 20; + cluster_->scan_load_per_node_[1] = 10; + cluster_->scan_load_per_node_[2] = 30; + + cluster_->node_index_sorted_by_scan_load_.emplace_back(0); + cluster_->node_index_sorted_by_scan_load_.emplace_back(1); + cluster_->node_index_sorted_by_scan_load_.emplace_back(2); + ASSERT_EQ(0, cluster_->node_index_sorted_by_scan_load_[0]); + ASSERT_EQ(1, cluster_->node_index_sorted_by_scan_load_[1]); + ASSERT_EQ(2, cluster_->node_index_sorted_by_scan_load_[2]); + + cluster_->SortNodesByScanLoad(); + ASSERT_EQ(1, cluster_->node_index_sorted_by_scan_load_[0]); + ASSERT_EQ(0, cluster_->node_index_sorted_by_scan_load_[1]); + ASSERT_EQ(2, cluster_->node_index_sorted_by_scan_load_[2]); +} + +} // namespace load_balancer +} // namespace tera + +/* vim: set expandtab ts=4 sw=4 sts=4 tw=100: */ diff --git a/src/load_balancer/test/cost_functions_test.cc b/src/load_balancer/test/cost_functions_test.cc new file mode 100644 index 000000000..84f546fba --- /dev/null +++ b/src/load_balancer/test/cost_functions_test.cc @@ -0,0 +1,176 @@ +// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "gflags/gflags.h" +#include "glog/logging.h" +#include "gtest/gtest.h" + +#include "load_balancer/cost_functions.h" +#include "load_balancer/random.h" + +namespace tera { +namespace load_balancer { + +class CostFunctionTest : public ::testing::Test { +public: + virtual void SetUp() { + move_cost_function_.reset(new MoveCountCostFunction(lb_options_)); + } + + virtual void TearDown() { + } + +private: + LBOptions lb_options_; + std::shared_ptr move_cost_function_; +}; + +class MoveCountCostFunctionTest : public ::testing::Test { +public: + virtual void SetUp() { + move_cost_function_.reset(new MoveCountCostFunction(lb_options_)); + + std::vector> empty_lb_nodes; + LBOptions options; + cluster_.reset(new Cluster(empty_lb_nodes, options)); + + move_cost_function_->Init(cluster_); + } + + virtual void TearDown() { + } + +private: + LBOptions lb_options_; + std::shared_ptr move_cost_function_; + std::shared_ptr cluster_; +}; + +class TabletCountCostFunctionTest : public ::testing::Test { +public: + virtual void SetUp() { + tablet_count_cost_function_.reset(new TabletCountCostFunction(lb_options_)); + + std::vector> empty_lb_nodes; + LBOptions options; + cluster_.reset(new Cluster(empty_lb_nodes, options)); + + tablet_count_cost_function_->Init(cluster_); + } + + virtual void TearDown() { + } + +private: + LBOptions lb_options_; + std::shared_ptr tablet_count_cost_function_; + std::shared_ptr cluster_; +}; + +class SizeCostFunctionTest : public ::testing::Test { +public: + virtual void SetUp() { + size_cost_function_.reset(new SizeCostFunction(lb_options_)); + + std::vector> empty_lb_nodes; + LBOptions options; + cluster_.reset(new Cluster(empty_lb_nodes, options)); + + size_cost_function_->Init(cluster_); + } + + virtual void TearDown() { + } + +private: + LBOptions lb_options_; + std::shared_ptr size_cost_function_; + std::shared_ptr cluster_; +}; + +TEST_F(CostFunctionTest, WeightTest) { + double w = 3.14; + move_cost_function_->SetWeight(w); + ASSERT_DOUBLE_EQ(w, move_cost_function_->GetWeight()); +} + +TEST_F(CostFunctionTest, SumTest) { + std::vector stats = {1, 2, 3}; + ASSERT_DOUBLE_EQ(6, move_cost_function_->GetSum(stats)); +} + +TEST_F(CostFunctionTest, ScaleTest) { + // value <= min + ASSERT_DOUBLE_EQ(0, move_cost_function_->Scale(0, 10, -1)); + ASSERT_DOUBLE_EQ(0, move_cost_function_->Scale(0, 10, 0)); + + // max <= min + ASSERT_DOUBLE_EQ(0, move_cost_function_->Scale(0, 0, 5)); + ASSERT_DOUBLE_EQ(0, move_cost_function_->Scale(0, -1, 5)); + + // normal case + ASSERT_DOUBLE_EQ(0, move_cost_function_->Scale(0, 10, 0)); + ASSERT_DOUBLE_EQ(0.5, move_cost_function_->Scale(0, 10, 5)); + ASSERT_DOUBLE_EQ(1, move_cost_function_->Scale(0, 10, 10)); + + // random case + size_t times = 100; + int min = 0; + int max = 10; + for (size_t i = 0; i < times; ++i) { + int value = Random::Rand(min, max + 1); + ASSERT_TRUE(move_cost_function_->Scale(min, max, value) >= 0); + ASSERT_TRUE(move_cost_function_->Scale(min, max, value) <= 1); + } +} + +TEST_F(CostFunctionTest, ScaleFromArrayTest) { + std::vector stats_0 = {0, 0}; + ASSERT_DOUBLE_EQ(0, move_cost_function_->ScaleFromArray(stats_0)); + + std::vector stats_1 = {10, 10}; + ASSERT_DOUBLE_EQ(0, move_cost_function_->ScaleFromArray(stats_0)); + + int begin = 0; + int end = 100; + size_t times = 100; + std::vector stats_2; + for (size_t i = 0; i < times; ++i) { + stats_2.clear(); + stats_2.emplace_back(Random::Rand(begin, end)); + stats_2.emplace_back(Random::Rand(begin, end)); + + ASSERT_TRUE(move_cost_function_->ScaleFromArray(stats_2) >= 0); + ASSERT_TRUE(move_cost_function_->ScaleFromArray(stats_2) <= 1); + } +} + +TEST_F(MoveCountCostFunctionTest, CostTest) { + move_cost_function_->tablet_max_move_num_ = 10; + move_cost_function_->tablet_max_move_percent_ = 0.05; + cluster_->tablet_num_ = 100; + + cluster_->tablet_moved_num_ = 1; + ASSERT_DOUBLE_EQ(0.1, move_cost_function_->Cost()); + + cluster_->tablet_moved_num_ = 6; + ASSERT_DOUBLE_EQ(0.6, move_cost_function_->Cost()); + + cluster_->tablet_moved_num_ = 10; + ASSERT_DOUBLE_EQ(1, move_cost_function_->Cost()); + + cluster_->tablet_moved_num_ = 11; + ASSERT_DOUBLE_EQ(move_cost_function_->kExpensiveCost, move_cost_function_->Cost()); +} + +TEST_F(TabletCountCostFunctionTest, CostTest) { +} + +TEST_F(SizeCostFunctionTest, CostTest) { +} + +} // namespace load_balancer +} // namespace tera + +/* vim: set expandtab ts=4 sw=4 sts=4 tw=100: */ diff --git a/src/load_balancer/test/random_test.cc b/src/load_balancer/test/random_test.cc new file mode 100644 index 000000000..385b76877 --- /dev/null +++ b/src/load_balancer/test/random_test.cc @@ -0,0 +1,44 @@ +// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "gflags/gflags.h" +#include "glog/logging.h" +#include "gtest/gtest.h" + +#include "load_balancer/random.h" + +namespace tera { +namespace load_balancer { + +class RandomTest : public ::testing::Test { +}; + +TEST_F(RandomTest, CommonTest) { + int start = 0; + int end = 3; + size_t times = 100; + + for (size_t i = 0; i < times; ++i) { + int rand = Random::Rand(start, end); + ASSERT_TRUE(rand >= start); + ASSERT_TRUE(rand < end); + } +} + +TEST_F(RandomTest, NegativeTest) { + int start = -10; + int end = 10; + size_t times = 100; + + for (size_t i = 0; i < times; ++i) { + int rand = Random::RandStd(start, end); + ASSERT_TRUE(rand >= start); + ASSERT_TRUE(rand < end); + } +} + +} // namespace load_balancer +} // namespace tera + +/* vim: set expandtab ts=4 sw=4 sts=4 tw=100: */ diff --git a/src/load_balancer/unity_balancer.cc b/src/load_balancer/unity_balancer.cc new file mode 100644 index 000000000..a6279d16f --- /dev/null +++ b/src/load_balancer/unity_balancer.cc @@ -0,0 +1,264 @@ +// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "load_balancer/unity_balancer.h" + +#include +#include + +#include "glog/logging.h" +#include "load_balancer/random.h" +#include "common/timer.h" + +namespace tera { +namespace load_balancer { + +using tera::master::TabletNodePtr; +using tera::master::TabletPtr; + +UnityBalancer::UnityBalancer(const LBOptions& options) : + lb_options_(options) { + // cost functions + if (lb_options_.move_count_cost_weight > 0) { + cost_functions_.emplace_back(new MoveCountCostFunction(options)); + VLOG(20) << "[lb] " << cost_functions_[cost_functions_.size() - 1]->Name() << " enabled"; + } + if (lb_options_.move_frequency_cost_weight > 0){ + cost_functions_.emplace_back(new MoveFrequencyCostFunction(options)); + VLOG(20) << "[lb] " << cost_functions_[cost_functions_.size() - 1]->Name() << " enabled"; + } + if (lb_options_.abnormal_node_cost_weight > 0) { + cost_functions_.emplace_back(new AbnormalNodeCostFunction(options)); + VLOG(20) << "[lb] " << cost_functions_[cost_functions_.size() - 1]->Name() << " enabled"; + } + if (lb_options_.read_pending_node_cost_weight > 0) { + cost_functions_.emplace_back(new ReadPendingNodeCostFunction(options)); + VLOG(20) << "[lb] " << cost_functions_[cost_functions_.size() - 1]->Name() << " enabled"; + } + if (lb_options_.write_pending_node_cost_weight > 0) { + cost_functions_.emplace_back(new WritePendingNodeCostFunction(options)); + VLOG(20) << "[lb] " << cost_functions_[cost_functions_.size() - 1]->Name() << " enabled"; + } + if (lb_options_.scan_pending_node_cost_weight > 0) { + cost_functions_.emplace_back(new ScanPendingNodeCostFunction(options)); + VLOG(20) << "[lb] " << cost_functions_[cost_functions_.size() - 1]->Name() << " enabled"; + } + if (lb_options_.tablet_count_cost_weight > 0) { + cost_functions_.emplace_back(new TabletCountCostFunction(options)); + VLOG(20) << "[lb] " << cost_functions_[cost_functions_.size() - 1]->Name() << " enabled"; + } + if (lb_options_.size_cost_weight > 0) { + cost_functions_.emplace_back(new SizeCostFunction(options)); + VLOG(20) << "[lb] " << cost_functions_[cost_functions_.size() - 1]->Name() << " enabled"; + } + if (lb_options_.read_load_cost_weight > 0) { + cost_functions_.emplace_back(new ReadLoadCostFunction(options)); + VLOG(20) << "[lb] " << cost_functions_[cost_functions_.size() - 1]->Name() << " enabled"; + } + if (lb_options_.write_load_cost_weight > 0) { + cost_functions_.emplace_back(new WriteLoadCostFunction(options)); + VLOG(20) << "[lb] " << cost_functions_[cost_functions_.size() - 1]->Name() << " enabled"; + } + if (lb_options_.scan_load_cost_weight > 0) { + cost_functions_.emplace_back(new ScanLoadCostFunction(options)); + VLOG(20) << "[lb] " << cost_functions_[cost_functions_.size() - 1]->Name() << " enabled"; + } + + // action generators + action_generators_.emplace_back(new RandomActionGenerator()); + VLOG(20) << "[lb] " << action_generators_[action_generators_.size() - 1]->Name() << " enabled"; + if (lb_options_.tablet_count_cost_weight > 0) { + action_generators_.emplace_back(new TabletCountActionGenerator()); + VLOG(20) << "[lb] " << action_generators_[action_generators_.size() - 1]->Name() << " enabled"; + } + if (lb_options_.size_cost_weight > 0) { + action_generators_.emplace_back(new SizeActionGenerator()); + VLOG(20) << "[lb] " << action_generators_[action_generators_.size() - 1]->Name() << " enabled"; + } + if (lb_options_.read_load_cost_weight > 0) { + action_generators_.emplace_back(new ReadLoadActionGenerator()); + VLOG(20) << "[lb] " << action_generators_[action_generators_.size() - 1]->Name() << " enabled"; + } + if (lb_options_.write_load_cost_weight > 0) { + action_generators_.emplace_back(new WriteLoadActionGenerator()); + VLOG(20) << "[lb] " << action_generators_[action_generators_.size() - 1]->Name() << " enabled"; + } + if (lb_options_.scan_load_cost_weight > 0) { + action_generators_.emplace_back(new ScanLoadActionGenerator()); + VLOG(20) << "[lb] " << action_generators_[action_generators_.size() - 1]->Name() << " enabled"; + } +} + +UnityBalancer::~UnityBalancer() { +} + +bool UnityBalancer::BalanceCluster( + const std::vector>& lb_nodes, + std::vector* plans) { + return BalanceCluster("", lb_nodes, plans); +} + +bool UnityBalancer::BalanceCluster( + const std::string& table_name, + const std::vector>& lb_nodes, + std::vector* plans) { + if (lb_nodes.size() <= 1 || plans == nullptr) { + return false; + } + + VLOG (5) << "[lb] BalanceCluster for table:" << table_name << " begin"; + + std::shared_ptr cluster = std::make_shared(lb_nodes, lb_options_); + + if (lb_options_.debug_mode_enabled) { + cluster->DebugCluster(); + } + + InitCostFunctions(cluster); + + if (!NeedBalance(cluster)) { + return true; + } + + uint64_t max_steps = std::min(lb_options_.max_compute_steps, static_cast(lb_options_.max_compute_steps_per_tablet * cluster->tablet_num_)); + double init_cost = ComputeCost(std::numeric_limits::max()); + double current_cost = init_cost; + + VLOG(5) << "[lb] compute begin, max_steps:" << max_steps << " init_cost:" << init_cost; + + int64_t start_time_ns = get_micros(); + int64_t cost_time_ms = 0; + uint64_t step = 0; + for (step = 0; step < max_steps; ++step) { + std::shared_ptr action(NextAction(cluster)); + VLOG(20) << "[lb] step:" << step << " action:" << action->ToString(); + + if (!cluster->ValidAction(action)) { + continue; + } + + cluster->DoAction(action); + + if (lb_options_.debug_mode_enabled) { + cluster->DebugCluster(); + } + + double new_cost = ComputeCost(current_cost); + if (new_cost < current_cost) { + VLOG(20) << "[lb] got lower cost!"; + current_cost = new_cost; + } else { + std::shared_ptr undo_action(action->UndoAction()); + VLOG(20) << "[lb] undo action:" << undo_action->ToString(); + cluster->DoAction(undo_action); + + if (lb_options_.debug_mode_enabled) { + cluster->DebugCluster(); + } + } + + cost_time_ms = (get_micros() - start_time_ns) / 1000; + if (static_cast(cost_time_ms) > lb_options_.max_compute_time_ms) { + VLOG(5) << "[lb] stop computing since time reach to max_compute_time_ms_:" + << lb_options_.max_compute_time_ms; + break; + } + } + + VLOG(5) << "[lb] compute end, cost time(ms):" << cost_time_ms + << " cost steps:" << step + << " init cost:" << init_cost + << " new cost:" << current_cost; + + if (current_cost < init_cost) { + CreatePlans(cluster, plans); + VLOG(5) << "[lb] balance plan size:" << plans->size(); + } else { + VLOG(5) << "[lb] no better balance plan"; + } + + VLOG (5) << "[lb] BalanceCluster for table:" << table_name << " end"; + + return true; +} + +bool UnityBalancer::NeedBalance(const std::shared_ptr& cluster) { + double total_cost = 0.0; + double total_weight = 0.0; + + for (const auto& cost_func : cost_functions_) { + double weight = cost_func->GetWeight(); + if (weight <= 0) { + continue; + } + + total_weight += weight; + total_cost += cost_func->Cost() * weight; + } + double cost = total_weight == 0 ? 0 : total_cost / total_weight; + + VLOG(5) << "[lb] NeedBalance compute, total_cost:" << total_cost + << " total_weight:" << total_weight + << " cost:" << cost + << " min_cost_need_balance:" << lb_options_.min_cost_need_balance; + + if (total_cost <= 0 || total_weight <= 0 || cost < lb_options_.min_cost_need_balance) { + LOG(INFO) << "[lb] no need to balance"; + return false; + } else { + return true; + } +} + +void UnityBalancer::InitCostFunctions(const std::shared_ptr& cluster) { + for (const auto& cost_func : cost_functions_) { + cost_func->Init(cluster); + } +} + +double UnityBalancer::ComputeCost(double previous_cost) { + VLOG(20) << "[lb] ComputeCost begin, previous cost:" << previous_cost; + double total_cost = 0.0; + + for (const auto& cost_func : cost_functions_) { + double weight = cost_func->GetWeight(); + if (weight <= 0) { + continue; + } + double cost = cost_func->Cost(); + total_cost += cost * weight; + VLOG(20) << "[lb] " << cost_func->Name() << " cost:" << cost << " weight:" << weight; + if (total_cost > previous_cost) { + break; + } + } + + VLOG(20) << "[lb] ComputeCost end, new cost:" << total_cost; + return total_cost; +} + +Action* UnityBalancer::NextAction(const std::shared_ptr& cluster) { + uint32_t rand = Random::Rand(0, action_generators_.size()); + return action_generators_[rand]->Generate(cluster); +} + +void UnityBalancer::CreatePlans(const std::shared_ptr& cluster, std::vector* plans) { + plans->clear(); + + for (uint32_t i = 0; i < cluster->tablet_index_to_node_index_.size(); ++i) { + uint32_t initial_node_index = cluster->initial_tablet_index_to_node_index_[i]; + uint32_t new_node_index = cluster->tablet_index_to_node_index_[i]; + + if (initial_node_index != new_node_index) { + // tablet has been moved to another tablet node + Plan plan(cluster->tablets_[i]->tablet_ptr, + cluster->nodes_[initial_node_index]->tablet_node_ptr, + cluster->nodes_[new_node_index]->tablet_node_ptr); + plans->emplace_back(plan); + } + } +} + +} // namespace load_balancer +} // namespace tera diff --git a/src/load_balancer/unity_balancer.h b/src/load_balancer/unity_balancer.h new file mode 100644 index 000000000..522acabff --- /dev/null +++ b/src/load_balancer/unity_balancer.h @@ -0,0 +1,58 @@ +// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef TERA_LOAD_BALANCER_UNITY_BALANCER_H_ +#define TERA_LOAD_BALANCER_UNITY_BALANCER_H_ + +#include +#include + +#include "load_balancer/action_generators.h" +#include "load_balancer/actions.h" +#include "load_balancer/balancer.h" +#include "load_balancer/cluster.h" +#include "load_balancer/cost_functions.h" + +namespace tera { +namespace load_balancer { + +class UnityBalancer : public Balancer { +public: + explicit UnityBalancer(const LBOptions& options); + virtual ~UnityBalancer(); + + virtual bool BalanceCluster( + const std::vector>& lb_nodes, + std::vector* plans) override; + + // if table_name is empty, balance whole culster, + // otherwhise balance the specified table of table_name + virtual bool BalanceCluster( + const std::string& table_name, + const std::vector>& lb_nodes, + std::vector* plans) override; + + virtual bool NeedBalance(const std::shared_ptr& cluster); + +protected: + virtual void InitCostFunctions(const std::shared_ptr& cluster); + + virtual double ComputeCost(double previous_cost); + + virtual Action* NextAction(const std::shared_ptr& cluster); + + // diff the initial cluster state with the current cluster state, then create plans + virtual void CreatePlans(const std::shared_ptr& cluster, std::vector* plans); + +private: + std::vector> cost_functions_; + std::vector> action_generators_; + + LBOptions lb_options_; +}; + +} // namespace load_balancer +} // namespace tera + +#endif // TERA_LOAD_BALANCER_UNITY_BALANCER_H_ diff --git a/src/master/availability.cc b/src/master/availability.cc index 998c14a8e..1b5c85b71 100644 --- a/src/master/availability.cc +++ b/src/master/availability.cc @@ -24,15 +24,50 @@ DECLARE_string(tera_master_meta_table_path); namespace tera { namespace master { +static std::string GetNameFromPath(const std::string& path) { + if (path == FLAGS_tera_master_meta_table_path) { + return FLAGS_tera_master_meta_table_name; + } + std::vector t; + SplitString(path, "/", &t); // table_name/tablet00...001 + if (!t.empty()) { + return t[0]; + } else { + return ""; + } +} + + TabletAvailability::TabletAvailability(std::shared_ptr t) : tablet_manager_(t) { start_ts_ = get_micros(); } -void TabletAvailability::AddNotReadyTablet(const std::string& path) { +void TabletAvailability::AddNotReadyTablet(const std::string& path, + const TabletStatus& tablet_status, + const TableStatus& table_status) { + if (tablet_status == kTableReady || table_status == kTableDisable) { + return; + } + MutexLock lock(&mutex_); int64_t ts = get_micros(); tablets_.insert(std::pair(path, ts)); + auto iter = not_ready_tablet_metrics_.emplace( + path, + MetricCounter{ + metric_name_, + "table:" + GetNameFromPath(path) + ",tablet:" + path, + {SubscriberType::LATEST}, + false + }); + + if (iter.second) { + VLOG(12) << "[Add NotReady To Metric]: " << static_cast(TabletErrorStatus::kNotReady); + iter.first->second.Set(static_cast(TabletErrorStatus::kNotReady)); + } else { + VLOG(12) << "[Add NotReady To Metric Failed]: " << static_cast(TabletErrorStatus::kNotReady); + } if (tablets_hist_cost_[path].start_ts > 0) { VLOG(10) << "notready again " << path; @@ -51,6 +86,7 @@ void TabletAvailability::AddNotReadyTablet(const std::string& path) { void TabletAvailability::EraseNotReadyTablet(const std::string& path) { MutexLock lock(&mutex_); tablets_.erase(path); + not_ready_tablet_metrics_.erase(path); if (tablets_hist_cost_.find(path) == tablets_hist_cost_.end() || tablets_hist_cost_[path].start_ts == 0) { @@ -71,22 +107,13 @@ void TabletAvailability::EraseNotReadyTablet(const std::string& path) { << ", reready " << tablets_hist_cost_[path].reready_num; } -static std::string GetNameFromPath(const std::string& path) { - if (path == FLAGS_tera_master_meta_table_path) { - return FLAGS_tera_master_meta_table_name; - } - std::vector t; - SplitString(path, "/", &t); // table_name/tablet00...001 - return t[0]; -} - void TabletAvailability::LogAvailability() { MutexLock lock(&mutex_); int64_t not_avai_count = 0; int64_t not_avai_warning = 0; int64_t not_avai_error = 0; int64_t not_avai_fatal = 0; - int64_t start = ::common::timer::get_micros(); + int64_t start = get_micros(); std::map::iterator it; for (it = tablets_.begin(); it != tablets_.end(); ++it) { std::string table_name = GetNameFromPath(it->first); @@ -99,22 +126,28 @@ void TabletAvailability::LogAvailability() { continue; } + auto metric_iter = not_ready_tablet_metrics_.find(it->first); + assert(metric_iter != not_ready_tablet_metrics_.end()); + if ((start - it->second) > FLAGS_tera_master_not_available_threshold * 1000 * 1000LL) { VLOG(12) << "[availability] not available:" << it->first; not_avai_count++; } if ((start - it->second) > FLAGS_tera_master_availability_fatal_threshold * 1000 * 1000LL) { not_avai_fatal++; + metric_iter->second.Set(static_cast(TabletErrorStatus::kFatal)); if (FLAGS_tera_master_availability_show_details_enabled) { LOG(INFO) << "[availability] fatal-tablet:" << it->first; } } else if ((start - it->second) > FLAGS_tera_master_availability_error_threshold * 1000 * 1000LL) { not_avai_error++; + metric_iter->second.Set(static_cast(TabletErrorStatus::kError)); if (FLAGS_tera_master_availability_show_details_enabled) { LOG(INFO) << "[availability] error-tablet:" << it->first; } } else if ((start - it->second) > FLAGS_tera_master_availability_warning_threshold * 1000 * 1000LL) { not_avai_warning++; + metric_iter->second.Set(static_cast(TabletErrorStatus::kWarning)); } } @@ -155,6 +188,9 @@ void TabletAvailability::LogAvailability() { } } int64_t nr_notready_tablets = tablets_hist_cost_.size(); + double time_percent = 1.0 - (double)total_time / (all_time * all_tablets + 1); + ready_time_percent.Set(static_cast(time_percent * 100)); + LOG(INFO) << "[availability][tablet_staticstic] time_interval: " << all_time / 1000 << ", notready_time: " << total_time / 1000 << ", total_time: " << (all_time * all_tablets) / 1000 @@ -165,7 +201,7 @@ void TabletAvailability::LogAvailability() { << ", notready_count: " << total_notready << ", reready_count: " << total_reready; - int64_t cost = ::common::timer::get_micros() - start; + int64_t cost = get_micros() - start; LOG(INFO) << "[availability] cost time:" << cost/1000 << " ms"; } diff --git a/src/master/availability.h b/src/master/availability.h index ddbe6a5f4..d6c133c72 100644 --- a/src/master/availability.h +++ b/src/master/availability.h @@ -6,10 +6,12 @@ #define TERA_MASTER_TABLET_AVAILABILITY_H_ #include +#include #include "master/tablet_manager.h" #include "common/mutex.h" +#include "common/metric/metric_counter.h" namespace tera { namespace master { @@ -25,16 +27,32 @@ class TabletAvailability { public: TabletAvailability(std::shared_ptr t); void LogAvailability(); - void AddNotReadyTablet(const std::string& id); + void AddNotReadyTablet(const std::string& path, + const TabletStatus& tablet_status, + const TableStatus& table_status); void EraseNotReadyTablet(const std::string& id); private: + + enum class TabletErrorStatus { + kNotReady = 1, + kFatal = 2, + kError = 3, + kWarning = 4 + }; + Mutex mutex_; std::shared_ptr tablet_manager_; + std::map tablets_; + std::map not_ready_tablet_metrics_; + MetricCounter ready_time_percent{"tera_master_tablet_ready_time_percent", + {SubscriberType::LATEST}, + false}; int64_t start_ts_; std::map tablets_hist_cost_; + const std::string metric_name_{"tera_master_tablet_availability"}; }; } // master diff --git a/src/master/gc_strategy.cc b/src/master/gc_strategy.cc index d87c96848..b87d113b1 100644 --- a/src/master/gc_strategy.cc +++ b/src/master/gc_strategy.cc @@ -8,12 +8,12 @@ #include "db/filename.h" #include "io/utils_leveldb.h" - +#include "leveldb/env_dfs.h" DECLARE_string(tera_tabletnode_path_prefix); DECLARE_string(tera_master_meta_table_name); DECLARE_int32(tera_garbage_collect_debug_log); - +DECLARE_string(tera_leveldb_env_type); namespace tera { namespace master { @@ -147,7 +147,15 @@ bool BatchGcStrategy::CollectSingleDeadTablet(const std::string& tablename, uint env->GetChildren(tablet_path, &children); list_count_.Inc(); if (children.size() == 0) { - LOG(INFO) << "[gc] delete empty tablet dir: " << tablet_path; + leveldb::FileLock* file_lock = nullptr; + // NEVER remove the trailing character '/', otherwise you will lock the parent directory + leveldb::Status s = env->LockFile(tablet_path + "/", &file_lock); + if (!s.ok()) { + LOG(WARNING) << "lock path failed, path: " << tablet_path << ", status: " << s.ToString(); + } + + delete file_lock; + env->DeleteDir(tablet_path); return false; } @@ -157,6 +165,14 @@ bool BatchGcStrategy::CollectSingleDeadTablet(const std::string& tablename, uint uint64_t number = 0; if (ParseFileName(children[lg], &number, &type)) { LOG(INFO) << "[gc] delete: " << lg_path; + + leveldb::FileLock* file_lock = nullptr; + // NEVER remove the trailing character '/', otherwise you will lock the parent directory + leveldb::Status s = env->LockFile(tablet_path + "/", &file_lock); + if (!s.ok()) { + LOG(WARNING) << "lock path failed, path: " << tablet_path << ", status: " << s.ToString(); + } + env->DeleteFile(lg_path); continue; } @@ -173,6 +189,13 @@ bool BatchGcStrategy::CollectSingleDeadTablet(const std::string& tablename, uint list_count_.Inc(); if (files.size() == 0) { LOG(INFO) << "[gc] delete empty lg dir: " << lg_path; + leveldb::FileLock* file_lock = nullptr; + // NEVER remove the trailing character '/', otherwise you will lock the parent directory + leveldb::Status s = env->LockFile(tablet_path + "/", &file_lock); + if (!s.ok()) { + LOG(WARNING) << "lock path failed, path: " << tablet_path << ", status: " << s.ToString(); + } + delete file_lock; env->DeleteDir(lg_path); continue; } @@ -184,6 +207,13 @@ bool BatchGcStrategy::CollectSingleDeadTablet(const std::string& tablename, uint if (!ParseFileName(files[f], &number, &type) || type != leveldb::kTableFile) { // only keep sst, delete rest files + leveldb::FileLock* file_lock = nullptr; + // NEVER remove the trailing character '/', otherwise you will lock the parent directory + leveldb::Status s = env->LockFile(lg_path + "/", &file_lock); + if (!s.ok()) { + LOG(WARNING) << "lock path failed, path: " << lg_path << ", status: " << s.ToString(); + } + delete file_lock; io::DeleteEnvDir(file_path); continue; } @@ -214,7 +244,20 @@ void BatchGcStrategy::DeleteObsoleteFiles() { for (size_t lg = 0; lg < file_set.size(); ++lg) { std::set::iterator it = file_set[lg].begin(); for (; it != file_set[lg].end(); ++it) { - std::string file_path = leveldb::BuildTableFilePath(tablepath, lg, *it); + uint64_t tablet = 0; + uint64_t number = 0; + leveldb::ParseFullFileNumber(*it, &tablet, &number); + std::string file_path = leveldb::BuildTableFilePath(tablepath, tablet, lg, number); + std::string lg_path = leveldb::BuildTabletLgPath(tablepath, tablet, lg); + + leveldb::FileLock* file_lock = nullptr; + // NEVER remove the trailing character '/', otherwise you will lock the parent directory + leveldb::Status s = env->LockFile(lg_path + "/", &file_lock); + if (!s.ok()) { + LOG(WARNING) << "lock path failed, path: " << lg_path << ", status: " << s.ToString(); + } + delete file_lock; + LOG(INFO) << "[gc] delete: " << file_path; env->DeleteFile(file_path); file_delete_num_++; @@ -223,390 +266,5 @@ void BatchGcStrategy::DeleteObsoleteFiles() { } } -IncrementalGcStrategy::IncrementalGcStrategy(std::shared_ptr tablet_manager) - : tablet_manager_(tablet_manager), - last_gc_time_(std::numeric_limits::max()), - max_ts_(std::numeric_limits::max()) {} - -bool IncrementalGcStrategy::PreQuery () { - int64_t start_ts = get_micros(); - std::vector tables; - tablet_manager_->ShowTable(&tables, NULL); - - for (size_t i = 0; i < tables.size(); ++i) { - TabletFiles tablet_files; - std::string table_name = tables[i]->GetTableName(); - if (table_name == FLAGS_tera_master_meta_table_name) continue; - dead_tablet_files_.insert(std::make_pair(table_name, tablet_files)); - live_tablet_files_.insert(std::make_pair(table_name, tablet_files)); - - std::set live_tablets, dead_tablets; - if (!tables[i]->GetTabletsForGc(&live_tablets, &dead_tablets, true)) { - continue; - } - std::set::iterator it; - // update dead tablets - for (it = dead_tablets.begin(); it != dead_tablets.end(); ++it) { - TabletFiles& temp_tablet_files = dead_tablet_files_[table_name]; - TabletFileSet tablet_file_set(get_micros() / 1000000, 0); - bool ret = temp_tablet_files.insert(std::make_pair(*it, tablet_file_set)).second; - if (ret) { - VLOG(10) << "[gc] newly dead talbet: " << leveldb::GetTabletPathFromNum(table_name, *it); - if (!CollectSingleDeadTablet(table_name, *it)) { - // collect from DFS fails, so rollback memory status, retry in the next time - assert(dead_tablet_files_[table_name].erase(*it) == 1); - } - } else { - VLOG(20) << "[gc] old dead talbet: " << leveldb::GetTabletPathFromNum(table_name, *it); - } - } - - // erase newly dead tablets from live tablets - for (TabletFiles::iterator it = live_tablet_files_[table_name].begin(); - it != live_tablet_files_[table_name].end();) { - if (dead_tablet_files_[table_name].find(static_cast(it->first)) != dead_tablet_files_[table_name].end()) { - live_tablet_files_[table_name].erase(it++); - } else { - ++it; - } - } - - // add new live tablets - for (it = live_tablets.begin(); it != live_tablets.end(); ++it) { - TabletFiles& temp_tablet_files = live_tablet_files_[table_name]; - TabletFileSet tablet_file_set; - temp_tablet_files.insert(std::make_pair(*it, tablet_file_set)); - } - } - if (FLAGS_tera_garbage_collect_debug_log) { - DEBUG_print_files(true); - DEBUG_print_files(false); - } - LOG(INFO) << "[gc] Gather dead tablets, cost: " << (get_micros() - start_ts) / 1000 << "ms."; - - // do not need gc if there is no new dead tablet - if (dead_tablet_files_.size() == 0) { - LOG(INFO) << "[gc] Do not need gc this time"; - } - return dead_tablet_files_.size() != 0; -} - -void IncrementalGcStrategy::ProcessQueryCallbackForGc(QueryResponse* response) { - LOG(INFO) << "[gc] ProcessQueryCallbackForGc"; - MutexLock lock(&gc_mutex_); - - std::set ready_tables; - for (int table = 0; table < response->inh_live_files_size(); ++table) { - ready_tables.insert(response->inh_live_files(table).table_name()); - } - - // update tablet ready time - for (int i = 0; i < response->tabletmeta_list().meta_size(); ++i) { - const TabletMeta& meta = response->tabletmeta_list().meta(i); - std::string table_name = meta.table_name(); - if (table_name == FLAGS_tera_master_meta_table_name) continue; - if (live_tablet_files_.find(table_name) == live_tablet_files_.end() || - ready_tables.find(table_name) == ready_tables.end()) { - continue; - } - int64_t tablet_number = static_cast(leveldb::GetTabletNumFromPath(meta.path())); - VLOG(15) << "[gc] see live tablet " << leveldb::GetTabletPathFromNum(table_name, tablet_number); - if (live_tablet_files_[table_name].find(tablet_number) == live_tablet_files_[table_name].end()) continue; - live_tablet_files_[table_name][tablet_number].ready_time_ = get_micros() / 1000000; - } - - // insert live files - for (int table = 0; table < response->inh_live_files_size(); ++table) { - InheritedLiveFiles live_files = response->inh_live_files(table); - std::string table_name = live_files.table_name(); - if (table_name == FLAGS_tera_master_meta_table_name) continue; - VLOG(12) << "[gc] inh pb: " << response->inh_live_files(table).ShortDebugString(); - if (live_tablet_files_.find(table_name) == live_tablet_files_.end()) continue; - // collect live files - TabletFiles temp_tablet_files; - for (int lg = 0; lg < live_files.lg_live_files_size(); ++lg) { - LgInheritedLiveFiles lg_live_files = live_files.lg_live_files(lg); - uint32_t lg_no = lg_live_files.lg_no(); - for (int i = 0; i < lg_live_files.file_number_size(); ++i) { - uint64_t tablet_number, file; - uint64_t file_number = lg_live_files.file_number(i); - leveldb::ParseFullFileNumber(file_number, &tablet_number, &file); - if (dead_tablet_files_[table_name].find(tablet_number) == - dead_tablet_files_[table_name].end()) { - VLOG(12) << "[gc] skip live tablet " << tablet_number; - continue; - } - TabletFileSet tablet_file_set; - temp_tablet_files.insert(std::make_pair(tablet_number, tablet_file_set)); - TabletFileSet& temp_tablet_file_set = temp_tablet_files[tablet_number]; - LgFileSet lg_files; - temp_tablet_file_set.files_.insert(std::make_pair(lg_no, lg_files)); - temp_tablet_file_set.files_[lg_no].live_files_.insert(file_number); - VLOG(12) << "[gc] insert live file " << leveldb::GetTabletPathFromNum(table_name, tablet_number) << "/" << lg_no << "/" << file; - const LgFileSet& check = ((dead_tablet_files_[table_name][tablet_number]).files_)[lg_no]; - if (check.storage_files_.find(file_number) == check.storage_files_.end()) { - LOG(WARNING) << "[gc] insert error " << leveldb::GetTabletPathFromNum(table_name, tablet_number) << "/" << lg_no << "/" << file; - } - } - } - // update live files in dead tablets - TabletFiles::iterator tablet_it = temp_tablet_files.begin(); - TabletFiles& dead_tablets = dead_tablet_files_[table_name]; - for (; tablet_it != temp_tablet_files.end(); ++tablet_it) { - uint64_t tablet_number = tablet_it->first; - if (dead_tablets.find(tablet_number) == dead_tablets.end()) { - VLOG(12) << "[gc] skip live tablet " << table_name << "/" << tablet_number; - continue; - } - std::map& live_lg = (tablet_it->second).files_; - std::map& dead_lg = dead_tablets[tablet_number].files_; - std::map::iterator lg_it = live_lg.begin(); - for (; lg_it != live_lg.end(); ++lg_it) { - uint32_t lg_no = lg_it->first; - LgFileSet lg_file_set; - dead_lg.insert(std::make_pair(lg_no, lg_file_set)); - for (std::set::iterator it = live_lg[lg_no].live_files_.begin(); it != live_lg[lg_no].live_files_.end(); ++it) { - dead_lg[lg_no].live_files_.insert(*it); - } - VLOG(12) << "[gc] dead tablet's live lg: " << leveldb::GetTabletPathFromNum(table_name, tablet_number) << "/" << lg_no; - } - } - } - if (FLAGS_tera_garbage_collect_debug_log) { - DEBUG_print_files(true); - } -} - -void IncrementalGcStrategy::PostQuery () { - LOG(INFO) << "[gc] PostQuery"; - if (FLAGS_tera_garbage_collect_debug_log) { - DEBUG_print_files(true); - DEBUG_print_files(false); - } - int64_t start_ts = get_micros(); - TableFiles::iterator table_it = dead_tablet_files_.begin(); - for (; table_it != dead_tablet_files_.end(); ++table_it) { - DeleteTableFiles(table_it->first); - } - if (FLAGS_tera_garbage_collect_debug_log) { - DEBUG_print_files(true); - DEBUG_print_files(false); - } - LOG(INFO) << "[gc] Delete useless sst, cost: " << (get_micros() - start_ts) / 1000 << "ms. list_times " << list_count_.Get(); - list_count_.Clear(); -} - -void IncrementalGcStrategy::Clear(std::string tablename) { - LOG(INFO) << "[gc] Clear " << tablename; - MutexLock lock(&gc_mutex_); - dead_tablet_files_.erase(tablename); - live_tablet_files_.erase(tablename); -} - -void IncrementalGcStrategy::DeleteTableFiles(const std::string& table_name) { - std::string table_path = FLAGS_tera_tabletnode_path_prefix + table_name; - leveldb::Env* env = io::LeveldbBaseEnv(); - TabletFiles& dead_tablets = dead_tablet_files_[table_name]; - TabletFiles& live_tablets = live_tablet_files_[table_name]; - int64_t earliest_ready_time = max_ts_; - TabletFiles::iterator tablet_it = live_tablets.begin(); - for (; tablet_it != live_tablets.end(); ++tablet_it) { - if (tablet_it->second.ready_time_ < earliest_ready_time) { - earliest_ready_time = tablet_it->second.ready_time_; - } - } - - if (earliest_ready_time != max_ts_) { - VLOG(12) << "[gc] earliest ready time " << earliest_ready_time << " : " << common::timer::get_time_str(earliest_ready_time); - } else { - VLOG(12) << "[gc] " << table_name << "'s tablets not ready"; - } - std::set gc_tablets; - for (tablet_it = dead_tablets.begin(); tablet_it != dead_tablets.end(); ++tablet_it) { - if (tablet_it->second.dead_time_ < earliest_ready_time) { - gc_tablets.insert(tablet_it->first); - VLOG(12) << "[gc] will gc tablet: " << leveldb::GetTabletPathFromNum(table_name, tablet_it->first); - } - } - - for (std::set::iterator gc_it = gc_tablets.begin(); gc_it != gc_tablets.end();) { - std::map& lg_files = dead_tablets[*gc_it].files_; - std::map::iterator lg_it = lg_files.begin(); - std::string tablet_path = leveldb::GetTabletPathFromNum(table_path, *gc_it); - for (; lg_it != lg_files.end();) { - VLOG(12) << "[gc] entry lg gc lg=" << lg_it->first; - LgFileSet& lg_file_set = lg_it->second; - std::set::iterator file_it = lg_file_set.storage_files_.begin(); - for (; file_it != lg_file_set.storage_files_.end();) { - if (lg_file_set.live_files_.find(*file_it) == lg_file_set.live_files_.end()) { - std::string file_path = - leveldb::BuildTableFilePath(table_path, lg_it->first, *file_it); - - std::string debug_str; - for (std::set::iterator it = lg_file_set.live_files_.begin(); it != lg_file_set.live_files_.end(); ++it) { - uint64_t file_no; - leveldb::ParseFullFileNumber(*it, NULL, &file_no); - debug_str += " " + std::to_string(file_no); - } - // VLOG(12) << "[gc] live = " << debug_str; - LOG(INFO) << "[gc] delete: " << file_path; - if (env->DeleteFile(file_path).ok()) { - lg_file_set.storage_files_.erase(file_it++); - } else { - ++file_it; - // do nothing, try to delete next time - // TODO: if retry times > MAX ? - // TODO: if failed due to timeout but delete ok in DFS, it will always retry - } - } else { - ++file_it; - } - } - if (lg_file_set.storage_files_.size() == 0) { - if (lg_file_set.live_files_.size() != 0) { - uint64_t full_number = *(lg_file_set.live_files_.begin()); - uint64_t tablet_number, file_number; - leveldb::ParseFullFileNumber(full_number, &tablet_number, &file_number); - LOG(ERROR) << "[gc] empty tablet still has live files: " << tablet_number << "/" << lg_it->first << "/" << file_number; - } else { - std::string lg_str = std::to_string(lg_it->first); - std::string lg_path = tablet_path + "/" + lg_str; - LOG(INFO) << "[gc] delete empty lg dir: " << lg_path; - if (io::DeleteEnvDir(lg_path).ok()) { - lg_files.erase(lg_it++); - } else { - ++lg_it; - // do nothing, try to delete next time - // TODO: iff retry times > MAX ? - // TODO: if failed due to timeout but delete ok in DFS, it will always retry - } - } - } else { - ++lg_it; - } - } - - if (lg_files.size() == 0) { - LOG(INFO) << "[gc] delete empty tablet dir: " << tablet_path; - if (env->DeleteDir(tablet_path).ok()) { - dead_tablets.erase(*gc_it); - } else { - LOG(ERROR) << "[gc] rm dir fail: " << tablet_path; - // do nothing, try to delete next time - // TODO: iff retry times > MAX ? - // TODO: if failed due to timeout but delete ok in DFS, it will always retry - } - } else { - // clear live_files_ in dead_tablets for next round of gc - for (lg_it = lg_files.begin(); lg_it != lg_files.end(); ++lg_it) { - VLOG(12) << "[gc] clear live_files_(lg_no/file_no): " << *gc_it << "/" << lg_it->first; - lg_it->second.live_files_.clear(); - } - dead_tablets[*gc_it].dead_time_ = get_micros() / 1000000; - VLOG(12) << "[gc] update dead_time_ " << dead_tablets[*gc_it].dead_time_ << " " << common::timer::get_time_str(dead_tablets[*gc_it].dead_time_); - } - gc_it++; - } -} - -bool IncrementalGcStrategy::CollectSingleDeadTablet(const std::string& tablename, uint64_t tabletnum) { - std::string tablepath = FLAGS_tera_tabletnode_path_prefix + tablename; - std::string tablet_path = leveldb::GetTabletPathFromNum(tablepath, tabletnum); - leveldb::Env* env = io::LeveldbBaseEnv(); - std::vector children; - leveldb::Status s = env->GetChildren(tablet_path, &children); - if (!s.ok()) { - LOG(ERROR) << "[gc] list directory fail: " << tablet_path; - return false; - } - list_count_.Inc(); - - for (size_t lg = 0; lg < children.size(); ++lg) { - std::string lg_path = tablet_path + "/" + children[lg]; - leveldb::FileType type = leveldb::kUnknown; - uint64_t number = 0; - if (ParseFileName(children[lg], &number, &type)) { - LOG(INFO) << "[gc] delete: " << lg_path; - env->DeleteFile(lg_path); - continue; - } - - leveldb::Slice rest(children[lg]); - uint64_t lg_num = 0; - if (!leveldb::ConsumeDecimalNumber(&rest, &lg_num)) { - LOG(INFO) << "[gc] skip unknown dir: " << lg_path; - continue; - } - - std::vector files; - env->GetChildren(lg_path, &files); - list_count_.Inc(); - - int64_t lg_no = std::stoll(children[lg]); - std::map& tablet_files = dead_tablet_files_[tablename][tabletnum].files_; - LgFileSet lg_file_set; - tablet_files.insert(std::make_pair(lg_no, lg_file_set)); - LgFileSet& temp_lg_files_set = tablet_files[lg_no]; - for (size_t f = 0; f < files.size(); ++f) { - std::string file_path = lg_path + "/" + files[f]; - type = leveldb::kUnknown; - number = 0; - if (!ParseFileName(files[f], &number, &type) || - type != leveldb::kTableFile) { - // skip manifest/CURRENT - continue; - } - - uint64_t full_number = leveldb::BuildFullFileNumber(lg_path, number); - temp_lg_files_set.storage_files_.insert(full_number); - } - } - return true; -} - -void IncrementalGcStrategy::DEBUG_print_files(bool print_dead) { - TableFiles all_tablet_files; - if (print_dead == true) { - LOG(INFO) << "----------------------------[gc] Test print DEAD"; - all_tablet_files = dead_tablet_files_; - } else { - LOG(INFO) << "----------------------------[gc] Test print LIVE"; - all_tablet_files = live_tablet_files_; - } - TableFiles::iterator table_it; - for (table_it = all_tablet_files.begin(); table_it != all_tablet_files.end(); ++table_it) { - LOG(INFO) << "[gc] table=" << table_it->first; - TabletFiles& tablet_files = table_it->second; - TabletFiles::iterator tablet_it; - for (tablet_it = tablet_files.begin(); tablet_it != tablet_files.end(); ++tablet_it) { - LOG(INFO) << "[gc] tablet -- " << tablet_it->first; - TabletFileSet tablet_file_set = tablet_it->second; - LOG(INFO) << "[gc] ready -- " << tablet_file_set.ready_time_; - LOG(INFO) << "[gc] dead -- " << tablet_file_set.dead_time_; - std::map& files = tablet_file_set.files_; - std::map::iterator lg_it; - for (lg_it = files.begin(); lg_it != files.end(); ++lg_it) { - std::set& f = (lg_it->second).storage_files_; - std::string debug_str = ""; - for (std::set::iterator it = f.begin(); it != f.end(); ++it) { - uint64_t file_no; - leveldb::ParseFullFileNumber(*it, NULL, &file_no); - debug_str += " " + std::to_string(file_no); - } - LOG(INFO) << "[gc] lg stor -- " << lg_it->first << "-" << (lg_it->second).storage_files_.size() << debug_str; - f = (lg_it->second).live_files_; - debug_str = ""; - for (std::set::iterator it = f.begin(); it != f.end(); ++it) { - uint64_t file_no; - leveldb::ParseFullFileNumber(*it, NULL, &file_no); - debug_str += " " + std::to_string(file_no); - } - LOG(INFO) << "[gc] lg live -- " << lg_it->first << "-" << (lg_it->second).live_files_.size() << debug_str; - } - } - } - LOG(INFO) << "----------------------------[gc] Done Test print"; -} - } // namespace master } // namespace tera diff --git a/src/master/gc_strategy.h b/src/master/gc_strategy.h index cccbd91b0..c68364502 100644 --- a/src/master/gc_strategy.h +++ b/src/master/gc_strategy.h @@ -7,7 +7,7 @@ #include "master/tablet_manager.h" #include "proto/tabletnode_client.h" #include "types.h" -#include "utils/counter.h" +#include "common/counter.h" namespace tera { namespace master { @@ -68,58 +68,6 @@ class BatchGcStrategy : public GcStrategy { tera::Counter list_count_; }; -class IncrementalGcStrategy : public GcStrategy{ -public: - IncrementalGcStrategy(std::shared_ptr tablet_manager); - virtual ~IncrementalGcStrategy() {} - - // get dead tablets - virtual bool PreQuery (); - - // gather live files - virtual void ProcessQueryCallbackForGc(QueryResponse* response); - - // delete dead files - virtual void PostQuery (); - - // clear memory when table is deleted - virtual void Clear(std::string tablename); - -private: - void DEBUG_print_files(bool print_dead); - bool CollectSingleDeadTablet(const std::string& tablename, uint64_t tabletnum); - void DeleteTableFiles(const std::string& table_name); - - struct LgFileSet { - std::set storage_files_; - std::set live_files_; - }; - - struct TabletFileSet { - int64_t dead_time_; - int64_t ready_time_; - std::map files_; // lg_no -> files - TabletFileSet() { - dead_time_ = std::numeric_limits::max(); - ready_time_ = 0; - }; - TabletFileSet(int64_t dead_time, int64_t ready_time) { - dead_time_ = dead_time; - ready_time_ = ready_time; - } - }; - - typedef std::map TabletFiles; // tablet_number -> files - typedef std::map TableFiles; // table_name -> files - mutable Mutex gc_mutex_; - std::shared_ptr tablet_manager_; - int64_t last_gc_time_; - TableFiles dead_tablet_files_; - TableFiles live_tablet_files_; - int64_t max_ts_; - tera::Counter list_count_; -}; - } // namespace master } // namespace tera diff --git a/src/master/master_entry.cc b/src/master/master_entry.cc index 1e958c028..13c9d276d 100644 --- a/src/master/master_entry.cc +++ b/src/master/master_entry.cc @@ -7,6 +7,7 @@ #include #include +#include "common/metric/collector_report.h" #include "common/net/ip_address.h" #include "master/master_impl.h" #include "master/remote_master.h" @@ -15,6 +16,8 @@ DECLARE_string(tera_master_port); DECLARE_int32(tera_master_rpc_server_max_inflow); DECLARE_int32(tera_master_rpc_server_max_outflow); +DECLARE_bool(tera_metric_http_server_enable); +DECLARE_int32(tera_metric_http_server_listen_port); std::string GetTeraEntryName() { return "master"; @@ -30,7 +33,8 @@ namespace master { MasterEntry::MasterEntry() : master_impl_(NULL), remote_master_(NULL), - rpc_server_(NULL) { + rpc_server_(NULL), + metric_http_server_(new tera::MetricHttpServer()) { sofa::pbrpc::RpcServerOptions rpc_options; rpc_options.max_throughput_in = FLAGS_tera_master_rpc_server_max_inflow; rpc_options.max_throughput_out = FLAGS_tera_master_rpc_server_max_outflow; @@ -57,10 +61,20 @@ bool MasterEntry::StartServer() { } LOG(INFO) << "finish starting master server"; + + // start metric http server + if (FLAGS_tera_metric_http_server_enable) { + if(!metric_http_server_->Start(FLAGS_tera_metric_http_server_listen_port)) { + LOG(WARNING) << "Start metric http server failed. Ignore"; + } + } else { + LOG(INFO) << "Metric http server is disabled."; + } return true; } bool MasterEntry::Run() { + CollectorReportPublisher::GetInstance().Refresh(); static int64_t timer_ticks = 0; ++timer_ticks; @@ -73,6 +87,7 @@ bool MasterEntry::Run() { } void MasterEntry::ShutdownServer() { + metric_http_server_->Stop(); rpc_server_->Stop(); master_impl_.reset(); } diff --git a/src/master/master_entry.h b/src/master/master_entry.h index c8f738916..919da4928 100644 --- a/src/master/master_entry.h +++ b/src/master/master_entry.h @@ -8,6 +8,7 @@ #include #include "common/base/scoped_ptr.h" +#include "common/metric/metric_http_server.h" #include "tera_entry.h" namespace tera { @@ -33,6 +34,7 @@ class MasterEntry : public TeraEntry { // scoped_ptr remote_master_; RemoteMaster* remote_master_; scoped_ptr rpc_server_; + scoped_ptr metric_http_server_; }; } // namespace master diff --git a/src/master/master_impl.cc b/src/master/master_impl.cc index 598faa4ec..f667c7d0f 100644 --- a/src/master/master_impl.cc +++ b/src/master/master_impl.cc @@ -26,7 +26,7 @@ #include "utils/config_utils.h" #include "utils/schema_utils.h" #include "utils/string_util.h" -#include "utils/timer.h" +#include "common/timer.h" #include "utils/utils_cmd.h" DECLARE_string(tera_master_port); @@ -48,11 +48,15 @@ DECLARE_string(tera_master_meta_table_name); DECLARE_string(tera_master_meta_table_path); DECLARE_int32(tera_master_meta_retry_times); +DECLARE_string(tera_coord_type); DECLARE_bool(tera_zk_enabled); DECLARE_bool(tera_mock_zk_enabled); DECLARE_double(tera_master_workload_split_threshold); +DECLARE_double(tera_master_workload_merge_threshold); DECLARE_int64(tera_master_split_tablet_size); +DECLARE_int64(tera_master_min_split_size); +DECLARE_double(tera_master_min_split_ratio); DECLARE_int64(tera_master_merge_tablet_size); DECLARE_bool(tera_master_kick_tabletnode_enabled); DECLARE_int32(tera_master_kick_tabletnode_query_fail_times); @@ -84,6 +88,8 @@ DECLARE_bool(tera_master_stat_table_enabled); DECLARE_int64(tera_master_stat_table_splitsize); DECLARE_int32(tera_master_gc_period); +DECLARE_bool(tera_master_gc_trash_enabled); +DECLARE_int64(tera_master_gc_trash_clean_period_s); DECLARE_string(tera_tabletnode_path_prefix); DECLARE_string(tera_leveldb_env_type); @@ -108,6 +114,7 @@ DECLARE_int32(tera_master_schema_update_retry_times); DECLARE_int64(tera_master_availability_check_period); DECLARE_bool(tera_master_availability_check_enabled); +DECLARE_bool(tera_master_update_split_meta); using namespace std::placeholders; namespace tera { @@ -131,6 +138,8 @@ MasterImpl::MasterImpl() thread_pool_(new ThreadPool(FLAGS_tera_master_impl_thread_max_num)), is_stat_table_(false), stat_table_(NULL), + gc_trash_clean_enabled_(false), + gc_trash_clean_timer_id_(kInvalidTimerId), gc_enabled_(false), gc_timer_id_(kInvalidTimerId), gc_query_enable_(false), @@ -152,15 +161,12 @@ MasterImpl::MasterImpl() if (FLAGS_tera_master_gc_strategy == "default") { LOG(INFO) << "[gc] gc strategy is BatchGcStrategy"; gc_strategy_ = std::shared_ptr(new BatchGcStrategy(tablet_manager_)); - } else if (FLAGS_tera_master_gc_strategy == "incremental") { - LOG(INFO) << "[gc] gc strategy is IncrementalGcStrategy"; - gc_strategy_ = std::shared_ptr(new IncrementalGcStrategy(tablet_manager_)); } else if (FLAGS_tera_master_gc_strategy == "trackable") { LOG(INFO) << "[gc] gc strategy is Trackable"; } else { - LOG(WARNING) << "Unknown gc strategy: " << FLAGS_tera_master_gc_strategy - << ", default gc strategy: BatchGcStrategy will take effect"; - gc_strategy_ = std::shared_ptr(new BatchGcStrategy(tablet_manager_)); + LOG(ERROR) << "Unknown gc strategy: " << FLAGS_tera_master_gc_strategy + << ", exit"; + exit(EXIT_FAILURE); } } @@ -171,18 +177,29 @@ MasterImpl::~MasterImpl() { } bool MasterImpl::Init() { - if (FLAGS_tera_zk_enabled) { + if (FLAGS_tera_coord_type.empty()) { + LOG(ERROR) << "Note: We don't recommend that use '" + << "--tera_[zk|ins|mock_zk|mock_ins]_enabled' flag for your cluster coord" + << " replace by '--tera_coord_type=[zk|ins|mock_zk|mock_ins|fake_zk]'" + << " flag is usually recommended."; + } + if (FLAGS_tera_coord_type == "zk" + || (FLAGS_tera_coord_type.empty() && FLAGS_tera_zk_enabled)) { zk_adapter_.reset(new MasterZkAdapter(this, local_addr_)); - } else if (FLAGS_tera_ins_enabled) { + } else if (FLAGS_tera_coord_type == "ins" + || (FLAGS_tera_coord_type.empty() && FLAGS_tera_ins_enabled)) { LOG(INFO) << "ins mode" ; zk_adapter_.reset(new InsMasterZkAdapter(this, local_addr_)); - } else if (FLAGS_tera_mock_zk_enabled) { + } else if (FLAGS_tera_coord_type == "mock_zk" + || (FLAGS_tera_coord_type.empty() && FLAGS_tera_mock_zk_enabled)) { LOG(INFO) << "mock zk mode" ; zk_adapter_.reset(new MockMasterZkAdapter(this, local_addr_)); - } else if (FLAGS_tera_mock_ins_enabled) { + } else if (FLAGS_tera_coord_type == "mock_ins" + || (FLAGS_tera_coord_type.empty() && FLAGS_tera_mock_ins_enabled)) { LOG(INFO) << "mock ins mode" ; zk_adapter_.reset(new MockInsMasterZkAdapter(this, local_addr_)); - } else { + } else if (FLAGS_tera_coord_type == "fake_zk" + || FLAGS_tera_coord_type.empty()) { LOG(INFO) << "fake zk mode!"; zk_adapter_.reset(new FakeMasterZkAdapter(this, local_addr_)); } @@ -406,18 +423,22 @@ void MasterImpl::RestoreUserTablet(const std::vector& report_meta_li VLOG(8) << "READY Tablet, " << tablet; continue; } - tablet_availability_->AddNotReadyTablet(tablet->GetPath()); CHECK(tablet->GetStatus() == kTableNotInit); + tablet_availability_->AddNotReadyTablet(tablet->GetPath(), tablet->GetStatus(), + tablet->GetTable()->GetStatus()); TabletNodePtr node; if (server_addr.empty()) { tablet->SetStatus(kTableOffLine); + ProcessOffLineTablet(tablet); VLOG(8) << "OFFLINE Tablet with empty addr, " << tablet; } else if (!tabletnode_manager_->FindTabletNode(server_addr, &node)) { tablet->SetStatus(kTableOffLine); + ProcessOffLineTablet(tablet); VLOG(8) << "OFFLINE Tablet of Dead TS, " << tablet; } else if (node->state_ == kReady) { tablet->SetStatus(kTableOffLine); + ProcessOffLineTablet(tablet); VLOG(8) << "OFFLINE Tablet of Alive TS, " << tablet; TryLoadTablet(tablet, server_addr); } else { @@ -1142,6 +1163,7 @@ void MasterImpl::ShowTables(const ShowTablesRequest* request, TabletPtr tablet = tablet_list[i]; TabletMeta meta; tablet->ToMeta(&meta); + meta.set_last_move_time_us(tablet->LastMoveTime()); tablet_meta_list->add_meta()->CopyFrom(meta); tablet_meta_list->add_counter()->CopyFrom(tablet->GetCounter()); tablet_meta_list->add_timestamp(tablet->UpdateTime()); @@ -1266,6 +1288,8 @@ void MasterImpl::CmdCtrl(const CmdCtrlRequest* request, ReloadConfig(response); } else if (request->command() == "kick") { KickTabletNodeCmdCtrl(request, response); + } else if (request->command() == "table") { + TableCmdCtrl(request, response); } else { response->set_status(kInvalidArgument); } @@ -1458,13 +1482,42 @@ void MasterImpl::ReloadConfig(CmdCtrlResponse* response) { } } -void MasterImpl::TabletCmdCtrl(const CmdCtrlRequest* request, - CmdCtrlResponse* response) { +void MasterImpl::TableCmdCtrl(const CmdCtrlRequest* request, + CmdCtrlResponse* response) { if (request->arg_list_size() < 2) { response->set_status(kInvalidArgument); return; } + if (request->arg_list(0) == "split") { + TabletPtr tablet; + StatusCode status; + for (int32_t i = 2; i < request->arg_list_size(); i++) { + if (!tablet_manager_->SearchTablet(request->arg_list(1), + request->arg_list(i), + &tablet, &status)) { + response->set_status(kInvalidArgument); + return; + } + VLOG(10) << "table split: key " << request->arg_list(i) + << ", " << tablet; + TrySplitTablet(tablet, request->arg_list(i)); + } + response->set_status(kMasterOk); + } else { + response->set_status(kInvalidArgument); + } + return; +} + +void MasterImpl::TabletCmdCtrl(const CmdCtrlRequest* request, + CmdCtrlResponse* response) { + int32_t request_argc = request->arg_list_size(); + if (request_argc < 2) { + response->set_status(kInvalidArgument); + return; + } + const std::string& op = request->arg_list(0); const std::string& tablet_id = request->arg_list(1); TabletPtr tablet; bool found = false; @@ -1483,40 +1536,35 @@ void MasterImpl::TabletCmdCtrl(const CmdCtrlRequest* request, return; } - if (request->arg_list(0) == "reload") { + if (op == "reload" && request_argc == 2) { std::string current_server_addr = tablet->GetServerAddr(); TryMoveTablet(tablet, current_server_addr, true); // force to unload and load tablet even it on the same ts - - } else if (request->arg_list(0) == "move") { - if (request->arg_list_size() > 3) { - response->set_status(kInvalidArgument); - return; - } - std::string expect_server_addr; - if (request->arg_list_size() == 3) { - expect_server_addr = request->arg_list(2); - } + response->set_status(kMasterOk); + } else if (op == "reloadx" && request_argc == 3 + && tablet->SetErrorIgnoredLGs(request->arg_list(2))) { + std::string current_server_addr = tablet->GetServerAddr(); + TryMoveTablet(tablet, current_server_addr, true); + response->set_status(kMasterOk); + } else if (op == "move" && request_argc == 3) { + std::string expect_server_addr = request->arg_list(2); TryMoveTablet(tablet, expect_server_addr); response->set_status(kMasterOk); - } else if (request->arg_list(0) == "split") { - if (request->arg_list_size() > 3) { - response->set_status(kInvalidArgument); - return; - } + } else if (op == "movex" && request_argc == 4 + && tablet->SetErrorIgnoredLGs(request->arg_list(3))) { + std::string expect_server_addr = request->arg_list(2); + TryMoveTablet(tablet, expect_server_addr); + response->set_status(kMasterOk); + } else if (op == "split" && (request_argc == 2 || request_argc == 3)) { std::string split_key; - if (request->arg_list_size() == 3) { + if (request_argc == 3) { split_key = request->arg_list(2); LOG(INFO) << "User specified split key: " << split_key; } TrySplitTablet(tablet, split_key); response->set_status(kMasterOk); - } else if (request->arg_list(0) == "merge") { - if (request->arg_list_size() > 3) { - response->set_status(kInvalidArgument); - return; - } + } else if (op == "merge" && request_argc == 2) { TryMergeTablet(tablet); response->set_status(kMasterOk); } else { @@ -1892,9 +1940,12 @@ bool MasterImpl::TabletNodeLoadBalance(TabletNodePtr tabletnode, Scheduler* sche split_size = tablet->GetSchema().split_size(); } if (write_workload > FLAGS_tera_master_workload_split_threshold) { - split_size /= 2; - VLOG(6) << tablet->GetPath() << " write_workload too large, split it by size: " - << split_size; + if (split_size > FLAGS_tera_master_min_split_size) { + split_size = std::max(FLAGS_tera_master_min_split_size, + static_cast(split_size * FLAGS_tera_master_min_split_ratio)); + } + VLOG(6) << tablet->GetPath() << ", trigger workload split, write_workload: " << write_workload + << ", split it by size(M): " << split_size; } int64_t merge_size = FLAGS_tera_master_merge_tablet_size; if (tablet->GetSchema().has_merge_size() && tablet->GetSchema().merge_size() > 0) { @@ -1903,12 +1954,14 @@ bool MasterImpl::TabletNodeLoadBalance(TabletNodePtr tabletnode, Scheduler* sche if (tablet->GetDataSize() < 0) { // tablet size is error, skip it continue; - } else if (tablet->GetDataSize() > (split_size << 20)) { + } else if (tablet->GetDataSize() > (split_size << 20) && + tablet->TestAndSetSplitTimeStamp(get_micros())) { TrySplitTablet(tablet); any_tablet_split = true; continue; } else if (tablet->GetDataSize() < (merge_size << 20)) { - if (write_workload < 1) { + if (!tablet->IsBusy() && + write_workload < FLAGS_tera_master_workload_merge_threshold) { TryMergeTablet(tablet); } else { VLOG(6) << "[merge] skip high workload tablet: " @@ -2133,14 +2186,15 @@ void MasterImpl::DeleteTabletNode(const std::string& tabletnode_addr) { std::vector::iterator it; for (it = tablet_list.begin(); it != tablet_list.end(); ++it) { TabletPtr tablet = *it; - tablet_availability_->AddNotReadyTablet(tablet->GetPath()); if (FLAGS_tera_master_tabletnode_timeout > 0 && tablet->GetTableName() != FLAGS_tera_master_meta_table_name) { - tablet->SetStatusIf(kTabletPending, kTableReady); - } else if (tablet->SetStatusIf(kTableOffLine, kTableReady)) { + tablet->SetStatusIf(kTabletPending, kTableReady, tabletnode_addr); + } else if (tablet->SetStatusIf(kTableOffLine, kTableReady, tabletnode_addr)) { ProcessOffLineTablet(tablet); } + tablet_availability_->AddNotReadyTablet(tablet->GetPath(), tablet->GetStatus(), + tablet->GetTable()->GetStatus()); if (tablet->GetStatus() == kTableUnLoadFail && tablet->GetMergeParam() != NULL) { MergeTabletUnloadCallback(tablet); @@ -2238,6 +2292,7 @@ bool MasterImpl::EnterSafeMode(StatusCode* status) { tablet_manager_->Stop(); DisableTabletNodeGcTimer(); DisableLoadBalance(); + DisableGcTrashCleanTimer(); return true; } @@ -2271,6 +2326,7 @@ bool MasterImpl::LeaveSafeMode(StatusCode* status) { EnableQueryTabletNodeTimer(); EnableTabletNodeGcTimer(); EnableLoadBalance(); + EnableGcTrashCleanTimer(); std::vector node_array; tabletnode_manager_->GetAllTabletNodeInfo(&node_array); @@ -2481,6 +2537,14 @@ void MasterImpl::LoadTabletAsync(TabletPtr tablet, LoadClosure done, uint64_t) { request->add_parent_tablets(meta.parent_tablets(i)); } + std::vector ignore_err_lgs; + tablet->GetErrorIgnoredLGs(&ignore_err_lgs); + for (uint32_t i = 0; i < ignore_err_lgs.size(); ++i) { + VLOG(6) << "Add ignore err lg to request :" << ignore_err_lgs[i]; + request->add_ignore_err_lgs(ignore_err_lgs[i]); + } + tablet->SetErrorIgnoredLGs(); // clean error lg, only for this request once + LOG(INFO) << "LoadTabletAsync id: " << request->sequence_id() << ", " << tablet; node_client.LoadTablet(request, response, done); @@ -3694,6 +3758,7 @@ void MasterImpl::SplitTabletAsync(TabletPtr tablet, const std::string& split_key request->add_child_tablets(tablet->GetTable()->GetNextTabletNo()); request->add_child_tablets(tablet->GetTable()->GetNextTabletNo()); request->set_split_key(split_key); + request->set_master_update_meta(FLAGS_tera_master_update_split_meta); tablet->ToMeta(request->mutable_tablet_meta()); std::vector snapshots; @@ -3704,7 +3769,8 @@ void MasterImpl::SplitTabletAsync(TabletPtr tablet, const std::string& split_key LOG(INFO) << "SplitTabletAsync id: " << request->sequence_id() << ", " << tablet; - tablet_availability_->AddNotReadyTablet(tablet->GetPath()); + tablet_availability_->AddNotReadyTablet(tablet->GetPath(), tablet->GetStatus(), + tablet->GetTable()->GetStatus()); node_client.SplitTablet(request, response, done); } @@ -3714,11 +3780,11 @@ void MasterImpl::SplitTabletCallback(TabletPtr tablet, bool failed, int error_code) { CHECK(tablet->GetStatus() == kTableOnSplit); StatusCode status = response->status(); - delete request; - delete response; + std::unique_ptr response_deleter(response); + std::unique_ptr request_deleter(request); const std::string& server_addr = tablet->GetServerAddr(); - // fail + // fail, RPC fail or unexpected return status if (failed || (status != kTabletNodeOk && status != kTableNotSupport && status != kMetaTabletError)) { if (failed) { @@ -3740,11 +3806,12 @@ void MasterImpl::SplitTabletCallback(TabletPtr tablet, if (status == kTabletNodeOk) { // tabletnode unloaded the tablet LOG(INFO) << "RPC SplitTablet success"; - } else if (status == kTableNotSupport) { + } else if (status == kTableNotSupport) { // TODO: use TryLoadAsync will be more safe. // tabletnode refused to split and didn't unload the tablet tablet->SetStatusIf(kTableReady, kTableOnSplit); ProcessReadyTablet(tablet); } else { + // this will not be true once Master is responsible for write child tablets info, will be deleted CHECK(status == kMetaTabletError); // meta table is not ok LOG(ERROR) << "fail to split: " << StatusCodeToString(status) << ", " @@ -3781,14 +3848,153 @@ void MasterImpl::SplitTabletCallback(TabletPtr tablet, tablet_availability_->EraseNotReadyTablet(tablet->GetPath()); return; } - + // old TS write child tablets info to meta table directly without sending back child tablets info + // we need scan MetaTable to get children meta info + if (response->split_keys_size() == 0) { // scan meta tablet - if (tablet->GetStatus() == kTableOnSplit) { - ScanClosure done = - std::bind(&MasterImpl::ScanMetaCallbackForSplit, this, tablet, _1, _2, _3, _4); - ScanMetaTableAsync(tablet->GetTableName(), tablet->GetKeyStart(), - tablet->GetKeyEnd(), done); + if (tablet->GetStatus() == kTableOnSplit) { + ScanClosure done = + std::bind(&MasterImpl::ScanMetaCallbackForSplit, this, tablet, _1, _2, _3, _4); + ScanMetaTableAsync(tablet->GetTableName(), tablet->GetKeyStart(), + tablet->GetKeyEnd(), done); + } + } else { + if (response->split_keys_size() > 1) { + LOG(INFO) << "currently we only support one split key, tablet " + << tablet << " will be split by key: " << response->split_keys(0); + } + SplitTabletWriteMetaAsync(tablet, response->split_keys(0)); + } +} + +void MasterImpl::SplitTabletWriteMetaAsync(TabletPtr tablet, const std::string& split_key) { + const std::string& key_start = tablet->GetKeyStart(); + const std::string& key_end = tablet->GetKeyEnd(); + if (split_key <= key_start || (key_end != "" && split_key >= key_end)) { + LOG(ERROR) << kSms << "two splits are not successive, " + << tablet << ", split_key: " << split_key; + // the tablet has alreay been unloaded, so we just mark it as kTableOffLine and try to reload it + tablet->SetStatus(kTableOffLine); + ProcessOffLineTablet(tablet); + TryLoadTablet(tablet); + return; + } + std::string meta_addr; + if (!tablet_manager_->GetMetaTabletAddr(&meta_addr)) { + LOG(ERROR) << "[split] meta table is not ready, try to load parent tablet"; + tablet->SetStatus(kTableOffLine); + ProcessOffLineTablet(tablet); + TryLoadTablet(tablet); + return; + } + + WriteTabletRequest* meta_request = new WriteTabletRequest; + WriteTabletResponse* meta_response = new WriteTabletResponse; + meta_request->set_sequence_id(this_sequence_id_.Inc()); + meta_request->set_tablet_name(FLAGS_tera_master_meta_table_name); + meta_request->set_is_sync(true); + meta_request->set_is_instant(true); + + const std::string& parent_path = tablet->GetPath(); + int64_t parent_size = tablet->GetDataSize(); + TablePtr table = tablet->GetTable(); + + std::string child_start_key = key_start; + std::string child_end_key = split_key; + std::vector child_tablets; + for (int i = 0; i < 2; ++i) { + TabletMeta child_meta; + tablet->ToMeta(&child_meta); + child_meta.clear_parent_tablets(); + child_meta.add_parent_tablets(leveldb::GetTabletNumFromPath(parent_path)); + child_meta.set_path(leveldb::GetChildTabletPath(parent_path, table->GetNextTabletNo())); + child_meta.mutable_key_range()->set_key_start(child_start_key); + child_meta.mutable_key_range()->set_key_end(child_end_key); + child_meta.set_size(parent_size / 2); + std::string meta_key, meta_value; + MakeMetaTableKeyValue(child_meta, &meta_key, &meta_value); + RowMutationSequence* mu_seq = meta_request->add_row_list(); + mu_seq->set_row_key(meta_key); + Mutation* mutation = mu_seq->add_mutation_sequence(); + mutation->set_type(kPut); + mutation->set_value(meta_value); + child_tablets.emplace_back(new Tablet(child_meta, table)); + child_start_key = child_end_key; + child_end_key = key_end; } + + WriteClosure done = std::bind(&MasterImpl::SplitTabletWriteMetaCallback, this, tablet, + child_tablets, FLAGS_tera_master_meta_retry_times, _1, _2, _3, _4); + + tabletnode::TabletNodeClient meta_node_client(meta_addr); + meta_node_client.WriteTablet(meta_request, meta_response, done); + return; +} + +void MasterImpl::SplitTabletWriteMetaCallback(TabletPtr parent_tablet, + std::vector child_tablets, + int32_t retry_times, + WriteTabletRequest* request, + WriteTabletResponse* response, + bool failed, int error_code) { + StatusCode status = response->status(); + if (!failed && status == kTabletNodeOk) { + CHECK_EQ(response->row_status_list_size(), 2); + CHECK_EQ(child_tablets.size(), 2); + status = response->row_status_list(0); + } + delete request; + delete response; + if (failed || status != kTabletNodeOk) { + if (failed) { + LOG(ERROR) << "[split] fail to add to meta tablet " + << sofa::pbrpc::RpcErrorCodeToString(error_code) << "," + << parent_tablet; + } else { + LOG(ERROR) << "[split] fail to add to meta tablet" + << StatusCodeToString(status) << "," << parent_tablet; + } + if (retry_times <= 0) { + LOG(ERROR) << kSms << "[split] fail to update meta tablet in max retry" + <<" times, parent_tablet: " << parent_tablet; + parent_tablet->SetStatus(kTableOffLine); + ProcessOffLineTablet(parent_tablet); + TryLoadTablet(parent_tablet); + } else { + std::vector meta_entries; + for (std::size_t idx = 0; idx < child_tablets.size(); ++idx) { + meta_entries.push_back(std::bind( + &Tablet::ToMetaTableKeyValue, child_tablets[idx], _1, _2)); + } + WriteClosure done = std::bind(&MasterImpl::SplitTabletWriteMetaCallback, this, + parent_tablet, child_tablets, retry_times - 1, _1, _2, _3, _4); + SuspendMetaOperation(meta_entries, false, done); + } + return; + } + + TabletMeta first_meta, second_meta; + child_tablets[0]->ToMeta(&first_meta); + first_meta.set_status(kTableOffLine); + child_tablets[1]->ToMeta(&second_meta); + second_meta.set_status(kTableOffLine); + TablePtr table = parent_tablet->GetTable(); + table->SplitTablet(parent_tablet, first_meta, second_meta, &child_tablets[0], &child_tablets[1]); + + tablet_availability_->EraseNotReadyTablet(parent_tablet->GetPath()); + tablet_availability_->AddNotReadyTablet(child_tablets[0]->GetPath(), child_tablets[0]->GetStatus(), + table->GetStatus()); + tablet_availability_->AddNotReadyTablet(child_tablets[1]->GetPath(), child_tablets[1]->GetStatus(), + table->GetStatus()); + LOG(INFO) << "split finish," << parent_tablet << ", try load child tablets," + << "\nfirst: " << first_meta.ShortDebugString() + << "\nsecond: " << second_meta.ShortDebugString(); + + ProcessOffLineTablet(child_tablets[0]); + TryLoadTablet(child_tablets[0]); + ProcessOffLineTablet(child_tablets[1]); + TryLoadTablet(child_tablets[1]); + return; } void MasterImpl::TryLoadTablet(TabletPtr tablet, std::string server_addr) { @@ -3971,6 +4177,18 @@ bool MasterImpl::TrySplitTablet(TabletPtr tablet, const std::string& split_key) // abort if status switch to offline (server down / disable) if (!tablet->SetStatusIf(kTableOnSplit, kTableReady)) { LOG(ERROR) << "error state, abort split table " << tablet->GetPath(); + node->FinishSplit(); + + TabletPtr next_tablet; + std::string split_key; + while (node->SplitNextWaitTablet(&next_tablet, &split_key)) { + if (next_tablet->SetStatusIf(kTableOnSplit, kTableReady)) { + next_tablet->SetServerId(node->uuid_); + SplitTabletAsync(next_tablet, split_key); + break; + } + node->FinishSplit(); + } return false; } @@ -4005,7 +4223,7 @@ bool MasterImpl::TryMergeTablet(TabletPtr tablet) { if (tablet2->GetStatus() != kTableReady || tablet2->IsBusy() || - tablet2->GetCounter().write_workload() >= 1) { + tablet2->GetCounter().write_workload() >= FLAGS_tera_master_workload_merge_threshold) { VLOG(20) << "[merge] merge failed, none proper tablet." << " status:" << tablet2->GetStatus() << " isbusy:" << tablet2->IsBusy() @@ -4051,8 +4269,10 @@ void MasterImpl::MergeTabletAsync(TabletPtr tablet_p1, TabletPtr tablet_p2) { std::bind(&MasterImpl::UnloadTabletCallback, this, tablet_p2, FLAGS_tera_master_impl_retry_times, _1, _2, _3, _4); - tablet_availability_->AddNotReadyTablet(tablet_p1->GetPath()); - tablet_availability_->AddNotReadyTablet(tablet_p2->GetPath()); + tablet_availability_->AddNotReadyTablet(tablet_p1->GetPath(), tablet_p1->GetStatus(), + tablet_p1->GetTable()->GetStatus()); + tablet_availability_->AddNotReadyTablet(tablet_p2->GetPath(), tablet_p2->GetStatus(), + tablet_p2->GetTable()->GetStatus()); UnloadTabletAsync(tablet_p1, done1); UnloadTabletAsync(tablet_p2, done2); } @@ -4230,8 +4450,9 @@ void MasterImpl::MergeTabletWriteMetaCallback(TabletPtr tablet_c, tablet_availability_->EraseNotReadyTablet(tablet_p1->GetPath()); tablet_availability_->EraseNotReadyTablet(tablet_p2->GetPath()); - tablet_availability_->AddNotReadyTablet(tablet_c->GetPath()); ProcessOffLineTablet(tablet_c); + tablet_availability_->AddNotReadyTablet(tablet_c->GetPath(), tablet_c->GetStatus(), + tablet_c->GetTable()->GetStatus()); TryLoadTablet(tablet_c); delete request; delete response; @@ -4475,6 +4696,8 @@ void MasterImpl::UpdateTableRecordForEnableCallback(TablePtr table, int32_t retr LOG(ERROR) << "fail to load tablet: " << tablet->GetPath() << ", tablet status: " << StatusCodeToString(tablet->GetStatus()); } + tablet_availability_->AddNotReadyTablet(tablet->GetPath(), tablet->GetStatus(), + tablet->GetTable()->GetStatus()); } } @@ -4871,8 +5094,10 @@ void MasterImpl::ScanMetaCallbackForSplit(TabletPtr tablet, table->SplitTablet(tablet, first_meta, second_meta, &first_tablet, &second_tablet); tablet_availability_->EraseNotReadyTablet(tablet->GetPath()); - tablet_availability_->AddNotReadyTablet(first_tablet->GetPath()); - tablet_availability_->AddNotReadyTablet(second_tablet->GetPath()); + tablet_availability_->AddNotReadyTablet(first_tablet->GetPath(), first_tablet->GetStatus(), + first_tablet->GetTable()->GetStatus()); + tablet_availability_->AddNotReadyTablet(second_tablet->GetPath(), second_tablet->GetStatus(), + second_tablet->GetTable()->GetStatus()); LOG(INFO) << "split finish, " << tablet << ", try load child tablets, " << "\nfirst: " << first_meta.ShortDebugString() << "\nsecond: " << second_meta.ShortDebugString(); @@ -5074,12 +5299,14 @@ void MasterImpl::TryMoveTablet(TabletPtr tablet, const std::string& server_addr, << " to " << server_addr; if (tablet->SetStatusIf(kTableUnLoading, kTableReady)) { tablet->SetExpectServerAddr(server_addr); + tablet->SetLastMoveTime(get_micros()); TabletNodePtr node; if (!server_addr.empty() && tabletnode_manager_->FindTabletNode(server_addr, &node)) { node->PlanToMoveIn(); } - tablet_availability_->AddNotReadyTablet(tablet->GetPath()); + tablet_availability_->AddNotReadyTablet(tablet->GetPath(), tablet->GetStatus(), + tablet->GetTable()->GetStatus()); UnloadClosure done = std::bind(&MasterImpl::UnloadTabletCallback, this, tablet, FLAGS_tera_master_impl_retry_times, _1, _2, _3, _4); @@ -5209,6 +5436,60 @@ void MasterImpl::EnableTabletNodeGcTimer() { gc_enabled_ = true; } +void MasterImpl::DoGcTrashClean() { + { + MutexLock lock(&mutex_); + if (!gc_trash_clean_enabled_) { + gc_trash_clean_timer_id_ = kInvalidTimerId; + return; + } + } + + int64_t start_ts = get_micros(); + io::CleanTrackableGcTrash(); + LOG(INFO) << "[gc] clean trackable gc trash, cost: " + << (get_micros() - start_ts) / 1000 << " ms"; + + MutexLock lock(&mutex_); + ScheduleGcTrashClean(); +} + +void MasterImpl::ScheduleGcTrashClean() { + mutex_.AssertHeld(); + VLOG(10) << "[gc] ScheduleGcTrashClean"; + ThreadPool::Task task = + std::bind(&MasterImpl::DoGcTrashClean, this); + gc_timer_id_ = thread_pool_->DelayTask( + FLAGS_tera_master_gc_trash_clean_period_s * 1000, task); +} + +void MasterImpl::EnableGcTrashCleanTimer() { + if (!FLAGS_tera_master_gc_trash_enabled) { + return; + } + + MutexLock lock(&mutex_); + if (gc_trash_clean_timer_id_ == kInvalidTimerId) { + ScheduleGcTrashClean(); + } + gc_trash_clean_enabled_ = true; +} + +void MasterImpl::DisableGcTrashCleanTimer() { + if (!FLAGS_tera_master_gc_trash_enabled) { + return; + } + + MutexLock lock(&mutex_); + if (gc_trash_clean_timer_id_ != kInvalidTimerId) { + bool non_block = true; + if (thread_pool_->CancelTask(gc_timer_id_, non_block)) { + gc_trash_clean_timer_id_ = kInvalidTimerId; + } + } + gc_trash_clean_enabled_ = false; +} + void MasterImpl::DoAvailableCheck() { MutexLock lock(&mutex_); if (FLAGS_tera_master_availability_check_enabled) { @@ -5285,9 +5566,9 @@ void MasterImpl::DoTabletNodeGcPhase2() { } LOG(INFO) << "[gc] try clean trash dir."; - int64_t start = common::timer::get_micros(); + int64_t start = get_micros(); io::CleanTrashDir(); - int64_t cost = (common::timer::get_micros() - start) / 1000; + int64_t cost = (get_micros() - start) / 1000; LOG(INFO) << "[gc] clean trash dir done, cost: " << cost << "ms."; MutexLock lock(&mutex_); diff --git a/src/master/master_impl.h b/src/master/master_impl.h index a8959c703..3a7a17b7e 100644 --- a/src/master/master_impl.h +++ b/src/master/master_impl.h @@ -52,7 +52,6 @@ class MetaTable; class Scheduler; class TabletManager; class TabletNodeManager; -class MasterImplTest; class MasterImpl { public: @@ -233,6 +232,8 @@ class MasterImpl { CmdCtrlResponse* response); void TabletCmdCtrl(const CmdCtrlRequest* request, CmdCtrlResponse* response); + void TableCmdCtrl(const CmdCtrlRequest* request, + CmdCtrlResponse* response); void MetaCmdCtrl(const CmdCtrlRequest* request, CmdCtrlResponse* response); @@ -363,6 +364,15 @@ class MasterImpl { SplitTabletResponse* response, bool failed, int error_code); + virtual void SplitTabletWriteMetaAsync(TabletPtr tablet, const std::string& split_key); + + void SplitTabletWriteMetaCallback(TabletPtr parent_tablet, + std::vector child_tablets, + int32_t retry_times, + WriteTabletRequest* request, + WriteTabletResponse* response, + bool failed, int err_code); + void MergeTabletAsync(TabletPtr tablet_p1, TabletPtr tablet_p2); virtual void MergeTabletAsyncPhase2(TabletPtr tablet_p1, TabletPtr tablet_p2); void MergeTabletUnloadCallback(TabletPtr tablet); @@ -440,7 +450,7 @@ class MasterImpl { WriteTabletResponse* response, bool failed, int error_code); - void ScanMetaTableAsync(const std::string& table_name, + virtual void ScanMetaTableAsync(const std::string& table_name, const std::string& tablet_key_start, const std::string& tablet_key_end, ScanClosure done); @@ -535,6 +545,10 @@ class MasterImpl { void DumpStatToTable(const TabletNode& stat); // garbage clean + void EnableGcTrashCleanTimer(); + void DisableGcTrashCleanTimer(); + void ScheduleGcTrashClean(); + void DoGcTrashClean(); void EnableTabletNodeGcTimer(); void DisableTabletNodeGcTimer(); void ScheduleTabletNodeGc(); @@ -609,6 +623,8 @@ class MasterImpl { TableImpl* stat_table_; // tabletnode garbage clean + bool gc_trash_clean_enabled_; + int64_t gc_trash_clean_timer_id_; bool gc_enabled_; int64_t gc_timer_id_; bool gc_query_enable_; diff --git a/src/master/master_zk_adapter.cc b/src/master/master_zk_adapter.cc index 6f481f225..7227a43ae 100644 --- a/src/master/master_zk_adapter.cc +++ b/src/master/master_zk_adapter.cc @@ -387,14 +387,6 @@ void MasterZkAdapter::OnSafeModeMarkDeleted() { LOG(ERROR) << "safemode mark node is deleted"; } -void MasterZkAdapter::OnMasterLockLost() { - LOG(ERROR) << "master lock lost"; - master_impl_->SetMasterStatus(MasterImpl::kIsSecondary); - master_impl_->DisableQueryTabletNodeTimer(); - DeleteMasterNode(); - Reset(); -} - void MasterZkAdapter::OnTabletNodeListDeleted() { LOG(ERROR) << "ts dir node is deleted"; if (!MarkSafeMode()) { diff --git a/src/master/master_zk_adapter.h b/src/master/master_zk_adapter.h index 618dbc984..7419a1246 100644 --- a/src/master/master_zk_adapter.h +++ b/src/master/master_zk_adapter.h @@ -66,7 +66,6 @@ class MasterZkAdapter : public MasterZkAdapterBase { virtual void OnSafeModeMarkCreated(); virtual void OnSafeModeMarkDeleted(); - virtual void OnMasterLockLost(); virtual void OnTabletNodeListDeleted(); virtual void OnRootTabletNodeDeleted(); virtual void OnMasterNodeDeleted(); diff --git a/src/master/tablet_manager.cc b/src/master/tablet_manager.cc index d8049e26c..e45f99bd8 100644 --- a/src/master/tablet_manager.cc +++ b/src/master/tablet_manager.cc @@ -31,15 +31,17 @@ DECLARE_string(tera_working_dir); DECLARE_string(tera_master_meta_table_path); DECLARE_string(tera_master_meta_table_name); -DECLARE_bool(tera_zk_enabled); DECLARE_string(tera_master_gc_strategy); +DECLARE_bool(tera_master_gc_trash_enabled); DECLARE_int32(tera_master_impl_retry_times); DECLARE_int32(tera_tabletnode_connect_retry_period); DECLARE_bool(tera_delete_obsolete_tabledir_enabled); DECLARE_string(tera_tabletnode_path_prefix); +DECLARE_int64(tera_master_split_history_time_interval); +DECLARE_string(tera_leveldb_env_type); namespace tera { namespace master { @@ -63,20 +65,22 @@ std::ostream& operator << (std::ostream& o, const TabletPtr& tablet) { return o; } -Tablet::Tablet(const TabletMeta& meta) - : meta_(meta), - update_time_(common::timer::get_micros()), - ready_time_(std::numeric_limits::max()), - merge_param_(NULL), - gc_reported_(false) {} +Tablet::Tablet(const TabletMeta& meta): + meta_(meta), + update_time_(get_micros()), + ready_time_(std::numeric_limits::max()), + last_move_time_us_(0), + merge_param_(NULL), + gc_reported_(false) { } -Tablet::Tablet(const TabletMeta& meta, TablePtr table) - : meta_(meta), - table_(table), - update_time_(common::timer::get_micros()), - ready_time_(std::numeric_limits::max()), - merge_param_(NULL), - gc_reported_(false) {} +Tablet::Tablet(const TabletMeta& meta, TablePtr table): + meta_(meta), + table_(table), + update_time_(get_micros()), + ready_time_(std::numeric_limits::max()), + last_move_time_us_(0), + merge_param_(NULL), + gc_reported_(false) { } Tablet::~Tablet() { table_.reset(); @@ -131,6 +135,21 @@ int64_t Tablet::GetQps() { + average_counter_.scan_rows(); } +int64_t Tablet::GetReadQps() { + MutexLock lock(&mutex_); + return average_counter_.read_rows(); +} + +int64_t Tablet::GetWriteQps() { + MutexLock lock(&mutex_); + return average_counter_.write_rows(); +} + +int64_t Tablet::GetScanQps() { + MutexLock lock(&mutex_); + return average_counter_.scan_rows(); +} + const std::string& Tablet::GetKeyStart() { MutexLock lock(&mutex_); return meta_.key_range().key_start(); @@ -188,8 +207,47 @@ bool Tablet::IsBusy() { if (counter_list_.size() > 0) { return counter_list_.back().is_on_busy(); } else { - return false; + return average_counter_.is_on_busy(); + } +} + +bool Tablet::TestAndSetSplitTimeStamp(int64_t ts) { // timestamp in us + ts /= 1000; // transalte into ms + //MutexLock lock(&mutex_); + if (split_history_.last_split_ts < (ts - FLAGS_tera_master_split_history_time_interval)) { + split_history_.last_split_ts = ts; + return true; + } + return false; +} + +void Tablet::GetErrorIgnoredLGs(std::vector* lgs) { + MutexLock lock(&mutex_); + *lgs = ignore_err_lgs_; +} + +bool Tablet::SetErrorIgnoredLGs(const std::string& lg_list_str) { + if (lg_list_str.empty()) { + MutexLock lock(&mutex_); + ignore_err_lgs_.clear(); + return true; } + std::vector lgs; + SplitString(lg_list_str, ":", &lgs); + const TableSchema& schema = GetSchema(); + std::set lg_schema_set; + for (int i = 0; i < schema.locality_groups_size(); ++i) { + lg_schema_set.insert(schema.locality_groups(i).name()); + } + for (const auto& lg : lgs) { + if (lg_schema_set.find(lg) == lg_schema_set.end()) { + LOG(WARNING) << "set error ignored locality group ["<< lg << "] failed."; + return false; + } + } + MutexLock lock(&mutex_); + ignore_err_lgs_ = lgs; + return true; } std::string Tablet::DebugString() { @@ -220,8 +278,8 @@ void Tablet::SetCounter(const TabletCounter& counter) { average_counter_.set_write_size( CounterWeightedSum(counter.write_size(), average_counter_.write_size())); average_counter_.set_write_workload(counter.write_workload()); - average_counter_.set_is_on_busy( - CounterWeightedSum(counter.is_on_busy(), average_counter_.is_on_busy())); + average_counter_.set_is_on_busy(counter.is_on_busy()); + average_counter_.set_db_status(counter.db_status()); } void Tablet::UpdateSize(const TabletMeta& meta) { @@ -282,6 +340,22 @@ bool Tablet::SetStatusIf(TabletStatus new_status, TabletStatus if_status, return false; } +bool Tablet::SetStatusIf(TabletStatus new_status, + TabletStatus if_status, + const std::string& if_addr) { + MutexLock lock(&mutex_); + if (meta_.status() == if_status && + meta_.server_addr() == if_addr && + CheckStatusSwitch(meta_.status(), new_status)) { + meta_.set_status(new_status); + if (new_status == kTableReady) { + ready_time_ = get_micros(); + } + return true; + } + return false; +} + bool Tablet::SetStatusIf(TabletStatus new_status, TabletStatus if_status, TableStatus if_table_status, TabletStatus* old_status) { if (!IsBound()) { @@ -368,12 +442,22 @@ int64_t Tablet::SetUpdateTime(int64_t timestamp) { int64_t Tablet::ReadyTime() { MutexLock lock(&mutex_); if (meta_.status() != kTableReady) { - return std::numeric_limits::max(); + return std::numeric_limits::max(); } else { return ready_time_; } } +int64_t Tablet::LastMoveTime() const { + MutexLock lock(&mutex_); + return last_move_time_us_; +} + +void Tablet::SetLastMoveTime(int64_t time) { + MutexLock lock(&mutex_); + last_move_time_us_ = time; +} + int32_t Tablet::AddSnapshot(uint64_t snapshot) { MutexLock lock(&mutex_); meta_.add_snapshot_list(snapshot); @@ -582,6 +666,7 @@ Table::Table(const std::string& table_name) deleted_tablet_num_(0), max_tablet_no_(0), create_time_((int64_t)time(NULL)), + metric_(table_name), schema_is_syncing_(false), rangefragment_(NULL), update_rpc_response_(NULL), @@ -936,6 +1021,10 @@ void Table::RefreshCounter() { sspeed += counter.scan_size(); } + metric_.SetTableSize(size); + metric_.SetTabletNum(tablet_num); + metric_.SetNotReady(notready); + counter_.set_size(size); counter_.set_tablet_num(tablet_num); counter_.set_notready_num(notready); @@ -1175,9 +1264,14 @@ bool Table::TryCollectInheritedFile() { std::vector tablet_files; CollectInheritedFileFromFilesystem(name_, *it, &tablet_files); - for (uint32_t i = 0; i < tablet_files.size(); i++) { + if (tablet_files.empty()) { MutexLock l(&mutex_); - AddInheritedFile(tablet_files[i], false); + AddEmptyDeadTablet(*it); + } else { + for (uint32_t i = 0; i < tablet_files.size(); i++) { + MutexLock l(&mutex_); + AddInheritedFile(tablet_files[i], false); + } } } return dead_tablets.size() > 0; @@ -1269,6 +1363,10 @@ bool Table::GetTabletsForGc(std::set* live_tablets, VLOG(10) << "[gc] add dead tablet: " << path; dead_tablets->insert(tabletnum); } + + if (0 == tabletnum) { + LOG(WARNING) << "[gc] invalid tablet path found: <" << path << ">"; + } } if (dead_tablets->size() == 0) { VLOG(10) << "[gc] there is none dead tablets: " << name_; @@ -1300,6 +1398,17 @@ void Table::AddInheritedFile(const TabletFile& file, bool need_ref) { VLOG(10) << "[gc] [" << name_ << "] file " << file << " ref increment to " << file_info.ref; } +void Table::AddEmptyDeadTablet(uint64_t tablet_id) { + mutex_.AssertHeld(); + + if (useful_inh_files_.find(tablet_id) == useful_inh_files_.end()) { + LOG(INFO) << "[gc] [" << name_ << "] new empty dead tablet " + << tablet_id << ", gc disabled"; + gc_disabled_dead_tablets_.insert(tablet_id); + useful_inh_files_[tablet_id]; + } +} + uint64_t Table::CleanObsoleteFile() { leveldb::Env* env = io::LeveldbBaseEnv(); std::string table_path = FLAGS_tera_tabletnode_path_prefix + name_; @@ -1314,13 +1423,38 @@ uint64_t Table::CleanObsoleteFile() { leveldb::Status s; if (file.lg_id == 0 && file.file_id == 0) { std::string path = leveldb::BuildTabletPath(table_path, file.tablet_id); + leveldb::FileLock* file_lock = nullptr; + // NEVER remove the trailing character '/', otherwise you will lock the parent directory + s = env->LockFile(path + "/", &file_lock); + if (!s.ok()) { + LOG(WARNING) << "lock path failed, path: " << path << ", status: " << s.ToString(); + } + delete file_lock; + LOG(INFO) << "[gc] [" << name_ << "] delete dir " << path; s = io::DeleteEnvDir(path); //safely delete dir and all file in it } else { + std::string lg_path = leveldb::BuildTabletLgPath(table_path, file.tablet_id, file.lg_id); + leveldb::FileLock* file_lock = nullptr; + // NEVER remove the trailing character '/', otherwise you will lock the parent directory + s = env->LockFile(lg_path + "/", &file_lock); + if (!s.ok()) { + LOG(WARNING) << "lock path failed, path: " << lg_path << ", status: " << s.ToString(); + } + + delete file_lock; + std::string path = leveldb::BuildTableFilePath(table_path, file.tablet_id, file.lg_id, file.file_id); - LOG(INFO) << "[gc] [" << name_ << "] delete file " << file << " path " << path; - s = env->DeleteFile(path); + if (FLAGS_tera_master_gc_trash_enabled) { + LOG(INFO) << "[gc] [" << name_ << "] move file to trash, file: " + << file << ", path: " << path; + // move sst to trackable gc trash instead of deleting it directly + s = io::MoveSstToTrackableGcTrash(name_, file.tablet_id, file.lg_id, file.file_id); + } else { + LOG(INFO) << "[gc] [" << name_ << "] delete file " << file << " path " << path; + s = env->DeleteFile(path); + } } mutex_.Lock(); if (!s.ok()) { @@ -1554,6 +1688,40 @@ bool TabletManager::FindOverlappedTablets(const std::string& table_name, return true; } +bool TabletManager::SearchTablet(const std::string& table_name, + const std::string& key, + TabletPtr* tablet, + StatusCode* ret_status) { + // lock table list + mutex_.Lock(); + + // search table + TableList::iterator it = all_tables_.find(table_name); + if (it == all_tables_.end()) { + mutex_.Unlock(); + VLOG(5) << "table: " << table_name << " not exist"; + SetStatusCode(kTableNotFound, ret_status); + return false; + } + Table& table = *it->second; + + // lock table + table.mutex_.Lock(); + mutex_.Unlock(); + + // search tablet + Table::TabletList::reverse_iterator rit2 = table.tablets_list_.rbegin(); + for (; rit2 != table.tablets_list_.rend(); ++rit2) { + if (rit2->first <= key) { + *tablet = rit2->second; + break; + } + } + + table.mutex_.Unlock(); + return true; +} + bool TabletManager::FindTable(const std::string& table_name, std::vector* tablet_meta_list, StatusCode* ret_status) { diff --git a/src/master/tablet_manager.h b/src/master/tablet_manager.h index 1e58d62cf..07e942ecb 100644 --- a/src/master/tablet_manager.h +++ b/src/master/tablet_manager.h @@ -16,11 +16,12 @@ #include "common/mutex.h" #include "common/thread_pool.h" +#include "common/metric/metric_counter.h" #include "proto/master_rpc.pb.h" #include "proto/table_meta.pb.h" #include "proto/tabletnode_rpc.pb.h" -#include "utils/counter.h" +#include "common/counter.h" #include "utils/fragment.h" using namespace std::placeholders; @@ -83,7 +84,9 @@ class Tablet { friend std::ostream& operator << (std::ostream& o, const Tablet& tablet); public: - Tablet(); + Tablet() = delete; + Tablet(const Tablet&) = delete; + Tablet& operator=(const Tablet&) = delete; explicit Tablet(const TabletMeta& meta); Tablet(const TabletMeta& meta, TablePtr table); ~Tablet(); @@ -95,6 +98,9 @@ class Tablet { int64_t GetDataSize(); void GetDataSize(int64_t* size, std::vector* lg_size); int64_t GetQps(); + int64_t GetReadQps(); + int64_t GetWriteQps(); + int64_t GetScanQps(); const std::string& GetKeyStart(); const std::string& GetKeyEnd(); @@ -117,6 +123,9 @@ class Tablet { bool SetStatus(TabletStatus new_status, TabletStatus* old_status = NULL); bool SetStatusIf(TabletStatus new_status, TabletStatus if_status, TabletStatus* old_status = NULL); + bool SetStatusIf(TabletStatus new_status, + TabletStatus if_status, + const std::string& if_addr); bool SetStatusIf(TabletStatus new_status, TabletStatus if_status, TableStatus if_table_status, TabletStatus* old_status = NULL); bool SetAddrIf(const std::string& server_addr, TabletStatus if_status, @@ -151,13 +160,20 @@ class Tablet { int64_t UpdateTime(); int64_t SetUpdateTime(int64_t timestamp); int64_t ReadyTime(); + int64_t LastMoveTime() const; + void SetLastMoveTime(int64_t time); void* GetMergeParam(); void SetMergeParam(void* merge_param); + bool TestAndSetSplitTimeStamp(int64_t ts); + + // Will set a flag to ignore lost file error when tabletserver load tablet. + // We should set specific locality_groups that avoid missing some of the + // exceptions in others locality_groups. + void GetErrorIgnoredLGs(std::vector* lgs); + bool SetErrorIgnoredLGs(const std::string& lg_list_str = ""); private: - Tablet(const Tablet&) {} - Tablet& operator=(const Tablet&) {return *this;} static bool CheckStatusSwitch(TabletStatus old_status, TabletStatus new_status); @@ -167,8 +183,10 @@ class Tablet { TablePtr table_; int64_t update_time_; int64_t ready_time_; + int64_t last_move_time_us_; std::string server_id_; std::string expect_server_addr_; + std::vector ignore_err_lgs_; // lg array for ignore_err_ std::list counter_list_; TabletCounter average_counter_; struct TabletAccumulateCounter { @@ -189,6 +207,14 @@ class Tablet { } accumu_counter_; void* merge_param_; + // Tablet Split History Tracing + struct TabletSplitHistory { + int64_t last_split_ts; + + TabletSplitHistory() + : last_split_ts(0) {} + } split_history_; + // protected by Table::mutex_ bool gc_reported_; std::multiset inh_files_; @@ -199,6 +225,42 @@ std::ostream& operator << (std::ostream& o, const TabletPtr& tablet); std::ostream& operator << (std::ostream& o, const TablePtr& table); class Table { + + class TableMetric { + public: + TableMetric(const std::string& name): + table_name_(name), + tablet_num_("tera_master_tablet_num", GetTableNameLabel(), + {SubscriberType::LATEST}, false), + not_ready_("tera_master_tablet_not_ready_num", GetTableNameLabel(), + {SubscriberType::LATEST}, false), + table_size_("tera_master_table_size", GetTableNameLabel(), + {SubscriberType::LATEST}, false) + {} + + void SetTabletNum(int64_t tablet_num) { + tablet_num_.Set(tablet_num); + } + + void SetNotReady(int64_t not_ready) { + not_ready_.Set(not_ready); + } + + void SetTableSize(int64_t table_size) { + table_size_.Set(table_size); + } + + private: + std::string GetTableNameLabel() { + return "table:" + table_name_; + } + + const std::string table_name_; + tera::MetricCounter tablet_num_; + tera::MetricCounter not_ready_; + tera::MetricCounter table_size_; + }; + friend class Tablet; friend class TabletManager; friend std::ostream& operator << (std::ostream& o, const Table& tablet); @@ -262,11 +324,12 @@ class Table { void EnableDeadTabletGarbageCollect(uint64_t tablet_id); void ReleaseInheritedFile(const TabletFile& file); void AddInheritedFile(const TabletFile& file, bool need_ref); + void AddEmptyDeadTablet(uint64_t tablet_id); uint64_t CleanObsoleteFile(); private: - Table(const Table&) {} - Table& operator=(const Table&) {return *this;} + Table(const Table&) = delete; + Table& operator=(const Table&) = delete; typedef std::map TabletList; TabletList tablets_list_; mutable Mutex mutex_; @@ -279,6 +342,7 @@ class Table { uint64_t max_tablet_no_; int64_t create_time_; TableCounter counter_; + TableMetric metric_; bool schema_is_syncing_; // is schema syncing to all ts(all tablets) RangeFragment* rangefragment_; UpdateTableResponse* update_rpc_response_; @@ -348,6 +412,11 @@ class TabletManager { std::vector* tablet_meta_list, StatusCode* ret_status = NULL); + bool SearchTablet(const std::string& table_name, + const std::string& key, + TabletPtr* tablet, + StatusCode* ret_status); + bool FindTable(const std::string& table_name, TablePtr* tablet); int64_t SearchTable(std::vector* tablet_meta_list, diff --git a/src/master/tabletnode_manager.cc b/src/master/tabletnode_manager.cc index 383526a02..ff1767ee4 100644 --- a/src/master/tabletnode_manager.cc +++ b/src/master/tabletnode_manager.cc @@ -6,7 +6,7 @@ #include "master/master_impl.h" #include "master/workload_scheduler.h" -#include "utils/timer.h" +#include "common/timer.h" DECLARE_string(tera_master_meta_table_name); DECLARE_int32(tera_master_max_load_concurrency); diff --git a/src/master/test/master_impl_test.cc b/src/master/test/master_impl_test.cc index e9e130e33..9e0573f04 100644 --- a/src/master/test/master_impl_test.cc +++ b/src/master/test/master_impl_test.cc @@ -14,36 +14,19 @@ #include "utils/utils_cmd.h" #include "version.h" +DECLARE_string(tera_master_port); +DECLARE_string(log_dir); +DECLARE_string(tera_coord_type); +DECLARE_string(tera_leveldb_env_type); + namespace tera { namespace master { class MasterImplTest : public ::testing::Test, public MasterImpl { public: MasterImplTest() : merge_enter_phase2(false) { - } - - void SplitTabletTest() { - SplitTabletRequest* request = NULL; - SplitTabletResponse* response = NULL; - bool failed; - int error_code; - TablePtr table; - TabletPtr tablet; - TabletMeta meta; - - table.reset(new Table("splittest")); - tablet.reset(new Tablet(meta, table)); - request = new SplitTabletRequest; - response = new SplitTabletResponse; - - tablet->SetStatus(kTableReady); - tablet->SetStatus(kTableOnSplit); - response->set_status(kTableNotSupport); - failed = false; - error_code = 0; - - MasterImpl::SplitTabletCallback(tablet, request, response, failed, error_code); - EXPECT_TRUE(tablet->GetStatus() == kTableOffLine); + FLAGS_tera_coord_type = "fake_zk"; + FLAGS_tera_leveldb_env_type = "local"; } bool merge_enter_phase2; @@ -95,6 +78,55 @@ class MasterImplTest : public ::testing::Test, public MasterImpl { return tablet; } + void DeleteTabletNodeTest() { + // add server + std::string addr1 = "127.0.0.1:22000"; + std::string addr2 = "127.0.0.2:22000"; + tabletnode_manager_->AddTabletNode(addr1, addr1); + tabletnode_manager_->AddTabletNode(addr2, addr2); + + // add tabelt + StatusCode s; + TabletMeta meta; + TablePtr table(new Table("table001")); + TabletPtr tablet = MakeTabletPtr("a", "z", table); + tablet->SetStatus(kTableReady); + tablet->SetAddr(addr1); + tablet->ToMeta(&meta); + tablet_manager_->AddTablet(meta, tablet->GetSchema(), &tablet, &s); + tablet->SetServerId(addr1); + + // thread1: get tablet from addr1 + std::vector tablet_list; + std::vector::iterator it; + tablet_manager_->FindTablet(addr1, &tablet_list, true); + EXPECT_TRUE(it != tablet_list.end()); + EXPECT_TRUE(tablet_list.size() == 1); + + // thread2: load tablet into addr2 + LoadTabletRequest* request = new LoadTabletRequest; + LoadTabletResponse* response = new LoadTabletResponse; + tablet->SetAddr(addr2); + tablet->SetServerId(addr2); + + TabletNodePtr node; + tabletnode_manager_->FindTabletNode(addr2, &node); + node->TryLoad(tablet); + tablet->SetStatus(kTableOffLine); + tablet->SetStatus(kTableOnLoad); + response->set_status(kTabletNodeOk); + LoadTabletCallback(tablet, 10, request, response, 0, 0); + EXPECT_TRUE(tablet->GetStatus() == kTableReady); + + // thread1: check addr1 and set status + for (it = tablet_list.begin(); it != tablet_list.end(); ++it) { + TabletPtr t = *it; + t->SetStatusIf(kTabletPending, kTableReady, addr1); + } + EXPECT_TRUE(tablet->GetStatus() == kTableReady); + EXPECT_STREQ(tablet->GetServerAddr().c_str(), addr2.c_str()); + } + // This unload function will not send unload request // Tablet will stay in kTableUnLoading status forever // It can be used to simulate a slow unload @@ -119,9 +151,7 @@ class MasterImplTest : public ::testing::Test, public MasterImpl { LOG(ERROR) << t1->GetStatus() << ";" << t2->GetStatus() << ";" << t3->GetStatus(); EXPECT_TRUE((t1->GetStatus() == kTableUnLoading) && (t2->GetStatus() == kTableUnLoading) - && (t3->GetStatus() == kTableReady)); - - // t2 & t3's merge should fail since t1 & t2 is merging + && (t3->GetStatus() == kTableReady)); // t2 & t3's merge should fail since t1 & t2 is merging MergeTabletAsync(t2, t3); LOG(ERROR) << t1->GetStatus() << ";" << t2->GetStatus() << ";" << t3->GetStatus(); EXPECT_TRUE((t1->GetStatus() == kTableUnLoading) @@ -135,10 +165,49 @@ class MasterImplTest : public ::testing::Test, public MasterImpl { && (t2->GetStatus() == kTableUnLoading) && (t3->GetStatus() == kTableReady)); } + + virtual void ScanMetaTableAsync(const std::string& table_name, + const std::string& tablet_key_start, + const std::string& tablet_end_key, + ScanClosure done); + + virtual void SplitTabletWriteMetaAsync(TabletPtr tablet, const std::string& split_key); }; -TEST_F(MasterImplTest, SplitTest) { - SplitTabletTest(); +void MasterImplTest::ScanMetaTableAsync(const std::string& table_name, + const std::string& tablet_key_start, + const std::string& tablet_end_key, + ScanClosure done) { + + const ::testing::TestInfo* test_case = ::testing::UnitTest::GetInstance()->current_test_info(); + std::string case_name(test_case->test_case_name()); + if (case_name == "InteractWithOldTS") { + EXPECT_TRUE(true); + } + if (case_name.find("InteractWithNewTS") != std::string::npos) { + EXPECT_TRUE(false); + } +} + +void MasterImplTest::SplitTabletWriteMetaAsync(TabletPtr tablet, const std::string& split_key) { + const ::testing::TestInfo* test_case = ::testing::UnitTest::GetInstance()->current_test_info(); + std::string case_name(test_case->test_case_name()); + if (case_name.find("InteractWithOldTS") != std::string::npos) { + EXPECT_TRUE(false); + } + if (case_name.find("InteractWithNewTS") != std::string::npos) { + EXPECT_TRUE(true); + } + EXPECT_EQ(tablet->GetStatus(), kTableOnSplit); + EXPECT_FALSE(split_key.empty()); + EXPECT_GT(split_key, tablet->GetKeyStart()); + if (!tablet->GetKeyEnd().empty()) { + EXPECT_GT(tablet->GetKeyEnd(), split_key); + } +} + +TEST_F(MasterImplTest, DeleteTabletNodeTest) { + DeleteTabletNodeTest(); } TEST_F(MasterImplTest, MergeTest) { @@ -149,6 +218,163 @@ TEST_F(MasterImplTest, MergeTabletBrokenTest) { MergeTabletBrokenTest(); } +TEST_F(MasterImplTest, SplitNotSupport) { + SplitTabletRequest* request = NULL; + SplitTabletResponse* response = NULL; + bool failed; + int error_code; + TablePtr table; + TabletPtr tablet; + TabletMeta meta; + + table.reset(new Table("splittest")); + tablet.reset(new Tablet(meta, table)); + request = new SplitTabletRequest; + response = new SplitTabletResponse; + + tablet->SetStatus(kTableReady); + tablet->SetStatus(kTableOnSplit); + response->set_status(kTableNotSupport); + failed = false; + error_code = 0; + + MasterImpl::SplitTabletCallback(tablet, request, response, failed, error_code); + EXPECT_TRUE(tablet->GetStatus() == kTableOffLine); +} + +TEST_F(MasterImplTest, InteractWithOldTS) { + SplitTabletRequest* request = NULL; + SplitTabletResponse* response = NULL; + TablePtr table; + TabletPtr tablet; + TabletMeta meta; + + table.reset(new Table("splittest")); + tablet.reset(new Tablet(meta, table)); + request = new SplitTabletRequest; + response = new SplitTabletResponse; + + tablet->SetStatus(kTableReady); + tablet->SetStatus(kTableOnSplit); + response->set_status(kTabletNodeOk); + + bool failed = false; + int error_code = 0; + MasterImpl::SplitTabletCallback(tablet, request, response, failed, error_code); +} + +TEST_F(MasterImplTest, InteractWithNewTSOK){ + TablePtr table; + TabletPtr tablet; + TabletMeta meta; + + table.reset(new Table("splittest")); + tablet.reset(new Tablet(meta, table)); + SplitTabletRequest* request = new SplitTabletRequest; + SplitTabletResponse* response = new SplitTabletResponse; + tablet->SetStatus(kTableReady); + tablet->SetStatus(kTableOnSplit); + response->set_status(kTabletNodeOk); + response->add_split_keys("abc"); + bool failed = false; + int error_code = 0; + MasterImpl::SplitTabletCallback(tablet, request, response, failed, error_code); + + meta.mutable_key_range()->set_key_start("ab"); + meta.mutable_key_range()->set_key_end("bc"); + tablet.reset(new Tablet(meta, table)); + tablet->SetStatus(kTableReady); + tablet->SetStatus(kTableOnSplit); + request = new SplitTabletRequest; + response = new SplitTabletResponse; + response->add_split_keys("b"); + MasterImpl::SplitTabletCallback(tablet, request, response, failed, error_code); + EXPECT_EQ(tablet->GetStatus(), kTableOnSplit); +} + +TEST_F(MasterImplTest, NewTSReturnInvalidSplitKey){ + TablePtr table; + TabletPtr tablet; + TabletMeta meta; + + meta.mutable_key_range()->set_key_start("aa"); + meta.mutable_key_range()->set_key_end("cc"); + table.reset(new Table("splittest")); + tablet.reset(new Tablet(meta, table)); + tablet->SetStatus(kTableReady); + tablet->SetStatus(kTableOnSplit); + MasterImpl::SplitTabletWriteMetaAsync(tablet, ""); + EXPECT_EQ(tablet->GetStatus(), kTableOffLine); + + tablet->SetStatus(kTableReady); + tablet->SetStatus(kTableOnSplit); + MasterImpl::SplitTabletWriteMetaAsync(tablet, "aa"); + EXPECT_EQ(tablet->GetStatus(), kTableOffLine); + + tablet->SetStatus(kTableReady); + tablet->SetStatus(kTableOnSplit); + MasterImpl::SplitTabletWriteMetaAsync(tablet, "cc"); + EXPECT_EQ(tablet->GetStatus(), kTableOffLine); + + tablet->SetStatus(kTableReady); + tablet->SetStatus(kTableOnSplit); + MasterImpl::SplitTabletWriteMetaAsync(tablet, "d"); + EXPECT_EQ(tablet->GetStatus(), kTableOffLine); + + meta.mutable_key_range()->set_key_end(""); + tablet.reset(new Tablet(meta, table)); + tablet->SetStatus(kTableReady); + tablet->SetStatus(kTableOnSplit); + MasterImpl::SplitTabletWriteMetaAsync(tablet, ""); + EXPECT_EQ(tablet->GetStatus(), kTableOffLine); + + meta.Clear(); + tablet.reset(new Tablet(meta, table)); + tablet->SetStatus(kTableReady); + tablet->SetStatus(kTableOnSplit); + MasterImpl::SplitTabletWriteMetaAsync(tablet, ""); + EXPECT_EQ(tablet->GetStatus(), kTableOffLine); + +} + +TEST_F(MasterImplTest, SplitTabletWriteMetaCallback) { + TablePtr table; + TabletPtr tablet; + TabletMeta meta; + + meta.mutable_key_range()->set_key_start("a"); + meta.mutable_key_range()->set_key_end("c"); + table.reset(new Table("splittest")); + tablet.reset(new Tablet(meta, table)); + tablet->SetStatus(kTableReady); + tablet->SetStatus(kTableOnSplit); + std::vector child_tablets; + meta.mutable_key_range()->set_key_end("b"); + child_tablets.emplace_back(new Tablet(meta)); + meta.mutable_key_range()->set_key_start("b"); + meta.mutable_key_range()->set_key_end("c"); + child_tablets.emplace_back(new Tablet(meta)); + bool failed = false; + int error_code = 0; + + WriteTabletRequest* request = new WriteTabletRequest; + WriteTabletResponse* response = new WriteTabletResponse; + + response->set_status(kTabletNodeOk); + response->add_row_status_list(kTabletNodeOk); + response->add_row_status_list(kTabletNodeOk); + + MasterImpl::SplitTabletWriteMetaCallback(tablet, + child_tablets, 1, request, response, failed, error_code); + EXPECT_EQ(table->tablets_list_.size(), 2); + TabletPtr t1, t2; + table->FindTablet("a", &t1); + table->FindTablet("b", &t2); + EXPECT_EQ(t1->GetStatus(), kTableOffLine); + EXPECT_EQ(t2->GetStatus(), kTableOffLine); + EXPECT_STREQ(t1->GetKeyEnd().c_str(), t2->GetKeyStart().c_str()); +} + } // master } // tera diff --git a/src/master/test/master_test.cc b/src/master/test/master_test.cc index 89b44c208..d0ecfb87f 100644 --- a/src/master/test/master_test.cc +++ b/src/master/test/master_test.cc @@ -8,20 +8,13 @@ #include "utils/utils_cmd.h" -DECLARE_string(tera_master_port); -DECLARE_string(log_dir); -DECLARE_bool(tera_zk_enabled); DECLARE_string(tera_leveldb_env_type); -DECLARE_string(tera_fake_zk_path_prefix); int main(int argc, char** argv) { ::google::ParseCommandLineFlags(&argc, &argv, true); ::google::InitGoogleLogging(argv[0]); - - FLAGS_tera_zk_enabled = false; - FLAGS_tera_leveldb_env_type = "local"; - tera::utils::SetupLog("master_test"); + FLAGS_tera_leveldb_env_type = "local"; ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); diff --git a/src/master/test/trackable_gc_test.cc b/src/master/test/trackable_gc_test.cc index 7d6c78dd6..09cee5dda 100644 --- a/src/master/test/trackable_gc_test.cc +++ b/src/master/test/trackable_gc_test.cc @@ -11,7 +11,7 @@ #include "master/tablet_manager.h" #include "utils/utils_cmd.h" -DECLARE_bool(tera_zk_enabled); +DECLARE_string(tera_coord_type); DECLARE_string(tera_leveldb_env_type); DECLARE_string(tera_master_gc_strategy); DECLARE_string(tera_tabletnode_path_prefix); @@ -500,7 +500,7 @@ class TrackableGcTest : public ::testing::Test { static void SetUpTestCase() { std::cout << "SetUpTestCase" << std::endl; - FLAGS_tera_zk_enabled = false; + FLAGS_tera_coord_type = "fake_zk"; FLAGS_tera_leveldb_env_type = "local"; FLAGS_tera_master_gc_strategy = "trackable"; FLAGS_tera_tabletnode_path_prefix = "./"; diff --git a/src/master/workload_scheduler.cc b/src/master/workload_scheduler.cc index f0f70540c..5933827cb 100644 --- a/src/master/workload_scheduler.cc +++ b/src/master/workload_scheduler.cc @@ -11,6 +11,7 @@ DECLARE_double(tera_master_load_balance_size_ratio_trigger); DECLARE_int32(tera_master_load_balance_ts_load_threshold); +DECLARE_int64(tera_master_load_balance_ts_size_threshold); DECLARE_int32(tera_master_load_balance_scan_weight); namespace tera { @@ -76,8 +77,8 @@ bool SizeScheduler::MayMoveOut(const TabletNodePtr& node, const std::string& table_name) { VLOG(16) << "[size-sched] MayMoveOut()"; int64_t node_size = node->GetSize(table_name); - if (node_size <= 0) { - VLOG(16) << "[size-sched] node has no data"; + if (node_size <= FLAGS_tera_master_load_balance_ts_size_threshold) { + VLOG(16) << "[size-sched] node do not need loadbalance"; return false; } return true; diff --git a/src/monitor/teramo_main.cc b/src/monitor/teramo_main.cc index d2a5d6417..169437948 100644 --- a/src/monitor/teramo_main.cc +++ b/src/monitor/teramo_main.cc @@ -19,7 +19,7 @@ #include "proto/tabletnode.pb.h" #include "tera.h" #include "utils/utils_cmd.h" -#include "utils/timer.h" +#include "common/timer.h" DEFINE_string(tera_monitor_default_request_filename, "tera_monitor.request", ""); DEFINE_string(tera_monitor_default_response_filename, "tera_monitor.response", ""); @@ -34,6 +34,7 @@ DECLARE_string(tera_ins_addr_list); DECLARE_string(tera_ins_root_path); DECLARE_bool(tera_zk_enabled); DECLARE_bool(tera_ins_enabled); +DECLARE_string(tera_coord_type); DECLARE_int64(tera_master_stat_table_interval); using namespace tera; @@ -296,9 +297,11 @@ void InitFlags(int32_t argc, char** argv, const MonitorRequest& request) { if (request.has_tera_zk_root()) { FLAGS_tera_ins_root_path = request.tera_zk_root(); } + FLAGS_tera_coord_type = "ins"; FLAGS_tera_ins_enabled = true; FLAGS_tera_zk_enabled = false; } else { + FLAGS_tera_coord_type = "zk"; if (request.has_tera_zk_addr()) { FLAGS_tera_zk_addr_list = request.tera_zk_addr(); } diff --git a/src/observer/executor/key_selector.h b/src/observer/executor/key_selector.h new file mode 100644 index 000000000..b6746b612 --- /dev/null +++ b/src/observer/executor/key_selector.h @@ -0,0 +1,29 @@ +// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef TERA_OBSERVER_EXECUTOR_KEY_SELECTOR_H_ +#define TERA_OBSERVER_EXECUTOR_KEY_SELECTOR_H_ + +#include +#include + +#include "tera.h" + +namespace tera { +namespace observer { + +class KeySelector { +public: + virtual ~KeySelector() {} + + // output: selected table name, selected start key + virtual bool SelectStart(std::string* table_name, + std::string* start_key) = 0; + virtual ErrorCode Observe(const std::string& table_name) = 0; +}; + +} // namespace observer +} // namespace tera + +#endif // TERA_OBSERVER_EXECUTOR_KEY_SELECTOR_H_ diff --git a/src/observer/executor/notification.h b/src/observer/executor/notification.h new file mode 100644 index 000000000..a73cbb255 --- /dev/null +++ b/src/observer/executor/notification.h @@ -0,0 +1,38 @@ +// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef TERA_OBSERVER_EXECUTOR_NOTIFICATION_H_ +#define TERA_OBSERVER_EXECUTOR_NOTIFICATION_H_ + +#include +#include + +#include "tera.h" + +#pragma GCC visibility push(default) + +namespace tera { +namespace observer { + +class Notification { +public: + virtual ~Notification() {} + + virtual void Ack(Table* t, + const std::string& row_key, + const std::string& column_family, + const std::string& qualifier) = 0; + + virtual void Notify(Table* t, + const std::string& row_key, + const std::string& column_family, + const std::string& qualifier) = 0; +}; + +} // namespace observer +} // namespace tera + +#pragma GCC visibility pop + +#endif // TERA_OBSERVER_EXECUTOR_NOTIFICATION_H_ diff --git a/src/observer/executor/notification_impl.cc b/src/observer/executor/notification_impl.cc new file mode 100644 index 000000000..125509d79 --- /dev/null +++ b/src/observer/executor/notification_impl.cc @@ -0,0 +1,67 @@ +// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "observer/executor/notification_impl.h" + +#include + +#include "common/timer.h" +#include "common/base/string_number.h" +#include "sdk/global_txn_internal.h" +#include "types.h" + +namespace tera { +namespace observer { + +Notification* GetNotification(Transaction* transaction) { + return new NotificationImpl(transaction); +} + +NotificationImpl::NotificationImpl(Transaction* transaction) + : transaction_(transaction), + start_timestamp_(get_micros()), + notify_timestamp_(0) {} + + void NotificationImpl::Ack(Table* t, + const std::string& row_key, + const std::string& column_family, + const std::string& qualifier) { + if (transaction_ != NULL) { + transaction_->Ack(t, row_key, column_family, qualifier); + return; + } + + // kNoneTransaction + tera::RowMutation* mutation = t->NewRowMutation(row_key); + std::string notify_qulifier = PackNotifyName(column_family, qualifier); + mutation->DeleteColumns(kNotifyColumnFamily, notify_qulifier, start_timestamp_); + t->ApplyMutation(mutation); + delete mutation; + } + +void NotificationImpl::Notify(Table* t, + const std::string& row_key, + const std::string& column_family, + const std::string& qualifier) { + if (transaction_ != NULL) { + transaction_->Notify(t, row_key, column_family, qualifier); + return; + } + + // kNoneTransaction + if (notify_timestamp_ == 0) { + notify_timestamp_ = get_micros(); + } + + tera::ErrorCode err; + std::string notify_qulifier = PackNotifyName(column_family, qualifier); + t->Put(row_key, kNotifyColumnFamily, notify_qulifier, NumberToString(notify_timestamp_), notify_timestamp_, &err); + if (err.GetType() != tera::ErrorCode::kOK) { + LOG(ERROR) << "Notify error. table: " << t->GetName() << " row " + << row_key << " pos: " << column_family << ":" << qualifier; + } +} + +} // namespace observer +} // namespace tera diff --git a/src/observer/executor/notification_impl.h b/src/observer/executor/notification_impl.h new file mode 100644 index 000000000..a88399d79 --- /dev/null +++ b/src/observer/executor/notification_impl.h @@ -0,0 +1,42 @@ +// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef TERA_OBSERVER_EXECUTOR_NOTIFICATION_IMPL_H_ +#define TERA_OBSERVER_EXECUTOR_NOTIFICATION_IMPL_H_ + +#include +#include + +#include "observer/executor/notification.h" +#include "tera.h" + +namespace tera { +namespace observer { + +Notification* GetNotification(Transaction* transaction); + +class NotificationImpl : public Notification { +public: + explicit NotificationImpl(Transaction* transaction); + virtual ~NotificationImpl() {} + + virtual void Ack(Table* t, + const std::string& row_key, + const std::string& column_family, + const std::string& qualifier); + + virtual void Notify(Table* t, + const std::string& row_key, + const std::string& column_family, + const std::string& qualifier); +private: + Transaction* transaction_; + int64_t start_timestamp_; + int64_t notify_timestamp_; +}; + +} // namespace observer +} // namespace tera + +#endif // TERA_OBSERVER_EXECUTOR_NOTIFICATION_IMPL_H_ diff --git a/src/observer/executor/notify_cell.h b/src/observer/executor/notify_cell.h new file mode 100644 index 000000000..9567c7231 --- /dev/null +++ b/src/observer/executor/notify_cell.h @@ -0,0 +1,110 @@ +// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef TERA_OBSERVER_EXECUTOR_NOTIFY_CELL_H_ +#define TERA_OBSERVER_EXECUTOR_TNOTIFY_CELL_H_ + +#include +#include +#include +#include + +#include "gflags/gflags.h" +#include "glog/logging.h" + +#include "observer/executor/observer.h" +#include "observer/rowlocknode/fake_rowlock_client.h" +#include "sdk/rowlock_client.h" +#include "tera.h" + +DECLARE_string(rowlock_server_port); +DECLARE_string(rowlock_server_ip); +DECLARE_bool(mock_rowlock_enable); + + +namespace tera { +namespace observer { + +struct Column { + std::string table_name; + std::string family; + std::string qualifier; + + bool operator<(const Column& other) const { + int32_t result = 0; + result = table_name.compare(other.table_name); + if (result != 0) { + return result < 0; + } + result = family.compare(other.family); + if (result != 0) { + return result < 0; + } + result = qualifier.compare(other.qualifier); + + return result < 0; + } + + bool operator==(const Column& other) const { + return table_name == other.table_name && family == other.family + && qualifier == other.qualifier; + } +}; + +struct AutoRowUnlocker { + AutoRowUnlocker(const std::string& table, + const std::string& unlock_row) + : table_name(table), + row(unlock_row) {} + AutoRowUnlocker() {} + + ~AutoRowUnlocker() { + // UnLockRow + + if (FLAGS_mock_rowlock_enable == true) { + client.reset(new FakeRowlockClient()); + } else { + client.reset(new RowlockClient()); + } + + RowlockRequest request; + RowlockResponse response; + + request.set_row(row); + request.set_table_name(table_name); + + client->UnLock(&request, &response); + VLOG(12) <<"[time] Transaction finish. [row] " << row; + } + + std::unique_ptr client; + std::string table_name; + std::string row; +}; + +// info inside scanner +struct NotifyCell { + NotifyCell(tera::Transaction* t) : transaction(t), + table(NULL) {} + ~NotifyCell() { + if (transaction) { + delete transaction; + } + } + + std::string row; + std::string value; + int64_t timestamp; + + Column observed_column; + tera::Transaction* transaction; + tera::Table* table; + + std::shared_ptr unlocker; +}; + +} // namespace observer +} // namespace tera + +#endif // TERA_OBSERVER_EXECUTOR_NOTIFY_CELL_H_ diff --git a/src/observer/executor/observer.h b/src/observer/executor/observer.h new file mode 100644 index 000000000..db1d912ae --- /dev/null +++ b/src/observer/executor/observer.h @@ -0,0 +1,52 @@ +// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef TERA_OBSERVER_H_ +#define TERA_OBSERVER_H_ + +#include + +#include "tera/client.h" +#include "tera/error_code.h" +#include "tera/transaction.h" +#include "observer/executor/notification.h" + +#pragma GCC visibility push(default) +namespace tera { +namespace observer { + +enum TransactionType { + kGlobalTransaction = 0, + kSingleRowTransaction = 1, + kNoneTransaction = 2, +}; + +class Observer { +public: + virtual ~Observer() {} + + // if notify and ack are needed during OnNotify, + // call notifiaction->Ack and notification->Notify + // before transaction commit + virtual ErrorCode OnNotify(tera::Transaction* t, + tera::Client* client, + const std::string& table_name, + const std::string& family, + const std::string& qualifier, + const std::string& row, + const std::string& value, + int64_t timestamp, + Notification* notification) = 0; + // return observer name + virtual std::string GetObserverName() const = 0; + + // return TransactionType + virtual TransactionType GetTransactionType() const = 0; +}; + +} // namespace observer +} +#pragma GCC visibility pop + +#endif // TERA_OBSERVER_H_ diff --git a/src/observer/executor/random_key_selector.cc b/src/observer/executor/random_key_selector.cc new file mode 100644 index 000000000..75b0129ab --- /dev/null +++ b/src/observer/executor/random_key_selector.cc @@ -0,0 +1,134 @@ +// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "observer/executor/random_key_selector.h" + +#include "gflags/gflags.h" +#include "glog/logging.h" + +#include "types.h" + +DECLARE_string(flagfile); + +namespace tera { +namespace observer { + +RandomKeySelector::RandomKeySelector() + : tables_(new std::map>), + quit_(false), + cond_(&quit_mutex_) { + tera::ErrorCode err; + client_ = tera::Client::NewClient(FLAGS_flagfile, &err); + update_thread_.Start(std::bind(&RandomKeySelector::Update, this)); +} + +RandomKeySelector::~RandomKeySelector() { + { + MutexLock locker(&quit_mutex_); + quit_ = true; + cond_.Broadcast(); + } + + update_thread_.Join(); + if (client_ != NULL) { + delete client_; + } +} + +bool RandomKeySelector::SelectStart(std::string* table_name, + std::string* start_key) { + srand((unsigned)time(NULL)); + + std::shared_ptr>> table_read_copy; + { + MutexLock locker(&table_mutex_); + // copy for copy-on-write, ref +1 + table_read_copy = tables_; + } + + if (table_read_copy->size() == 0) { + return false; + } + + // random table + uint32_t table_no = rand() % observe_tables_.size(); + *table_name = observe_tables_[table_no]; + + + // random key + size_t tablet_num = (*table_read_copy)[*table_name].size(); + if (0 == tablet_num) { + LOG(ERROR) << "No tablet"; + return false; + } + + uint32_t tablet_no = rand() % tablet_num; + *start_key = (*table_read_copy)[*table_name][tablet_no].start_key; + + VLOG(25) << "Random StartKey=" << *start_key << " TabletNo=" << tablet_no; + return true; +} + +ErrorCode RandomKeySelector::Observe(const std::string& table_name) { + tera::ErrorCode err; + + MutexLock locker(&table_mutex_); + + if (!tables_.unique()) { + // In this case threads may reading this copy. + // Shared_ptr construct a new copy from the original one. + // Later requests will operate on the new copy. + tables_.reset(new std::map>(*tables_)); + } + if (tables_->find(table_name) == tables_->end()) { + + std::vector tablets; + client_->GetTabletLocation(table_name, &tablets, &err); + if (tera::ErrorCode::kOK != err.GetType()) { + LOG(ERROR) << "Observe table failed, " << err.ToString(); + return err; + } + observe_tables_.push_back(table_name); + (*tables_)[table_name] = tablets; + } + return err; +} + +void RandomKeySelector::Update() { + tera::ErrorCode err; + while (true) { + { + MutexLock locker(&quit_mutex_); + if (quit_) { + return; + } + cond_.TimeWaitInUs(kObserverWaitTime); + } + + // update data first + std::shared_ptr>> table_update_copy( + new std::map>); + + // updated table + for (uint32_t i = 0; i < observe_tables_.size(); ++i) { + std::string table_name = observe_tables_[i]; + + std::vector tablets; + client_->GetTabletLocation(table_name, &tablets, &err); + if (tera::ErrorCode::kOK != err.GetType()) { + LOG(ERROR) << "Observe table failed, " << err.ToString(); + continue; + } + + table_update_copy->insert(std::pair>(table_name, tablets)); + } + + // update pointer + MutexLock locker(&table_mutex_); + tables_.swap(table_update_copy); + } +} + +} // namespace observer +} // namespace tera diff --git a/src/observer/executor/random_key_selector.h b/src/observer/executor/random_key_selector.h new file mode 100644 index 000000000..5a20fb4f3 --- /dev/null +++ b/src/observer/executor/random_key_selector.h @@ -0,0 +1,47 @@ +// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef TERA_OBSERVER_EXECUTOR_RANDOM_KEY_SELECTOR_H_ +#define TERA_OBSERVER_EXECUTOR_RANDOM_KEY_SELECTOR_H_ + +#include +#include +#include +#include + +#include "common/mutex.h" +#include "common/thread.h" +#include "observer/executor/key_selector.h" +#include "tera.h" + +namespace tera { +namespace observer { + +class RandomKeySelector : public KeySelector { +public: + RandomKeySelector(); + virtual ~RandomKeySelector(); + + virtual bool SelectStart(std::string* table_name, + std::string* start_key); + virtual ErrorCode Observe(const std::string& table_name); +private: + void Update(); + +private: + tera::Client* client_; + mutable Mutex table_mutex_; + std::vector observe_tables_; + std::shared_ptr>> tables_; + common::Thread update_thread_; + + mutable Mutex quit_mutex_; + bool quit_; + common::CondVar cond_; +}; + +} // namespace observer +} // namespace tera + +#endif // TERA_OBSERVER_EXECUTOR_RANDOM_KEY_SELECTOR_H_ \ No newline at end of file diff --git a/src/observer/executor/scanner.h b/src/observer/executor/scanner.h new file mode 100644 index 000000000..a11a8646d --- /dev/null +++ b/src/observer/executor/scanner.h @@ -0,0 +1,41 @@ +// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef TERA_SCANNER_H_ +#define TERA_SCANNER_H_ + +#include + +#include "observer/executor/observer.h" +#include "tera/error_code.h" + +#pragma GCC visibility push(default) +namespace tera { +namespace observer { + +class Scanner { +public: + static Scanner* GetScanner(); + + virtual ~Scanner() {} + + // register user define observers + // user should not destruct observers, which will be handled by scanner + virtual ErrorCode Observe(const std::string& table_name, + const std::string& column_family, + const std::string& qualifier, + Observer* observer) = 0; + + virtual bool Init() = 0; + + virtual bool Start() = 0; + + virtual void Exit() = 0; +}; + +} // namespace observer +} // namespace tera +#pragma GCC visibility pop + +#endif // TERA_SCANNER_H_ diff --git a/src/observer/executor/scanner_entry.cc b/src/observer/executor/scanner_entry.cc new file mode 100644 index 000000000..5b012b339 --- /dev/null +++ b/src/observer/executor/scanner_entry.cc @@ -0,0 +1,63 @@ +// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "observer/executor/scanner_entry.h" + +#include "gflags/gflags.h" +#include "glog/logging.h" + +#include "observer/executor/scanner_impl.h" + +namespace tera { +namespace observer { + +ScannerEntry::ScannerEntry() {} + +ScannerEntry::~ScannerEntry() {} + +bool ScannerEntry::StartServer() { + scanner_.reset(tera::observer::Scanner::GetScanner()); + + if(!scanner_->Init()) { + LOG(ERROR) << "fail to init scanner_impl"; + return false; + } + + // observe observers to scanner + ErrorCode err = Observe(); + if (tera::ErrorCode::kOK != err.GetType()) { + LOG(ERROR) << "Observe failed, reason: " << err.ToString(); + return false; + } + + if(!scanner_->Start()) { + LOG(ERROR) << "fail to start scanner_impl"; + return false; + } + return true; +} + +void ScannerEntry::ShutdownServer() { + LOG(INFO) << "shut down scanner"; + scanner_->Exit(); + scanner_.reset(); + LOG(INFO) << "scanner stop done!"; +} + +bool ScannerEntry::Run() { + ThisThread::Sleep(1000); + return true; +} + +ErrorCode ScannerEntry::Observe() { + ErrorCode err; + return err; +} + +Scanner* ScannerEntry::GetScanner() const { + return scanner_.get(); +} + +} // namespace observer +} // namespace tera \ No newline at end of file diff --git a/src/observer/executor/scanner_entry.h b/src/observer/executor/scanner_entry.h new file mode 100644 index 000000000..ed5e5c325 --- /dev/null +++ b/src/observer/executor/scanner_entry.h @@ -0,0 +1,40 @@ +// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef TERA_OBSERVER_EXECUTOR_SCANNER_ENTRY_H_ +#define TERA_OBSERVER_EXECUTOR_SCANNER_ENTRY_H_ + +#include +#include + +#include "common/this_thread.h" +#include "observer/executor/observer.h" +#include "tera.h" +#include "tera_entry.h" + +namespace tera { +namespace observer { + +class Scanner; + +class ScannerEntry : public TeraEntry { +public: + ScannerEntry(); + virtual ~ScannerEntry(); + + virtual bool StartServer(); + virtual bool Run(); + virtual void ShutdownServer(); + + virtual ErrorCode Observe(); + Scanner* GetScanner() const; +private: + std::unique_ptr scanner_; +}; + + +} // namespace observer +} // namespace tera + +#endif // TERA_OBSERVER_EXECUTOR_SCANNER_ENTRY_H_ \ No newline at end of file diff --git a/src/observer/executor/scanner_impl.cc b/src/observer/executor/scanner_impl.cc new file mode 100644 index 000000000..f42ba6b05 --- /dev/null +++ b/src/observer/executor/scanner_impl.cc @@ -0,0 +1,657 @@ +// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "observer/executor/scanner_impl.h" + +#include +#include +#include + +#include + +#include "gflags/gflags.h" + +#include "common/base/string_number.h" +#include "observer/executor/random_key_selector.h" +#include "observer/executor/notification.h" +#include "observer/executor/notification_impl.h" +#include "observer/rowlocknode/fake_rowlock_client.h" +#include "sdk/table_impl.h" +#include "sdk/sdk_utils.h" +#include "tera.h" +#include "types.h" + +DECLARE_int32(observer_proc_thread_num); +DECLARE_int32(observer_scanner_thread_num); +DECLARE_int32(observer_ack_conflict_timeout); +DECLARE_int64(observer_max_pending_task); +DECLARE_int64(observer_ack_timeout_time); +DECLARE_string(flagfile); +DECLARE_string(rowlock_server_ip); +DECLARE_string(rowlock_server_port); +DECLARE_int32(observer_rowlock_client_thread_num); +DECLARE_bool(mock_rowlock_enable); + +namespace tera { +namespace observer { + +ScannerImpl* ScannerImpl::scanner_instance_ = new ScannerImpl(); +Scanner* Scanner::GetScanner() { + return ScannerImpl::GetInstance(); +} + +ScannerImpl* ScannerImpl::GetInstance() { + return scanner_instance_; +} + +ScannerImpl::ScannerImpl() + : tera_client_(NULL), + table_observe_info_(new std::map), + scan_table_threads_(new common::ThreadPool(FLAGS_observer_scanner_thread_num)), + transaction_threads_(new common::ThreadPool(FLAGS_observer_proc_thread_num)), + quit_(false), + cond_(&quit_mutex_) { + profiling_thread_.Start(std::bind(&ScannerImpl::Profiling, this)); +} + +ScannerImpl::~ScannerImpl() { + Exit(); + + scan_table_threads_->Stop(true); + transaction_threads_->Stop(true); + profiling_thread_.Join(); + + MutexLock locker(&table_mutex_); + // close table + for (auto it = table_observe_info_->begin(); it != table_observe_info_->end(); ++it) { + if (it->second.table != NULL) { + delete it->second.table; + } + } + + if (tera_client_ != NULL) { + delete tera_client_; + } + + for (auto it = observers_.begin(); it != observers_.end(); ++it) { + delete *it; + } +} + +ErrorCode ScannerImpl::Observe(const std::string& table_name, + const std::string& column_family, + const std::string& qualifier, + Observer* observer) { + // Observe before init + tera::ErrorCode err; + if (NULL == tera_client_) { + LOG(ERROR) << "Init scanner first!"; + err.SetFailed(ErrorCode::kSystem, "observe before scanner init"); + return err; + } + + Column column = {table_name, column_family, qualifier}; + + { + MutexLock locker(&table_mutex_); + if (!table_observe_info_.unique()) { + // Shared_ptr construct a new copy from the original one. + // Former requests still reading the original shared_ptr + // Write operation executed on the new copy, so as the later requests + table_observe_info_.reset(new std::map(*table_observe_info_)); + } + + if ((*table_observe_info_)[table_name].table == NULL) { + // init table + tera::Table* table = tera_client_->OpenTable(table_name, &err); + if (tera::ErrorCode::kOK != err.GetType()) { + LOG(ERROR) << "open tera table [" << table_name << "] failed, " << err.ToString(); + return err; + } + LOG(INFO) << "open tera table [" << table_name << "] succ"; + + // build map + (*table_observe_info_)[table_name].table = table; + (*table_observe_info_)[table_name].type = GetTableTransactionType(table); + } + + if (!CheckTransactionTypeLegalForTable(observer->GetTransactionType(), + (*table_observe_info_)[table_name].type)) { + LOG(ERROR) << "Transaction type does not match table. table_name: " << table_name + << " type: " << (*table_observe_info_)[table_name].type << " , observer name: " << + observer->GetObserverName() << " type: " << observer->GetTransactionType(); + err.SetFailed(ErrorCode::kSystem, "Transaction type does not match table"); + return err; + } + + auto it = (*table_observe_info_)[table_name].observe_columns[column].insert(observer); + if (!it.second) { + LOG(ERROR) << "Observer " << observer->GetObserverName() << " observe " << table_name + << ":" << column_family << ":" << qualifier << " more than once!"; + err.SetFailed(ErrorCode::kSystem, "the same observer observe the same column more than once"); + return err; + } + observers_.insert(observer); + } + + err = key_selector_->Observe(table_name); + LOG(INFO) << "Observer start. table: " << table_name << " cf:qu " << column_family << ":" << + qualifier << " observer: " << observer->GetObserverName(); + + return err; +} + +bool ScannerImpl::Init() { + tera::ErrorCode err; + if (NULL == tera_client_) { + tera_client_ = tera::Client::NewClient(FLAGS_flagfile, &err); + + if (tera::ErrorCode::kOK != err.GetType()) { + LOG(ERROR) << "init tera client [" << FLAGS_flagfile << "] failed, " << err.ToString(); + return false; + } + } + + // init key_selector_ + // different selector started by different flags + key_selector_.reset(new RandomKeySelector()); + + return true; +} + +bool ScannerImpl::Start() { + for (int32_t idx = 0; idx < FLAGS_observer_scanner_thread_num; ++idx) { + scan_table_threads_->AddTask(std::bind(&ScannerImpl::ScanTable, this)); + } + return true; +} + +void ScannerImpl::Exit() { + // the scope of quit_mutex only covers cond_ broadcast + MutexLock locker(&quit_mutex_); + quit_ = true; + cond_.Broadcast(); +} + +tera::Client* ScannerImpl::GetTeraClient() const { + return tera_client_; +} + +void ScannerImpl::ScanTable() { + std::string start_key; + std::string table_name; + std::set columns; + tera::Table* table = NULL; + + // table and start key will be refreshed. + while (true) { + { + MutexLock locker(&quit_mutex_); + if (quit_) { + break; + } + cond_.TimeWaitInUs(kObserverWaitTime); + } + + if (key_selector_->SelectStart(&table_name, &start_key)) { + GetObserveColumns(table_name, &columns); + } else { + continue; + } + + table = GetTable(table_name); + if (DoScanTable(table, columns, start_key, "")) { + DoScanTable(table, columns, "", start_key); + } + } +} + +bool ScannerImpl::DoScanTable(tera::Table* table, + const std::set& columns, + const std::string& start_key, + const std::string& end_key) { + if (table == NULL) { + return false; + } + + LOG(INFO) << "Start scan table. Table name: [" << table->GetName() + << "]. Start key: [" << start_key << "]"; + + tera::ScanDescriptor desc(start_key); + desc.SetEnd(end_key); + // Notify stores in single lg + desc.AddColumnFamily(kNotifyColumnFamily); + tera::ErrorCode err; + std::unique_ptr result_stream(table->Scan(desc, &err)); + if (tera::ErrorCode::kOK != err.GetType()) { + LOG(ERROR) << "table scan failed, " << err.ToString(); + return false; + } + + if (result_stream->Done(&err)) { + LOG(ERROR) << " ERR " << err.GetReason(); + } + + bool finished = false; + std::string rowkey; + std::vector vec_col; + while (NextRow(columns, result_stream.get(), table->GetName(), &finished, &rowkey, &vec_col)) { + // lock row + if (!TryLockRow(table->GetName(), rowkey)) { + // collision + LOG(INFO) <<"[rowlock failed] table=" << table->GetName() << " row=" << rowkey; + return false; + } + VLOG(12) <<"[time] Transaction start. [row] " << rowkey; + + // automatic unlock + std::shared_ptr unlocker( + new AutoRowUnlocker(table->GetName(), rowkey)); + + for (uint32_t i = 0; i < vec_col.size(); ++i ) { + tera::Transaction* t = NULL; + TransactionType type; + { + MutexLock locker(&table_mutex_); + type = (*table_observe_info_)[table->GetName()].type; + } + + switch (type) { + case kGlobalTransaction: + t = tera_client_->NewGlobalTransaction(); + if (t == NULL) { + LOG(ERROR) << "NewGlobalTransaction failed. Notify cell ignored. table: " << table->GetName() + << " row: " << rowkey << " family: " << vec_col[i].family + << " qualifier: " << vec_col[i].qualifier; + continue; + } + break; + case kSingleRowTransaction: + t = table->StartRowTransaction(rowkey); + if (t == NULL) { + LOG(ERROR) << "StartRowTransaction failed. Notify cell ignored. table: " << table->GetName() + << " row: " << rowkey << " family: " << vec_col[i].family + << " qualifier: " << vec_col[i].qualifier; + continue; + } + break; + default: + break; + } + std::shared_ptr notify_cell(new NotifyCell(t)); + notify_cell->table = table; + notify_cell->row = rowkey; + notify_cell->observed_column = vec_col[i]; + notify_cell->unlocker = unlocker; + + DoReadValue(notify_cell); + } + + MutexLock locker(&quit_mutex_); + if (quit_) { + return false; + } + } + if (finished) { + return true; + } else { + return false; + } + +} + +bool ScannerImpl::NextRow(const std::set& columns, tera::ResultStream* result_stream, + const std::string& table_name, bool* finished, + std::string* row, std::vector* vec_col) { + tera::ErrorCode err; + + // check finish + if (result_stream->Done(&err)) { + *finished = true; + return false; + } + + if (tera::ErrorCode::kOK != err.GetType()) { + LOG(ERROR) << "scanning failed" << err.ToString(); + return false; + } + + vec_col->clear(); + *row = result_stream->RowName(); + + // scan cell + while (!result_stream->Done(&err) && result_stream->RowName() == *row) { + while (transaction_threads_->PendingNum() > FLAGS_observer_max_pending_task) { + VLOG(12) << "transaction_threads pending: " << transaction_threads_->PendingNum(); + MutexLock locker(&quit_mutex_); + if (quit_) { + return false; + } + cond_.TimeWaitInUs(kObserverWaitTime); + } + std::string ob_cf; + std::string ob_qu; + + if (!ParseNotifyQualifier(result_stream->Qualifier(), &ob_cf, &ob_qu)) { + LOG(WARNING) << "parse notify qualifier failed: " << result_stream->Qualifier(); + result_stream->Next(); + continue; + } + + Column ob_col = {table_name, ob_cf, ob_qu}; + if (columns.end() == columns.find(ob_col)) { + LOG(WARNING) << "miss observed column, table_name" << table_name << + " cf=" << ob_cf << " qu=" << ob_qu; + result_stream->Next(); + continue; + } + vec_col->push_back(ob_col); + result_stream->Next(); + + } + return true; +} + +// example qualifier: C:url +// C: cf; column: url; +bool ScannerImpl::ParseNotifyQualifier(const std::string& notify_qualifier, + std::string* data_family, + std::string* data_qualifier) { + + std::vector frags; + std::size_t pos = std::string::npos; + std::size_t start_pos = 0; + std::string frag; + + // parse cf + pos = notify_qualifier.find_first_of(':', start_pos); + if (pos == std::string::npos) { + LOG(ERROR) << "Parse notify qualifier error: " << notify_qualifier; + return false; + } + frag = notify_qualifier.substr(start_pos, pos - start_pos); + frags.push_back(frag); + start_pos = pos + 1; + + pos = notify_qualifier.size(); + frag = notify_qualifier.substr(start_pos, pos - start_pos); + frags.push_back(frag); + if (2 != frags.size()) { + return false; + } + if (frags[0] == "" || frags[1] == "") { + return false; + } + *data_family = frags[0]; + *data_qualifier = frags[1]; + + return true; +} + +bool ScannerImpl::DoReadValue(std::shared_ptr notify_cell) { + VLOG(12) <<"[time] do read value start. [row] " << notify_cell->row; + std::unique_ptr row_reader(notify_cell->table->NewRowReader(notify_cell->row)); + assert(row_reader.get() != NULL); + row_reader->AddColumn(notify_cell->observed_column.family, notify_cell->observed_column.qualifier); + // transaction read + if (notify_cell->transaction != NULL) { + notify_cell->transaction->Get(row_reader.get()); + } else { + notify_cell->table->Get(row_reader.get()); + } + VLOG(12) <<"[time] do read value finish. [row] " << notify_cell->row; + if (tera::ErrorCode::kOK == row_reader->GetError().GetType()) { + notify_cell->value = row_reader->Value(); + notify_cell->timestamp = row_reader->Timestamp(); + + std::shared_ptr> table_observe_info_read_copy; + { + MutexLock locker(&table_mutex_); + // shared_ptr ref +1 + table_observe_info_read_copy = table_observe_info_; + } + + auto it = table_observe_info_read_copy->find(notify_cell->observed_column.table_name); + if (it == table_observe_info_read_copy->end()) { + LOG(WARNING) << "table not found: " << notify_cell->observed_column.table_name; + return false; + } + + if (it->second.observe_columns.find(notify_cell->observed_column) == it->second.observe_columns.end()) { + LOG(WARNING) << "column not found. cf: " << notify_cell->observed_column.family + << " qu: " << notify_cell->observed_column.qualifier; + return false; + } + + if (it->second.observe_columns[notify_cell->observed_column].size() == 0) { + LOG(WARNING) << "no match observers, table=" << notify_cell->observed_column.table_name << + " cf=" << notify_cell->observed_column.family << " qu=" << notify_cell->observed_column.qualifier; + return false; + } + + std::set& observer_set = (*table_observe_info_read_copy)[notify_cell->observed_column.table_name].observe_columns[notify_cell->observed_column]; + + // only gtxn check ack + if ((*observer_set.begin())->GetTransactionType() == kGlobalTransaction + && !CheckConflictOnAckColumn(notify_cell, observer_set)) { + LOG(WARNING) << "Ack failed ! row=" << notify_cell->row << " cf=" << notify_cell->observed_column.family << + " qu=" << notify_cell->observed_column.qualifier;; + return false; + } + // every column may have more than one observers + for (auto observer = observer_set.begin(); observer != observer_set.end(); ++observer) { + + transaction_threads_->AddTask( [=] (int64_t) { + total_counter_.Inc(); + std::unique_ptr notification(GetNotification(notify_cell->transaction)); + tera::ErrorCode err = (*observer)->OnNotify(notify_cell->transaction, tera_client_, notify_cell->observed_column.table_name, + notify_cell->observed_column.family, notify_cell->observed_column.qualifier, + notify_cell->row, notify_cell->value, notify_cell->timestamp, notification.get()); + if (err.GetType() != tera::ErrorCode::kOK) { + LOG(WARNING) << "OnNotify failed! reason: " << err.GetReason(); + fail_counter_.Inc(); + } + }); + } + + } else { + LOG(WARNING) << "[read failed] table=" << notify_cell->table->GetName() << " cf=" << notify_cell->observed_column.family << + " qu=" << notify_cell->observed_column.qualifier << " row=" << notify_cell->row << + " err=" << row_reader->GetError().GetType() << row_reader->GetError().GetReason(); + return false; + } + + return true; +} + +void ScannerImpl::GetObserveColumns(const std::string& table_name, std::set* columns) { + columns->clear(); + + std::shared_ptr> table_observe_info_read_copy; + { + MutexLock locker(&table_mutex_); + // shared_ptr ref +1 + table_observe_info_read_copy = table_observe_info_; + } + + for (auto it : (*table_observe_info_read_copy)[table_name].observe_columns) { + columns->insert(it.first); + } +} + +tera::Table* ScannerImpl::GetTable(const std::string table_name) { + std::shared_ptr> table_observe_info_read_copy; + { + MutexLock locker(&table_mutex_); + table_observe_info_read_copy = table_observe_info_; + } + return (*table_observe_info_read_copy)[table_name].table; +} + +void ScannerImpl::Profiling() { + while (true) { + { + MutexLock locker(&quit_mutex_); + if (quit_) { + return; + } + cond_.TimeWaitInUs(kObserverWaitTime); + } + LOG(INFO) << "[Observer Profiling Info] total: " << total_counter_.Get() << + " failed: " << fail_counter_.Get() << " transaction pending: " << + transaction_threads_->PendingNum(); + total_counter_.Clear(); + fail_counter_.Clear(); + } +} + +bool ScannerImpl::CheckConflictOnAckColumn(std::shared_ptr notify_cell, + const std::set& observers) { + VLOG(12) <<"[time] Check ACK start. [cf:qu] " << notify_cell->observed_column.family + << notify_cell->observed_column.qualifier; + bool is_collision = false; + std::vector ack_qualifier_list; + std::string ack_qualifier_prefix = GetAckQualifierPrefix(notify_cell->observed_column.family, + notify_cell->observed_column.qualifier); + + // use transaction to read column Ack + std::unique_ptr row_transaction(notify_cell->table->StartRowTransaction(notify_cell->row)); + + // read Acks + std::unique_ptr row_reader(notify_cell->table->NewRowReader(notify_cell->row)); + for (auto it : observers) { + std::string ack_qualifier = GetAckQualifier(ack_qualifier_prefix, it->GetObserverName()); + ack_qualifier_list.push_back(ack_qualifier); + + row_reader->AddColumn(notify_cell->observed_column.family, ack_qualifier); + } + row_transaction->Get(row_reader.get()); + if (tera::ErrorCode::kOK == row_reader->GetError().GetType()) { + while (!row_reader->Done()) { + int64_t latest_observer_start_ts = 0; + if (!StringToNumber(row_reader->Value(), &latest_observer_start_ts)) { + LOG(ERROR) << "Convert string to timestamp failed! String: " << row_reader->Value() << + " row=" << notify_cell->row << " cf=" << notify_cell->observed_column.family << + " qu=" << notify_cell->observed_column.qualifier; + is_collision = true; + break; + } + + // collision check: ack ts later than notify ts && + if (latest_observer_start_ts >= notify_cell->timestamp && + notify_cell->transaction->GetStartTimestamp() - latest_observer_start_ts + < FLAGS_observer_ack_conflict_timeout) { + // time too short, collisision, ignore + + is_collision = true; + LOG(INFO) << "own collision. row=" << notify_cell->row << + " cf=" << notify_cell->observed_column.family << " qu=" << + notify_cell->observed_column.qualifier << + ", latest observer start_ts=" << latest_observer_start_ts << + ", observer start_ts=" << notify_cell->transaction->GetStartTimestamp() << + ", data commit_ts=" << notify_cell->timestamp; + break; + + } + row_reader->Next(); + } + } else { + LOG(INFO) << "read Acks failed, err=" << row_reader->GetError().GetReason() << + " row=" << notify_cell->row << " cf=" << notify_cell->observed_column.family << + " qu=" << notify_cell->observed_column.qualifier; + } + + if (!is_collision) { + // set Acks + std::unique_ptr mutation(notify_cell->table->NewRowMutation(notify_cell->row)); + for (size_t idx = 0; idx < ack_qualifier_list.size(); ++idx) { + mutation->Put(notify_cell->observed_column.family, ack_qualifier_list[idx], + std::to_string(notify_cell->transaction->GetStartTimestamp())); + } + row_transaction->ApplyMutation(mutation.get()); + notify_cell->table->CommitRowTransaction(row_transaction.get()); + if (row_transaction->GetError().GetType() != tera::ErrorCode::kOK) { + LOG(INFO) << "write Ack failed, row=" << notify_cell->row << " err=" << + row_transaction->GetError().GetReason() << " cf=" << + notify_cell->observed_column.family << " qu=" << + notify_cell->observed_column.qualifier; + is_collision = true; + } + } + VLOG(12) <<"[time] Check ACK finish. [cf:qu] " << notify_cell->observed_column.family + << notify_cell->observed_column.qualifier; + + return !is_collision; +} + +std::string ScannerImpl::GetAckQualifierPrefix(const std::string& family, + const std::string& qualifier) const { + return family + ":" + qualifier; +} + +std::string ScannerImpl::GetAckQualifier(const std::string& prefix, + const std::string& observer_name) const { + return prefix + "+ack_" + observer_name; +} + +bool ScannerImpl::TryLockRow(const std::string& table_name, + const std::string& row) const { + VLOG(12) << "[time] trylock " << table_name << " " << row; + RowlockRequest request; + RowlockResponse response; + + std::shared_ptr rowlock_client; + + if (FLAGS_mock_rowlock_enable == true) { + rowlock_client.reset(new FakeRowlockClient()); + } else { + rowlock_client.reset(new RowlockClient()); + } + + request.set_table_name(table_name); + request.set_row(row); + + if (!rowlock_client->TryLock(&request, &response)) { + LOG(ERROR) << "TryLock rpc fail, row: " << row; + return false; + } + if (response.lock_status() != kLockSucc) { + LOG(INFO) << "Lock row fail, row: " << row; + return false; + } + VLOG(12) << "[time] trylock finish " << table_name << " " << row; + return true; +} + +bool ScannerImpl::CheckTransactionTypeLegalForTable(TransactionType type, + TransactionType table_type) { + if (type == table_type) { + return true; + } + + if (type == kNoneTransaction && table_type == kSingleRowTransaction) { + return true; + } + + return false; +} + +TransactionType ScannerImpl::GetTableTransactionType(tera::Table* table) { + tera::ErrorCode err; + TableImpl* table_impl(dynamic_cast(tera_client_)->OpenTableInternal(table->GetName(), &err)); + TableSchema schema = table_impl->GetTableSchema(); + + if (IsTransactionTable(schema)) { + std::set gtxn_cfs; + FindGlobalTransactionCfs(schema, >xn_cfs); + if (gtxn_cfs.size() > 0) { + return kGlobalTransaction; + } + return kSingleRowTransaction; + } + return kNoneTransaction; +} + +} // namespace observer +} // namespace tera diff --git a/src/observer/executor/scanner_impl.h b/src/observer/executor/scanner_impl.h new file mode 100644 index 000000000..833ff3fa4 --- /dev/null +++ b/src/observer/executor/scanner_impl.h @@ -0,0 +1,118 @@ +// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef TERA_OBSERVER_EXECUTOR_SCANNER_IMPL_H_ +#define TERA_OBSERVER_EXECUTOR_SCANNER_IMPL_H_ + +#include +#include + +#include "common/counter.h" +#include "common/mutex.h" +#include "common/thread_pool.h" +#include "common/thread.h" +#include "common/this_thread.h" +#include "observer/executor/notify_cell.h" +#include "observer/executor/observer.h" +#include "observer/executor/scanner.h" +#include "tera.h" + +namespace tera { +namespace observer { + +class Observer; +class KeySelector; + +class ScannerImpl : public Scanner { +private: + struct TableObserveInfo { + std::map> observe_columns; + tera::Table* table; + TransactionType type; + }; + +public: + ScannerImpl(); + virtual ~ScannerImpl(); + + virtual ErrorCode Observe(const std::string& table_name, + const std::string& column_family, + const std::string& qualifier, + Observer* observer); + + virtual bool Init(); + + virtual bool Start(); + + virtual void Exit(); + + tera::Client* GetTeraClient() const; + + static ScannerImpl* GetInstance(); + +private: + void ScanTable(); + + bool DoScanTable(tera::Table* table, + const std::set& column_set, + const std::string& start_key, + const std::string& end_key); + + bool DoReadValue(std::shared_ptr notify_cell); + + bool ParseNotifyQualifier(const std::string& notify_qualifier, + std::string* data_family, + std::string* data_qualfier); + + void GetObserveColumns(const std::string& table_name, + std::set* column_set); + + tera::Table* GetTable(const std::string table_name); + + bool NextRow(const std::set& columns, tera::ResultStream* result_stream, + const std::string& table_name, bool* finished, + std::string* row, std::vector* vec_col); + + void Profiling(); + + bool CheckConflictOnAckColumn(std::shared_ptr notify_cell, + const std::set& observers); + std::string GetAckQualifierPrefix(const std::string& family, const std::string& qualifier) const; + std::string GetAckQualifier(const std::string& prefix, const std::string& observer_name) const; + bool TryLockRow(const std::string& table_name, + const std::string& row) const; + + bool CheckTransactionTypeLegalForTable(TransactionType type, TransactionType table_type); + TransactionType GetTableTransactionType(tera::Table* table); + +private: + mutable Mutex table_mutex_; + tera::Client* tera_client_; + std::unique_ptr key_selector_; + + // map
> + std::shared_ptr> table_observe_info_; + // This set stores unique user-define observer addresses. + // Release user-define observers when scanner destruct + std::set observers_; + + std::unique_ptr scan_table_threads_; + std::unique_ptr transaction_threads_; + + // for quit + bool quit_; + mutable Mutex quit_mutex_; + common::CondVar cond_; + + common::Thread profiling_thread_; + Counter total_counter_; + Counter fail_counter_; + + static ScannerImpl* scanner_instance_; +}; + +} // namespace observer +} // namespace tera + +#endif // TERA_OBSERVER_EXECUTOR_SCANNER_IMPL_H_ diff --git a/src/observer/observer_demo/demo_entry.cc b/src/observer/observer_demo/demo_entry.cc new file mode 100644 index 000000000..7d6e3a361 --- /dev/null +++ b/src/observer/observer_demo/demo_entry.cc @@ -0,0 +1,59 @@ +// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "observer/observer_demo/demo_entry.h" + +#include "observer/executor/observer.h" +#include "observer/executor/scanner.h" +#include "observer/observer_demo/demo_observer.h" +#include "tera.h" + +std::string GetTeraEntryName() { + return "DemoEntry"; +} + +tera::TeraEntry* GetTeraEntry() { + return new tera::observer::DemoEntry(); +} + +namespace tera { +namespace observer { + +DemoEntry::DemoEntry() {} + +ErrorCode DemoEntry::Observe() { + ErrorCode err; + // new an observer ptr and do not delete it + Observer* demo = new DemoObserver(); + Observer* parser = new ParseObserver(); + Observer* single_row_observer = new SingleRowObserver(); + Observer* none_txn_observer = new NoneTransactionObserver(); + + Scanner* scanner = GetScanner(); + err = scanner->Observe("observer_test_table", "Data", "Page", demo); + if (tera::ErrorCode::kOK != err.GetType()) { + return err; + } + err = scanner->Observe("observer_test_table", "Data", "Link", demo); + if (tera::ErrorCode::kOK != err.GetType()) { + return err; + } + + err = scanner->Observe("observer_test_table", "Data", "Link", parser); + if (tera::ErrorCode::kOK != err.GetType()) { + return err; + } + + err = scanner->Observe("single_row_test_table", "Data", "Link", single_row_observer); + if (tera::ErrorCode::kOK != err.GetType()) { + return err; + } + + err = scanner->Observe("none_txn_test_table", "Data", "Link", none_txn_observer); + return err; + +} + +} // namespace observer +} // namespace tera \ No newline at end of file diff --git a/src/observer/observer_demo/demo_entry.h b/src/observer/observer_demo/demo_entry.h new file mode 100644 index 000000000..5f01ec840 --- /dev/null +++ b/src/observer/observer_demo/demo_entry.h @@ -0,0 +1,30 @@ +// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef TERA_OBSERVER_OBSERVER_DEMO_DEMO_ENTRY_H_ +#define TERA_OBSERVER_OBSERVER_DEMO_DEMO_ENTRY_H_ + +#include +#include + +#include "observer/executor/scanner_entry.h" +#include "tera.h" + +namespace tera { +namespace observer { + +class DemoEntry : public ScannerEntry { +public: + DemoEntry(); + virtual ~DemoEntry() {} + + virtual ErrorCode Observe(); +}; + + +} // namespace observer +} // namespace tera + +#endif // TERA_OBSERVER_OBSERVER_DEMO_DEMO_ENTRY_H_ + diff --git a/src/observer/observer_demo/demo_observer.cc b/src/observer/observer_demo/demo_observer.cc new file mode 100644 index 000000000..07048af92 --- /dev/null +++ b/src/observer/observer_demo/demo_observer.cc @@ -0,0 +1,156 @@ +// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "observer/observer_demo/demo_observer.h" + +#include + +namespace tera { +namespace observer { + +ErrorCode DemoObserver::OnNotify(tera::Transaction* t, + tera::Client* client, + const std::string& table_name, + const std::string& family, + const std::string& qualifier, + const std::string& row, + const std::string& value, + int64_t timestamp, + Notification* notification) { + VLOG(12) <<"[time] OnNotify start. [row] " << row; + LOG(INFO) << "[Notify DemoObserver] table:family:qualifer=" << + table_name << ":" << family << ":" << + qualifier << " row=" << row << + " value=" << value << " timestamp=" << timestamp; + + tera::ErrorCode err; + tera::Table* table = client->OpenTable(table_name, &err); + + // write ForwordIndex column + tera::RowMutation* mutation = table->NewRowMutation(row); + mutation->Put("Data", "ForwordIndex", "FIValue_" + row); + t->ApplyMutation(mutation); + + tera::ErrorCode error; + notification->Ack(table, row, family, qualifier); + error = t->Commit(); + delete mutation; + VLOG(12) <<"[time] OnNotify finish. [row] " << row; + return error; +} + +std::string DemoObserver::GetObserverName() const { + return "DemoObserver"; +} + +TransactionType DemoObserver::GetTransactionType() const { + return kGlobalTransaction; +} + +ErrorCode ParseObserver::OnNotify(tera::Transaction* t, + tera::Client* client, + const std::string& table_name, + const std::string& family, + const std::string& qualifier, + const std::string& row, + const std::string& value, + int64_t timestamp, + Notification* notification) { + LOG(INFO) << "[Notify ParseObserver] table:family:qualifer=" << + table_name << ":" << family << ":" << + qualifier << " row=" << row << + " value=" << value << " timestamp=" << timestamp; + + tera::ErrorCode err; + // do nothing + tera::Table* table = client->OpenTable(table_name, &err); + notification->Ack(table, row, family, qualifier); + err = t->Commit(); + return err; +} + +std::string ParseObserver::GetObserverName() const { + return "ParseObserver"; +} + +TransactionType ParseObserver::GetTransactionType() const { + return kGlobalTransaction; +} + +ErrorCode SingleRowObserver::OnNotify(tera::Transaction* t, + tera::Client* client, + const std::string& table_name, + const std::string& family, + const std::string& qualifier, + const std::string& row, + const std::string& value, + int64_t timestamp, + Notification* notification) { + LOG(INFO) << "[Notify SingleRowObserver] table:family:qualifer=" << + table_name << ":" << family << ":" << + qualifier << " row=" << row << + " value=" << value << " timestamp=" << timestamp; + + tera::ErrorCode err; + tera::Table* table = client->OpenTable(table_name, &err); + + // single row txn + tera::RowMutation* mutation = table->NewRowMutation(row); + mutation->Put(family, "another_qu", "value"); + t->ApplyMutation(mutation); + + tera::ErrorCode error; + notification->Ack(table, row, family, qualifier); + tera::Table* another_table = client->OpenTable("another_table", &err); + notification->Ack(another_table, "somerow", "family", "qualifier"); + error = t->Commit(); + delete mutation; + return error; +} + +std::string SingleRowObserver::GetObserverName() const { + return "SingleRowObserver"; +} + +TransactionType SingleRowObserver::GetTransactionType() const { + return kSingleRowTransaction; +} + +ErrorCode NoneTransactionObserver::OnNotify(tera::Transaction* t, + tera::Client* client, + const std::string& table_name, + const std::string& family, + const std::string& qualifier, + const std::string& row, + const std::string& value, + int64_t timestamp, + Notification* notification) { + LOG(INFO) << "[Notify NoneTransactionObserver] table:family:qualifer=" << + table_name << ":" << family << ":" << + qualifier << " row=" << row << + " value=" << value << " timestamp=" << timestamp; + + tera::ErrorCode err; + tera::Table* table = client->OpenTable(table_name, &err); + + // do something + // kNoneTransaction notify + notification->Ack(table, row, family, qualifier); + + // kNoneTransaction ack + tera::Table* notify_table = client->OpenTable("notify_table", &err); + notification->Notify(notify_table, "notify_row", "family", "qualifier"); + return err; +} + +std::string NoneTransactionObserver::GetObserverName() const { + return "NoneTransactionObserver"; +} + +TransactionType NoneTransactionObserver::GetTransactionType() const { + return kNoneTransaction; +} + +} // namespace observer +} // namespace tera \ No newline at end of file diff --git a/src/observer/observer_demo/demo_observer.h b/src/observer/observer_demo/demo_observer.h new file mode 100644 index 000000000..201feebf2 --- /dev/null +++ b/src/observer/observer_demo/demo_observer.h @@ -0,0 +1,86 @@ +// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef TERA_OBSERVER_OBSERVER_DEMO_DEMO_OBSERVER_H_ +#define TERA_OBSERVER_OBSERVER_DEMO_DEMO_OBSERVER_H_ + +#include "observer/executor/observer.h" +#include "tera.h" + +namespace tera { +namespace observer { + +class DemoObserver : public tera::observer::Observer { +public: + DemoObserver() {} + virtual ~DemoObserver() {} + virtual ErrorCode OnNotify(tera::Transaction* t, + tera::Client* client, + const std::string& table_name, + const std::string& family, + const std::string& qualifier, + const std::string& row, + const std::string& value, + int64_t timestamp, + Notification* notification); + virtual std::string GetObserverName() const; + virtual TransactionType GetTransactionType() const; +}; + +class ParseObserver : public tera::observer::Observer { +public: + ParseObserver() {} + virtual ~ParseObserver() {} + virtual ErrorCode OnNotify(tera::Transaction* t, + tera::Client* client, + const std::string& table_name, + const std::string& family, + const std::string& qualifier, + const std::string& row, + const std::string& value, + int64_t timestamp, + Notification* notification); + virtual std::string GetObserverName() const; + virtual TransactionType GetTransactionType() const; +}; + +class SingleRowObserver : public tera::observer::Observer { +public: + SingleRowObserver() {} + virtual ~SingleRowObserver() {} + virtual ErrorCode OnNotify(tera::Transaction* t, + tera::Client* client, + const std::string& table_name, + const std::string& family, + const std::string& qualifier, + const std::string& row, + const std::string& value, + int64_t timestamp, + Notification* notification); + virtual std::string GetObserverName() const; + virtual TransactionType GetTransactionType() const; +}; + +class NoneTransactionObserver : public tera::observer::Observer { +public: + NoneTransactionObserver() {} + virtual ~NoneTransactionObserver() {} + virtual ErrorCode OnNotify(tera::Transaction* t, + tera::Client* client, + const std::string& table_name, + const std::string& family, + const std::string& qualifier, + const std::string& row, + const std::string& value, + int64_t timestamp, + Notification* notification); + virtual std::string GetObserverName() const; + virtual TransactionType GetTransactionType() const; +}; + +} // namespace observer +} // namespace tera + +#endif // TERA_OBSERVER_OBSERVER_DEMO_DEMO_OBSERVER_H_ + diff --git a/src/observer/observer_demo/observe_demo_main.cc b/src/observer/observer_demo/observe_demo_main.cc new file mode 100644 index 000000000..af633255a --- /dev/null +++ b/src/observer/observer_demo/observe_demo_main.cc @@ -0,0 +1,84 @@ +// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include + +#include +#include + +#include "common/base/scoped_ptr.h" +#include "common/log/log_cleaner.h" +#include "tera_entry.h" +#include "utils/utils_cmd.h" +#include "version.h" + +DECLARE_string(tera_log_prefix); +DECLARE_string(tera_local_addr); +DECLARE_bool(tera_info_log_clean_enable); + +extern std::string GetTeraEntryName(); +extern tera::TeraEntry* GetTeraEntry(); + +volatile sig_atomic_t g_quit = 0; + +static void SignalIntHandler(int sig) { + g_quit = 1; +} + +int main(int argc, char** argv) { + ::google::ParseCommandLineFlags(&argc, &argv, true); + if (FLAGS_tera_log_prefix.empty()) { + FLAGS_tera_log_prefix = GetTeraEntryName(); + if (FLAGS_tera_log_prefix.empty()) { + FLAGS_tera_log_prefix = "tera"; + } + } + tera::utils::SetupLog(FLAGS_tera_log_prefix); + + if (argc > 1) { + std::string ext_cmd = argv[1]; + if (ext_cmd == "version") { + PrintSystemVersion(); + return 0; + } + } + + signal(SIGINT, SignalIntHandler); + signal(SIGTERM, SignalIntHandler); + + scoped_ptr entry(GetTeraEntry()); + if (entry.get() == NULL) { + return -1; + } + + if (!entry->Start()) { + return -1; + } + + // start log cleaner + if (FLAGS_tera_info_log_clean_enable) { + common::LogCleaner::StartCleaner(); + LOG(INFO) << "start log cleaner"; + } else { + LOG(INFO) << "log cleaner is disable"; + } + + while (!g_quit) { + if (!entry->Run()) { + LOG(ERROR) << "Server run error ,and then exit now "; + break; + } + } + if (g_quit) { + LOG(INFO) << "received interrupt signal from user, will stop"; + } + + common::LogCleaner::StopCleaner(); + + if (!entry->Shutdown()) { + return -1; + } + + return 0; +} diff --git a/src/observer/rowlocknode/fake_rowlock_client.h b/src/observer/rowlocknode/fake_rowlock_client.h new file mode 100644 index 000000000..d884d15e4 --- /dev/null +++ b/src/observer/rowlocknode/fake_rowlock_client.h @@ -0,0 +1,41 @@ +// Copyright (c) 2017, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef TERA_OBSERVER_EXECUTOR_FAKE_ROWLOCK_CLIENT_H_ +#define TERA_OBSERVER_EXECUTOR_FAKE_ROWLOCK_CLIENT_H_ + +#include +#include + +#include "proto/rpc_client.h" +#include "sdk/rowlock_client.h" + +namespace tera { +namespace observer { + +class FakeRowlockClient : public RowlockClient { +public: + FakeRowlockClient() : RowlockClient("127.0.0.1:22222") {}; + ~FakeRowlockClient() {} + + virtual bool TryLock(const RowlockRequest* request, + RowlockResponse* response, + std::function done = NULL) { + response->set_lock_status(kLockSucc); + return true; + } + + virtual bool UnLock(const RowlockRequest* request, + RowlockResponse* response, + std::function done = NULL) { + response->set_lock_status(kLockSucc); + return true; + } +}; + +} // namespace observer +} // namespace tera +#endif // TERA_OBSERVER_EXECUTOR_FAKE_ROWLOCK_CLIENT_H_ + + diff --git a/src/observer/rowlocknode/fake_rowlocknode_zk_adapter.cc b/src/observer/rowlocknode/fake_rowlocknode_zk_adapter.cc new file mode 100644 index 000000000..2cf0d8974 --- /dev/null +++ b/src/observer/rowlocknode/fake_rowlocknode_zk_adapter.cc @@ -0,0 +1,66 @@ +// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "observer/rowlocknode/fake_rowlocknode_zk_adapter.h" + +#include + +#include + +#include "common/this_thread.h" +#include "ins_sdk.h" +#include "observer/rowlocknode/rowlocknode_zk_adapter_base.h" +#include "types.h" + +DECLARE_string(rowlock_ins_root_path); +DECLARE_int32(rowlock_server_node_num); +DECLARE_string(rowlock_fake_root_path); + +namespace tera { +namespace observer { + +FakeRowlockNodeZkAdapter::FakeRowlockNodeZkAdapter(RowlockNodeImpl* rowlocknode_impl, + const std::string& server_addr) : + rowlocknode_impl_(rowlocknode_impl), server_addr_(server_addr) { +} + +FakeRowlockNodeZkAdapter::~FakeRowlockNodeZkAdapter() { +} + +void FakeRowlockNodeZkAdapter::Init() { + std::string root_path = FLAGS_rowlock_fake_root_path; + + std::string node_num_key = root_path + kRowlockNodeNumPath; + zk::FakeZkUtil::WriteNode(node_num_key, std::to_string(FLAGS_rowlock_server_node_num)); + + // create node + int id = 0; + std::string id_lock_key; + std::string host_lock_key; + while (true) { + id_lock_key = root_path + kRowlockNodeIdListPath + "/" + std::to_string(id); + std::string file_path = "mkdir -p " + root_path + kRowlockNodeIdListPath; + system(file_path.c_str()); + if (zk::FakeZkUtil::WriteNode(id_lock_key, std::to_string(id))) { + break; + } else { + LOG(ERROR) << "[Fake rowlock zk]: write node " << id_lock_key << " failed"; + } + if (++id >= FLAGS_rowlock_server_node_num) { + id = 0; + } + ThisThread::Sleep(1); + } + + LOG(INFO) << "RowlockNode Id=" << id << " host=" << server_addr_ + << " nodenum=" << FLAGS_rowlock_server_node_num; +} + +void FakeRowlockNodeZkAdapter::OnLockChange(std::string session_id, bool deleted) { + _Exit(EXIT_FAILURE); +} + +} // namespace observer +} // namespace tera + diff --git a/src/observer/rowlocknode/fake_rowlocknode_zk_adapter.h b/src/observer/rowlocknode/fake_rowlocknode_zk_adapter.h new file mode 100644 index 000000000..686b2cdef --- /dev/null +++ b/src/observer/rowlocknode/fake_rowlocknode_zk_adapter.h @@ -0,0 +1,55 @@ +// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef TERA_OBSERVER_ROWLOCKNODE_FAKE_ROWLOCKNODE_ZK_ADAPTER_H_ +#define TERA_OBSERVER_ROWLOCKNODE_FAKE_ROWLOCKNODE_ZK_ADAPTER_H_ + +#include +#include + +#include "observer/rowlocknode/rowlocknode_impl.h" +#include "observer/rowlocknode/rowlocknode_zk_adapter_base.h" +#include "zk/zk_adapter.h" + +namespace galaxy { +namespace ins { +namespace sdk { + class InsSDK; +} // namespace sdk +} // namespace ins +} // namespace galaxy + +namespace tera { +namespace observer { + +class RowlockNodeImpl; + +class FakeRowlockNodeZkAdapter : public RowlockNodeZkAdapterBase { +public: + FakeRowlockNodeZkAdapter(RowlockNodeImpl* rowlocknode_impl, const std::string& server_addr); + virtual ~FakeRowlockNodeZkAdapter(); + virtual void Init(); + void OnLockChange(std::string session_id, bool deleted); + +private: + virtual void OnChildrenChanged(const std::string& path, + const std::vector& name_list, + const std::vector& data_list) {} + virtual void OnNodeValueChanged(const std::string& path, + const std::string& value) {} + virtual void OnNodeCreated(const std::string& path) {} + virtual void OnNodeDeleted(const std::string& path) {} + virtual void OnWatchFailed(const std::string& path, int watch_type, + int err) {} + virtual void OnSessionTimeout() {} + +private: + RowlockNodeImpl* rowlocknode_impl_; + std::string server_addr_; +}; + +} // namespace observer +} // namespace tera +#endif // TERA_OBSERVER_ROWLOCKNODE_FAKE_ROWLOCKNODE_ZK_ADAPTER_H_ + diff --git a/src/observer/rowlocknode/ins_rowlock_client_zk_adapter.cc b/src/observer/rowlocknode/ins_rowlock_client_zk_adapter.cc new file mode 100644 index 000000000..01c9e8970 --- /dev/null +++ b/src/observer/rowlocknode/ins_rowlock_client_zk_adapter.cc @@ -0,0 +1,55 @@ +// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "observer/rowlocknode/ins_rowlock_client_zk_adapter.h" + +#include +#include + +#include "ins_sdk.h" + +#include "sdk/rowlock_client.h" +#include "types.h" + +DECLARE_string(rowlock_ins_root_path); +DECLARE_string(tera_ins_addr_list); +DECLARE_int32(rowlock_server_node_num); +DECLARE_int64(tera_zk_retry_period); +DECLARE_int32(tera_zk_timeout); +DECLARE_int32(tera_zk_retry_max_times); + +namespace tera { +namespace observer { + +InsRowlockClientZkAdapter::InsRowlockClientZkAdapter(RowlockClient* server_client, + const std::string& server_addr) + : ZkRowlockClientZkAdapter(server_client, server_addr), + client_(server_client), + server_addr_(server_addr) {} + +bool InsRowlockClientZkAdapter::Init() { + std::string root_path = FLAGS_rowlock_ins_root_path; + std::vector value; + // create session + ins_sdk_ = new galaxy::ins::sdk::InsSDK(FLAGS_tera_ins_addr_list); + + // put server_node_num + std::string rowlock_proxy_path = root_path + kRowlockProxyPath; + + galaxy::ins::sdk::ScanResult* result = ins_sdk_->Scan(rowlock_proxy_path + "/!", + rowlock_proxy_path + "/~"); + while (!result->Done()) { + CHECK_EQ(result->Error(), galaxy::ins::sdk::kOK); + value.push_back(result->Value()); + result->Next(); + } + delete result; + + client_->Update(value); + return true; +} + +} // namespace observer +} // namespace tera + diff --git a/src/observer/rowlocknode/ins_rowlock_client_zk_adapter.h b/src/observer/rowlocknode/ins_rowlock_client_zk_adapter.h new file mode 100644 index 000000000..7f56389ce --- /dev/null +++ b/src/observer/rowlocknode/ins_rowlock_client_zk_adapter.h @@ -0,0 +1,50 @@ +// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef TERA_OBSERVER_ROWLOCKNODE_INS_ROWLOCK_CLIENT_ZK_ADAPTER_H_ +#define TERA_OBSERVER_ROWLOCKNODE_INS_ROWLOCK_CLIENT_ZK_ADAPTER_H_ + +#include "observer/rowlocknode/zk_rowlock_client_zk_adapter.h" +#include "zk/zk_adapter.h" + +namespace galaxy { +namespace ins { +namespace sdk { + class InsSDK; +} // namespace sdk +} // namespace ins +} // namespace galaxy + +namespace tera { +namespace observer { + +class RowlockClient; + +class InsRowlockClientZkAdapter : public ZkRowlockClientZkAdapter { +public: + InsRowlockClientZkAdapter(RowlockClient* server_client, const std::string& server_addr); + virtual ~InsRowlockClientZkAdapter() {}; + virtual bool Init(); +protected: + virtual void OnNodeValueChanged(const std::string& path, + const std::string& value) {} + virtual void OnWatchFailed(const std::string& path, int watch_type, + int err) {} + virtual void OnNodeDeleted(const std::string& path) {} + virtual void OnSessionTimeout() {} + virtual void OnNodeCreated(const std::string& path) {} + virtual void OnChildrenChanged(const std::string& path, + const std::vector& name_list, + const std::vector& data_list) {} + +private: + RowlockClient* client_; + std::string server_addr_; + galaxy::ins::sdk::InsSDK* ins_sdk_; +}; + +} // namespace observer +} // namespace tera + +#endif // TERA_OBSERVER_ROWLOCKNODE_INS_ROWLOCK_CLIENT_ZK_ADAPTER_H_ diff --git a/src/observer/rowlocknode/ins_rowlocknode_zk_adapter.cc b/src/observer/rowlocknode/ins_rowlocknode_zk_adapter.cc new file mode 100644 index 000000000..c0ec709d5 --- /dev/null +++ b/src/observer/rowlocknode/ins_rowlocknode_zk_adapter.cc @@ -0,0 +1,80 @@ +// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include + +#include "common/this_thread.h" +#include "ins_sdk.h" +#include "observer/rowlocknode/ins_rowlocknode_zk_adapter.h" +#include "types.h" + +DECLARE_int64(tera_zk_retry_period); +DECLARE_string(rowlock_ins_root_path); +DECLARE_string(tera_ins_addr_list); +DECLARE_int32(rowlock_server_node_num); +DECLARE_string(rowlock_fake_root_path); + +namespace tera { +namespace observer { + +InsRowlockNodeZkAdapter::InsRowlockNodeZkAdapter(RowlockNodeImpl* rowlocknode_impl, + const std::string& server_addr) : + rowlocknode_impl_(rowlocknode_impl), server_addr_(server_addr) { +} + +InsRowlockNodeZkAdapter::~InsRowlockNodeZkAdapter() { +} + +static void InsOnLockChange(const galaxy::ins::sdk::WatchParam& param, + galaxy::ins::sdk::SDKError error) { + LOG(ERROR) << "recv lock change event" ; + InsRowlockNodeZkAdapter* ins_adp = static_cast(param.context); + ins_adp->OnLockChange(param.value, param.deleted); +} + +void InsRowlockNodeZkAdapter::Init() { + std::string root_path = FLAGS_rowlock_ins_root_path; + galaxy::ins::sdk::SDKError err; + // create session + ins_sdk_ = new galaxy::ins::sdk::InsSDK(FLAGS_tera_ins_addr_list); + // get session id + std::string session_id = ins_sdk_->GetSessionID(); + + // put server_node_num + std::string node_num_key = root_path + kRowlockNodeNumPath; + if (!ins_sdk_->Put(node_num_key, std::to_string(FLAGS_rowlock_server_node_num), &err)) { + LOG(WARNING) << "put NodeNum fail"; + } + + // create node + int id = 0; + std::string id_lock_key; + std::string host_lock_key; + while (true) { + id_lock_key = root_path + kRowlockNodeIdListPath + "/" + std::to_string(id); + if (ins_sdk_->Put(id_lock_key, server_addr_, &err) && galaxy::ins::sdk::kOK == err) { + host_lock_key = root_path + kRowlockNodeHostListPath + "/" + server_addr_; + CHECK(ins_sdk_->Lock(host_lock_key, &err)) << "register fail"; + break; + } + if (++id >= FLAGS_rowlock_server_node_num) { + id = 0; + } + ThisThread::Sleep(FLAGS_tera_zk_retry_period); + } + + // create watch node + CHECK(ins_sdk_->Watch(host_lock_key, &InsOnLockChange, this, &err)) << "watch lock fail"; + + LOG(ERROR) << "RowlockNode Id=" << id << " host=" << server_addr_ + << " nodenum=" << FLAGS_rowlock_server_node_num; +} + +void InsRowlockNodeZkAdapter::OnLockChange(std::string session_id, bool deleted) { + _Exit(EXIT_FAILURE); +} + +} // namespace observer +} // namespace tera + diff --git a/src/observer/rowlocknode/ins_rowlocknode_zk_adapter.h b/src/observer/rowlocknode/ins_rowlocknode_zk_adapter.h new file mode 100644 index 000000000..b335115fa --- /dev/null +++ b/src/observer/rowlocknode/ins_rowlocknode_zk_adapter.h @@ -0,0 +1,56 @@ +// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef TERA_OBSERVER_ROWLOCKNODE_INS_ROWLOCKNODE_ZK_ADAPTER_H_ +#define TERA_OBSERVER_ROWLOCKNODE_INS_ROWLOCKNODE_ZK_ADAPTER_H_ + +#include +#include + +#include "observer/rowlocknode/rowlocknode_impl.h" +#include "observer/rowlocknode/rowlocknode_zk_adapter_base.h" +#include "zk/zk_adapter.h" + +namespace galaxy { +namespace ins { +namespace sdk { + class InsSDK; +} // namespace sdk +} // namespace ins +} // namespace galaxy + +namespace tera { +namespace observer { + +class RowlockNodeImpl; + +class InsRowlockNodeZkAdapter : public RowlockNodeZkAdapterBase { +public: + InsRowlockNodeZkAdapter(RowlockNodeImpl* rowlocknode_impl, const std::string& server_addr); + virtual ~InsRowlockNodeZkAdapter(); + virtual void Init(); + void OnLockChange(std::string session_id, bool deleted); + +private: + virtual void OnChildrenChanged(const std::string& path, + const std::vector& name_list, + const std::vector& data_list) {} + virtual void OnNodeValueChanged(const std::string& path, + const std::string& value) {} + virtual void OnNodeCreated(const std::string& path) {} + virtual void OnNodeDeleted(const std::string& path) {} + virtual void OnWatchFailed(const std::string& path, int watch_type, + int err) {} + virtual void OnSessionTimeout() {} + +private: + RowlockNodeImpl* rowlocknode_impl_; + std::string server_addr_; + galaxy::ins::sdk::InsSDK* ins_sdk_; +}; + +} // namespace observer +} // namespace tera +#endif // TERA_OBSERVER_ROWLOCKNODE_INS_ROWLOCKNODE_ZK_ADAPTER_H_ + diff --git a/src/observer/rowlocknode/remote_rowlocknode.cc b/src/observer/rowlocknode/remote_rowlocknode.cc new file mode 100644 index 000000000..533672607 --- /dev/null +++ b/src/observer/rowlocknode/remote_rowlocknode.cc @@ -0,0 +1,36 @@ +// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "observer/rowlocknode/remote_rowlocknode.h" + +#include "gflags/gflags.h" + +DECLARE_int32(rowlock_thread_max_num); + +namespace tera { +namespace observer { + +RemoteRowlockNode::RemoteRowlockNode(RowlockNodeImpl* rowlocknode_impl) : + rowlocknode_impl_(rowlocknode_impl) { +} + +RemoteRowlockNode::~RemoteRowlockNode() { +} + +void RemoteRowlockNode::Lock(google::protobuf::RpcController* controller, + const RowlockRequest* request, + RowlockResponse* response, + google::protobuf::Closure* done) { + rowlocknode_impl_->TryLock(request, response, done); +} + +void RemoteRowlockNode::UnLock(google::protobuf::RpcController* controller, + const RowlockRequest* request, + RowlockResponse* response, + google::protobuf::Closure* done) { + rowlocknode_impl_->UnLock(request, response, done); +} + +} // namespace observer +} // namespace tera diff --git a/src/observer/rowlocknode/remote_rowlocknode.h b/src/observer/rowlocknode/remote_rowlocknode.h new file mode 100644 index 000000000..6c65d79d2 --- /dev/null +++ b/src/observer/rowlocknode/remote_rowlocknode.h @@ -0,0 +1,37 @@ +// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef TERA_OBSERVER_ROWLOCKNODE_REMOTE_ROWLOCKNODE_H_ +#define TERA_OBSERVER_ROWLOCKNODE_REMOTE_ROWLOCKNODE_H_ + +#include "common/base/scoped_ptr.h" +#include "common/thread_pool.h" +#include "observer/rowlocknode/rowlocknode_impl.h" + +namespace tera { +namespace observer { + +class RemoteRowlockNode : public RowlockService { +public: + explicit RemoteRowlockNode(RowlockNodeImpl* rowlocknode_impl); + ~RemoteRowlockNode(); + + void Lock(google::protobuf::RpcController* controller, + const RowlockRequest* request, + RowlockResponse* response, + google::protobuf::Closure* done); + + void UnLock(google::protobuf::RpcController* controller, + const RowlockRequest* request, + RowlockResponse* response, + google::protobuf::Closure* done); + +private: + RowlockNodeImpl* rowlocknode_impl_; +}; + +} // namespace observer +} // namespace tera +#endif // TERA_OBSERVER_ROWLOCKNODE_REMOTE_ROWLOCKNODE_H_ + diff --git a/src/observer/rowlocknode/rowlock_db.h b/src/observer/rowlocknode/rowlock_db.h new file mode 100644 index 000000000..94c98889c --- /dev/null +++ b/src/observer/rowlocknode/rowlock_db.h @@ -0,0 +1,161 @@ +// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef TERA_OBSERVER_ROWLOCKNODE_ROWLOCK_DB_H_ +#define TERA_OBSERVER_ROWLOCKNODE_ROWLOCK_DB_H_ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "common/base/scoped_ptr.h" +#include "common/mutex.h" +#include "common/thread_pool.h" +#include "common/timer.h" + +DECLARE_int32(rowlock_db_sharding_number); +DECLARE_int32(rowlock_db_ttl); +DECLARE_int32(rowlock_timing_wheel_patch_num); + +namespace tera { +namespace observer { + +class RowlockDB { +public: + RowlockDB() + : timing_wheel_pos_(0), + timing_wheel_patch_num_(FLAGS_rowlock_timing_wheel_patch_num) { + timing_wheel_.resize(timing_wheel_patch_num_); + } + + ~RowlockDB() {} + + bool TryLock(uint64_t row) { + MutexLock locker(&mutex_); + if (locks_.find(row) == locks_.end()) { + locks_[row].reset(new uint64_t(row)); + std::weak_ptr ptr = locks_[row]; + timing_wheel_[timing_wheel_pos_].push_back(ptr); + return true; + } else { + return false; + } + } + + void UnLock(uint64_t row) { + MutexLock locker(&mutex_); + locks_.erase(row); + } + + // call this function ever timeout period + // 1. pointer of timing wheel move forward by one step + // 2. clear all the rowlock keys and remove them from locks_ + // 3. the next 60 seconds all new rowlock keys will be put into this wheel patch + void ClearTimeout() { + // pointer forward + mutex_.Lock(); + timing_wheel_pos_ = (timing_wheel_pos_ + 1) % timing_wheel_patch_num_; + std::vector> buffer; + + // release memory + buffer.swap(timing_wheel_[timing_wheel_pos_]); + mutex_.Unlock(); + + // remove key from locks_ + for (uint32_t i = 0; i < buffer.size(); ++i) { + if (!buffer[i].expired()) { + mutex_.Lock(); + auto it = buffer[i].lock(); + locks_.erase(*it); + mutex_.Unlock(); + } + } + } + + size_t Size() const { + MutexLock locker(&mutex_); + return locks_.size(); + } + +private: + mutable Mutex mutex_; + + std::unordered_map> locks_; + + // timing wheel + uint32_t timing_wheel_pos_; + uint32_t timing_wheel_patch_num_; + std::vector>> timing_wheel_; +}; + +class ShardedRowlockDB { +public: + ShardedRowlockDB() : thread_pool_(new ThreadPool(1)) { + lock_map_.resize(FLAGS_rowlock_db_sharding_number); + + for (int32_t i = 0; i < FLAGS_rowlock_db_sharding_number; ++i) { + std::unique_ptr db(new RowlockDB()); + lock_map_[i].reset(db.release()); + } + ScheduleClearTimeout(); + } + + ~ShardedRowlockDB() {} + + bool TryLock(uint64_t row) { + std::unique_ptr& db_node = lock_map_[row % FLAGS_rowlock_db_sharding_number]; + + if (db_node->TryLock(row) == true) { + return true; + } else { + return false; + } + } + + void UnLock(uint64_t row) { + std::unique_ptr& db_node = lock_map_[row % FLAGS_rowlock_db_sharding_number]; + db_node->UnLock(row); + } + + size_t Size() const { + size_t size = 0; + for (uint32_t i = 0; i < lock_map_.size(); ++i) { + size += lock_map_[i]->Size(); + } + return size; + } + +private: + void ScheduleClearTimeout() { + ClearTimeout(); + + ThreadPool::Task task = std::bind(&ShardedRowlockDB::ScheduleClearTimeout, this); + // everytime timing wheel move forward one step, every patch_num steps data will be cleared + thread_pool_->DelayTask(FLAGS_rowlock_db_ttl / FLAGS_rowlock_timing_wheel_patch_num, task); + } + + void ClearTimeout() { + for (int32_t i = 0; i < FLAGS_rowlock_db_sharding_number; ++i) { + lock_map_[i]->ClearTimeout(); + } + } + +private: + std::vector> lock_map_; + scoped_ptr thread_pool_; +}; + +} // namespace observer +} // namespace tera + +#endif // TERA_OBSERVER_ROWLOCKNODE_ROWLOCK_DB_H_ diff --git a/src/observer/rowlocknode/rowlocknode_entry.cc b/src/observer/rowlocknode/rowlocknode_entry.cc new file mode 100644 index 000000000..eb2eb4e17 --- /dev/null +++ b/src/observer/rowlocknode/rowlocknode_entry.cc @@ -0,0 +1,87 @@ +// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "observer/rowlocknode/rowlocknode_entry.h" + +#include +#include + +#include "common/base/string_ext.h" +#include "common/base/string_number.h" +#include "common/net/ip_address.h" +#include "common/this_thread.h" +#include "common/thread_attributes.h" +#include "common/timer.h" +#include "common/counter.h" +#include "utils/rpc_timer_list.h" +#include "common/timer.h" +#include "observer/rowlocknode/remote_rowlocknode.h" + +DECLARE_string(rowlock_server_port); +DECLARE_int32(rowlock_io_service_pool_size); +DECLARE_int32(rowlock_rpc_work_thread_num); + +std::string GetTeraEntryName() { + return "rowlock"; +} + +tera::TeraEntry* GetTeraEntry() { + return new tera::observer::RowlockNodeEntry(); +} + +namespace tera { +namespace observer { + +RowlockNodeEntry::RowlockNodeEntry() : rowlocknode_impl_(NULL), remote_rowlocknode_(NULL) { + sofa::pbrpc::RpcServerOptions rpc_options; + rpc_options.max_throughput_in = -1; + rpc_options.max_throughput_out = -1; + rpc_options.work_thread_num = FLAGS_rowlock_rpc_work_thread_num; + rpc_options.io_service_pool_size = FLAGS_rowlock_io_service_pool_size; + rpc_options.no_delay = false; //use Nagle's Algorithm + rpc_options.write_buffer_base_block_factor = 0; //64Bytes per malloc + rpc_options.read_buffer_base_block_factor = 7; //8kBytes per malloc + rpc_server_.reset(new sofa::pbrpc::RpcServer(rpc_options)); +} + +RowlockNodeEntry::~RowlockNodeEntry() {} + +bool RowlockNodeEntry::StartServer() { + SetProcessorAffinity(); + IpAddress rowlocknode_addr("0.0.0.0", FLAGS_rowlock_server_port); + LOG(INFO) << "Start RPC server at: " << rowlocknode_addr.ToString(); + rowlocknode_impl_.reset(new RowlockNodeImpl()); + remote_rowlocknode_ = new RemoteRowlockNode(rowlocknode_impl_.get()); + rpc_server_->RegisterService(remote_rowlocknode_); + if (!rpc_server_->Start(rowlocknode_addr.ToString())) { + LOG(ERROR) << "start RPC server error"; + return false; + } + if (!rowlocknode_impl_->Init()) { + LOG(ERROR) << "fail to init rowlocknode_impl"; + return false; + } + LOG(INFO) << "finish starting RPC server"; + + return true; +} + +void RowlockNodeEntry::ShutdownServer() { + LOG(INFO) << "shut down server"; + rpc_server_->Stop(); + rowlocknode_impl_->Exit(); + rowlocknode_impl_.reset(); + LOG(INFO) << "RowlockNodeEntry stop done!"; +} + +bool RowlockNodeEntry::Run() { + ThisThread::Sleep(3000); + rowlocknode_impl_->PrintQPS(); + return true; +} + +void RowlockNodeEntry::SetProcessorAffinity() {} + +} // namespace observer +} // namespace tera diff --git a/src/observer/rowlocknode/rowlocknode_entry.h b/src/observer/rowlocknode/rowlocknode_entry.h new file mode 100644 index 000000000..b968e8d4e --- /dev/null +++ b/src/observer/rowlocknode/rowlocknode_entry.h @@ -0,0 +1,39 @@ +// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef TERA_OBSERVER_ROWLOCKNODE_ROWLOCKNODE_ENTRY_H_ +#define TERA_OBSERVER_ROWLOCKNODE_ROWLOCKNODE_ENTRY_H_ + +#include + +#include "common/base/scoped_ptr.h" +#include "observer/rowlocknode/remote_rowlocknode.h" +#include "observer/rowlocknode/rowlocknode_impl.h" +#include "tera_entry.h" + +namespace tera { +namespace observer { + +class RowlockNodeEntry : public tera::TeraEntry { +public: + RowlockNodeEntry(); + virtual ~RowlockNodeEntry(); + + virtual bool StartServer(); + virtual bool Run(); + virtual void ShutdownServer(); + void SetProcessorAffinity(); + +private: + common::Mutex mutex_; + + scoped_ptr rowlocknode_impl_; + RemoteRowlockNode* remote_rowlocknode_; + scoped_ptr rpc_server_; +}; + +} // namespace observer +} // namespace tera + +#endif // TERA_OBSERVER_ROWLOCKNODE_ROWLOCKNODE_ENTRY_H_ diff --git a/src/observer/rowlocknode/rowlocknode_impl.cc b/src/observer/rowlocknode/rowlocknode_impl.cc new file mode 100644 index 000000000..a8563a156 --- /dev/null +++ b/src/observer/rowlocknode/rowlocknode_impl.cc @@ -0,0 +1,82 @@ +// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "observer/rowlocknode/rowlocknode_impl.h" + +#include "common/timer.h" +#include "observer/rowlocknode/fake_rowlocknode_zk_adapter.h" +#include "observer/rowlocknode/ins_rowlocknode_zk_adapter.h" +#include "observer/rowlocknode/rowlocknode_zk_adapter.h" +#include "utils/utils_cmd.h" + +DECLARE_string(rowlock_server_port); +DECLARE_string(tera_coord_type); + +namespace tera { +namespace observer { + +RowlockNodeImpl::RowlockNodeImpl() {} + +RowlockNodeImpl::~RowlockNodeImpl() {} + +bool RowlockNodeImpl::Init() { + std::string local_addr = tera::utils::GetLocalHostName() + ":" + FLAGS_rowlock_server_port; + if (FLAGS_tera_coord_type == "zk") { + zk_adapter_.reset(new RowlockNodeZkAdapter(this, local_addr)); + } else if (FLAGS_tera_coord_type == "ins") { + zk_adapter_.reset(new InsRowlockNodeZkAdapter(this, local_addr)); + } else { + zk_adapter_.reset(new FakeRowlockNodeZkAdapter(this, local_addr)); + } + + zk_adapter_->Init(); + + LOG(INFO) << "Rowlock node init finish"; + return true; +} + +bool RowlockNodeImpl::Exit() { + return true; +} + +void RowlockNodeImpl::TryLock(const RowlockRequest* request, + RowlockResponse* response, + google::protobuf::Closure* done) { + uint64_t rowlock_key = GetRowlockKey(request->table_name(), request->row()); + if (rowlock_db_.TryLock(rowlock_key)) { + response->set_lock_status(kLockSucc); + } else { + response->set_lock_status(kLockFail); + LOG(WARNING) << " table name: " << request->table_name() + << " row :" << request->row(); + } + + done->Run(); +} + +void RowlockNodeImpl::UnLock(const RowlockRequest* request, + RowlockResponse* response, + google::protobuf::Closure* done) { + uint64_t rowlock_key = GetRowlockKey(request->table_name(), request->row()); + rowlock_db_.UnLock(rowlock_key); + response->set_lock_status(kLockSucc); + done->Run(); +} + +void RowlockNodeImpl::PrintQPS() { + return; +} + +uint64_t RowlockNodeImpl::GetRowlockKey(const std::string& table_name, + const std::string& row) const { + // RowlockKey : TableName + Row + std::string rowlock_key_str = table_name + row; + return std::hash()(rowlock_key_str); + +} + + +} // namespace observer +} // namespace tera + diff --git a/src/observer/rowlocknode/rowlocknode_impl.h b/src/observer/rowlocknode/rowlocknode_impl.h new file mode 100644 index 000000000..a60b89dde --- /dev/null +++ b/src/observer/rowlocknode/rowlocknode_impl.h @@ -0,0 +1,52 @@ +// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef TERA_OBSERVER_ROWLOCKNODE_ROWLOCKNODE_IMPL_H_ +#define TERA_OBSERVER_ROWLOCKNODE_ROWLOCKNODE_IMPL_H_ + +#include +#include +#include + +#include "common/base/scoped_ptr.h" +#include "common/counter.h" +#include "common/mutex.h" +#include "observer/rowlocknode/fake_rowlocknode_zk_adapter.h" +#include "observer/rowlocknode/rowlock_db.h" +#include "observer/rowlocknode/rowlocknode_zk_adapter.h" +#include "proto/rowlocknode_rpc.pb.h" +#include "zk/zk_adapter.h" + +namespace tera { +namespace observer { + +class RowlockNodeImpl { +public: + RowlockNodeImpl(); + ~RowlockNodeImpl(); + + bool Init(); + + bool Exit(); + + void TryLock(const RowlockRequest* request, + RowlockResponse* response, + google::protobuf::Closure* done); + + void UnLock(const RowlockRequest* request, + RowlockResponse* response, + google::protobuf::Closure* done); + + void PrintQPS(); +private: + uint64_t GetRowlockKey(const std::string& table_name, const std::string& row) const; +private: + ShardedRowlockDB rowlock_db_; + std::unique_ptr zk_adapter_; +}; + +} // namespace observer +} // namespace tera + +#endif // TERA_OBSERVER_ROWLOCKNODE_ROWLOCKNODE_IMPL_H_ diff --git a/src/observer/rowlocknode/rowlocknode_zk_adapter.cc b/src/observer/rowlocknode/rowlocknode_zk_adapter.cc new file mode 100644 index 000000000..9d079a502 --- /dev/null +++ b/src/observer/rowlocknode/rowlocknode_zk_adapter.cc @@ -0,0 +1,119 @@ +// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "observer/rowlocknode/rowlocknode_zk_adapter.h" + +#include + +#include "common/this_thread.h" +#include "ins_sdk.h" +#include "types.h" + +DECLARE_string(rowlock_zk_root_path); +DECLARE_string(tera_zk_addr_list); +DECLARE_int32(rowlock_server_node_num); +DECLARE_int64(tera_zk_retry_period); +DECLARE_int32(tera_zk_timeout); +DECLARE_int32(tera_zk_retry_max_times); + +namespace tera { +namespace observer { + +RowlockNodeZkAdapter::RowlockNodeZkAdapter(RowlockNodeImpl* rowlocknode_impl, + const std::string& server_addr) : + rowlocknode_impl_(rowlocknode_impl), server_addr_(server_addr) { +} + +RowlockNodeZkAdapter::~RowlockNodeZkAdapter() { +} + +void RowlockNodeZkAdapter::Init() { + std::string root_path = FLAGS_rowlock_zk_root_path; + std::string node_num_key = root_path + kRowlockNodeNumPath; + + int zk_errno = zk::ZE_OK;; + // init zk client + while (!ZooKeeperAdapter::Init(FLAGS_tera_zk_addr_list, + FLAGS_rowlock_zk_root_path, FLAGS_tera_zk_timeout, + server_addr_, &zk_errno)) { + LOG(ERROR) << "fail to init zk : " << zk::ZkErrnoToString(zk_errno); + ThisThread::Sleep(FLAGS_tera_zk_retry_period); + } + LOG(INFO) << "init zk success"; + + // get session id + int64_t session_id_int = 0; + if (!GetSessionId(&session_id_int, &zk_errno)) { + LOG(ERROR) << "get session id fail : " << zk::ZkErrnoToString(zk_errno); + return; + } + + // put server_node_num + zk_errno = zk::ZE_OK; + bool is_exist = true; + int32_t retry_count = 0; + std::string value = std::to_string(FLAGS_rowlock_server_node_num); + CheckExist(node_num_key, &is_exist, &zk_errno); + if (!is_exist) { + while (!CreateEphemeralNode(node_num_key, value, &zk_errno)) { + if (retry_count++ >= FLAGS_tera_zk_retry_max_times) { + LOG(ERROR) << "fail to create master node"; + return; + } + LOG(ERROR) << "retry create rowlock number node in " + << FLAGS_tera_zk_retry_period << " ms, retry=" << retry_count; + ThisThread::Sleep(FLAGS_tera_zk_retry_period); + zk_errno = zk::ZE_OK; + } + } else { + WriteNode(node_num_key, value, &zk_errno); + zk_errno = zk::ZE_OK; + } + + value = server_addr_; + + // create node + int id = 0; + std::string id_lock_key; + std::string host_lock_key; + + while (true) { + id_lock_key = root_path + kRowlockNodeIdListPath + "/" + std::to_string(id); + zk_errno = zk::ZE_OK; + + if (!CreateEphemeralNode(id_lock_key, server_addr_, &zk_errno)) { + LOG(ERROR) << "create rowlock node fail: " << id_lock_key; + } else { + break; + } + LOG(ERROR) << "fail to create serve-node : " << zk::ZkErrnoToString(zk_errno); + + if (++id >= FLAGS_rowlock_server_node_num) { + id = 0; + } + ThisThread::Sleep(FLAGS_tera_zk_retry_period); + } + LOG(INFO) << "create serve-node success"; + + is_exist = false; + + // watch my node + while (!CheckAndWatchExist(id_lock_key, &is_exist, &zk_errno)) { + LOG(ERROR) << "fail to watch serve-node : " << zk::ZkErrnoToString(zk_errno); + ThisThread::Sleep(FLAGS_tera_zk_retry_period); + } + LOG(INFO) << "watch rowlock-node success"; + + if (!is_exist) { + OnLockChange(); + } +} + +void RowlockNodeZkAdapter::OnLockChange() { + _Exit(EXIT_FAILURE); +} + +} // namespace observer +} // namespace tera + diff --git a/src/observer/rowlocknode/rowlocknode_zk_adapter.h b/src/observer/rowlocknode/rowlocknode_zk_adapter.h new file mode 100644 index 000000000..67324f85f --- /dev/null +++ b/src/observer/rowlocknode/rowlocknode_zk_adapter.h @@ -0,0 +1,55 @@ +// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef TERA_OBSERVER_ROWLOCKNODE_ROWLOCKNODE_ZK_ADAPTER_H_ +#define TERA_OBSERVER_ROWLOCKNODE_ROWLOCKNODE_ZK_ADAPTER_H_ + +#include +#include + +#include "observer/rowlocknode/rowlocknode_impl.h" +#include "observer/rowlocknode/rowlocknode_zk_adapter_base.h" +#include "zk/zk_adapter.h" + +namespace galaxy { +namespace ins { +namespace sdk { + class InsSDK; +} // namespace sdk +} // namespace ins +} // namespace galaxy + +namespace tera { +namespace observer { + +class RowlockNodeImpl; + +class RowlockNodeZkAdapter : public RowlockNodeZkAdapterBase { +public: + RowlockNodeZkAdapter(RowlockNodeImpl* rowlocknode_impl, const std::string& server_addr); + virtual ~RowlockNodeZkAdapter(); + virtual void Init(); + void OnLockChange(); + +private: + virtual void OnChildrenChanged(const std::string& path, + const std::vector& name_list, + const std::vector& data_list) {} + virtual void OnNodeValueChanged(const std::string& path, + const std::string& value) {} + virtual void OnNodeCreated(const std::string& path) {} + virtual void OnNodeDeleted(const std::string& path) {} + virtual void OnWatchFailed(const std::string& path, int watch_type, + int err) {} + virtual void OnSessionTimeout() {} + +private: + RowlockNodeImpl* rowlocknode_impl_; + std::string server_addr_; +}; + +} // namespace observer +} // namespace tera +#endif // TERA_OBSERVER_ROWLOCKNODE_ROWLOCKNODE_ZK_ADAPTER_H_ + diff --git a/src/observer/rowlocknode/rowlocknode_zk_adapter_base.h b/src/observer/rowlocknode/rowlocknode_zk_adapter_base.h new file mode 100644 index 000000000..1ef93ccfb --- /dev/null +++ b/src/observer/rowlocknode/rowlocknode_zk_adapter_base.h @@ -0,0 +1,21 @@ +// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. +// +#ifndef TERA_OBSERVER_ROWLOCKNODE_ROWLOCKNODE_ZK_ADAPTER_BASE_H_ +#define TERA_OBSERVER_ROWLOCKNODE_ROWLOCKNODE_ZK_ADAPTER_BASE_H_ + +#include "zk/zk_adapter.h" + +namespace tera { +namespace observer { + +class RowlockNodeZkAdapterBase : public tera::zk::ZooKeeperAdapter { +public: + virtual ~RowlockNodeZkAdapterBase() {} + virtual void Init() = 0; +}; + +} // namespace observer +} // namespace tera +#endif // TERA_OBSERVER_ROWLOCKNODE_ROWLOCKNODE_ZK_ADAPTER_BASE_H_ diff --git a/src/observer/rowlocknode/zk_rowlock_client_zk_adapter.cc b/src/observer/rowlocknode/zk_rowlock_client_zk_adapter.cc new file mode 100644 index 000000000..cacd993fc --- /dev/null +++ b/src/observer/rowlocknode/zk_rowlock_client_zk_adapter.cc @@ -0,0 +1,58 @@ +// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "observer/rowlocknode/zk_rowlock_client_zk_adapter.h" + +#include +#include + +#include "sdk/rowlock_client.h" +#include "types.h" + +DECLARE_string(rowlock_zk_root_path); +DECLARE_string(tera_zk_addr_list); +DECLARE_int32(rowlock_server_node_num); +DECLARE_int64(tera_zk_retry_period); +DECLARE_int32(tera_zk_timeout); +DECLARE_int32(tera_zk_retry_max_times); + +namespace tera { +namespace observer { + +ZkRowlockClientZkAdapter::ZkRowlockClientZkAdapter(RowlockClient* server_client, + const std::string& server_addr) + : client_(server_client), + server_addr_(server_addr) {} + +ZkRowlockClientZkAdapter::~ZkRowlockClientZkAdapter() { + ZooKeeperAdapter::Finalize(); +} + +bool ZkRowlockClientZkAdapter::Init() { + std::string root_path = FLAGS_rowlock_zk_root_path; + std::string proxy_path = root_path + kRowlockProxyPath; + + int zk_errno = zk::ZE_OK;; + // init zk client + while (!ZooKeeperAdapter::Init(FLAGS_tera_zk_addr_list, + FLAGS_rowlock_zk_root_path, FLAGS_tera_zk_timeout, + server_addr_, &zk_errno)) { + LOG(ERROR) << "fail to init zk : " << zk::ZkErrnoToString(zk_errno); + ThisThread::Sleep(FLAGS_tera_zk_retry_period); + } + LOG(INFO) << "init zk success"; + + std::vector child; + std::vector value; + + while (!ListChildren(proxy_path, &child, &value, &zk_errno)) { + LOG(ERROR) << "fail to get proxy addr : " << zk::ZkErrnoToString(zk_errno); + ThisThread::Sleep(FLAGS_tera_zk_retry_period); + } + client_->Update(value); + return true; +} + +} // namespace observer +} // namespace tera \ No newline at end of file diff --git a/src/observer/rowlocknode/zk_rowlock_client_zk_adapter.h b/src/observer/rowlocknode/zk_rowlock_client_zk_adapter.h new file mode 100644 index 000000000..76a388895 --- /dev/null +++ b/src/observer/rowlocknode/zk_rowlock_client_zk_adapter.h @@ -0,0 +1,29 @@ +// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef TERA_OBSERVER_ROWLOCKNODE_ZK_ROWLOCK_CLIENT_ZK_ADAPTER_H_ +#define TERA_OBSERVER_ROWLOCKNODE_ZK_ROWLOCK_CLIENT_ZK_ADAPTER_H_ + +#include "zk/zk_adapter.h" + +namespace tera { +namespace observer { + +class RowlockClient; + +class ZkRowlockClientZkAdapter : public zk::ZooKeeperLightAdapter { +public: + ZkRowlockClientZkAdapter(RowlockClient* server_client, const std::string& server_addr); + virtual ~ZkRowlockClientZkAdapter(); + virtual bool Init(); + +private: + RowlockClient* client_; + std::string server_addr_; +}; + +} // namespace observer +} // namespace tera + +#endif // TERA_OBSERVER_ROWLOCKNODE_ZK_ROWLOCK_CLIENT_ZK_ADAPTER_H_ diff --git a/src/observer/rowlockproxy/remote_rowlock_proxy.cc b/src/observer/rowlockproxy/remote_rowlock_proxy.cc new file mode 100644 index 000000000..845d30fbe --- /dev/null +++ b/src/observer/rowlockproxy/remote_rowlock_proxy.cc @@ -0,0 +1,36 @@ +// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "observer/rowlockproxy/remote_rowlock_proxy.h" + +#include "gflags/gflags.h" + +DECLARE_int32(rowlock_thread_max_num); + +namespace tera { +namespace observer { + +RemoteRowlockProxy::RemoteRowlockProxy(RowlockProxyImpl* rowlock_proxy_impl) : + rowlock_proxy_impl_(rowlock_proxy_impl) { +} + +RemoteRowlockProxy::~RemoteRowlockProxy() { +} + +void RemoteRowlockProxy::Lock(google::protobuf::RpcController* controller, + const RowlockRequest* request, + RowlockResponse* response, + google::protobuf::Closure* done) { + rowlock_proxy_impl_->TryLock(request, response, done); +} + +void RemoteRowlockProxy::UnLock(google::protobuf::RpcController* controller, + const RowlockRequest* request, + RowlockResponse* response, + google::protobuf::Closure* done) { + rowlock_proxy_impl_->UnLock(request, response, done); +} + +} // namespace observer +} // namespace tera diff --git a/src/observer/rowlockproxy/remote_rowlock_proxy.h b/src/observer/rowlockproxy/remote_rowlock_proxy.h new file mode 100644 index 000000000..df8e2c2b8 --- /dev/null +++ b/src/observer/rowlockproxy/remote_rowlock_proxy.h @@ -0,0 +1,38 @@ +// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef TERA_OBSERVER_ROWLOCKPROXY_REMOTE_ROWLOCK_PROXY_H_ +#define TERA_OBSERVER_ROWLOCKPROXY_REMOTE_ROWLOCK_PROXY_H_ + +#include + +#include "common/base/scoped_ptr.h" +#include "common/thread_pool.h" +#include "observer/rowlockproxy/rowlock_proxy_impl.h" + +namespace tera { +namespace observer { + +class RemoteRowlockProxy : public RowlockService { +public: + explicit RemoteRowlockProxy(RowlockProxyImpl* rowlock_proxy_impl); + ~RemoteRowlockProxy(); + + void Lock(google::protobuf::RpcController* controller, + const RowlockRequest* request, + RowlockResponse* response, + google::protobuf::Closure* done); + + void UnLock(google::protobuf::RpcController* controller, + const RowlockRequest* request, + RowlockResponse* response, + google::protobuf::Closure* done); + +private: + RowlockProxyImpl* rowlock_proxy_impl_; +}; + +} // namespace observer +} // namespace tera +#endif // TERA_OBSERVER_ROWLOCKPROXY_REMOTE_ROWLOCK_PROXY_H_ diff --git a/src/observer/rowlockproxy/rowlock_proxy_entry.cc b/src/observer/rowlockproxy/rowlock_proxy_entry.cc new file mode 100644 index 000000000..e9f19faa0 --- /dev/null +++ b/src/observer/rowlockproxy/rowlock_proxy_entry.cc @@ -0,0 +1,79 @@ +// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "observer/rowlockproxy/rowlock_proxy_entry.h" + +#include +#include + +#include "common/base/string_ext.h" +#include "common/base/string_number.h" +#include "common/net/ip_address.h" +#include "common/this_thread.h" +#include "common/thread_attributes.h" +#include "common/timer.h" +#include "common/counter.h" +#include "utils/rpc_timer_list.h" +#include "observer/rowlockproxy/remote_rowlock_proxy.h" + +DECLARE_string(rowlock_proxy_port); +DECLARE_int32(rowlock_io_service_pool_size); +DECLARE_int32(rowlock_rpc_work_thread_num); + +std::string GetTeraEntryName() { + return "rowlock_proxy"; +} + +tera::TeraEntry* GetTeraEntry() { + return new tera::observer::RowlockProxyEntry(); +} + +namespace tera { +namespace observer { + +RowlockProxyEntry::RowlockProxyEntry() { + sofa::pbrpc::RpcServerOptions rpc_options; + rpc_options.max_throughput_in = -1; + rpc_options.max_throughput_out = -1; + rpc_options.work_thread_num = FLAGS_rowlock_rpc_work_thread_num; + rpc_options.io_service_pool_size = FLAGS_rowlock_io_service_pool_size; + rpc_server_.reset(new sofa::pbrpc::RpcServer(rpc_options)); +} + +RowlockProxyEntry::~RowlockProxyEntry() {} + +bool RowlockProxyEntry::StartServer() { + IpAddress rowlock_proxy_addr("0.0.0.0", FLAGS_rowlock_proxy_port); + LOG(INFO) << "Start RPC server at: " << rowlock_proxy_addr.ToString(); + rowlock_proxy_impl_.reset(new RowlockProxyImpl()); + remote_rowlock_proxy_ = new RemoteRowlockProxy(rowlock_proxy_impl_.get()); + rpc_server_->RegisterService(remote_rowlock_proxy_); + if (!rpc_server_->Start(rowlock_proxy_addr.ToString())) { + LOG(ERROR) << "start RPC server error"; + return false; + } + if (!rowlock_proxy_impl_->Init()) { + LOG(ERROR) << "fail to init rowlocknode_impl"; + return false; + } + LOG(INFO) << "finish starting RPC server"; + + return true; +} + +void RowlockProxyEntry::ShutdownServer() { + LOG(INFO) << "shut down server"; + rpc_server_->Stop(); + + LOG(INFO) << "RowlockProxyEntry stop done!"; + _exit(0); +} + +bool RowlockProxyEntry::Run() { + ThisThread::Sleep(1000); + return true; +} + +} // namespace observer +} // namespace tera \ No newline at end of file diff --git a/src/observer/rowlockproxy/rowlock_proxy_entry.h b/src/observer/rowlockproxy/rowlock_proxy_entry.h new file mode 100644 index 000000000..547cf8d04 --- /dev/null +++ b/src/observer/rowlockproxy/rowlock_proxy_entry.h @@ -0,0 +1,37 @@ +// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef TERA_OBSERVER_ROWLOCKPROXY_ROWLOCK_PROXY_ENTRY_H_ +#define TERA_OBSERVER_ROWLOCKPROXY_ROWLOCK_PROXY_ENTRY_H_ + +#include + +#include + +#include "observer/rowlockproxy/remote_rowlock_proxy.h" +#include "observer/rowlockproxy/rowlock_proxy_impl.h" +#include "tera_entry.h" + +namespace tera { +namespace observer { + +class RowlockProxyEntry : public tera::TeraEntry { +public: + RowlockProxyEntry(); + virtual ~RowlockProxyEntry(); + + virtual bool StartServer(); + virtual bool Run(); + virtual void ShutdownServer(); + +private: + std::unique_ptr rowlock_proxy_impl_; + RemoteRowlockProxy* remote_rowlock_proxy_; + std::unique_ptr rpc_server_; +}; + +} // namespace observer +} // namespace tera + +#endif // TERA_OBSERVER_ROWLOCKPROXY_ROWLOCK_PROXY_ENTRY_H_ \ No newline at end of file diff --git a/src/observer/rowlockproxy/rowlock_proxy_impl.cc b/src/observer/rowlockproxy/rowlock_proxy_impl.cc new file mode 100644 index 000000000..0a499dabb --- /dev/null +++ b/src/observer/rowlockproxy/rowlock_proxy_impl.cc @@ -0,0 +1,146 @@ +// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "observer/rowlockproxy/rowlock_proxy_impl.h" + +#include + +#include "common/timer.h" +#include "utils/utils_cmd.h" + +DECLARE_string(rowlock_proxy_port); +DECLARE_string(tera_coord_type); +DECLARE_bool(rowlock_proxy_async_enable); + +namespace tera { +namespace observer { + +RowlockProxyImpl::RowlockProxyImpl() + : server_addrs_(new std::vector), + clients_(new std::map), + server_number_(1) {} + +RowlockProxyImpl::~RowlockProxyImpl() { + for (auto it = clients_->begin(); it != clients_->end(); ++it) { + delete it->second; + } +} + +bool RowlockProxyImpl::Init() { + if (FLAGS_tera_coord_type == "zk") { + zk_adapter_.reset(new RowlockProxyZkAdapter(this, + tera::utils::GetLocalHostName() + ":" + FLAGS_rowlock_proxy_port)); + } else { + zk_adapter_.reset(new InsRowlockProxyZkAdapter(this, + tera::utils::GetLocalHostName() + ":" + FLAGS_rowlock_proxy_port)); + } + + if (!zk_adapter_->Init()) { + LOG(ERROR) << "init zk adapter fail"; + return false; + } + + LOG(INFO) << "Rowlock node init finish"; + return true; +} + +void RowlockProxyImpl::TryLock(const RowlockRequest* request, + RowlockResponse* response, + google::protobuf::Closure* done) { + + uint64_t rowlock_key = GetRowKey(request->table_name(), request->row()); + std::string addr = ScheduleRowKey(rowlock_key); + + // read + std::shared_ptr> read_clients; + { + MutexLock locker(&client_mutex_); + // copy-on-write, ref +1 + read_clients = clients_; + } + + if (FLAGS_rowlock_proxy_async_enable == false) { + (*read_clients)[addr]->TryLock(request, response); + done->Run(); + } else { + (*read_clients)[addr]->TryLock(request, response, [&] (RowlockRequest*, RowlockResponse*, bool, int) {done->Run();}); + } + +} + +void RowlockProxyImpl::UnLock(const RowlockRequest* request, + RowlockResponse* response, + google::protobuf::Closure* done) { + + uint64_t rowlock_key = GetRowKey(request->table_name(), request->row()); + std::string addr = ScheduleRowKey(rowlock_key); + + // read + std::shared_ptr> read_clients; + { + MutexLock locker(&client_mutex_); + // copy for copy-on-write, ref +1 + read_clients = clients_; + } + + if (FLAGS_rowlock_proxy_async_enable == false) { + (*read_clients)[addr]->UnLock(request, response); + done->Run(); + } else { + (*read_clients)[addr]->UnLock(request, response, [&] (RowlockRequest*, RowlockResponse*, bool, int) {done->Run();}); + } +} + +uint64_t RowlockProxyImpl::GetRowKey(const std::string& table_name, + const std::string& row) const { + std::string rowkey_str = table_name + row; + return std::hash()(rowkey_str); +} + +std::string RowlockProxyImpl::ScheduleRowKey(uint64_t row_key) { + std::shared_ptr> server_addrs_copy; + + MutexLock locker(&server_addrs_mutex_); + // copy for copy-on-write, ref +1 + server_addrs_copy = server_addrs_; + + return (*server_addrs_copy)[row_key % server_number_]; +} + +void RowlockProxyImpl::SetServerNumber(uint32_t number) { + MutexLock locker(&server_addrs_mutex_); + + server_number_ = number; + + if (server_addrs_->size() < number) { + server_addrs_->resize(number); + } +} + +void RowlockProxyImpl::UpdateServers(uint32_t id, const std::string& addr) { + // update data first + { + MutexLock locker(&server_addrs_mutex_); + (*server_addrs_)[id] = addr; + } + + MutexLock locker(&client_mutex_); + if(!clients_.unique()) { + clients_.reset(new std::map(*clients_)); + } + + if (clients_->find(addr) == clients_->end()) { + clients_->insert(make_pair(addr, new RowlockStub(addr))); + } +} + +uint32_t RowlockProxyImpl::GetServerNumber() { + return server_number_; +} + +} // namespace observer +} // namespace tera + + + diff --git a/src/observer/rowlockproxy/rowlock_proxy_impl.h b/src/observer/rowlockproxy/rowlock_proxy_impl.h new file mode 100644 index 000000000..4417c3973 --- /dev/null +++ b/src/observer/rowlockproxy/rowlock_proxy_impl.h @@ -0,0 +1,68 @@ +// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef TERA_OBSERVER_ROWLOCKPROXY_ROWLOCK_PROXY_IMPL_H_ +#define TERA_OBSERVER_ROWLOCKPROXY_ROWLOCK_PROXY_IMPL_H_ + +#include +#include +#include + +#include "common/counter.h" +#include "common/mutex.h" +#include "observer/rowlockproxy/rowlock_proxy_zk_adapter.h" +#include "proto/rowlocknode_rpc.pb.h" +#include "sdk/rowlock_client.h" +#include "zk/zk_adapter.h" + +namespace tera { +namespace observer { + +class RowlockProxyZkAdapterBase; +class RowLockStub; + +class RowlockProxyImpl { +public: + RowlockProxyImpl(); + ~RowlockProxyImpl(); + + bool Init(); + + void TryLock(const RowlockRequest* request, + RowlockResponse* response, + google::protobuf::Closure* done); + + void UnLock(const RowlockRequest* request, + RowlockResponse* response, + google::protobuf::Closure* done); + + // for zk + void SetServerNumber(uint32_t number); + uint32_t GetServerNumber(); + void UpdateServers(uint32_t id, const std::string& addr); +private: + uint64_t GetRowKey(const std::string& table_name, + const std::string& row) const; + // rowkey -> server addr + std::string ScheduleRowKey(uint64_t row_key); + +private: + common::Mutex server_addrs_mutex_; + // a map from virtual node to server addr + // key: vector index, virtual node number + // value: vector value, server address + // shared_ptr: used for copy-on-write + std::shared_ptr> server_addrs_; + + common::Mutex client_mutex_; + std::shared_ptr> clients_; + + uint32_t server_number_; + std::unique_ptr zk_adapter_; +}; + +} // namespace observer +} // namespace tera + +#endif // TERA_OBSERVER_ROWLOCKPROXY_ROWLOCK_PROXY_IMPL_H_ diff --git a/src/observer/rowlockproxy/rowlock_proxy_zk_adapter.cc b/src/observer/rowlockproxy/rowlock_proxy_zk_adapter.cc new file mode 100644 index 000000000..290c6815c --- /dev/null +++ b/src/observer/rowlockproxy/rowlock_proxy_zk_adapter.cc @@ -0,0 +1,411 @@ +// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "observer/rowlockproxy/rowlock_proxy_zk_adapter.h" + +#include +#include + +#include "common/base/string_number.h" +#include "observer/rowlockproxy/rowlock_proxy_impl.h" +#include "types.h" +#include "ins_sdk.h" + +DECLARE_string(rowlock_zk_root_path); +DECLARE_string(tera_zk_addr_list); +DECLARE_int32(rowlock_server_node_num); +DECLARE_int64(tera_zk_retry_period); +DECLARE_int32(tera_zk_timeout); +DECLARE_int32(tera_zk_retry_max_times); + +DECLARE_string(rowlock_ins_root_path); +DECLARE_string(tera_ins_addr_list); + +namespace tera { +namespace observer { + +RowlockProxyZkAdapter::RowlockProxyZkAdapter(RowlockProxyImpl* rowlock_proxy_impl, + const std::string& server_addr) + : rowlock_proxy_impl_(rowlock_proxy_impl), + server_addr_(server_addr) {} + +bool RowlockProxyZkAdapter::Init() { + std::string root_path = FLAGS_rowlock_zk_root_path; + std::string node_num_key = root_path + kRowlockNodeNumPath; + std::string id_lock_path; + std::string proxy_path = root_path + kRowlockProxyPath + "/" + server_addr_; + + int zk_errno = zk::ZE_OK; + int32_t retry_count = 0; + // init zk client + while (!ZooKeeperAdapter::Init(FLAGS_tera_zk_addr_list, + FLAGS_rowlock_zk_root_path, FLAGS_tera_zk_timeout, + server_addr_, &zk_errno)) { + if (retry_count++ >= FLAGS_tera_zk_retry_max_times) { + LOG(ERROR) << "fail to init zk: " << zk::ZkErrnoToString(zk_errno); + return false; + } + + LOG(ERROR) << "init zk fail: " << zk::ZkErrnoToString(zk_errno) + << ". retry in " << FLAGS_tera_zk_retry_period << " ms, retry: " + << retry_count; + ThisThread::Sleep(FLAGS_tera_zk_retry_period); + zk_errno = zk::ZE_OK; + } + LOG(INFO) << "init zk success"; + + // get session id + int64_t session_id_int = 0; + if (!GetSessionId(&session_id_int, &zk_errno)) { + LOG(ERROR) << "get session id fail : " << zk::ZkErrnoToString(zk_errno); + return false; + } + + bool is_exist = false; + uint32_t node_num; + while(!is_exist) { + CheckExist(node_num_key, &is_exist, &zk_errno); + if (!is_exist) { + LOG(ERROR) << "rowlock service number node not found: " << node_num_key + << " make sure rowlock zk available"; + ThisThread::Sleep(FLAGS_tera_zk_retry_period); + } + } + std::string value; + ReadAndWatchNode(node_num_key, &value, &zk_errno); + + if (!StringToNumber(value, &node_num)) { + LOG(ERROR) << "read number node fail"; + return false; + } + + rowlock_proxy_impl_->SetServerNumber(node_num); + + retry_count = 0; + id_lock_path = root_path + kRowlockNodeIdListPath; + std::vector name_list; + std::vector data_list; + + while (!ListAndWatchChildren(id_lock_path, &name_list, &data_list, + &zk_errno) || name_list.size() != node_num) { + if (retry_count++ >= FLAGS_tera_zk_retry_max_times) { + LOG(ERROR) << "fail to watch rowlock server list or lack rowlock server"; + return false; + } + LOG(ERROR) << "retry watch rowlock server list in " + << FLAGS_tera_zk_retry_period << " ms, retry=" << retry_count + << " node_num: " << node_num << " list size: " << name_list.size(); + ThisThread::Sleep(FLAGS_tera_zk_retry_period); + zk_errno = zk::ZE_OK; + } + size_t list_count = name_list.size(); + for (size_t i = 0; i < list_count; i++) { + const std::string& name = name_list[i]; + const std::string& data = data_list[i]; + + uint32_t id; + StringToNumber(name, &id); + rowlock_proxy_impl_->UpdateServers(id, data); + } + + // create proxy node + retry_count = 0; + while (!CreateEphemeralNode(proxy_path, server_addr_, &zk_errno)) { + if (retry_count++ >= FLAGS_tera_zk_retry_max_times) { + LOG(ERROR) << "fail to create proxy node"; + return false; + } + LOG(ERROR) << "retry create rowlock number node in " + << FLAGS_tera_zk_retry_period << " ms, retry=" << retry_count; + ThisThread::Sleep(FLAGS_tera_zk_retry_period); + zk_errno = zk::ZE_OK; + } + return true; +} +void RowlockProxyZkAdapter::OnNodeValueChanged(const std::string& path, + const std::string& value) { + std::string value_str; + int zk_errno = zk::ZE_OK; + std::string node_num_key = FLAGS_rowlock_zk_root_path + kRowlockNodeNumPath; + + if (path == node_num_key) { + LOG(WARNING) << "rowlock service server number changed to " << value; + uint32_t node_num; + StringToNumber(value, &node_num); + rowlock_proxy_impl_->SetServerNumber(node_num); + ReadAndWatchNode(node_num_key, &value_str, &zk_errno); + } +} + +void RowlockProxyZkAdapter::OnWatchFailed(const std::string& path, int watch_type, + int err) { + LOG(ERROR) << "watch failed ! " << path; + _Exit(EXIT_FAILURE); +} + +void RowlockProxyZkAdapter::OnSessionTimeout() { + LOG(ERROR) << "zk session timeout!"; + _Exit(EXIT_FAILURE); +} + +void RowlockProxyZkAdapter::OnNodeCreated(const std::string& path) { + std::string value; + int zk_errno = zk::ZE_OK; + + if (path == FLAGS_rowlock_zk_root_path + kRowlockNodeNumPath) { + LOG(WARNING) << "rowlock service number node create"; + ReadAndWatchNode(path, &value, &zk_errno); + uint32_t node_num; + StringToNumber(value, &node_num); + rowlock_proxy_impl_->SetServerNumber(node_num); + } else { + std::string id_str = path.substr(path.find_last_of("/"), + path.size() - path.find_last_of("/")); + uint32_t id; + StringToNumber(id_str, &id); + ReadAndWatchNode(path, &value, &zk_errno); + rowlock_proxy_impl_->UpdateServers(id, value); + } +} + +void RowlockProxyZkAdapter::OnNodeDeleted(const std::string& path) { + LOG(ERROR) << "node deleted: " << path; + + int zk_errno = zk::ZE_OK; + bool is_exist = false; + if (path == FLAGS_rowlock_zk_root_path + kRowlockNodeNumPath) { + while(!is_exist) { + CheckExist(path, &is_exist, &zk_errno); + if (!is_exist) { + LOG(ERROR) << "rowlock service number node not found: " << path + << " make sure rowlock zk available"; + ThisThread::Sleep(FLAGS_tera_zk_retry_period); + } + + std::string value; + ReadAndWatchNode(path, &value, &zk_errno); + uint32_t node_num; + if (!StringToNumber(value, &node_num)) { + LOG(ERROR) << "read number node fail"; + return; + } + + rowlock_proxy_impl_->SetServerNumber(node_num); + } + return; + } + // server node + std::string id_str = path.substr(path.find_last_of("/"), + path.size() - path.find_last_of("/")); + uint32_t id; + StringToNumber(id_str, &id); + + if (id >= rowlock_proxy_impl_->GetServerNumber()) { + return; + } + + while(!is_exist) { + CheckExist(path, &is_exist, &zk_errno); + if (!is_exist) { + LOG(ERROR) << "rowlock server node not found: " << path; + ThisThread::Sleep(FLAGS_tera_zk_retry_period); + } + + std::string value; + ReadAndWatchNode(path, &value, &zk_errno); + uint32_t node_num; + if (!StringToNumber(value, &node_num)) { + LOG(ERROR) << "read number node fail"; + return; + } + + rowlock_proxy_impl_->UpdateServers(node_num, value); + } +} + +void RowlockProxyZkAdapter::OnChildrenChanged(const std::string& path, + const std::vector& name_list, + const std::vector& data_list) { + std::string root_path = FLAGS_rowlock_ins_root_path; + int32_t retry_count = 0; + int zk_errno = zk::ZE_OK; + std::string id_lock_path = root_path + kRowlockNodeIdListPath; + std::vector names; + std::vector datum; + + while (!ListAndWatchChildren(id_lock_path, &names, &datum, + &zk_errno)) { + if (retry_count++ >= FLAGS_tera_zk_retry_max_times) { + LOG(ERROR) << "fail to watch rowlock server list or lack rowlock server"; + _Exit(EXIT_FAILURE); + } + LOG(ERROR) << "retry watch rowlock server list in " + << FLAGS_tera_zk_retry_period << " ms, retry=" << retry_count; + ThisThread::Sleep(FLAGS_tera_zk_retry_period); + } + size_t list_count = name_list.size(); + for (size_t i = 0; i < list_count; i++) { + const std::string& name = names[i]; + const std::string& data = datum[i]; + + uint32_t id; + StringToNumber(name, &id); + rowlock_proxy_impl_->UpdateServers(id, data); + } +} + +// ins + +InsRowlockProxyZkAdapter::InsRowlockProxyZkAdapter(RowlockProxyImpl* rowlock_proxy_impl, + const std::string& server_addr) + : rowlock_proxy_impl_(rowlock_proxy_impl), + server_addr_(server_addr) {} + +static void InsOnNumberChange(const galaxy::ins::sdk::WatchParam& param, + galaxy::ins::sdk::SDKError error) { + InsRowlockProxyZkAdapter* ins_adp = static_cast(param.context); + ins_adp->OnValueChange(param.key, param.value); +} + +static void InsOnServerChange(const galaxy::ins::sdk::WatchParam& param, + galaxy::ins::sdk::SDKError error) { + InsRowlockProxyZkAdapter* ins_adp = static_cast(param.context); + ins_adp->OnServerChange(); +} + +bool InsRowlockProxyZkAdapter::Init() { + std::string root_path = FLAGS_rowlock_ins_root_path; + std::string node_num_key = root_path + kRowlockNodeNumPath; + std::string proxy_path = root_path + kRowlockProxyPath + "/" + server_addr_; + std::string value; + galaxy::ins::sdk::SDKError err; + + ins_sdk_ = new galaxy::ins::sdk::InsSDK(FLAGS_tera_ins_addr_list); + + LOG(INFO) << "init ins success"; + + if (!ins_sdk_->Get(node_num_key, &value, &err)) { + LOG(ERROR) << "ins rowlock service number node not found: " << node_num_key + << " make sure rowlock ins available"; + return false; + } + + uint32_t node_num; + if (!StringToNumber(value, &node_num)) { + LOG(ERROR) << "read number node fail"; + return false; + } + rowlock_proxy_impl_->SetServerNumber(node_num); + + if (!ins_sdk_->Watch(node_num_key, InsOnNumberChange, this, &err)) { + LOG(ERROR) << "try to watch number node ,path=" << node_num_key << " failed," + << ins_sdk_->ErrorToString(err); + return false; + } + + + // read server addr + int32_t retry_count = 0; + std::string id_lock_path = root_path + kRowlockNodeIdListPath; + + while (!ins_sdk_->Watch(id_lock_path, InsOnServerChange, this, &err)) { + LOG(ERROR) << "try to watch server node ,path=" << id_lock_path << " failed," + << ins_sdk_->ErrorToString(err); + if (retry_count++ > FLAGS_tera_zk_retry_max_times) { + return false; + } + } + + galaxy::ins::sdk::ScanResult* result = ins_sdk_->Scan(id_lock_path+"/!", + id_lock_path+"/~"); + while (!result->Done()) { + CHECK_EQ(result->Error(), galaxy::ins::sdk::kOK); + std::string value = result->Value(); + std::string key = result->Key(); + VLOG(12) << "Key: " << key << " value: " << value; + + uint32_t node_num; + uint32_t pos = key.find_last_of("/") + 1; + key = key.substr(pos, key.length() - pos); + VLOG(12) << "key: " << key; + if (!StringToNumber(key, &node_num)) { + LOG(ERROR) << "read number node fail"; + _Exit(EXIT_FAILURE); + } + + rowlock_proxy_impl_->UpdateServers(node_num, value); + result->Next(); + } + delete result; + + // create proxy node + retry_count = 0; + while (!ins_sdk_->Put(proxy_path, server_addr_, &err)) { + if (retry_count++ >= FLAGS_tera_zk_retry_max_times) { + LOG(ERROR) << "fail to create proxy node"; + return false; + } + LOG(ERROR) << "retry create rowlock number node in " + << FLAGS_tera_zk_retry_period << " ms, retry=" << retry_count; + ThisThread::Sleep(FLAGS_tera_zk_retry_period); + } + return true; +} + +void InsRowlockProxyZkAdapter::OnValueChange(const std::string& path, const std::string& value) { + uint32_t node_num; + galaxy::ins::sdk::SDKError err; + + if (!StringToNumber(value, &node_num)) { + LOG(ERROR) << "read number node fail"; + return; + } + rowlock_proxy_impl_->SetServerNumber(node_num); + + if (!ins_sdk_->Watch(path, InsOnNumberChange, this, &err)) { + LOG(ERROR) << "try to watch number node ,path=" << path << " failed," + << ins_sdk_->ErrorToString(err); + return; + } +} + +void InsRowlockProxyZkAdapter::OnServerChange() { + galaxy::ins::sdk::SDKError err; + std::string root_path = FLAGS_rowlock_ins_root_path; + + int32_t retry_count = 0; + std::string id_lock_path = root_path + kRowlockNodeIdListPath; + + while (!ins_sdk_->Watch(id_lock_path, InsOnServerChange, this, &err)) { + LOG(ERROR) << "try to watch server node ,path=" << id_lock_path << " failed," + << ins_sdk_->ErrorToString(err); + if (retry_count++ > FLAGS_tera_zk_retry_max_times) { + _Exit(EXIT_FAILURE); + } + } + + galaxy::ins::sdk::ScanResult* result = ins_sdk_->Scan(id_lock_path+"/!", + id_lock_path+"/~"); + while (!result->Done()) { + CHECK_EQ(result->Error(), galaxy::ins::sdk::kOK); + std::string value = result->Value(); + std::string key = result->Key(); + + uint32_t node_num; + uint32_t pos = key.find_last_of("/") + 1; + key = key.substr(pos, key.length() - pos); + VLOG(12) << "key: " << key; + if (!StringToNumber(key, &node_num)) { + LOG(ERROR) << "read number node fail"; + _Exit(EXIT_FAILURE); + } + + rowlock_proxy_impl_->UpdateServers(node_num, value); + result->Next(); + } + delete result; +} + +} // namespace observer +} // namespace tera \ No newline at end of file diff --git a/src/observer/rowlockproxy/rowlock_proxy_zk_adapter.h b/src/observer/rowlockproxy/rowlock_proxy_zk_adapter.h new file mode 100644 index 000000000..02125135c --- /dev/null +++ b/src/observer/rowlockproxy/rowlock_proxy_zk_adapter.h @@ -0,0 +1,83 @@ +// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef TERA_OBSERVER_ROWLOCKPROXY_ROWLOCK_PROXY_ZK_ADAPTER_H_ +#define TERA_OBSERVER_ROWLOCKPROXY_ROWLOCK_PROXY_ZK_ADAPTER_H_ + +#include "zk/zk_adapter.h" + +namespace galaxy { +namespace ins { +namespace sdk { + class InsSDK; +} // namespace sdk +} // namespace ins +} // namespace galaxy + +namespace tera { +namespace observer { + +class RowlockProxyImpl; + +class RowlockProxyZkAdapterBase : public zk::ZooKeeperAdapter { +public: + virtual ~RowlockProxyZkAdapterBase() {} + virtual bool Init() = 0; +}; + +class RowlockProxyZkAdapter : public RowlockProxyZkAdapterBase { +public: + RowlockProxyZkAdapter(RowlockProxyImpl* rowlock_proxy_impl, const std::string& server_addr); + virtual ~RowlockProxyZkAdapter() {} + virtual bool Init(); + +protected: + virtual void OnNodeValueChanged(const std::string& path, + const std::string& value); + virtual void OnWatchFailed(const std::string& path, int watch_type, + int err); + virtual void OnNodeDeleted(const std::string& path); + virtual void OnSessionTimeout(); + virtual void OnNodeCreated(const std::string& path); + virtual void OnChildrenChanged(const std::string& path, + const std::vector& name_list, + const std::vector& data_list); + +private: + RowlockProxyImpl* rowlock_proxy_impl_; + std::string server_addr_; + +}; + +class InsRowlockProxyZkAdapter : public RowlockProxyZkAdapterBase { +public: + InsRowlockProxyZkAdapter(RowlockProxyImpl* rowlock_proxy_impl, const std::string& server_addr); + virtual ~InsRowlockProxyZkAdapter() {} + virtual bool Init(); + + void OnValueChange(const std::string& path, const std::string& value); + void OnServerChange(); + +protected: + virtual void OnNodeValueChanged(const std::string& path, + const std::string& value) {} + virtual void OnWatchFailed(const std::string& path, int watch_type, + int err) {} + virtual void OnNodeDeleted(const std::string& path) {} + virtual void OnSessionTimeout() {} + virtual void OnNodeCreated(const std::string& path) {} + virtual void OnChildrenChanged(const std::string& path, + const std::vector& name_list, + const std::vector& data_list) {} + +private: + RowlockProxyImpl* rowlock_proxy_impl_; + std::string server_addr_; + galaxy::ins::sdk::InsSDK* ins_sdk_; +}; + +} // namespace observer +} // namespace tera + +#endif // TERA_OBSERVER_ROWLOCKPROXY_ROWLOCK_PROXY_ZK_ADAPTER_H_ \ No newline at end of file diff --git a/src/observer/test/observer_test.cc b/src/observer/test/observer_test.cc new file mode 100644 index 000000000..299ec4581 --- /dev/null +++ b/src/observer/test/observer_test.cc @@ -0,0 +1,587 @@ +// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include +#include + +#include +#include +#include + +#include "common/thread_pool.h" +#include "observer/executor/observer.h" +#include "observer/executor/random_key_selector.h" +#include "observer/executor/scanner.h" +#include "observer/executor/scanner_impl.h" +#include "observer/executor/notification_impl.h" +#include "sdk/client_impl.h" +#include "sdk/sdk_utils.h" +#include "tera.h" +#include "types.h" + +DECLARE_bool(tera_gtxn_test_opened); +DECLARE_int64(start_ts); +DECLARE_int64(begin_commit_ts); +DECLARE_int64(begin_prewrite_ts); +DECLARE_int64(end_prewrite_ts); +DECLARE_int64(commit_ts); +DECLARE_string(flagfile); +DECLARE_string(tera_coord_type); +DECLARE_bool(tera_sdk_client_for_gtxn); +DECLARE_bool(mock_rowlock_enable); + +namespace tera { +namespace observer { + +class TestWorker : public Observer { +public: + TestWorker(): counter_(0), notified_(false) {} + virtual ~TestWorker() {} + virtual ErrorCode OnNotify(tera::Transaction* t, + tera::Client* client, + const std::string& table_name, + const std::string& family, + const std::string& qualifier, + const std::string& row, + const std::string& value, + int64_t timestamp, + Notification* notification) { + LOG(INFO) << "[Notify DemoObserver] table:family:qualifer=" << + table_name << ":" << family << ":" << + qualifier << " row=" << row << + " value=" << value << " timestamp=" << timestamp; + + table_name_ = table_name; + family_ = family; + qualifier_ = qualifier; + row_ = row; + value_ = value; + + tera::ErrorCode err; + notified_ = true; + ++counter_; + + tera::Table* table = client->OpenTable(table_name, &err); + notification->Ack(table, row, family, qualifier); + + return err; + } + + virtual std::string GetObserverName() const { + return "DemoObserver"; + } + + virtual TransactionType GetTransactionType() const { + return kGlobalTransaction; + } +private: + std::atomic counter_; + std::atomic notified_; + + std::string table_name_; + std::string family_; + std::string qualifier_; + std::string row_; + std::string value_; +}; + +class TestWorkerGTX : public Observer { +public: + TestWorkerGTX(): counter_(0), notified_(false) {} + virtual ~TestWorkerGTX() {} + virtual ErrorCode OnNotify(tera::Transaction* t, + tera::Client* client, + const std::string& table_name, + const std::string& family, + const std::string& qualifier, + const std::string& row, + const std::string& value, + int64_t timestamp, + Notification* notification) { + LOG(INFO) << "[Notify TestWorkerGTX] table:family:qualifer=" << + table_name << ":" << family << ":" << + qualifier << " row=" << row << + " value=" << value << " timestamp=" << timestamp; + + table_name_ = table_name; + family_ = family; + qualifier_ = qualifier; + row_ = row; + value_ = value; + + tera::ErrorCode err; + notified_ = true; + ++counter_; + + tera::Table* table = client->OpenTable(table_name, &err); + + // write ForwordIndex column + tera::RowMutation* mutation = table->NewRowMutation(row); + mutation->Put(family, qualifier + "_test", row + "_"); + t->ApplyMutation(mutation); + + tera::ErrorCode error; + t->Ack(table, row, family, qualifier); + table->CommitRowTransaction(t); + delete mutation; + return error; + + return err; + } + + virtual std::string GetObserverName() const { + return "DemoObserver"; + } + + virtual TransactionType GetTransactionType() const { + return kSingleRowTransaction; + } +private: + std::atomic counter_; + std::atomic notified_; + + std::string table_name_; + std::string family_; + std::string qualifier_; + std::string row_; + std::string value_; +}; + +class DemoObserver : public tera::observer::Observer { +public: + DemoObserver() {} + virtual ~DemoObserver() {} + virtual ErrorCode OnNotify(tera::Transaction* t, + tera::Client* client, + const std::string& table_name, + const std::string& family, + const std::string& qualifier, + const std::string& row, + const std::string& value, + int64_t timestamp, + Notification* notification) { + LOG(INFO) << "[Notify ParseObserver] table:family:qualifer=" << + table_name << ":" << family << ":" << + qualifier << " row=" << row << + " value=" << value << " timestamp=" << timestamp; + + tera::ErrorCode err; + // do nothing + return err; + } + virtual std::string GetObserverName() const { + return "DemoObserver"; + } + virtual TransactionType GetTransactionType() const { + return kGlobalTransaction; + } +}; + +class TestWorkerNTX : public Observer { +public: + TestWorkerNTX(): counter_(0), notified_(false) {} + virtual ~TestWorkerNTX() {} + virtual ErrorCode OnNotify(tera::Transaction* t, + tera::Client* client, + const std::string& table_name, + const std::string& family, + const std::string& qualifier, + const std::string& row, + const std::string& value, + int64_t timestamp, + Notification* notification) { + LOG(INFO) << "[Notify TestWorkerNTX] table:family:qualifer=" << + table_name << ":" << family << ":" << + qualifier << " row=" << row << + " value=" << value << " timestamp=" << timestamp; + + table_name_ = table_name; + family_ = family; + qualifier_ = qualifier; + row_ = row; + value_ = value; + + tera::ErrorCode err; + notified_ = true; + ++counter_; + + // do something without transaction + + return err; + } + + virtual std::string GetObserverName() const { + return "DemoObserver"; + } + + virtual TransactionType GetTransactionType() const { + return kNoneTransaction; + } +private: + std::atomic counter_; + std::atomic notified_; + + std::string table_name_; + std::string family_; + std::string qualifier_; + std::string row_; + std::string value_; +}; + +class ObserverImplTest : public ::testing::Test { +public: + void OnNotifyTest() { + tera::ErrorCode err; + tera::Client* client = tera::Client::NewClient(FLAGS_flagfile, &err); + // for ut test + EXPECT_EQ(tera::ErrorCode::kOK, err.GetType()); + // for no core + if (tera::ErrorCode::kOK != err.GetType()) { + LOG(ERROR) << "new client failed"; + return; + } + + // create table + tera::TableDescriptor table_desc("observer_test_table"); + table_desc.EnableTxn(); + + table_desc.AddLocalityGroup("lg1"); + tera::ColumnFamilyDescriptor* cf1 = table_desc.AddColumnFamily("cf", "lg1"); + cf1->EnableGlobalTransaction(); + cf1->EnableNotify(); + ExtendNotifyLgToDescriptor(&table_desc); + + client->CreateTable(table_desc, &err); + if (err.GetType() != tera::ErrorCode::kOK) { + LOG(ERROR) << "Create table fail"; + } + + tera::Table* table = client->OpenTable("observer_test_table", &err); + EXPECT_EQ(tera::ErrorCode::kOK, err.GetType()); + if (tera::ErrorCode::kOK != err.GetType()) { + LOG(ERROR) << "open table failed"; + return; + } + + std::unique_ptr t(table->StartRowTransaction("www.baidu.com")); + + assert(t != NULL); + std::unique_ptr mu0(table->NewRowMutation("www.baidu.com")); + mu0->Put("_N_", "cf:Page", "I am not important"); + t->ApplyMutation(mu0.get()); + t->Commit(); + + std::unique_ptr g_txn(client->NewGlobalTransaction()); + assert(g_txn != NULL); + std::unique_ptr mu1(table->NewRowMutation("www.baidu.com")); + + mu1->Put("cf", "Page", "hello world", -1); + g_txn->ApplyMutation(mu1.get()); + g_txn->Commit(); + + if (g_txn->GetError().GetType() != tera::ErrorCode::kOK) { + std::cout << g_txn->GetError().ToString() << std::endl; + } else { + std::cout << "commit success" << std::endl; + } + + // varibles for fake timeoracle + FLAGS_start_ts = 10; + FLAGS_begin_commit_ts = 1; + FLAGS_begin_prewrite_ts = 1; + FLAGS_end_prewrite_ts = 1; + FLAGS_commit_ts = 13; + + Observer* observer = new TestWorker(); + Observer* demo = new DemoObserver(); + + Scanner* scanner = new ScannerImpl(); + bool ret = scanner->Init(); + EXPECT_EQ(true, ret); + if(!ret) { + LOG(ERROR) << "fail to init scanner_impl"; + return; + } + + err = scanner->Observe("observer_test_table", "cf", "Page", observer); + EXPECT_EQ(err.GetType(), tera::ErrorCode::kOK); + + err = scanner->Observe("observer_test_table", "cf", "Page", demo); + EXPECT_EQ(err.GetType(), tera::ErrorCode::kOK); + + if(!scanner->Start()) { + LOG(ERROR) << "fail to start scanner_impl"; + return; + } + + while (!static_cast(observer)->notified_) { + sleep(1); + } + + + EXPECT_EQ("www.baidu.com", static_cast(observer)->row_); + EXPECT_EQ("observer_test_table", static_cast(observer)->table_name_); + EXPECT_EQ("cf", static_cast(observer)->family_); + EXPECT_EQ("Page", static_cast(observer)->qualifier_); + EXPECT_EQ("hello world", static_cast(observer)->value_); + + scanner->Exit(); + delete scanner; + } + + void SingleRowTransactionTest() { + tera::ErrorCode err; + tera::Client* client = tera::Client::NewClient(FLAGS_flagfile, &err); + // for ut test + EXPECT_EQ(tera::ErrorCode::kOK, err.GetType()); + // for no core + if (tera::ErrorCode::kOK != err.GetType()) { + LOG(ERROR) << "new client failed"; + return; + } + + // create table + tera::TableDescriptor table_desc("observer_table_gtx"); + table_desc.EnableTxn(); + + table_desc.AddLocalityGroup("lg1"); + tera::ColumnFamilyDescriptor* cf1 = table_desc.AddColumnFamily("cf", "lg1"); + cf1->EnableNotify(); + ExtendNotifyLgToDescriptor(&table_desc); + + client->CreateTable(table_desc, &err); + if (err.GetType() != tera::ErrorCode::kOK) { + LOG(ERROR) << "Create table fail"; + } + + tera::Table* table = client->OpenTable("observer_table_gtx", &err); + EXPECT_EQ(tera::ErrorCode::kOK, err.GetType()); + if (tera::ErrorCode::kOK != err.GetType()) { + LOG(ERROR) << "open table failed"; + return; + } + + std::unique_ptr t(table->StartRowTransaction("www.baidu.com")); + + assert(t != NULL); + std::unique_ptr mu0(table->NewRowMutation("www.baidu.com")); + mu0->Put("_N_", "cf:Page", "I am not important"); + mu0->Put("cf", "Page", "hello world", -1); + t->ApplyMutation(mu0.get()); + t->Commit(); + + if (t->GetError().GetType() != tera::ErrorCode::kOK) { + std::cout << t->GetError().ToString() << std::endl; + } else { + std::cout << "commit success" << std::endl; + } + + Observer* observer = new TestWorkerGTX(); + + Scanner* scanner = new ScannerImpl(); + bool ret = scanner->Init(); + + EXPECT_EQ(true, ret); + if(!ret) { + LOG(ERROR) << "fail to init scanner_impl"; + return; + } + + err = scanner->Observe("observer_table_gtx", "cf", "Page", observer); + EXPECT_EQ(err.GetType(), tera::ErrorCode::kOK); + + if(!scanner->Start()) { + LOG(ERROR) << "fail to start scanner_impl"; + return; + } + + while (!static_cast(observer)->notified_) { + sleep(1); + } + + EXPECT_EQ("www.baidu.com", static_cast(observer)->row_); + EXPECT_EQ("observer_table_gtx", static_cast(observer)->table_name_); + EXPECT_EQ("cf", static_cast(observer)->family_); + EXPECT_EQ("Page", static_cast(observer)->qualifier_); + EXPECT_EQ("hello world", static_cast(observer)->value_); + scanner->Exit(); + delete scanner; + } + + void NonTransactionTest() { + tera::ErrorCode err; + tera::Client* client = tera::Client::NewClient(FLAGS_flagfile, &err); + // for ut test + EXPECT_EQ(tera::ErrorCode::kOK, err.GetType()); + // for no core + if (tera::ErrorCode::kOK != err.GetType()) { + LOG(ERROR) << "new client failed"; + return; + } + + // create table + tera::TableDescriptor table_desc("observer_table_ntx"); + + table_desc.AddLocalityGroup("lg1"); + tera::ColumnFamilyDescriptor* cf1 = table_desc.AddColumnFamily("cf", "lg1"); + cf1->EnableNotify(); + ExtendNotifyLgToDescriptor(&table_desc); + + client->CreateTable(table_desc, &err); + if (err.GetType() != tera::ErrorCode::kOK) { + LOG(ERROR) << "Create table fail"; + } + + tera::Table* table = client->OpenTable("observer_table_ntx", &err); + EXPECT_EQ(tera::ErrorCode::kOK, err.GetType()); + if (tera::ErrorCode::kOK != err.GetType()) { + LOG(ERROR) << "open table failed"; + return; + } + + table->Put("www.baidu.com", "_N_", "cf:Page", "I am not important", &err); + table->Put("www.baidu.com", "cf", "Page", "hello world", -1, &err); + + Observer* observer = new TestWorkerNTX(); + + Scanner* scanner = new ScannerImpl(); + bool ret = scanner->Init(); + + EXPECT_EQ(true, ret); + if(!ret) { + LOG(ERROR) << "fail to init scanner_impl"; + return; + } + + err = scanner->Observe("observer_table_ntx", "cf", "Page", observer); + EXPECT_EQ(err.GetType(), tera::ErrorCode::kOK); + + if(!scanner->Start()) { + LOG(ERROR) << "fail to start scanner_impl"; + return; + } + + while (!static_cast(observer)->notified_) { + sleep(1); + } + + EXPECT_EQ("www.baidu.com", static_cast(observer)->row_); + EXPECT_EQ("observer_table_ntx", static_cast(observer)->table_name_); + EXPECT_EQ("cf", static_cast(observer)->family_); + EXPECT_EQ("Page", static_cast(observer)->qualifier_); + EXPECT_EQ("hello world", static_cast(observer)->value_); + scanner->Exit(); + delete scanner; + } + + void ObserveTest() { + tera::ErrorCode err; + tera::Client* client = tera::Client::NewClient(FLAGS_flagfile, &err); + // for ut test + EXPECT_EQ(tera::ErrorCode::kOK, err.GetType()); + // for no core + if (tera::ErrorCode::kOK != err.GetType()) { + LOG(ERROR) << "new client failed"; + return; + } + + // create table + tera::TableDescriptor table_desc("observer_table"); + table_desc.EnableTxn(); + table_desc.AddLocalityGroup("notify"); + tera::ColumnFamilyDescriptor* cf_t = table_desc.AddColumnFamily(kNotifyColumnFamily, "notify"); + cf_t->EnableGlobalTransaction(); + + table_desc.AddLocalityGroup("lg1"); + tera::ColumnFamilyDescriptor* cf1 = table_desc.AddColumnFamily("cf", "lg1"); + cf1->EnableGlobalTransaction(); + cf1->EnableNotify(); + tera::ColumnFamilyDescriptor* cf2 = table_desc.AddColumnFamily("cf_1", "lg1"); + cf2->EnableGlobalTransaction(); + cf2->EnableNotify(); + + ExtendNotifyLgToDescriptor(&table_desc); + + client->CreateTable(table_desc, &err); + if (err.GetType() != tera::ErrorCode::kOK) { + LOG(ERROR) << "Create table fail"; + } + + FLAGS_tera_sdk_client_for_gtxn = true; + FLAGS_tera_coord_type = "ins"; + common::ThreadPool thread_pool(5); + ScannerImpl* scanner = new ScannerImpl(); + Observer* observer = new DemoObserver(); + scanner->key_selector_.reset(new RandomKeySelector()); + + // single thread + + err = scanner->Observe("observer_table", "cf", "qualifier", observer); + EXPECT_TRUE(err.GetType() != tera::ErrorCode::kOK); + + scanner->tera_client_ = tera::Client::NewClient(FLAGS_flagfile, &err); + EXPECT_EQ(scanner->table_observe_info_->size(), 0); + + err = scanner->Observe("observer_table", "cf", "qualifier", observer); + EXPECT_TRUE(err.GetType() == tera::ErrorCode::kOK); + + err = scanner->Observe("observer_table", "cf", "qualifier", observer); + EXPECT_FALSE(err.GetType() == tera::ErrorCode::kOK); + + err = scanner->Observe("observer_table", "cf_1", "qualifier", observer); + EXPECT_TRUE(err.GetType() == tera::ErrorCode::kOK); + + // multi thread + std::string qualifier; + + for (uint32_t i = 0; i < 10; ++i) { + qualifier += 'a'; + thread_pool.AddTask(std::bind(&ScannerImpl::Observe, scanner, "observer_table", "cf", qualifier, observer)); + } + thread_pool.Stop(true); + EXPECT_EQ(1, scanner->observers_.size()); + EXPECT_EQ(10 + 2, (*(scanner->table_observe_info_))["observer_table"].observe_columns.size()); + scanner->Exit(); + delete scanner; + } +}; + +TEST_F(ObserverImplTest, OnNotifyTest) { + FLAGS_tera_gtxn_test_opened = true; + FLAGS_tera_coord_type = "ins"; + FLAGS_mock_rowlock_enable = true; + OnNotifyTest(); +} + +TEST_F(ObserverImplTest, SingleRowTransactionTest) { + FLAGS_tera_gtxn_test_opened = true; + FLAGS_tera_coord_type = "ins"; + FLAGS_mock_rowlock_enable = true; + SingleRowTransactionTest(); +} + +TEST_F(ObserverImplTest, NoneTransactionTest) { + FLAGS_tera_gtxn_test_opened = true; + FLAGS_tera_coord_type = "ins"; + FLAGS_mock_rowlock_enable = true; + NonTransactionTest(); +} + +TEST_F(ObserverImplTest, ObserveTest) { + FLAGS_tera_gtxn_test_opened = true; + FLAGS_tera_coord_type = "ins"; + FLAGS_mock_rowlock_enable = true; + ObserveTest(); +} + +} // namespace observer +} // namespace tera + +int main(int argc, char** argv) { + FLAGS_tera_sdk_client_for_gtxn = true; + ::google::ParseCommandLineFlags(&argc, &argv, true); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} + diff --git a/src/observer/test/rowlock_proxy_test.cc b/src/observer/test/rowlock_proxy_test.cc new file mode 100644 index 000000000..3b690686b --- /dev/null +++ b/src/observer/test/rowlock_proxy_test.cc @@ -0,0 +1,107 @@ +// Copyright (c) 2017, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include +#include +#include + +#include +#include +#include +#include + +#include "observer/rowlockproxy/remote_rowlock_proxy.h" +#include "observer/rowlockproxy/rowlock_proxy_impl.h" +#include "proto/rpc_client.h" +#include "sdk/rowlock_client.h" +#include "utils/utils_cmd.h" + +class TestClosure : public google::protobuf::Closure { +public: + TestClosure() {} + virtual void Run() {} +}; + +namespace tera { +namespace observer { + +class TestClient : public RowlockStub { +public: + TestClient() : RowlockStub("127.0.0.1:22222") {}; + ~TestClient() {} + + virtual bool TryLock(const RowlockRequest* request, + RowlockResponse* response, + std::function done = NULL) { + response->set_lock_status(kLockSucc); + return true; + } + + virtual bool UnLock(const RowlockRequest* request, + RowlockResponse* response, + std::function done = NULL) { + response->set_lock_status(kLockSucc); + return true; + } +}; + +TEST(RowlockProxyTest, ValueTest) { + RowlockProxyImpl rowlock_proxy_impl; + + rowlock_proxy_impl.SetServerNumber(100); + EXPECT_EQ(100, rowlock_proxy_impl.server_number_); + EXPECT_EQ(100, rowlock_proxy_impl.GetServerNumber()); + + rowlock_proxy_impl.SetServerNumber(1000); + EXPECT_EQ(1000, rowlock_proxy_impl.server_number_); + EXPECT_EQ(1000, rowlock_proxy_impl.GetServerNumber()); + + rowlock_proxy_impl.SetServerNumber(2); + EXPECT_EQ(1000, rowlock_proxy_impl.server_addrs_->size()); + EXPECT_EQ(0, rowlock_proxy_impl.clients_->size()); + rowlock_proxy_impl.UpdateServers(0, "0.0.0.0:9999"); + + EXPECT_EQ(1, rowlock_proxy_impl.clients_->size()); + rowlock_proxy_impl.UpdateServers(0, "0.0.1.1:9999"); + + EXPECT_EQ(2, rowlock_proxy_impl.clients_->size()); + + EXPECT_EQ(std::hash()("tablerow"), + rowlock_proxy_impl.GetRowKey("table", "row")); + + EXPECT_EQ((*rowlock_proxy_impl.server_addrs_)[0], rowlock_proxy_impl.ScheduleRowKey(0)); + EXPECT_EQ((*rowlock_proxy_impl.server_addrs_)[1], rowlock_proxy_impl.ScheduleRowKey(1)); +} + +TEST(RowlockProxyTest, LockTest) { + RowlockProxyImpl rowlock_proxy_impl; + + rowlock_proxy_impl.SetServerNumber(1); + rowlock_proxy_impl.UpdateServers(0, "0.0.0.0:9999"); + EXPECT_EQ(1, rowlock_proxy_impl.server_addrs_->size()); + EXPECT_EQ(1, rowlock_proxy_impl.clients_->size()); + + EXPECT_TRUE(rowlock_proxy_impl.clients_->find("0.0.0.0:9999") != + rowlock_proxy_impl.clients_->end()); + delete (*rowlock_proxy_impl.clients_)["0.0.0.0:9999"]; + (*rowlock_proxy_impl.clients_)["0.0.0.0:9999"] = new TestClient(); + + RowlockRequest request; + RowlockResponse response; + request.set_table_name("table"); + request.set_row("row"); + + google::protobuf::Closure* closure = new TestClosure(); + + rowlock_proxy_impl.TryLock(&request, &response, closure); + EXPECT_EQ(response.lock_status(), kLockSucc); + + google::protobuf::Closure* unlock_closure = new TestClosure(); + rowlock_proxy_impl.UnLock(&request, &response, unlock_closure); + EXPECT_EQ(response.lock_status(), kLockSucc); +} + +} // namespace observer +} // namespace tera + diff --git a/src/observer/test/rowlock_test.cc b/src/observer/test/rowlock_test.cc new file mode 100644 index 000000000..611cf195c --- /dev/null +++ b/src/observer/test/rowlock_test.cc @@ -0,0 +1,184 @@ +// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include +#include +#include + +#include "gflags/gflags.h" +#include "glog/logging.h" +#include "gtest/gtest.h" + +#include "observer/rowlocknode/rowlock_db.h" +#include "common/counter.h" + +DECLARE_int32(rowlock_timing_wheel_patch_num); + +namespace tera { +namespace observer { + +class LockTest { +public: + void Lock(tera::observer::ShardedRowlockDB* db, Counter* succeed) { + for (uint32_t i = 0; i < 10; ++i) { + uint64_t key = 1; + + if (db->TryLock(key) == true) { + succeed->Inc(); + } + } + } +}; + +TEST(ShardedRowlockDB, LockTest) { + ShardedRowlockDB db; + + // test for lock + EXPECT_EQ(0, db.Size()); + + // different keys + EXPECT_TRUE(db.TryLock(0)); + EXPECT_TRUE(db.TryLock(1)); + EXPECT_TRUE(db.TryLock(2)); + + // same key that has been locked + EXPECT_FALSE(db.TryLock(0)); + EXPECT_FALSE(db.TryLock(1)); + EXPECT_FALSE(db.TryLock(2)); + + // test for unlock + db.UnLock(0); + EXPECT_TRUE(db.TryLock(0)); + + // unlock for other locked keys + EXPECT_FALSE(db.TryLock(1)); + EXPECT_FALSE(db.TryLock(2)); + + // double unlock + db.UnLock(0); + db.UnLock(0); + EXPECT_TRUE(db.TryLock(0)); + + // unlock size + EXPECT_EQ(3, db.Size()); + db.UnLock(0); + EXPECT_EQ(2, db.Size()); + db.UnLock(0); + EXPECT_EQ(2, db.Size()); + db.UnLock(1); + EXPECT_EQ(1, db.Size()); + db.UnLock(2); + EXPECT_EQ(0, db.Size()); + + // test for ClearTimeout + for (int32_t i = 0; i < FLAGS_rowlock_timing_wheel_patch_num; ++i) { + // all keys will not be unlocked until timeing wheel works + EXPECT_TRUE(db.TryLock(i)); + EXPECT_EQ(i + 1, db.Size()); + db.ClearTimeout(); + } + + // timing wheel has run a circle, oldest key will be unlocked + EXPECT_EQ(FLAGS_rowlock_timing_wheel_patch_num - 1, db.Size()); + + // unlock the second oldest key + db.ClearTimeout(); + EXPECT_EQ(FLAGS_rowlock_timing_wheel_patch_num - 2, db.Size()); + + // test for ClearTimeout multi keys + for (int32_t i = 0; i < FLAGS_rowlock_timing_wheel_patch_num; ++i) { + // all keys will not be unlocked until timeing wheel works + EXPECT_TRUE(db.TryLock(i * 10 + 1000000)); + EXPECT_TRUE(db.TryLock(i * 10 + 1000001)); + EXPECT_TRUE(db.TryLock(i * 10 + 1000002)); + db.ClearTimeout(); + } + + // timing wheel has run a circle, oldest 3 keys will be unlocked + EXPECT_EQ(FLAGS_rowlock_timing_wheel_patch_num * 3 - 3, db.Size()); + + // unlock the oldest 3 keys + db.ClearTimeout(); + EXPECT_EQ(FLAGS_rowlock_timing_wheel_patch_num * 3 - 6, db.Size()); +} + +TEST(RowlockDB, LockTest) { + RowlockDB db; + + // test for lock + EXPECT_EQ(0, db.Size()); + + // different keys + EXPECT_TRUE(db.TryLock(0)); + EXPECT_TRUE(db.TryLock(1)); + EXPECT_TRUE(db.TryLock(2)); + + // same key that has been locked + EXPECT_FALSE(db.TryLock(0)); + EXPECT_FALSE(db.TryLock(1)); + EXPECT_FALSE(db.TryLock(2)); + + // test for unlock + db.UnLock(0); + EXPECT_TRUE(db.TryLock(0)); + + // unlock for other locked keys + EXPECT_FALSE(db.TryLock(1)); + EXPECT_FALSE(db.TryLock(2)); + + // double unlock + db.UnLock(0); + db.UnLock(0); + EXPECT_TRUE(db.TryLock(0)); + + // unlock size + EXPECT_EQ(3, db.Size()); + db.UnLock(0); + EXPECT_EQ(2, db.Size()); + db.UnLock(0); + EXPECT_EQ(2, db.Size()); + db.UnLock(1); + EXPECT_EQ(1, db.Size()); + db.UnLock(2); + EXPECT_EQ(0, db.Size()); + + // test for ClearTimeout + for (int32_t i = 0; i < FLAGS_rowlock_timing_wheel_patch_num; ++i) { + // all keys will not be unlocked until timeing wheel works + EXPECT_TRUE(db.TryLock(i)); + EXPECT_EQ(i + 1, db.Size()); + db.ClearTimeout(); + } + + // timing wheel has run a circle, oldest key will be unlocked + EXPECT_EQ(FLAGS_rowlock_timing_wheel_patch_num - 1, db.Size()); + + // unlock the second oldest key + db.ClearTimeout(); + EXPECT_EQ(FLAGS_rowlock_timing_wheel_patch_num - 2, db.Size()); +} + +TEST(ShardedRowlockDB, ParaTest) { + Counter counter; + ShardedRowlockDB db; + LockTest test; + + // 10 threads to lock the same key + ThreadPool thread_pool(10); + for (uint32_t i = 0; i < 10; ++i) { + ThreadPool::Task task = std::bind(&LockTest::Lock, &test, &db, &counter); + thread_pool.AddTask(task); + } + sleep(1); + EXPECT_EQ(1, db.Size()); + EXPECT_EQ(1, counter.Get()); + + for (int32_t i = 0; i < FLAGS_rowlock_timing_wheel_patch_num; ++i) { + db.ClearTimeout(); + } + EXPECT_EQ(0, db.Size()); +} + +} // namespace observer +} // namespace tera diff --git a/src/observer/test/scanner_test.cc b/src/observer/test/scanner_test.cc new file mode 100644 index 000000000..fc1b91c05 --- /dev/null +++ b/src/observer/test/scanner_test.cc @@ -0,0 +1,495 @@ +// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include +#include +#include +#include + +#include "gflags/gflags.h" +#include "glog/logging.h" +#include "gtest/gtest.h" + +#include "observer/executor/random_key_selector.h" +#include "observer/executor/scanner_impl.h" +#include "observer/observer_demo/demo_observer.h" +#include "sdk/client_impl.h" +#include "sdk/global_txn.h" +#include "sdk/mutate_impl.h" +#include "sdk/read_impl.h" +#include "sdk/table_impl.h" +#include "sdk/sdk_utils.h" +#include "tera.h" + +DECLARE_bool(tera_sdk_client_for_gtxn); +DECLARE_bool(tera_sdk_tso_client_enabled); +DECLARE_string(tera_coord_type); +DECLARE_bool(rowlock_test); + +namespace tera { +namespace observer { + + +class TestRowReader : public RowReaderImpl { +public: + TestRowReader(TableImpl* table, const std::string& row_key) + : RowReaderImpl(table, row_key), seq_(0) { + if (row_key == "empty") { + // empty case + } else if (row_key == "900") { + value_.push_back("900"); + value_.push_back("900"); + value_.push_back("901"); + value_.push_back("920"); + } else if (row_key == "1100") { + value_.push_back("1000"); + value_.push_back("1000"); + value_.push_back("1100"); + value_.push_back("1100"); + } else if (row_key == "1hour") { + value_.push_back("810"); + value_.push_back("820"); + value_.push_back("830"); + value_.push_back("840"); + } else if (row_key == "collision_mix") { + value_.push_back("100"); + value_.push_back("1000"); + value_.push_back("4700"); + value_.push_back("1100"); + } else if (row_key == "error_ts") { + value_.push_back("100:sffaeeew"); + } else if (row_key == "some_error_ts") { + value_.push_back("wrong_string"); + value_.push_back("900"); + value_.push_back("900"); + value_.push_back("900"); + } else { + value_.push_back("1010"); + value_.push_back("1012"); + value_.push_back("1013"); + value_.push_back("1014"); + value_.push_back("1015"); + value_.push_back("1016"); + value_.push_back("1017"); + } + } + virtual std::string Value() { + return value_[seq_]; + + } + virtual int64_t Timestamp() { + return 9999999; + } + virtual void AddColumn(const std::string& family, const std::string& qualifier) {} + virtual bool Done() { + return seq_ == value_.size(); + } + virtual void Next() { + seq_++; + } +private: + std::vector value_; + uint32_t seq_; +}; + +class TestTransaction : public GlobalTxn { +public: + TestTransaction(int64_t start_ts, common::ThreadPool* thread_pool, bool error = false) + : GlobalTxn(NULL, thread_pool, NULL), + start_timestamp_(1000), error_(error) {} + + virtual ~TestTransaction() {} + virtual ErrorCode Get(RowReader* row_reader) { + ErrorCode err; + return err; + } + virtual int64_t GetStartTimestamp() { + return start_timestamp_; + } + virtual const ErrorCode& GetError() { + if (error_ == true) { + err_.SetFailed(ErrorCode::kSystem, ""); + } + return err_; + } +private: + int64_t start_timestamp_; + ErrorCode err_; + bool error_; +}; + +class TestRowMutationImpl : public RowMutationImpl { +public: + TestRowMutationImpl(Table* table, const std::string& row_key) + : RowMutationImpl(table, row_key) {} + virtual void Put(const std::string& value, int32_t ttl = -1) {} + virtual void ApplyMutation(RowMutation* row_mu) {} +}; + +class TestTable : public TableImpl { +public: + TestTable(const std::string& table_name, + ThreadPool* thread_pool, + sdk::ClusterFinder* cluster) + : TableImpl(table_name, thread_pool, cluster), + global_txn_(true), + thread_pool_(thread_pool) {} + virtual RowReader* NewRowReader(const std::string& row_key) { + return new TestRowReader(this, row_key); + } + virtual Transaction* StartRowTransaction(const std::string& row_key) { + return new TestTransaction(1, thread_pool_); + } + virtual RowMutation* NewRowMutation(const std::string& row_key) { + return new TestRowMutationImpl(this, row_key); + } + virtual void CommitRowTransaction(Transaction* transaction) {} + virtual bool GetDescriptor(TableDescriptor* schema, ErrorCode* err) { + schema->AddLocalityGroup("lg0"); + tera::ColumnFamilyDescriptor* cfd1 = schema->AddColumnFamily("cf1"); + cfd1->EnableNotify(); + ExtendNotifyLgToDescriptor(schema); + if (!global_txn_) { + cfd1->DisableGlobalTransaction(); + } + return true; + } +private: + bool global_txn_; + common::ThreadPool* thread_pool_; +}; + +class TestResultStream : public tera::ResultStream{ +public: + virtual bool Done(ErrorCode* err) { + if (next_number_ < row_name_.size()) { + return false; + } else { + return true; + } + } + virtual void Next() { + next_number_++; + } + + virtual std::string RowName() const { + return row_name_[next_number_]; + } + virtual std::string Qualifier() const { + return qualifier_[next_number_]; + } + + + virtual std::string Family() const { + return ""; + } + + virtual int64_t Timestamp() const { + return 0; + } + virtual std::string Value() const { + return ""; + } + + virtual int64_t ValueInt64() const { + return 0; + } + + virtual bool LookUp(const std::string& row_key) { + return true; + } + + virtual std::string ColumnName() const { + return ""; + } +private: + uint32_t next_number_; + std::vector row_name_; + std::vector qualifier_; + bool done_; +}; + +class TestObserver : public tera::observer::Observer { +public: + TestObserver() : count_(0) {} + virtual ~TestObserver() {} + virtual ErrorCode OnNotify(tera::Transaction* t, + tera::Client* client, + const std::string& table_name, + const std::string& family, + const std::string& qualifier, + const std::string& row, + const std::string& value, + int64_t timestamp, + Notification* notification) { + LOG(INFO) << "[Notify TestObserver] table:family:qualifer=" << + table_name << ":" << family << ":" << + qualifier << " row=" << row << + " value=" << value << " timestamp=" << timestamp; + + count_++; + + tera::ErrorCode err; + // do nothing + return err; + } + virtual std::string GetObserverName() const { + return "TestObserver"; + } + + virtual TransactionType GetTransactionType() const { + return kGlobalTransaction; + } +private: + std::atomic count_; +}; + +class TestClient : public ClientImpl { +public: + TestClient() : ClientImpl("", "") {} + ~TestClient() {} + virtual Table* OpenTable(const std::string& table_name, ErrorCode* err) { + return static_cast(new TestTable(table_name, &thread_pool_, NULL)); + } +}; + +class TestKeySelector : public RandomKeySelector { +public: + TestKeySelector() {} + virtual ErrorCode Observe(const std::string& table_name) { + tera::ErrorCode err; + return err; + } +}; + +TEST(ScannerImpl, ParseNotifyQualifier) { + FLAGS_tera_sdk_client_for_gtxn = true; + FLAGS_tera_coord_type = "mock_zk"; + ScannerImpl scanner; + + std::string data_family; + std::string data_qualfier; + + EXPECT_TRUE(scanner.ParseNotifyQualifier("C:url", &data_family, &data_qualfier)); + EXPECT_EQ(data_family, "C"); + EXPECT_EQ(data_qualfier, "url"); + + EXPECT_TRUE(scanner.ParseNotifyQualifier("cf:page", &data_family, &data_qualfier)); + EXPECT_EQ(data_family, "cf"); + EXPECT_EQ(data_qualfier, "page"); + + EXPECT_TRUE(scanner.ParseNotifyQualifier("cf::::::", &data_family, &data_qualfier)); + EXPECT_EQ(data_family, "cf"); + EXPECT_EQ(data_qualfier, ":::::"); + + EXPECT_TRUE(scanner.ParseNotifyQualifier("cf:___", &data_family, &data_qualfier)); + EXPECT_EQ(data_family, "cf"); + EXPECT_EQ(data_qualfier, "___"); + + EXPECT_FALSE(scanner.ParseNotifyQualifier("Curl", &data_family, &data_qualfier)); + EXPECT_FALSE(scanner.ParseNotifyQualifier("C_url", &data_family, &data_qualfier)); + EXPECT_FALSE(scanner.ParseNotifyQualifier("C.urlN_", &data_family, &data_qualfier)); + EXPECT_FALSE(scanner.ParseNotifyQualifier("++page", &data_family, &data_qualfier)); + +} + +TEST(ScannerImpl, DoReadValue) { + FLAGS_tera_sdk_client_for_gtxn = true; + FLAGS_mock_rowlock_enable = true; + FLAGS_tera_coord_type = "mock_zk"; + common::ThreadPool thread_pool(2); + ScannerImpl scanner; + TestTable table("test_table", &thread_pool, NULL); + + std::shared_ptr notify_cell(new NotifyCell(new TestTransaction(1, &thread_pool))); + Column column = {"test_table", "family", "qualifier"}; + + notify_cell->row = "row"; + notify_cell->value = "value"; + notify_cell->timestamp = 999999999; + notify_cell->observed_column = column; + notify_cell->table = &table; + + // no table name + EXPECT_FALSE(scanner.DoReadValue(notify_cell)); + // no column + ScannerImpl::TableObserveInfo cell; + (*scanner.table_observe_info_)["test_table"] = cell; + EXPECT_FALSE(scanner.DoReadValue(notify_cell)); + // size 0 + (*scanner.table_observe_info_)["test_table"].observe_columns[column].clear(); + EXPECT_FALSE(scanner.DoReadValue(notify_cell)); + + Observer* observer = new TestObserver(); + // normal + (*scanner.table_observe_info_)["test_table"].observe_columns[column].insert(observer); + EXPECT_TRUE(scanner.DoReadValue(notify_cell)); + + // multi observer + Observer* parse = new TestObserver(); + (*scanner.table_observe_info_)["test_table"].observe_columns[column].insert(parse); + EXPECT_TRUE(scanner.DoReadValue(notify_cell)); +} + +TEST(ScannerImpl, MultiThreadDoReadValue) { + FLAGS_tera_sdk_client_for_gtxn = true; + FLAGS_mock_rowlock_enable = true; + FLAGS_tera_coord_type = "mock_zk"; + common::ThreadPool thread_pool(2); + ScannerImpl scanner; + TestTable table("test_table", &thread_pool, NULL); + + std::shared_ptr notify_cell(new NotifyCell(new TestTransaction(1, &thread_pool))); + Column column = {"test_table", "family", "qualifier"}; + + notify_cell->row = "row"; + notify_cell->value = "value"; + notify_cell->timestamp = 100; + notify_cell->observed_column = column; + notify_cell->table = &table; + + Observer* observer = new TestObserver(); + (*scanner.table_observe_info_)["test_table"].observe_columns[column].insert(observer); + + common::ThreadPool worker_thread(10); + for (uint32_t i = 0; i < 10; ++i) { + worker_thread.AddTask(std::bind(&ScannerImpl::DoReadValue, &scanner, notify_cell)); + } + worker_thread.Stop(true); + scanner.transaction_threads_->Stop(true); + EXPECT_EQ(((TestObserver*)observer)->count_, 10); +} + +TEST(ScannerImpl, NextRow) { + FLAGS_tera_sdk_client_for_gtxn = true; + FLAGS_tera_coord_type = "mock_zk"; + std::unique_ptr result_stream(new TestResultStream()); + ScannerImpl scanner; + std::set columns; + bool finished = false; + std::string vec_rowkey; + std::vector vec_col; + + // stream done + EXPECT_FALSE(scanner.NextRow(columns, result_stream.get(), "table_name", &finished, &vec_rowkey, &vec_col)); + EXPECT_EQ(true, finished); + + finished = false; + static_cast(result_stream.get())->row_name_.push_back("row1"); + static_cast(result_stream.get())->qualifier_.push_back("cf:page1"); + static_cast(result_stream.get())->row_name_.push_back("row1"); + static_cast(result_stream.get())->qualifier_.push_back("cf:page2"); + static_cast(result_stream.get())->row_name_.push_back("row2"); + static_cast(result_stream.get())->qualifier_.push_back("cf:page3"); + static_cast(result_stream.get())->row_name_.push_back("row2"); + static_cast(result_stream.get())->qualifier_.push_back("cf:page4"); + + Column colum_1 = {"table_name", "cf", "page1"}; + Column colum_2 = {"table_name", "cf", "page2"}; + Column colum_3 = {"table_name", "cf", "page3"}; + Column colum_4 = {"table_name", "cf", "page4"}; + columns.insert(colum_1); + columns.insert(colum_2); + columns.insert(colum_3); + columns.insert(colum_4); + + // row 1 + EXPECT_TRUE(scanner.NextRow(columns, result_stream.get(), "table_name", &finished, &vec_rowkey, &vec_col)); + EXPECT_FALSE(finished); + + // row 1 data + EXPECT_EQ(vec_col.size(), 2); + EXPECT_EQ(vec_rowkey, "row1"); + EXPECT_EQ(vec_col[0].qualifier, "page1"); + EXPECT_EQ(vec_col[1].qualifier, "page2"); + + // row 2 + EXPECT_TRUE(scanner.NextRow(columns, result_stream.get(), "table_name", &finished, &vec_rowkey, &vec_col)); + EXPECT_FALSE(finished); + + // row 2 data + EXPECT_EQ(vec_col.size(), 2); + EXPECT_EQ(vec_rowkey, "row2"); + EXPECT_EQ(vec_col[0].qualifier, "page3"); + EXPECT_EQ(vec_col[1].qualifier, "page4"); + + // scan finish + EXPECT_FALSE(scanner.NextRow(columns, result_stream.get(), "table_name", &finished, &vec_rowkey, &vec_col)); + EXPECT_TRUE(finished); +} + + + +TEST(ScannerImpl, CheckConflictOnAckColumn) { + FLAGS_tera_sdk_client_for_gtxn = true; + FLAGS_tera_coord_type = "mock_zk"; + common::ThreadPool thread_pool(2); + ScannerImpl scanner; + TestTable table("test_table", &thread_pool, NULL); + + std::shared_ptr notify_cell(new NotifyCell(new TestTransaction(1, &thread_pool))); + Column column = {"test_table", "family", "qualifier"}; + + notify_cell->row = "row"; + notify_cell->value = "value"; + notify_cell->timestamp = 1000; + notify_cell->observed_column = column; + notify_cell->table = &table; + + std::set observers; + + TestObserver observer; + observers.insert(&observer); + + // empty case + notify_cell->row = "empty"; + EXPECT_TRUE(scanner.CheckConflictOnAckColumn(notify_cell, observers)); + + // row reader ts < transaction(notify) ts + notify_cell->row = "900"; + EXPECT_TRUE(scanner.CheckConflictOnAckColumn(notify_cell, observers)); + + // row reader ts > transaction(notify) ts + notify_cell->row = "1100"; + EXPECT_FALSE(scanner.CheckConflictOnAckColumn(notify_cell, observers)); + + // transaction ts - row reader ts < 600 + notify_cell->timestamp = 700; + notify_cell->row = "1hour"; + EXPECT_FALSE(scanner.CheckConflictOnAckColumn(notify_cell, observers)); + + // collision_mix: some legal, some illegal + notify_cell->row = "collision_mix"; + EXPECT_FALSE(scanner.CheckConflictOnAckColumn(notify_cell, observers)); + + // ack parse fail + notify_cell->timestamp = 1000; + notify_cell->row = "error_ts"; + EXPECT_FALSE(scanner.CheckConflictOnAckColumn(notify_cell, observers)); + + // some ack parse fail + notify_cell->row = "some_error_ts"; + EXPECT_FALSE(scanner.CheckConflictOnAckColumn(notify_cell, observers)); + + // mutation fail + std::shared_ptr notify_cell_fail(new NotifyCell(new TestTransaction(1, &thread_pool, true))); + + notify_cell_fail->row = "row"; + notify_cell_fail->value = "value"; + notify_cell_fail->timestamp = 1000; + notify_cell_fail->observed_column = column; + notify_cell_fail->table = &table; + + // empty case + notify_cell->row = "empty"; + EXPECT_FALSE(scanner.CheckConflictOnAckColumn(notify_cell_fail, observers)); + + // row reader ts < transaction(notify) ts + notify_cell->row = "900"; + EXPECT_FALSE(scanner.CheckConflictOnAckColumn(notify_cell_fail, observers)); +} + +} // namespace observer +} // namespace tera + diff --git a/src/proto/lb_client.cc b/src/proto/lb_client.cc new file mode 100644 index 000000000..0b70af707 --- /dev/null +++ b/src/proto/lb_client.cc @@ -0,0 +1,37 @@ +// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include + +#include "gflags/gflags.h" + +#include "proto/lb_client.h" + +DECLARE_int32(tera_master_connect_retry_times); +DECLARE_int32(tera_master_connect_retry_period); +DECLARE_int32(tera_master_connect_timeout_period); + +namespace tera { +namespace load_balancer { + +LBClient::LBClient(const std::string& server_addr, + int32_t rpc_timeout) + : RpcClient(server_addr), + rpc_timeout_(rpc_timeout) { +} + +LBClient::~LBClient() { +} + +bool LBClient::CmdCtrl(const CmdCtrlRequest* request, + CmdCtrlResponse* response) { + return SendMessageWithRetry(&LoadBalancerService::Stub::CmdCtrl, + request, response, + (std::function)NULL, + "CmdCtrl", rpc_timeout_); +} + +} // namespace load_balancer +} // namespace tera + diff --git a/src/proto/lb_client.h b/src/proto/lb_client.h new file mode 100644 index 000000000..faf47b59a --- /dev/null +++ b/src/proto/lb_client.h @@ -0,0 +1,35 @@ +// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef TERA_LOAD_BALANCER_LB_CLIENT_H_ +#define TERA_LOAD_BALANCER_LB_CLIENT_H_ + +#include + +#include "proto/load_balancer_rpc.pb.h" +#include "proto/rpc_client.h" + +DECLARE_int32(tera_rpc_timeout_period); + +namespace tera { +namespace load_balancer { + +class LBClient : public RpcClient { +public: + LBClient(const std::string& server_addr = "", + int32_t rpc_timeout = FLAGS_tera_rpc_timeout_period); + virtual ~LBClient(); + + virtual bool CmdCtrl(const CmdCtrlRequest* request, + CmdCtrlResponse* response); + +private: + int32_t rpc_timeout_; +}; + +} // namespace load_balancer +} // namespace tera + +#endif // TERA_LOAD_BALANCER_LB_CLIENT_H_ + diff --git a/src/proto/load_balancer_rpc.proto b/src/proto/load_balancer_rpc.proto new file mode 100644 index 000000000..d7b077fa1 --- /dev/null +++ b/src/proto/load_balancer_rpc.proto @@ -0,0 +1,11 @@ +import "sofa/pbrpc/rpc_option.proto"; +import "master_rpc.proto"; + +package tera; + +service LoadBalancerService { + rpc CmdCtrl(CmdCtrlRequest) returns(CmdCtrlResponse); +} + +option cc_generic_services = true; + diff --git a/src/proto/rowlocknode_rpc.proto b/src/proto/rowlocknode_rpc.proto new file mode 100644 index 000000000..6e8107710 --- /dev/null +++ b/src/proto/rowlocknode_rpc.proto @@ -0,0 +1,19 @@ +import "sofa/pbrpc/rpc_option.proto"; +import "status_code.proto"; + +package tera; + +message RowlockRequest { + required string table_name = 1; + required string row = 2; +} + +message RowlockResponse { + required StatusCode lock_status = 1; +} + +service RowlockService { + rpc Lock(RowlockRequest) returns(RowlockResponse); + rpc UnLock(RowlockRequest) returns(RowlockResponse); +} +option cc_generic_services = true; diff --git a/src/proto/rpc_client.h b/src/proto/rpc_client.h index 74ded0212..9067cc96d 100644 --- a/src/proto/rpc_client.h +++ b/src/proto/rpc_client.h @@ -144,7 +144,7 @@ class RpcClient : public RpcClientBase { int32_t rpc_timeout, ThreadPool* thread_pool = 0) { if (NULL == server_client_.get()) { // sync call - if (closure == NULL) { + if (!closure) { return false; } @@ -168,7 +168,7 @@ class RpcClient : public RpcClientBase { (server_client_.get()->*func)(rpc_controller, request, response, done); // sync call - if (closure == NULL) { + if (!closure) { sync_call_event.Wait(); return (!sync_call_failed); } @@ -196,7 +196,7 @@ class RpcClient : public RpcClientBase { delete param; // sync call - if (closure == NULL) { + if (!closure) { client->sync_call_failed = failed; client->sync_call_event.Set(); return; diff --git a/src/proto/status_code.proto b/src/proto/status_code.proto index 24b0ff595..cb99c7235 100644 --- a/src/proto/status_code.proto +++ b/src/proto/status_code.proto @@ -96,6 +96,18 @@ enum StatusCode { kTableStatusEnable = 1000; kTableStatusDisable = 1001; + + // Timeoracle + kTimeoracleOk = 2000; + kTimeoracleBusy = 2001; + + // rowlock service + kLockSucc = 2100; + kLockFail = 2101; + + // LoadBalancer + kLoadBalancerOk = 2200; + kLoadBalancerError = 2201; } enum TabletStatus { @@ -118,6 +130,9 @@ enum TabletStatus { kTabletPending = 65; kTabletOnSnapshot = 66; kTabletDelSnapshot = 67; + + // runtime status + kTabletCorruption = 90; } enum TableStatus { diff --git a/src/proto/table_meta.proto b/src/proto/table_meta.proto index c0df47e63..cdf18b689 100644 --- a/src/proto/table_meta.proto +++ b/src/proto/table_meta.proto @@ -64,6 +64,7 @@ message TabletCounter { optional double write_workload = 11 [default = 0.0]; optional bool is_on_busy = 15 [default = false]; + optional TabletStatus db_status = 16; } message TableCounter { @@ -107,6 +108,7 @@ message TabletMeta { repeated uint64 parent_tablets = 12; repeated int64 lg_size = 13; repeated Rollback rollbacks = 14; + optional int64 last_move_time_us = 15; } message TableMetaList { @@ -130,3 +132,12 @@ message SdkCookie { required string table_name = 1; repeated SdkTabletCookie tablets = 2; } + +message PrimaryInfo { + optional string table_name = 1; + optional bytes row_key = 2; + optional bytes column_family = 3; + optional bytes qualifier = 4; + optional int64 gtxn_start_ts = 5; + optional string client_session = 6; +} diff --git a/src/proto/table_schema.proto b/src/proto/table_schema.proto index 9f6c8727d..62c716c53 100644 --- a/src/proto/table_schema.proto +++ b/src/proto/table_schema.proto @@ -39,6 +39,8 @@ message ColumnFamilySchema { optional int32 time_to_live = 8 [default = 0]; // 单位:秒(0:不过期, <0:提前过期, >0:延后过期) optional int64 disk_quota = 9; optional string type = 10; + optional bool gtxn = 11 [default = false]; // 'gtxn=on' for global transaction feature availability + optional bool notify = 12 [default = false]; // 'notify=on' for notify feature availability } message TableSchema { diff --git a/src/proto/tabletnode.proto b/src/proto/tabletnode.proto index fff28caa5..d36f5e0f2 100644 --- a/src/proto/tabletnode.proto +++ b/src/proto/tabletnode.proto @@ -14,6 +14,7 @@ message TabletNodeInfo { optional uint64 timestamp = 4; optional uint32 tablet_total = 5; optional uint32 tablet_onbusy = 6; + optional uint32 tablet_corruption = 7; optional uint32 low_read_cell = 11; optional uint32 scan_rows = 12; diff --git a/src/proto/tabletnode_client.cc b/src/proto/tabletnode_client.cc index b6b347d2d..e57a5e8a8 100644 --- a/src/proto/tabletnode_client.cc +++ b/src/proto/tabletnode_client.cc @@ -105,6 +105,14 @@ bool TabletNodeClient::SplitTablet(const SplitTabletRequest* request, request, response, done, "SplitTablet", rpc_timeout_, thread_pool_); } +bool TabletNodeClient::ComputeSplitKey(const SplitTabletRequest* request, + SplitTabletResponse* response, + std::function done) { + return SendMessageWithRetry(&TabletNodeServer::Stub::ComputeSplitKey, + request, response, done, "ComputeSplitKey", + rpc_timeout_, thread_pool_); +} + bool TabletNodeClient::CompactTablet(const CompactTabletRequest* request, CompactTabletResponse* response, diff --git a/src/proto/tabletnode_client.h b/src/proto/tabletnode_client.h index c56e0d7c0..1033841d0 100644 --- a/src/proto/tabletnode_client.h +++ b/src/proto/tabletnode_client.h @@ -69,6 +69,9 @@ class TabletNodeClient : public RpcClient { bool SplitTablet(const SplitTabletRequest* request, SplitTabletResponse* response, std::function done = NULL); + bool ComputeSplitKey(const SplitTabletRequest* request, SplitTabletResponse* response, + std::function done = NULL); + bool CompactTablet(const CompactTabletRequest* request, CompactTabletResponse* response, diff --git a/src/proto/tabletnode_rpc.proto b/src/proto/tabletnode_rpc.proto index 0d79ce0c7..45651203e 100644 --- a/src/proto/tabletnode_rpc.proto +++ b/src/proto/tabletnode_rpc.proto @@ -90,6 +90,7 @@ message LoadTabletRequest { repeated uint64 snapshots_sequence = 10; repeated uint64 parent_tablets = 11; repeated Rollback rollbacks = 12; + repeated string ignore_err_lgs = 13; } message LoadTabletResponse { @@ -263,6 +264,7 @@ message ScanTabletRequest { optional int64 timestamp = 18 [default = 0]; optional int64 timeout = 19; optional int64 number_limit = 21; + optional uint64 max_qualifiers = 22; } message ScanTabletResponse { @@ -282,6 +284,7 @@ message RowReaderInfo { optional TimeRange time_range = 3; optional FilterList filter_list = 4; optional uint32 max_version = 5; + optional uint64 max_qualifiers = 6; } message ReadTabletRequest { @@ -309,11 +312,13 @@ message SplitTabletRequest { optional TabletMeta tablet_meta = 4; repeated uint64 child_tablets = 5; optional bytes split_key = 6; + optional bool master_update_meta = 7; } message SplitTabletResponse { required StatusCode status = 1 [default = kTableMergeError]; required uint64 sequence_id = 2; + repeated string split_keys = 3; } message MergeTabletRequest { @@ -367,6 +372,7 @@ service TabletNodeServer { rpc Rollback(SnapshotRollbackRequest) returns(SnapshotRollbackResponse); rpc SplitTablet(SplitTabletRequest) returns(SplitTabletResponse); + rpc ComputeSplitKey(SplitTabletRequest) returns (SplitTabletResponse); rpc CmdCtrl(TsCmdCtrlRequest) returns(TsCmdCtrlResponse); rpc Update(UpdateRequest) returns(UpdateResponse); diff --git a/src/proto/timeoracle_rpc.proto b/src/proto/timeoracle_rpc.proto new file mode 100644 index 000000000..f96661b9f --- /dev/null +++ b/src/proto/timeoracle_rpc.proto @@ -0,0 +1,20 @@ +import "sofa/pbrpc/rpc_option.proto"; +import "status_code.proto"; + +package tera; + +message GetTimestampRequest { + optional uint64 count = 1; +} + +message GetTimestampResponse { + optional StatusCode status = 1; + optional int64 start_timestamp = 2; + optional uint64 count = 3; +} + +service TimeoracleServer { + rpc GetTimestamp(GetTimestampRequest) returns(GetTimestampResponse); +} + +option cc_generic_services = true; diff --git a/src/sample/Makefile b/src/sample/Makefile index 81698c729..02268f2ff 100644 --- a/src/sample/Makefile +++ b/src/sample/Makefile @@ -10,15 +10,15 @@ SHARED_LDFLAGS = -shared -Wl,-soname -Wl, INCPATH += -I../../include $(DEPS_INCPATH) CFLAGS += $(OPT) $(SHARED_CFLAGS) $(INCPATH) -CXXFLAGS += $(OPT) $(SHARED_CFLAGS) $(INCPATH) +CXXFLAGS += -std=gnu++11 $(OPT) $(SHARED_CFLAGS) $(INCPATH) LDFLAGS += ../../build/lib/libtera.a $(DEPS_LDPATH) $(DEPS_LDFLAGS) -lpthread -lz -SAMPLE_SRC := ./tera_sample.cc tera_row_txn_sample.cc atomic_sample.cc +SAMPLE_SRC := ./tera_sample.cc tera_row_txn_sample.cc atomic_sample.cc global_txn_async_sample.cc SAMPLE_OBJ := $(SAMPLE_SRC:.cc=.o) .PHONY: clean -all: sample_demo tera_row_txn_sample atomic_sample +all: sample_demo tera_row_txn_sample atomic_sample global_txn_async_sample global_txn_sync_sample sample_demo: ./tera_sample.o $(CXX) -o $@ $^ $(LDFLAGS) @@ -26,6 +26,12 @@ sample_demo: ./tera_sample.o tera_row_txn_sample: tera_row_txn_sample.o $(CXX) -o $@ $^ $(LDFLAGS) +global_txn_async_sample: global_txn_async_sample.o + $(CXX) -o $@ $^ $(LDFLAGS) + +global_txn_sync_sample: global_txn_sync_sample.o + $(CXX) -o $@ $^ $(LDFLAGS) + atomic_sample: atomic_sample.o $(CXX) -o $@ $^ $(LDFLAGS) @@ -36,5 +42,7 @@ clean: rm -f *.o rm -f ./sample_demo rm -f ./tera_row_txn_sample + rm -f ./global_txn_async_sample + rm -f ./global_txn_sync_sample rm -f ./atomic_sample diff --git a/src/sample/atomic_sample.cc b/src/sample/atomic_sample.cc index ce35fbe6b..3053ec8b4 100644 --- a/src/sample/atomic_sample.cc +++ b/src/sample/atomic_sample.cc @@ -1,4 +1,5 @@ #include +#include #include "tera.h" int main() { diff --git a/src/sample/global_txn_async_sample.cc b/src/sample/global_txn_async_sample.cc new file mode 100644 index 000000000..a2f77896e --- /dev/null +++ b/src/sample/global_txn_async_sample.cc @@ -0,0 +1,143 @@ +#include +#include +#include +#include + +#include +#include + +#include "tera.h" + +std::string read_result = ""; +std::atomic all_gtxn_thread_done(false); +std::atomic finish_cnt(0); + +struct RowReaderContext { + tera::Transaction* gtxn; + tera::Table* t1; + tera::Table* t2; +}; + +tera::Table* InitTable(tera::Client* client, const std::string& tablename) { + tera::ErrorCode error_code; + if (!client->IsTableExist(tablename, &error_code)) { + tera::TableDescriptor schema(tablename); + schema.EnableTxn(); // 参与全局事务的表schema 都需要设置 txn=true + schema.AddLocalityGroup("lg0"); + tera::ColumnFamilyDescriptor* cfd1 = schema.AddColumnFamily("cf1"); + cfd1->EnableGlobalTransaction(); + tera::ColumnFamilyDescriptor* cfd2 = schema.AddColumnFamily("cf2"); + cfd2->DisableGlobalTransaction(); + client->CreateTable(schema, &error_code); + assert(error_code.GetType() == tera::ErrorCode::kOK); + } + + tera::Table* table = client->OpenTable(tablename, &error_code); + assert(table && error_code.GetType() == tera::ErrorCode::kOK); + return table; +} + +void TxnCallBack(tera::Transaction* txn) { + if (txn->GetError().GetType() != tera::ErrorCode::kOK) { + std::cout << "txn failed, start_ts= " << txn->GetStartTimestamp() + << ", reason= " << txn->GetError().ToString() + << std::endl; + } else { + std::cout << "gtxn success" << std::endl; + } + delete txn; + all_gtxn_thread_done.store(true); +} + +void ReadRowCallBack(tera::RowReader* row_reader) { + RowReaderContext* ctx = (RowReaderContext*)row_reader->GetContext(); + while (!row_reader->Done()) { + printf("Row: %s\%s\%ld\%s\n", + row_reader->RowName().c_str(), row_reader->ColumnName().c_str(), + row_reader->Timestamp(), row_reader->Value().c_str()); + row_reader->Next(); + read_result += row_reader->Value(); + } + delete row_reader; + ++finish_cnt; + // mutations begin at all reader callback done + if (finish_cnt.load() == 2) { + // write to other columns + tera::Transaction* g_txn = ctx->gtxn; + tera::RowMutation* m1 = ctx->t1->NewRowMutation("r1"); + tera::RowMutation* m2 = ctx->t2->NewRowMutation("r1"); + m1->Put( "cf1", "q1", read_result); + m2->Put( "cf1", "q1", read_result); + + // ApplyMutation only modifying local memory and do not need asynchronous + // we also support asynchronous interface for RowMutation,as you like + g_txn->ApplyMutation(m1); + g_txn->ApplyMutation(m2); + g_txn->SetCommitCallback(TxnCallBack); + delete m1; + delete m2; + // need not check ApplyMutation, Transaction will be check before commit. + g_txn->Commit(); + } +} + +void DoTxn(tera::Client* client, tera::Table* t1, tera::Table* t2) { + + // begin global transaction + tera::Transaction* g_txn = client->NewGlobalTransaction(); + if (g_txn == NULL) { + return; + } + + // read from different tables + tera::RowReader* r1 = t1->NewRowReader("r1"); + tera::RowReader* r2 = t2->NewRowReader("r1"); + r1->AddColumn("cf1", "q2"); + r2->AddColumn("cf1", "q2"); + r1->SetCallBack(ReadRowCallBack); + r2->SetCallBack(ReadRowCallBack); + RowReaderContext ctx; + ctx.gtxn = g_txn; + ctx.t1 = t1; + ctx.t2 = t2; + r1->SetContext(&ctx); + r2->SetContext(&ctx); + // read from t1:r1:cf1:q2 and check + g_txn->Get(r1); + // read from t2:r1:cf1:q2 and check + g_txn->Get(r2); +} + +int main(int argc, char *argv[]) { + + tera::ErrorCode error_code; + + tera::Client* client = tera::Client::NewClient("../conf/tera.flag", "global_txn_sample_async", &error_code); + if (client == NULL) { + return -1; + } + + // create or open tables + // before global transaction should be + // (1) OpenTable which you will r/w + // (2) check OpenTable success + tera::Table* t1 = InitTable(client, "t1"); + tera::Table* t2 = InitTable(client, "t2"); + + // the global transaction may add to threadpool, which implements by yourself. + // + // In this example, + // + // first, read two cell values from different tables, + // next, get all values concat at reader callback, + // last, put concat result into different tables. + DoTxn(client, t1, t2); + + // global transaction thead always finished before callback + // wait for callback thread done at main thread + // if your know the program can't exit before callback done, it's not necessary. + while (!all_gtxn_thread_done.load()) { + usleep(100); + } + return 0; +} diff --git a/src/sample/global_txn_sync_sample.cc b/src/sample/global_txn_sync_sample.cc new file mode 100644 index 000000000..66bb94b7d --- /dev/null +++ b/src/sample/global_txn_sync_sample.cc @@ -0,0 +1,107 @@ +#include +#include + +#include +#include "tera.h" + +int main(int argc, char *argv[]) { + + tera::ErrorCode error_code; + + tera::Client* client = tera::Client::NewClient("../conf/tera.flag", "global_txn_sample", &error_code); + assert(client); + // create or open tables + tera::Table* t1 = nullptr; + tera::Table* t2 = nullptr; + if (!client->IsTableExist("t1", &error_code)) { + tera::TableDescriptor schema("t1"); + schema.EnableTxn(); // 参与全局事务的表schema 都需要设置 txn=true + schema.AddLocalityGroup("lg0"); + tera::ColumnFamilyDescriptor* cfd1 = schema.AddColumnFamily("cf1"); + cfd1->EnableGlobalTransaction(); + tera::ColumnFamilyDescriptor* cfd2 = schema.AddColumnFamily("cf2"); + cfd2->EnableGlobalTransaction(); + client->CreateTable(schema, &error_code); + assert(error_code.GetType() == tera::ErrorCode::kOK); + } + + if (!client->IsTableExist("t2", &error_code)) { + tera::TableDescriptor schema("t2"); + schema.EnableTxn(); // 参与全局事务的表schema 都需要设置 txn=true + schema.AddLocalityGroup("lg0"); + tera::ColumnFamilyDescriptor* cfd1 = schema.AddColumnFamily("cf1"); + cfd1->EnableGlobalTransaction(); + tera::ColumnFamilyDescriptor* cfd2 = schema.AddColumnFamily("cf2"); + cfd2->EnableGlobalTransaction(); + client->CreateTable(schema, &error_code); + assert(error_code.GetType() == tera::ErrorCode::kOK); + } + // before global transaction should be + // (1) OpenTable which you will r/w + // (2) check OpenTable success + t1 = client->OpenTable("t1", &error_code); + assert(t1 && error_code.GetType() == tera::ErrorCode::kOK); + + t2 = client->OpenTable("t2", &error_code); + assert(t2 && error_code.GetType() == tera::ErrorCode::kOK); + + // begin global transaction + tera::Transaction* g_txn = client->NewGlobalTransaction(); + if (g_txn == NULL) { + return -1; + } + if (error_code.GetType()!=tera::ErrorCode::kOK) { + std::cout << error_code.ToString() << std::endl; + return -1; + } + // read from different tables + std::unique_ptr r1(t1->NewRowReader("r1")); + std::unique_ptr r2(t2->NewRowReader("r1")); + r1->AddColumn("cf1", "q2"); + r2->AddColumn("cf1", "q2"); + // read from t1:r1:cf1:q2 and check + g_txn->Get(r1.get()); + if (g_txn->GetError().GetType() != tera::ErrorCode::kOK) { + std::cout << g_txn->GetError().ToString() << std::endl; + return -1; + } + std::string r1_v = ""; + while(!r1->Done()) { + std::cout << r1->Value() << std::endl; + r1_v = r1->Value(); + r1->Next(); + } + + // read from t2:r1:cf1:q2 and check + g_txn->Get(r2.get()); + if (g_txn->GetError().GetType() != tera::ErrorCode::kOK) { + std::cout << g_txn->GetError().ToString() << std::endl; + return -1; + } + std::string r2_v = ""; + while(!r2->Done()) { + std::cout << r2->Value() << std::endl; + r2_v = r2->Value(); + r2->Next(); + } + + // write to other columns + std::unique_ptr m1(t1->NewRowMutation("r1")); + std::unique_ptr m2(t2->NewRowMutation("r1")); + m1->Put( "cf1", "q1", r2_v); + m2->Put( "cf1", "q1", r1_v); + + g_txn->ApplyMutation(m1.get()); + g_txn->ApplyMutation(m2.get()); + // need not check ApplyMutation, Transaction will be check before commit. + g_txn->Commit(); + if (g_txn->GetError().GetType() != tera::ErrorCode::kOK) { + std::cout << g_txn->GetError().ToString() << std::endl; + } else { + std::cout << "commit success" << std::endl; + } + + delete g_txn; + // end global transaction + return 0; +} diff --git a/src/sample/tera_row_txn_sample.cc b/src/sample/tera_row_txn_sample.cc index 4c9897708..879652dfc 100644 --- a/src/sample/tera_row_txn_sample.cc +++ b/src/sample/tera_row_txn_sample.cc @@ -1,3 +1,6 @@ +#include +#include + #include "tera.h" int main() { diff --git a/src/sdk/client_impl.cc b/src/sdk/client_impl.cc index bc9eb1998..3599b2e9e 100644 --- a/src/sdk/client_impl.cc +++ b/src/sdk/client_impl.cc @@ -10,6 +10,7 @@ #include "gflags/gflags.h" #include "common/file/file_path.h" +#include "common/log/log_cleaner.h" #include "common/mutex.h" #include "proto/kv_helper.h" #include "proto/master_client.h" @@ -17,6 +18,8 @@ #include "proto/table_meta.pb.h" #include "proto/tabletnode_client.h" #include "sdk/table_impl.h" +#include "sdk/global_txn.h" +#include "sdk/sdk_perf.h" #include "sdk/sdk_utils.h" #include "sdk/sdk_zk.h" #include "utils/config_utils.h" @@ -43,6 +46,12 @@ DECLARE_int32(tera_sdk_rpc_max_pending_buffer_size); DECLARE_int32(tera_sdk_rpc_work_thread_num); DECLARE_int32(tera_sdk_show_max_num); DECLARE_bool(tera_online_schema_update_enabled); +DECLARE_string(tera_log_prefix); +DECLARE_bool(tera_info_log_clean_enable); +DECLARE_bool(tera_sdk_perf_collect_enabled); +DECLARE_int32(tera_gtxn_thread_max_num); +DECLARE_bool(tera_sdk_client_for_gtxn); +DECLARE_bool(tera_sdk_tso_client_enabled); namespace tera { @@ -55,14 +64,40 @@ void LogSdkVersionInfo() { ClientImpl::ClientImpl(const std::string& user_identity, const std::string& user_passcode) : thread_pool_(FLAGS_tera_sdk_thread_max_num), + gtxn_thread_pool_(NULL), user_identity_(user_identity), - user_passcode_(user_passcode) { + user_passcode_(user_passcode), + client_zk_adapter_(NULL), + tso_cluster_(NULL), + collecter_(NULL), + session_str_("") { tabletnode::TabletNodeClient::SetThreadPool(&thread_pool_); tabletnode::TabletNodeClient::SetRpcOption( FLAGS_tera_sdk_rpc_limit_enabled ? FLAGS_tera_sdk_rpc_limit_max_inflow : -1, FLAGS_tera_sdk_rpc_limit_enabled ? FLAGS_tera_sdk_rpc_limit_max_outflow : -1, FLAGS_tera_sdk_rpc_max_pending_buffer_size, FLAGS_tera_sdk_rpc_work_thread_num); - cluster_ = sdk::NewClusterFinder(); + + if (FLAGS_tera_sdk_client_for_gtxn) { + client_zk_adapter_ = sdk::NewClientZkAdapter(); + client_zk_adapter_->Init(); + cluster_ = sdk::NewClusterFinder(client_zk_adapter_); + if (FLAGS_tera_sdk_tso_client_enabled) { + tso_cluster_ = sdk::NewTimeoracleClusterFinder(); + } + gtxn_thread_pool_ = new ThreadPool(FLAGS_tera_gtxn_thread_max_num); + RegisterSelf(); + } else { + cluster_ = sdk::NewClusterFinder(); + } + + if (FLAGS_tera_sdk_perf_collect_enabled) { + collecter_ = new sdk::PerfCollecter(); + collecter_->Run(); + LOG(INFO) << "start perf collect"; + } else { + LOG(INFO) << "perf collect disable"; + } + pthread_once(&sdk_client_once_control, LogSdkVersionInfo); } @@ -77,6 +112,17 @@ ClientImpl::~ClientImpl() { } } delete cluster_; + if (FLAGS_tera_sdk_perf_collect_enabled) { + collecter_->Stop(); + delete collecter_; + } + if (FLAGS_tera_sdk_client_for_gtxn) { + delete gtxn_thread_pool_; + if (FLAGS_tera_sdk_tso_client_enabled) { + delete tso_cluster_; + } + delete client_zk_adapter_; + } } bool ClientImpl::CreateTable(const TableDescriptor& desc, ErrorCode* err) { @@ -1173,6 +1219,29 @@ bool ClientImpl::ParseTabletEntry(const TabletMeta& meta, std::vectorIsClientAlive(path); + } + return true; +} + +std::string ClientImpl::ClientSession() { + return session_str_; +} + +bool ClientImpl::RegisterSelf() { + if (client_zk_adapter_ != NULL) { + return client_zk_adapter_->RegisterClient(&session_str_); + } else { + return false; + } +} + static Mutex g_mutex; static bool g_is_glog_init = false; @@ -1223,6 +1292,14 @@ static int InitFlags(const std::string& confpath, const std::string& log_prefix) if (!g_is_glog_init) { ::google::InitGoogleLogging(log_prefix.c_str()); utils::SetupLog(log_prefix); + FLAGS_tera_log_prefix = log_prefix; + // start log cleaner + if (FLAGS_tera_info_log_clean_enable) { + common::LogCleaner::StartCleaner(); + LOG(INFO) << "start log cleaner"; + } else { + LOG(INFO) << "log cleaner is disable"; + } g_is_glog_init = true; } diff --git a/src/sdk/client_impl.h b/src/sdk/client_impl.h index f401111f3..246e7608d 100644 --- a/src/sdk/client_impl.h +++ b/src/sdk/client_impl.h @@ -8,9 +8,11 @@ #include "common/thread_pool.h" #include "proto/master_rpc.pb.h" #include "proto/tabletnode_client.h" +#include "sdk/sdk_perf.h" #include "sdk/sdk_zk.h" +#include "sdk/timeoracle_client_impl.h" #include "tera.h" -#include "utils/timer.h" +#include "common/timer.h" using std::string; @@ -97,6 +99,8 @@ class ClientImpl : public Client { string* str_result, ErrorCode* err); + virtual Transaction* NewGlobalTransaction(); + bool ShowTableSchema(const string& name, TableSchema* meta, ErrorCode* err); bool ShowTablesInfo(const string& name, TableMeta* meta, @@ -117,6 +121,10 @@ class ClientImpl : public Client { void CloseTable(const string& table_name); TableImpl* OpenTableInternal(const string& table_name, ErrorCode* err); + bool IsClientAlive(const string& path); + + string ClientSession(); + private: bool ListInternal(std::vector* table_list, std::vector* tablet_list, @@ -147,10 +155,13 @@ class ClientImpl : public Client { bool is_brief, ErrorCode* err); + bool RegisterSelf(); + private: ClientImpl(const ClientImpl&); void operator=(const ClientImpl&); ThreadPool thread_pool_; + ThreadPool* gtxn_thread_pool_; std::string user_identity_; std::string user_passcode_; @@ -160,7 +171,11 @@ class ClientImpl : public Client { /// we have to access zookeeper whenever we need master_addr or root_table_addr. /// if there is cluster_, /// we save master_addr & root_table_addr in cluster_, access zookeeper only once. + sdk::ClientZkAdapterBase* client_zk_adapter_; sdk::ClusterFinder* cluster_; + sdk::ClusterFinder* tso_cluster_; + sdk::PerfCollecter* collecter_; + std::string session_str_; Mutex open_table_mutex_; struct TableHandle { diff --git a/src/sdk/global_txn.cc b/src/sdk/global_txn.cc new file mode 100644 index 000000000..a003cbd64 --- /dev/null +++ b/src/sdk/global_txn.cc @@ -0,0 +1,1142 @@ +// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. +// +// Author: baorenyi@baidu.com + +#include +#include + +#include "common/metric/metric_counter.h" +#include "common/this_thread.h" +#include "common/thread.h" +#include "proto/table_meta.pb.h" +#include "proto/tabletnode_rpc.pb.h" +#include "sdk/global_txn.h" +#include "sdk/read_impl.h" +#include "sdk/timeoracle_client_impl.h" + +DECLARE_bool(tera_gtxn_test_opened); +DECLARE_string(tera_gtxn_test_flagfile); +DECLARE_int32(tera_gtxn_get_waited_times_limit); +DECLARE_int32(tera_gtxn_timeout_ms); +DECLARE_bool(tera_sdk_tso_client_enabled); + +namespace tera { + +extern tera::MetricCounter gtxn_read_cnt; +extern tera::MetricCounter gtxn_read_fail_cnt; +extern tera::MetricCounter gtxn_read_retry_cnt; +extern tera::MetricCounter gtxn_read_rollback_cnt; +extern tera::MetricCounter gtxn_read_rollforward_cnt; +extern tera::MetricCounter gtxn_commit_cnt; +extern tera::MetricCounter gtxn_commit_fail_cnt; +extern tera::MetricCounter gtxn_prewrite_cnt; +extern tera::MetricCounter gtxn_prewrite_fail_cnt; +extern tera::MetricCounter gtxn_primary_cnt; +extern tera::MetricCounter gtxn_primary_fail_cnt; +extern tera::MetricCounter gtxn_secondaries_cnt; +extern tera::MetricCounter gtxn_secondaries_fail_cnt; +extern tera::MetricCounter gtxn_acks_cnt; +extern tera::MetricCounter gtxn_acks_fail_cnt; +extern tera::MetricCounter gtxn_notifies_cnt; +extern tera::MetricCounter gtxn_notifies_fail_cnt; + +Transaction* GlobalTxn::NewGlobalTxn(tera::Client* client, + common::ThreadPool* thread_pool, + sdk::ClusterFinder* tso_cluster) { + if (client != NULL) { + return new GlobalTxn(client, thread_pool, tso_cluster); + } + LOG(ERROR) << "client or tso_cluster is NULL"; + return NULL; +} + +GlobalTxn::GlobalTxn(tera::Client* client, + common::ThreadPool* thread_pool, + sdk::ClusterFinder* tso_cluster) : + gtxn_internal_(new GlobalTxnInternal(client)), + status_returned_(false), + primary_write_(NULL), + writes_size_(0), + commit_ts_(0), + isolation_level_(IsolationLevel::kSnapshot), + serialized_primary_(""), + finish_(false), + finish_cond_(&finish_mutex_), + has_commited_(false), + user_commit_callback_(NULL), + user_commit_context_(NULL), + thread_pool_(thread_pool), + tso_cluster_(tso_cluster), + timeout_ms_(FLAGS_tera_gtxn_timeout_ms), + all_task_pushed_(false) { + if (FLAGS_tera_gtxn_test_opened) { + VLOG(12) << "conf_file = " << FLAGS_tera_gtxn_test_flagfile; + start_ts_ = gtxn_internal_->TEST_Init(FLAGS_tera_gtxn_test_flagfile); + } else if (!FLAGS_tera_sdk_tso_client_enabled) { + start_ts_ = get_micros(); + } else { + timeoracle::TimeoracleClientImpl tsoc(thread_pool_, tso_cluster_); + start_ts_ = tsoc.GetTimestamp(1); + if (start_ts_ == 0) { + status_.SetFailed(ErrorCode::kGTxnTimestampLost); + status_returned_ = true; + } + } + prewrite_start_ts_ = start_ts_; + gtxn_internal_->SetStartTimestamp(start_ts_); +} + +GlobalTxn::~GlobalTxn() { +} + +void GlobalTxn::SetIsolation(const IsolationLevel& isolation_level) { + assert(has_commited_ == false); + isolation_level_ = isolation_level; +} + +void GlobalTxn::SetTimeout(int64_t timeout_ms) { + timeout_ms_ = timeout_ms; +} + +int64_t GlobalTxn::Timeout() { + return timeout_ms_; +} + +void GlobalTxn::SetReaderStatusAndRunCallback(RowReaderImpl* reader_impl, + ErrorCode* status) { + gtxn_read_cnt.Inc(); + gtxn_internal_->PerfReadDelay(0, get_micros()); // finish_time + VLOG(12) << "[gtxn][get][" << start_ts_ << "][status] :" << status->ToString(); + reader_impl->SetError(status->GetType(), status->GetReason()); + thread_pool_->AddTask(std::bind(&RowReaderImpl::RunCallback, reader_impl)); +} + +ErrorCode GlobalTxn::Get(RowReader* row_reader) { + assert(row_reader != NULL); + gtxn_internal_->PerfReadDelay(get_micros(), 0); // begin_time + gtxn_internal_->TEST_GetSleep(); + + RowReaderImpl* reader_impl = static_cast(row_reader); + reader_impl->SetTransaction(this); + + // Pre Check can read + ErrorCode status; + status.SetFailed(ErrorCode::kOK); + if (has_commited_.load()) { + std::string reason = "get failed, txn has commited @ [" + + std::to_string(start_ts_) + "," + std::to_string(commit_ts_); + LOG(ERROR) << "[gtxn][get][" << start_ts_ <<"] " << reason; + status.SetFailed(ErrorCode::kGTxnOpAfterCommit, reason); + SetReaderStatusAndRunCallback(reader_impl, &status); + return status; + } + + Table* table = row_reader->GetTable(); + const std::string& row_key = row_reader->RowKey(); + // Check UserReader and Build cells + if (!gtxn_internal_->VerifyUserRowReader(row_reader)) { + status = reader_impl->GetError(); + SetReaderStatusAndRunCallback(reader_impl, &status); + return status; + } + + std::vector cells; + for (auto it : row_reader->GetReadColumnList()) { + const std::string& column_family = it.first; + const std::set& qualifier_set = it.second; + + for (auto q_it = qualifier_set.begin(); q_it != qualifier_set.end(); ++q_it) { + const std::string& qualifier = *q_it; + cells.push_back(new Cell(table, row_key, column_family, qualifier)); + } + } + int expected_cells_cnt = cells.size(); + + InternalReaderContext* ctx = new InternalReaderContext(expected_cells_cnt, reader_impl, this); + for(auto& cell : cells) { + ctx->cell_map[cell] = 0; // cell* -> try_time, default = 0 + AsyncGetCell(cell, reader_impl, ctx); + } + + // sync wait and set status + if(!reader_impl->IsAsync()) { + reader_impl->Wait(); + status = reader_impl->GetError(); + return status; + } + return status; +} + +void GlobalTxn::AsyncGetCell(Cell* cell, + RowReaderImpl* user_reader_impl, + InternalReaderContext* ctx) { + VLOG(12) << "[gtxn][get][" << start_ts_ << "] " + << gtxn_internal_->DebugString(*cell, "TryGet times(" + std::to_string(ctx->cell_map[cell]) + ")"); + + Table* table = cell->Table(); + RowReader* reader = table->NewRowReader(cell->RowKey()); + reader->AddColumn(cell->ColFamily(), cell->LockName()); + reader->AddColumn(cell->ColFamily(), cell->WriteName()); + reader->AddColumn(cell->ColFamily(), cell->Qualifier()); + reader->SetTimeRange(0, kMaxTimeStamp); + reader->SetMaxVersions(UINT32_MAX); + reader->SetCallBack([] (RowReader* r) { + CellReaderContext* ctx = (CellReaderContext*)r->GetContext(); + GlobalTxn* gtxn = static_cast(ctx->internal_reader_ctx->gtxn); + gtxn->thread_pool_->AddTask(std::bind(&GlobalTxn::DoGetCellReaderCallback, + gtxn, static_cast(r))); + }); + reader->SetContext(new CellReaderContext(cell, ctx)); + table->Get(reader); +} + +void GlobalTxn::DoGetCellReaderCallback(RowReader* reader) { + ErrorCode status = reader->GetError(); + if (status.GetType() != ErrorCode::kOK) { + MergeCellToRow(reader, status); + return; + } + + RowReader::TRow row; + reader->ToMap(&row); + CellReaderContext* ctx = (CellReaderContext*)reader->GetContext(); + Cell* cell = ctx->cell; + if (row.find(cell->ColFamily()) == row.end()) { + status.SetFailed(ErrorCode::kNotFound, "columnfamily not found"); + MergeCellToRow(reader, status); + return; + } + // local check lock + if (gtxn_internal_->IsLockedByOthers(row, *cell)) { + // sync operate + status.SetFailed(ErrorCode::kOK); + InternalReaderContext* internal_reader_ctx = ctx->internal_reader_ctx; + bool do_clean = false; + // check clean lock before read cell next time, + // when read times >= limit - 1 do clean lock opreations + if (internal_reader_ctx->cell_map[cell] >= FLAGS_tera_gtxn_get_waited_times_limit - 1) { + do_clean = true; + } + BackoffAndMaybeCleanupLock(row, *cell, do_clean, &status); + if (status.GetType() == ErrorCode::kOK) { + // call Next time to async GetCell + // don't merge until next time ok or failed + ++ internal_reader_ctx->cell_map[cell]; + gtxn_read_retry_cnt.Inc(); + AsyncGetCell(cell, + static_cast(internal_reader_ctx->user_reader), + internal_reader_ctx); + return; + } + } else if (!FindValueFromResultRow(row, cell)) { + status.SetFailed(ErrorCode::kNotFound, "build data col from write col failed"); + } + MergeCellToRow(reader, status); +} + +void GlobalTxn::MergeCellToRow(RowReader* internal_reader, + const ErrorCode& status) { + CellReaderContext* ctx = (CellReaderContext*)internal_reader->GetContext(); + ctx->status = status; + VLOG(12) << "[gtxn][get][" << start_ts_ << "] " + << gtxn_internal_->DebugString(*(ctx->cell), status.ToString()); + GetCellCallback(ctx); + // next time internal read will new next RowReader + delete internal_reader; +} + +void GlobalTxn::GetCellCallback(CellReaderContext* ctx) { + InternalReaderContext* internal_reader_ctx = ctx->internal_reader_ctx; + Cell* cell = ctx->cell; + bool last_cell = false; + { + MutexLock lock(&mu_); + ++internal_reader_ctx->active_cell_cnt; + if (internal_reader_ctx->fail_cell_cnt == 0 && ctx->status.GetType() == ErrorCode::kOK) { + KeyValuePair* kv = internal_reader_ctx->results.add_key_values(); + kv->set_key(cell->RowKey()); + kv->set_column_family(cell->ColFamily()); + kv->set_qualifier(cell->Qualifier()); + kv->set_timestamp(cell->Timestamp()); + kv->set_value(cell->Value()); + } else if (ctx->status.GetType() != ErrorCode::kNotFound) { + ++internal_reader_ctx->fail_cell_cnt; + internal_reader_ctx->results.clear_key_values(); + internal_reader_ctx->last_err = ctx->status; + } else { + ++internal_reader_ctx->not_found_cnt; + } + last_cell = (internal_reader_ctx->active_cell_cnt == internal_reader_ctx->expected_cell_cnt); + } + if (last_cell) { + ErrorCode last_err = internal_reader_ctx->last_err; + RowReaderImpl* reader_impl = static_cast(internal_reader_ctx->user_reader); + if (internal_reader_ctx->fail_cell_cnt > 0) { + gtxn_read_fail_cnt.Inc(); + } else if (internal_reader_ctx->not_found_cnt == internal_reader_ctx->expected_cell_cnt) { + // all cell not found + last_err.SetFailed(ErrorCode::kNotFound); + } else { + reader_impl->SetResult(internal_reader_ctx->results); + last_err.SetFailed(ErrorCode::kOK); + } + delete internal_reader_ctx; + SetReaderStatusAndRunCallback(reader_impl, &last_err); + } +} + +bool GlobalTxn::FindValueFromResultRow(RowReader::TRow& result_row, Cell* target_cell) { + + auto write_col_it = result_row[target_cell->ColFamily()].find(target_cell->WriteName()); + auto data_col_it = result_row[target_cell->ColFamily()].find(target_cell->Qualifier()); + + // check write col and data col exsit + if (write_col_it == result_row[target_cell->ColFamily()].end() + || data_col_it == result_row[target_cell->ColFamily()].end()) { + return false; + } + auto write_col = result_row[target_cell->ColFamily()][target_cell->WriteName()]; + auto data_col = result_row[target_cell->ColFamily()][target_cell->Qualifier()]; + + for (auto k1 = write_col.rbegin(); k1 != write_col.rend(); ++k1) { + int64_t write_ts = k1->first; + std::string write_value = k1->second; + VLOG(12) << "[gtxn][get][" << start_ts_ << "] found write col, ts=" + << write_ts << ", internal val = " << write_value; + int write_type; + int64_t data_ts; + // skip new version value or skip error write format version + if (write_ts > start_ts_ || !DecodeWriteValue(write_value, &write_type, &data_ts)) { + continue; + } + VLOG(12) << "[gtxn][get][" << start_ts_ << "] decode write col, ts=" + << write_ts << ", type=" << write_type << ", value=" << data_ts; + // get data col , ts == data_ts + for (auto k2 = data_col.rbegin(); k2 != data_col.rend(); ++k2) { + VLOG(12) << "[gtxn][get][" << start_ts_ << "] found data col, ts=" + << k2->first << ", internal val = " << k2->second; + if (k2->first == data_ts && write_type == RowMutation::kPut) { + target_cell->SetTimestamp(data_ts); + target_cell->SetValue(k2->second); + return true; + } else if (k2->first < data_ts) { + VLOG(12) << "[gtxn][get][" << start_ts_ + << "] data cell version not found, v=" << k2->first; + break; + } + } + VLOG(12) << "[gtxn][get][" << start_ts_ << "] check data col failed, no data"; + break; + } + VLOG(12) << "[gtxn][get][" << start_ts_ + << "] write col versions count" << write_col.size(); + return false; +} + +void GlobalTxn::BackoffAndMaybeCleanupLock(RowReader::TRow& row, const Cell& cell, + const bool try_clean, ErrorCode* status) { + VLOG(12) << gtxn_internal_->DebugString(cell, "[gtxn][get][" + + std::to_string(start_ts_) + " backoff or cleanup lock"); + // get lock ts + int64_t lock_ts = -1; + int lock_type = -1; + tera::PrimaryInfo primary_info; + for (auto k = row[cell.ColFamily()][cell.LockName()].rbegin(); + k != row[cell.ColFamily()][cell.LockName()].rend(); ++k) { + if (k->first < start_ts_) { + lock_ts = k->first; + VLOG(12) << "lock_ts=" << lock_ts << ", primary_str=" << k->second; + if (!DecodeLockValue(k->second, &lock_type, &primary_info)) { + status->SetFailed(ErrorCode::kGTxnPrimaryLost, "can't found primary"); + return; + } + break; + } + } + // get primary lock + const std::string& process = "[gtxn][get][" + std::to_string(start_ts_) + + "][check locked and writed]"; + bool ret = gtxn_internal_->PrimaryIsLocked(primary_info, lock_ts, status); + if (status->GetType() != ErrorCode::kOK && status->GetType() != ErrorCode::kNotFound) { + LOG(ERROR) << gtxn_internal_->DebugString(cell, process + " failed," + status->ToString()); + return; + } else if (ret) { + // NotFound means : other txn on prewrite process + // and this cell locked but primary unlocked(failed) + VLOG(12) << gtxn_internal_->DebugString(cell, process + " succeed"); + // primary at prewrite do (1) clean or (2) wait + if (try_clean) { + CleanLock(cell, primary_info, status); + } else if (gtxn_internal_->SuspectLive(primary_info)) { + // TODO add a better sleep strategy + ThisThread::Sleep(100); + } else { + CleanLock(cell, primary_info, status); + } + } else { + if (!gtxn_internal_->IsPrimary(cell, primary_info)) { + VLOG(12) << gtxn_internal_->DebugString(cell, process + ", will do rollforward"); + // primary maybe at commited do roll_forward + RollForward(cell, primary_info, lock_type, status); + if (status->GetType() == ErrorCode::kGTxnPrimaryLost) { + VLOG(12) << gtxn_internal_->DebugString(cell, process + ", rollforward failed, try clean lock"); + // primary prewrite failed + status->SetFailed(ErrorCode::kOK); + if (try_clean) { + CleanLock(cell, primary_info, status); + } else if (gtxn_internal_->SuspectLive(primary_info)) { + ThisThread::Sleep(100); + } else { + CleanLock(cell, primary_info, status); + } + } + } else { + VLOG(12) << gtxn_internal_->DebugString(cell, process + ", ignore(primary)"); + } + } +} + +void GlobalTxn::CleanLock(const Cell& cell, const tera::PrimaryInfo& primary, ErrorCode* status) { + gtxn_read_rollback_cnt.Inc(); + Table* primary_table = gtxn_internal_->FindTable(primary.table_name()); + assert(primary_table != NULL); + const Cell& primary_cell = Cell(primary_table, primary.row_key(), + primary.column_family(), primary.qualifier()); + // if now cell is primary + bool is_same = cell.Table()->GetName() == primary_table->GetName() + && cell.RowKey() == primary_cell.RowKey() + && cell.ColFamily() == primary_cell.ColFamily() + && cell.LockName() == primary_cell.LockName(); + if (!is_same) { + VLOG(12) << "[gtxn][get][" << start_ts_ << "] " + << gtxn_internal_->DebugString(primary_cell, "clean lock primary"); + RowMutation* pri_mu = primary_table->NewRowMutation(primary_cell.RowKey()); + // delete all info between [0, start_ts_] at lock col + pri_mu->DeleteColumns(primary_cell.ColFamily(), primary_cell.LockName(), start_ts_); + primary_table->ApplyMutation(pri_mu); + if (pri_mu->GetError().GetType() != tera::ErrorCode::kOK) { + LOG(WARNING) << pri_mu->GetError().ToString(); + *status = pri_mu->GetError(); + } + delete pri_mu; + } + VLOG(12) << "[gtxn][get][" << start_ts_ << "] " + << gtxn_internal_->DebugString(cell, "clean lock this cell"); + RowMutation* this_mu = (cell.Table())->NewRowMutation(cell.RowKey()); + // delete all info between [0, start_ts_] at lock col + this_mu->DeleteColumns(cell.ColFamily(), cell.LockName(), start_ts_); + (cell.Table())->ApplyMutation(this_mu); + if (this_mu->GetError().GetType() != tera::ErrorCode::kOK) { + LOG(WARNING) << "[gtxn][get][" << start_ts_ << "] clean lock failed :" + << this_mu->GetError().ToString(); + *status = this_mu->GetError(); + } + delete this_mu; +} + +void GlobalTxn::RollForward(const Cell& cell, const tera::PrimaryInfo& primary, + int lock_type, ErrorCode* status) { + gtxn_read_rollforward_cnt.Inc(); + // find primary write col start_ts + Table* pri_table = gtxn_internal_->FindTable(primary.table_name()); + assert(pri_table != NULL); + std::unique_ptr primary_cell(new Cell(pri_table, primary.row_key(), + primary.column_family(), + primary.qualifier())); + RowReader* reader = pri_table->NewRowReader(primary_cell->RowKey()); + reader->AddColumn(primary_cell->ColFamily(), primary_cell->WriteName()); + reader->SetTimeRange(0, kMaxTimeStamp); + reader->SetMaxVersions(UINT32_MAX); + pri_table->Get(reader); + if (reader->GetError().GetType() != ErrorCode::kOK) { + if (reader->GetError().GetType() == ErrorCode::kNotFound) { + status->SetFailed(ErrorCode::kGTxnPrimaryLost, "primary lost, not 'lock' and 'write'"); + } else { + LOG(WARNING) << status->GetReason(); + *status = reader->GetError(); + } + delete reader; + return; + } + int64_t commit_ts = -1; + int write_type; + int64_t data_ts = -1; + while (!reader->Done()) { + // decode primary cell write col value + std::string reader_value = reader->Value(); + DecodeWriteValue(reader_value, &write_type, &data_ts); + VLOG(12) << "[gtxn][get][ " << start_ts_ << "] decode primary 'write', ts=" << reader->Timestamp() + << ", type=" << write_type << ", value=" << data_ts; + VLOG(12) << "[gtxn][get][ " << start_ts_ << "] primary start_ts=" << primary.gtxn_start_ts(); + if (data_ts > 0 && data_ts < primary.gtxn_start_ts()) { + status->SetFailed(ErrorCode::kGTxnPrimaryLost, "primary lost, not 'lock' and 'write'"); + delete reader; + return; + } else if (data_ts == primary.gtxn_start_ts()) { + commit_ts = reader->Timestamp(); + break; + } + reader->Next(); + } + delete reader; + + if (commit_ts > 0) { + RowMutation* this_mu = cell.Table()->NewRowMutation(cell.RowKey()); + this_mu->Put(cell.ColFamily(), + cell.WriteName(), + EncodeWriteValue(lock_type, data_ts), + commit_ts); + this_mu->DeleteColumns(cell.ColFamily(), cell.LockName(), commit_ts); + cell.Table()->ApplyMutation(this_mu); + if (this_mu->GetError().GetType() != tera::ErrorCode::kOK) { + LOG(WARNING) << this_mu->GetError().GetReason(); + *status = this_mu->GetError(); + } + delete this_mu; + } else { + status->SetFailed(ErrorCode::kGTxnPrimaryLost, "not found primary cell"); + } +} + +void GlobalTxn::SaveWrite(const std::string& tablename, const std::string& row_key, + tera::Write& w) { + MutexLock lock(&mu_); + TableWithRowkey twr(tablename, row_key); + auto it = writes_.find(twr); + if (it != writes_.end()) { + std::vector* ws_ptr = &(writes_[twr]); + ws_ptr->push_back(w); + } else { + std::vector ws; + ws.push_back(w); + writes_[twr] = ws; + writes_cnt_.Inc(); + } +} + +void GlobalTxn::SetLastStatus(ErrorCode* status) { + MutexLock lock(&mu_); + if (!status_returned_) { + VLOG(12) << "[gtxn][commit][status][" << start_ts_ << "]" << status->ToString(); + status_.SetFailed(status->GetType(), status->GetReason()); + status_returned_ = true; + } +} + +void GlobalTxn::RunUserCallback() { + if (status_.GetType() == ErrorCode::kOK) { + gtxn_commit_cnt.Inc(); + } else { + gtxn_commit_fail_cnt.Inc(); + } + gtxn_internal_->PerfCommitDelay(0, get_micros()); // finish_time + if (user_commit_callback_ != NULL) { + VLOG(12) << "[gtxn][commit][callback][" << start_ts_ << "]" << status_.ToString(); + user_commit_callback_(this); + } else { + MutexLock lock(&finish_mutex_); + VLOG(12) << "[gtxn][commit][finish][" << start_ts_ << "]" << status_.ToString(); + finish_ = true; + finish_cond_.Signal(); + } +} + +ErrorCode GlobalTxn::Commit() { + /// begin commit + gtxn_internal_->TEST_Sleep(); + gtxn_internal_->PerfCommitDelay(get_micros(), 0); // begin_time + ErrorCode status; + if (put_fail_cnt_.Get() > 0 || has_commited_) { + std::string reason("commit failed, has_commited[" + + std::to_string(has_commited_.load()) + + "], put_fail_cnt[" + std::to_string(put_fail_cnt_.Get()) + "]"); + VLOG(12) << reason; + status.SetFailed(ErrorCode::kGTxnOpAfterCommit, reason); + SetLastStatus(&status); + // Callback Point : put applyMutation failed or has commited + RunUserCallback(); + return status; + } + has_commited_ = true; + // don't have any writes + if (writes_cnt_.Get() == 0) { + status.SetFailed(ErrorCode::kOK, "No modification exists"); + SetLastStatus(&status); + // Callback Point + RunUserCallback(); + return status; + } + thread_pool_->AddTask(std::bind(&GlobalTxn::InternalCommit, this)); + + if (user_commit_callback_ == NULL) { + WaitForComplete(); + } + return status_; +} + +void GlobalTxn::InternalCommit() { + gtxn_internal_->SetCommitDuration(timeout_ms_); + + /// begin prewrite + gtxn_internal_->TEST_Sleep(); + + // on ReadCommitedSnapshot level will get new timestamp before prewrite + if (isolation_level_ == IsolationLevel::kReadCommitedSnapshot) { + if (FLAGS_tera_gtxn_test_opened) { + prewrite_start_ts_ = gtxn_internal_->TEST_GetPrewriteStartTimestamp(); + } else if (!FLAGS_tera_sdk_tso_client_enabled) { + start_ts_ = get_micros(); + } else { + timeoracle::TimeoracleClientImpl tsoc(thread_pool_, tso_cluster_); + prewrite_start_ts_ = tsoc.GetTimestamp(1); + } + if (prewrite_start_ts_ < start_ts_) { + ErrorCode status; + LOG(ERROR) << "[gtxn][prewrite][" << start_ts_ <<"] get prewrite new ts failed"; + status.SetFailed(ErrorCode::kGTxnTimestampLost, "get prewrite new ts failed"); + SetLastStatus(&status); + RunUserCallback(); + return; + } + gtxn_internal_->SetPrewriteStartTimestamp(prewrite_start_ts_); + } + VLOG(12) << "[gtxn][prewrite][" << start_ts_ << "]"; + gtxn_internal_->PerfPrewriteDelay(get_micros(), 0); // begin_time + gtxn_prewrite_cnt.Inc(); + + prewrite_iterator_ = writes_.begin(); + primary_write_ = &(prewrite_iterator_->second[0]); + primary_write_->Serialize(prewrite_start_ts_, + gtxn_internal_->GetClientSession(), + &serialized_primary_); + AsyncPrewrite(&prewrite_iterator_->second); +} + +// [prewrite] Step(1): +// read "lock", "write" column from tera +// +// aysnc prewrite one row use single_row_txn +// +void GlobalTxn::AsyncPrewrite(std::vector* ws) { + assert(ws->size() > 0); + // find table and rowkey to new reader and single row txn + Write w = *(ws->begin()); + Table* table = w.Table(); + Transaction* single_row_txn = table->StartRowTransaction(w.RowKey()); + RowReader* reader = table->NewRowReader(w.RowKey()); + // set internal reader timeout + gtxn_internal_->SetInternalSdkTaskTimeout(reader); + // set cf qu and timerange for reader + gtxn_internal_->BuildRowReaderForPrewrite(*ws, reader); + // set callback, context, single row txn for reader + reader->SetCallBack([](RowReader* r){ + GlobalTxn* gtxn = static_cast(((PrewriteContext*)r->GetContext())->gtxn); + gtxn->thread_pool_->AddTask(std::bind(&GlobalTxn::DoPrewriteReaderCallback, gtxn, r)); + }); + PrewriteContext* ctx = new PrewriteContext(ws, this, w.TableName(), w.RowKey()); + if (gtxn_internal_->IsTimeOut()) { + ctx->status.SetFailed(ErrorCode::kGTxnPrewriteTimeout, "global transaction prewrite timeout"); + VLOG(12) << "[gtxn][prewrite][stxn_read] ignored : " << ctx->DebugString(); + RunAfterPrewriteFailed(ctx); + } else { + reader->SetContext(ctx); + // get async + VLOG(12) << "[gtxn][prewrite][stxn_read] invoked : " << ctx->DebugString(); + single_row_txn->Get(reader); + } +} + +// [prewrite] Step(2): +// a) verify [prewrite] step(1) read result status and no conflict +// b) write "lock" and "data" column to tera, through same single_row_txn in step(1) +// +// call by [prewrite] step(1),through reader callback +// +void GlobalTxn::DoPrewriteReaderCallback(RowReader* reader) { + PrewriteContext* ctx = (PrewriteContext*)reader->GetContext(); + if (reader->GetError().GetType() != ErrorCode::kNotFound + && reader->GetError().GetType() != ErrorCode::kOK) { + ctx->status = reader->GetError(); + VLOG(12) << "[gtxn][prewrite][stxn_read] failed : " << ctx->status.ToString(); + if (gtxn_internal_->IsTimeOut() || reader->GetError().GetType() == ErrorCode::kTimeout) { + ctx->status.SetFailed(ErrorCode::kGTxnPrewriteTimeout, ctx->status.ToString()); + } + delete reader; + RunAfterPrewriteFailed(ctx); + } else if (gtxn_internal_->ConflictWithOtherWrite(ctx->ws, reader, &(ctx->status))) { + VLOG(12) << "[gtxn][prewrite][stxn_read] failed : " << ctx->status.ToString(); + delete reader; + RunAfterPrewriteFailed(ctx); + } else { + VLOG(12) << "[gtxn][prewrite][stxn_read] succeed, table=" << ctx->DebugString(); + Table* t = reader->GetTable(); + RowMutation* prewrite_mu = t->NewRowMutation(reader->RowKey()); + // set internal task timeout + gtxn_internal_->SetInternalSdkTaskTimeout(prewrite_mu); + gtxn_internal_->BuildRowMutationForPrewrite(ctx->ws, prewrite_mu, + serialized_primary_); + + // commit single_row_txn + SingleRowTxn* single_row_txn = static_cast(reader->GetTransaction()); + delete reader; + single_row_txn->SetContext(ctx); + single_row_txn->SetCommitCallback([](Transaction* single_txn) { + GlobalTxn* gtxn = static_cast(((PrewriteContext*)single_txn->GetContext())->gtxn); + SingleRowTxn* stxn = static_cast(single_txn); + gtxn->thread_pool_->AddTask(std::bind(&GlobalTxn::DoPrewriteCallback, gtxn, stxn)); + }); + if (gtxn_internal_->IsTimeOut()) { + ctx->status.SetFailed(ErrorCode::kGTxnPrewriteTimeout, "global transaction prewrite timeout"); + VLOG(12) << "[gtxn][prewrite][stxn_commit] ignored : " << ctx->DebugString(); + delete single_row_txn; + delete prewrite_mu; + RunAfterPrewriteFailed(ctx); + } else { + single_row_txn->ApplyMutation(prewrite_mu); + VLOG(12) << "[gtxn][prewrite][stxn_commit] invoked : " << ctx->DebugString(); + t->CommitRowTransaction(single_row_txn); + delete prewrite_mu; + } + } +} + +// prewrite Step(3): +// verify [prewrite] step(2) single_row_txn commit status, +// if the last prewrite callback and status ok, will call [commit] +// +// call by [prewrite] step(2), through single_row_txn commit callback +// +void GlobalTxn::DoPrewriteCallback(SingleRowTxn* single_row_txn) { + ErrorCode status = single_row_txn->GetError(); + PrewriteContext* ctx = (PrewriteContext*)single_row_txn->GetContext(); + delete single_row_txn; + if (gtxn_internal_->IsTimeOut() || status.GetType() != ErrorCode::kOK) { + // wapper timeout status for global transaction + if (gtxn_internal_->IsTimeOut() || status.GetType() == ErrorCode::kTimeout) { + ctx->status.SetFailed(ErrorCode::kGTxnPrewriteTimeout, status.ToString()); + } else { + ctx->status.SetFailed(status.GetType(), status.ToString()); + } + VLOG(12) << "[gtxn][prewrite][stxn_commit] failed : " << ctx->DebugString(); + RunAfterPrewriteFailed(ctx); + } else if (++prewrite_iterator_ != writes_.end()) { + thread_pool_->AddTask(std::bind(&GlobalTxn::AsyncPrewrite, this, &(prewrite_iterator_->second))); + } else { + gtxn_internal_->PerfPrewriteDelay(0, get_micros()); // finish_time + VLOG(12) << "prewrite done, next step"; + InternalCommitPhase2(); + } +} + +void GlobalTxn::RunAfterPrewriteFailed(PrewriteContext* ctx) { + gtxn_internal_->PerfPrewriteDelay(0, get_micros()); // finish_time + gtxn_prewrite_fail_cnt.Inc(); + if (gtxn_internal_->IsTimeOut() || ctx->status.GetType() == ErrorCode::kTimeout) { + ctx->status.SetFailed(ErrorCode::kGTxnPrewriteTimeout, ctx->status.ToString()); + } + SetLastStatus(&ctx->status); + delete ctx; + RunUserCallback(); +} + +// commit phase2 Step(1): +// a) get timestamp from timeoracle for commit_ts +// b) sync commit primary write through single_row_txn +// (for this gtxn, on this step only one thread can work) +// c) loop call [commit phase2] step(2) +// +// call by [prewrite] step(3) +void GlobalTxn::InternalCommitPhase2() { + gtxn_internal_->PerfPrimaryCommitDelay(get_micros(), 0); // begin_time + gtxn_primary_cnt.Inc(); + gtxn_internal_->TEST_Sleep(); // end prewrite + ErrorCode status; + status.SetFailed(ErrorCode::kOK); + gtxn_internal_->TEST_Sleep(); // wait to begin commit + + if (FLAGS_tera_gtxn_test_opened) { + commit_ts_ = gtxn_internal_->TEST_GetCommitTimestamp(); + } else if (!FLAGS_tera_sdk_tso_client_enabled) { + start_ts_ = get_micros(); + } else { + timeoracle::TimeoracleClientImpl tsoc(thread_pool_, tso_cluster_); + commit_ts_ = tsoc.GetTimestamp(1); + } + if (commit_ts_ < prewrite_start_ts_) { + LOG(ERROR) << "[gtxn][commit] get commit ts failed"; + status.SetFailed(ErrorCode::kGTxnTimestampLost, "get commit ts failed"); + SetLastStatus(&status); + gtxn_internal_->PerfPrimaryCommitDelay(0, get_micros()); + gtxn_primary_fail_cnt.Inc(); + RunUserCallback(); + return; + } + + VLOG(12) << "[gtxn][commit] commit_ts:" << commit_ts_; + gtxn_internal_->TEST_Sleep(); // wait to begin primary commit + + /// begin to commit primary + VerifyPrimaryLocked(); +} + +void GlobalTxn::VerifyPrimaryLocked() { + Table* pri_t = primary_write_->Table(); + tera::Transaction* pri_txn = pri_t->StartRowTransaction(primary_write_->RowKey()); + RowReader* reader = pri_t->NewRowReader(primary_write_->RowKey()); + // set internal task timeout + gtxn_internal_->SetInternalSdkTaskTimeout(reader); + reader->AddColumn(primary_write_->ColFamily(), primary_write_->LockName()); + reader->SetTimeRange(prewrite_start_ts_, prewrite_start_ts_); + reader->SetCallBack([](RowReader* r) { + ((GlobalTxn*)r->GetContext())->DoVerifyPrimaryLockedCallback(r);}); + reader->SetContext(this); + pri_txn->Get(reader); +} + +void GlobalTxn::DoVerifyPrimaryLockedCallback(RowReader* reader) { + ErrorCode status = reader->GetError(); + SingleRowTxn* pri_txn = static_cast(reader->GetTransaction()); + delete reader; + + if (status.GetType() == ErrorCode::kOK) { + CommitPrimary(pri_txn); + } else { + delete pri_txn; + if (status.GetType() == ErrorCode::kNotFound) { + status.SetFailed(ErrorCode::kGTxnPrimaryLost, "primary 'lock' lost before commit"); + } else if (status.GetType() == ErrorCode::kTimeout) { + status.SetFailed(ErrorCode::kGTxnPrimaryCommitTimeout, status.ToString()); + } + SetLastStatus(&status); + gtxn_primary_fail_cnt.Inc(); + gtxn_internal_->PerfPrimaryCommitDelay(0, get_micros()); // finish_time + RunUserCallback(); + } +} + +void GlobalTxn::CommitPrimary(SingleRowTxn* pri_txn) { + Table* pri_t = primary_write_->Table(); + RowMutation* primary_mu = pri_t->NewRowMutation(primary_write_->RowKey()); + // set internal task timeout + gtxn_internal_->SetInternalSdkTaskTimeout(primary_mu); + primary_mu->Put(primary_write_->ColFamily(), primary_write_->WriteName(), + EncodeWriteValue(primary_write_->WriteType(), prewrite_start_ts_), commit_ts_); + primary_mu->DeleteColumns(primary_write_->ColFamily(), primary_write_->LockName(), commit_ts_); + pri_txn->ApplyMutation(primary_mu); + pri_txn->SetCommitCallback([] (Transaction* txn) { + ((GlobalTxn*)txn->GetContext())->CheckPrimaryStatusAndCommmitSecondaries(txn); + }); + pri_txn->SetContext(this); + pri_txn->Commit(); + delete primary_mu; +} + +void GlobalTxn::CheckPrimaryStatusAndCommmitSecondaries(Transaction* pri_txn) { + ErrorCode status = pri_txn->GetError(); + delete pri_txn; + gtxn_internal_->TEST_Sleep(); + // primary commit failed callback and return + if (status.GetType() != tera::ErrorCode::kOK) { + VLOG(12) << "[gtxn][commit] primary failed :[" << status.ToString() << "]"; + // Callback Point : primary commit failed + if (status.GetType() == ErrorCode::kTimeout) { + status.SetFailed(ErrorCode::kGTxnPrimaryCommitTimeout, status.ToString()); + } + SetLastStatus(&status); + gtxn_primary_fail_cnt.Inc(); + gtxn_internal_->PerfPrimaryCommitDelay(0, get_micros()); // finish_time + RunUserCallback(); + return; + } + gtxn_internal_->PerfPrimaryCommitDelay(0, get_micros()); // finish_time + if (acks_cnt_.Get() == 0 && notifies_cnt_.Get() == 0) { + SetLastStatus(&status); + } + // wait primary commit done + VLOG(12) << "[gtxn][commit] succeed :[" << start_ts_ + << "," << prewrite_start_ts_ << "," << commit_ts_ << "]"; + + std::vector* ws = &(writes_.begin()->second); + if (ws->size() == 1) { + writes_.erase(writes_.begin()); + writes_cnt_.Dec(); + } else { + ws->erase(ws->begin()); + } + + all_task_pushed_ = false; + /// begin commit secondaries + for (auto &same_row_writes : writes_) { + thread_pool_->AddTask(std::bind(&GlobalTxn::AsyncCommitSecondaries, + this, &(same_row_writes.second))); + } + + /// begin ack + for (auto &same_row_acks : acks_) { + thread_pool_->AddTask(std::bind(&GlobalTxn::AsyncAck, + this, &(same_row_acks.second))); + } + /// begin notify + for (auto &same_row_notifies : notifies_) { + thread_pool_->AddTask(std::bind(&GlobalTxn::AsyncNotify, + this, &(same_row_notifies.second))); + } + bool should_callback = false; + { + MutexLock lock(&mu_); + all_task_pushed_ = true; + should_callback = commit_secondaries_done_cnt_.Get() == writes_cnt_.Get() + && acks_cnt_.Get() == ack_done_cnt_.Get() + && notifies_cnt_.Get() == notify_done_cnt_.Get() + && all_task_pushed_ == true; + } + if (should_callback) { + RunUserCallback(); + } + +} + +void GlobalTxn::AsyncAck(std::vector* ws) { + gtxn_internal_->PerfAckDelay(get_micros(), 0); + gtxn_acks_cnt.Inc(); + assert(ws->size() > 0); + Write w = *(ws->begin()); + Table* table = w.Table(); + RowMutation* mu = table->NewRowMutation(w.RowKey()); + gtxn_internal_->SetInternalSdkTaskTimeout(mu); + gtxn_internal_->BuildRowMutationForAck(ws, mu); + mu->SetCallBack([](RowMutation* row_mu) { + ((GlobalTxn*)row_mu->GetContext())->DoAckCallback(row_mu);}); + mu->SetContext(this); + table->ApplyMutation(mu); +} + +void GlobalTxn::DoAckCallback(RowMutation* mutation) { + if (mutation->GetError().GetType() != tera::ErrorCode::kOK) { + LOG(WARNING) << "[gtxn][commit][ack], failed" + << mutation->GetError().GetReason(); + ErrorCode status; + status.SetFailed(ErrorCode::kGTxnOKButAckFailed, mutation->GetError().ToString()); + SetLastStatus(&status); + gtxn_acks_fail_cnt.Inc(); + } + delete mutation; + bool should_callback = false; + { + MutexLock lock(&mu_); + ack_done_cnt_.Inc(); + gtxn_internal_->PerfAckDelay(0, get_micros()); + should_callback = commit_secondaries_done_cnt_.Get() == writes_cnt_.Get() + && acks_cnt_.Get() == ack_done_cnt_.Get() + && notifies_cnt_.Get() == notify_done_cnt_.Get(); + } + + if (should_callback) { + RunUserCallback(); + } +} + +void GlobalTxn::AsyncNotify(std::vector* ws) { + gtxn_internal_->PerfNotifyDelay(get_micros(), 0); + gtxn_notifies_cnt.Inc(); + assert(ws->size() > 0); + Write w = *(ws->begin()); + Table* table = w.Table(); + RowMutation* mu = table->NewRowMutation(w.RowKey()); + gtxn_internal_->SetInternalSdkTaskTimeout(mu); + gtxn_internal_->BuildRowMutationForNotify(ws, mu, commit_ts_); + mu->SetCallBack([](RowMutation* row_mu) { + ((GlobalTxn*)row_mu->GetContext())->DoNotifyCallback(row_mu);}); + mu->SetContext(this); + table->ApplyMutation(mu); +} + +void GlobalTxn::DoNotifyCallback(RowMutation* mutation) { + if (mutation->GetError().GetType() != tera::ErrorCode::kOK) { + LOG(WARNING) << "[gtxn][commit][notify], failed" + << mutation->GetError().GetReason(); + ErrorCode status; + status.SetFailed(ErrorCode::kGTxnOKButNotifyFailed, mutation->GetError().ToString()); + gtxn_notifies_fail_cnt.Inc(); + SetLastStatus(&status); + } + delete mutation; + + bool should_callback = false; + { + MutexLock lock(&mu_); + notify_done_cnt_.Inc(); + gtxn_internal_->PerfNotifyDelay(0, get_micros()); + should_callback = commit_secondaries_done_cnt_.Get() == writes_cnt_.Get() + && acks_cnt_.Get() == ack_done_cnt_.Get() + && notifies_cnt_.Get() == notify_done_cnt_.Get() + && all_task_pushed_ == true; + } + + if (should_callback) { + RunUserCallback(); + } +} + +void GlobalTxn::AsyncCommitSecondaries(std::vector* ws) { + gtxn_internal_->PerfSecondariesCommitDelay(get_micros(), 0); // begin time + gtxn_secondaries_cnt.Inc(); + assert(ws->size() > 0); + Write w = *(ws->begin()); + Table* table = w.Table(); + RowMutation* mu = table->NewRowMutation(w.RowKey()); + gtxn_internal_->SetInternalSdkTaskTimeout(mu); + gtxn_internal_->BuildRowMutationForCommit(ws, mu, commit_ts_); + mu->SetCallBack([](RowMutation* row_mu) { + ((GlobalTxn*)row_mu->GetContext())->DoCommitSecondariesCallback(row_mu);}); + mu->SetContext(this); + table->ApplyMutation(mu); +} + +void GlobalTxn::DoCommitSecondariesCallback(RowMutation* mutation) { + if (mutation->GetError().GetType() != tera::ErrorCode::kOK) { + LOG(WARNING) << "[gtxn][commit][secondaries], failed" + << mutation->GetError().GetReason(); + gtxn_secondaries_fail_cnt.Inc(); + } + delete mutation; + + bool should_callback = false; + { + MutexLock lock(&mu_); + commit_secondaries_done_cnt_.Inc(); + gtxn_internal_->PerfSecondariesCommitDelay(0, get_micros()); // finish time + should_callback = commit_secondaries_done_cnt_.Get() == writes_cnt_.Get() + && acks_cnt_.Get() == ack_done_cnt_.Get() + && notifies_cnt_.Get() == notify_done_cnt_.Get() + && all_task_pushed_ == true; + } + + if (should_callback) { + RunUserCallback(); + } +} + +void GlobalTxn::ApplyMutation(RowMutation* row_mu) { + assert(row_mu != NULL); + + RowMutationImpl* row_mu_impl = static_cast(row_mu); + row_mu_impl->SetTransaction(this); + row_mu_impl->SetError(ErrorCode::kOK); + + bool can_apply = false; + if (!has_commited_.load()) { + assert(put_fail_cnt_.Get() > -1); + put_fail_cnt_.Inc(); + // check writes_size_ over limit + MutexLock lock(&mu_); + can_apply = gtxn_internal_->VerifyWritesSize(row_mu, &writes_size_); + } else { + std::string reason = "ApplyMutation failed, txn has committed at [" + + std::to_string(commit_ts_) + "]"; + LOG(ERROR) << "[gtxn][apply_mutation][" << start_ts_ << "]" << reason; + row_mu_impl->SetError(ErrorCode::kGTxnOpAfterCommit, reason); + } + + size_t writes_cnt = 0; + + if (can_apply && gtxn_internal_->VerifyUserRowMutation(row_mu)) { + Table* table = row_mu->GetTable(); + const std::string& tablename = table->GetName(); + const std::string& row_key = row_mu->RowKey(); + for (size_t i = 0; i < row_mu->MutationNum(); ++i) { + const RowMutation::Mutation& mu = row_mu->GetMutation(i); + Cell cell(table, row_key, mu.family, mu.qualifier, start_ts_, mu.value); + Write w(cell, mu.type); + ++writes_cnt; + SaveWrite(tablename, row_key, w); + } + } + + bool is_async = row_mu_impl->IsAsync(); + ErrorCode mu_err = row_mu_impl->GetError(); + + if (mu_err.GetType() != ErrorCode::kOK || writes_cnt == 0) { + if (!status_returned_) { + status_.SetFailed(mu_err.GetType(), mu_err.GetReason()); + status_returned_ = true; + } + if (is_async) { + thread_pool_->AddTask(std::bind(&RowMutationImpl::RunCallback, row_mu_impl)); + } else { + // nothing to do + // sync mu_err != ok will return before put_fail_cnt -1 + } + return; + } + if (is_async) { + thread_pool_->AddTask(std::bind(&RowMutationImpl::RunCallback, row_mu_impl)); + } + // only succes put will -1 + assert(put_fail_cnt_.Get() > 0); + put_fail_cnt_.Dec(); +} + +// for wait commit +void GlobalTxn::WaitForComplete() { + MutexLock lock(&finish_mutex_); + while(!finish_) { + finish_cond_.Wait(); + } +} + +void GlobalTxn::Ack(Table* t, + const std::string& row_key, + const std::string& column_family, + const std::string& qualifier) { + if (t == NULL) { + LOG(ERROR) << "set ack cell failed"; + return; + } + const std::string& tablename = t->GetName(); + Cell cell(t, row_key, column_family, qualifier); + Write w(cell); + TableWithRowkey twr(tablename, row_key); + MutexLock lock(&mu_); + auto it = acks_.find(twr); + if (it != acks_.end()) { + std::vector* acks_ptr = &(acks_[twr]); + acks_ptr->push_back(w); + } else { + std::vector acks; + acks.push_back(w); + acks_[twr] = acks; + acks_cnt_.Inc(); + } +} + +void GlobalTxn::Notify(Table* t, + const std::string& row_key, + const std::string& column_family, + const std::string& qualifier) { + if (t == NULL) { + LOG(ERROR) << "set ack cell failed"; + return; + } + const std::string& tablename = t->GetName(); + Cell cell(t, row_key, column_family, qualifier); + Write w(cell); + TableWithRowkey twr(tablename, row_key); + MutexLock lock(&mu_); + auto it = notifies_.find(twr); + if (it != notifies_.end()) { + std::vector* notifies_ptr = &(notifies_[twr]); + notifies_ptr->push_back(w); + } else { + std::vector notifies; + notifies.push_back(w); + notifies_[twr] = notifies; + notifies_cnt_.Inc(); + } +} + +} // namespace tera + +/* vim: set expandtab ts=4 sw=4 sts=4 tw=100: */ diff --git a/src/sdk/global_txn.h b/src/sdk/global_txn.h new file mode 100644 index 000000000..de5832166 --- /dev/null +++ b/src/sdk/global_txn.h @@ -0,0 +1,273 @@ +// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. +// +// Author: baorenyi@baidu.com + +#ifndef TERA_SDK_GLOBAL_TXN_H_ +#define TERA_SDK_GLOBAL_TXN_H_ + +#include +#include +#include +#include + +#include "common/mutex.h" +#include "io/coding.h" +#include "proto/table_meta.pb.h" +#include "sdk/global_txn_internal.h" +#include "sdk/single_row_txn.h" +#include "sdk/sdk_utils.h" +#include "sdk/table_impl.h" +#include "sdk/sdk_zk.h" +#include "tera.h" +#include "common/counter.h" +#include "common/timer.h" + +namespace tera { + +class Cell; +class Write; +class GlobalTxnInternal; +class CellReaderContext; +class InternalReaderContext; +class PrewriteContext; + +class GlobalTxn : public Transaction { +public: + static Transaction* NewGlobalTxn(tera::Client* client, + common::ThreadPool* thread_pool, + sdk::ClusterFinder* tso_cluster); + + virtual ~GlobalTxn(); + + virtual void ApplyMutation(RowMutation* row_mu); + virtual ErrorCode Get(RowReader* row_reader); + virtual ErrorCode Commit(); + + virtual int64_t GetStartTimestamp() { return start_ts_; } + virtual int64_t GetCommitTimestamp() { return commit_ts_; } + + virtual const ErrorCode& GetError() { return status_; } + + typedef void (*Callback)(Transaction* transaction); + + virtual void SetCommitCallback(Callback callback) { + user_commit_callback_ = callback; + } + + virtual Callback GetCommitCallback() { + return user_commit_callback_; + } + + virtual void SetContext(void* context) { + user_commit_context_ = context; + } + + virtual void* GetContext() { + return user_commit_context_; + } + + virtual void Ack(Table* t, + const std::string& row_key, + const std::string& column_family, + const std::string& qualifier); + + virtual void Notify(Table* t, + const std::string& row_key, + const std::string& column_family, + const std::string& qualifier); + + virtual void SetIsolation(const IsolationLevel& isolation_level); + + virtual IsolationLevel Isolation() { return isolation_level_; } + + virtual void SetTimeout(int64_t timeout_ms); + + virtual int64_t Timeout(); + +private: + // ----------------------- begin get process --------------------------- // + // read one cell from db + // + // read "lock", "write", "data" columns result from db, + // use async interface of tera [RowReader] + void AsyncGetCell(Cell* cell, RowReaderImpl* user_reader_impl, InternalReaderContext* ctx); + + // check lock write and build cell result + // (1) check read result, if failed will call [MergeCellToRow] + // (2) maybe call [BackoffAndMaybeCleanupLock] and call [AsyncGetCell] retry + // (3) maybe call [FindValueFromResultRow] and call [MergeCellToRow] + void DoGetCellReaderCallback(RowReader* reader); + + // check "lock" and "write" columns, do like percolator + // maybe call CleanLock, RollForward or wait some times + // + // if try_clean == true will be CleanLock not wait + void BackoffAndMaybeCleanupLock(RowReader::TRow& row, + const Cell& cell, + const bool try_clean, + ErrorCode* status); + void CleanLock(const Cell& cell, const tera::PrimaryInfo& primary, + ErrorCode* status); + + void RollForward(const Cell& cell, + const tera::PrimaryInfo& primary, + int lock_type, + ErrorCode* status); + + // get result form "result_row" and set into "target_cell" + bool FindValueFromResultRow(RowReader::TRow& result_row, Cell* target_cell); + + // call GetCellCallback function @ other thread + void MergeCellToRow(RowReader* internal_reader, const ErrorCode& status); + + // set cell result, merge to value_list and call user_reader_callback + void GetCellCallback(CellReaderContext* ctx); + + void SetReaderStatusAndRunCallback(RowReaderImpl* reader_impl, ErrorCode* status); + + // ------------- begin commit prewrite (commit phase1) ----------------- // + void SaveWrite(const std::string& tablename, + const std::string& row_key, + tera::Write& w); + + // commit entry + // + // do [commit phase1], [commit phase2] will begin at callback + void InternalCommit(); + + // [prewrite] Step(1): + // read "data", "lock", "write" column from tera + // + // aysnc prewrite one row use single_row_txn + void AsyncPrewrite(std::vector* same_row_writes); + + // [prewrite] Step(2): + // a) verify [prewrite] step(1) read result status and no conflict + // b) write "lock" and "data" column to tera, + // through same single_row_txn in step(1) + // + // call by [prewrite] step(1),through reader callback + void DoPrewriteReaderCallback(RowReader* reader); + + // prewrite Step(3): + // verify [prewrite] step(2) single_row_txn commit status, + // if the last prewrite callback and status ok, will call [commit] + // + // call by [prewrite] step(2), through single_row_txn commit callback + void DoPrewriteCallback(SingleRowTxn* single_row_txn); + void RunAfterPrewriteFailed(PrewriteContext* ctx); + + // --------------------- begin commit phase2 ---------------------- // + + // commit phase2 Step(1): + // a) get timestamp from timeoracle for commit_ts + // b) sync commit primary write through single_row_txn + // (for this gtxn, on this step only one thread can work) + // c) call [commit phase2] step(2) in a loop + // + // call by [prewrite] step(3) + void InternalCommitPhase2(); + + void VerifyPrimaryLocked(); + + void DoVerifyPrimaryLockedCallback(RowReader* reader); + + void CommitPrimary(SingleRowTxn* primary_single_txn); + + void CheckPrimaryStatusAndCommmitSecondaries(Transaction* primary_single_txn); + + // commit phase2 Step(2): + // async commit secondaries writes through RowMutaion + // + // call by [commit phase2] step(1) + void AsyncCommitSecondaries(std::vector* same_row_writes); + + void DoCommitSecondariesCallback(RowMutation* mutation); + + // commit phase2 Step(3): + // async do ack through RowMutaion + // + // call by [commit phase2] step(1) + void AsyncAck(std::vector* same_row_acks); + + void DoAckCallback(RowMutation* mutation); + + // commit phase2 Step(4): + // async do notify through RowMutaion + // + // call by [commit phase2] step(1) + void AsyncNotify(std::vector* same_row_notifies); + + void DoNotifyCallback(RowMutation* mutation); + + /// if user want to delete this transaction, + /// before any async tasks of this transaction finished for failed + void WaitForComplete(); + + void SetLastStatus(ErrorCode* status); + + void RunUserCallback(); + + // -------------------- end commit phase1 and phase2 ------------------- // +private: + GlobalTxn(tera::Client* client, + common::ThreadPool* thread_pool, + sdk::ClusterFinder* tso_cluster); + + GlobalTxn(const GlobalTxn&) = delete; + void operator=(const GlobalTxn&) = delete; + + // + typedef std::pair TableWithRowkey; + // tableWithRowkey -> set(write) + typedef std::map> WriterMap; + + std::unique_ptr gtxn_internal_; + ErrorCode status_; + bool status_returned_; // if true gtxn will not change "status_" + + Write* primary_write_; + WriterMap writes_; + WriterMap::iterator prewrite_iterator_; + int64_t writes_size_; + + int64_t start_ts_; + int64_t prewrite_start_ts_; + int64_t commit_ts_; + IsolationLevel isolation_level_; + std::string serialized_primary_; + + WriterMap acks_; + WriterMap notifies_; + + mutable Mutex mu_; + std::atomic finish_; + mutable Mutex finish_mutex_; + common::CondVar finish_cond_; + + std::atomic has_commited_; + + Callback user_commit_callback_; + void* user_commit_context_; + + common::ThreadPool* thread_pool_; + sdk::ClusterFinder* tso_cluster_; + + int64_t timeout_ms_; + + Counter put_fail_cnt_; // put begin +1, done -1 + Counter commit_secondaries_done_cnt_; + Counter ack_done_cnt_; + Counter notify_done_cnt_; + + Counter writes_cnt_; + Counter acks_cnt_; + Counter notifies_cnt_; + std::atomic all_task_pushed_; +}; + +} // namespace tera + +#endif // TERA_SDK_GLOBAL_TXN_H_ diff --git a/src/sdk/global_txn_internal.cc b/src/sdk/global_txn_internal.cc new file mode 100644 index 000000000..8c69651ed --- /dev/null +++ b/src/sdk/global_txn_internal.cc @@ -0,0 +1,559 @@ +// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. +// +// Author: baorenyi@baidu.com + +#include "sdk/global_txn_internal.h" + +#include "common/metric/metric_counter.h" +#include "common/this_thread.h" +#include "proto/table_meta.pb.h" +#include "proto/tabletnode_rpc.pb.h" +#include "sdk/global_txn.h" +#include "sdk/read_impl.h" +#include "sdk/sdk_metric_name.h" + +DECLARE_bool(tera_gtxn_test_opened); +DECLARE_string(tera_gtxn_test_flagfile); +DECLARE_int32(tera_gtxn_all_puts_size_limit); +DECLARE_int32(tera_sdk_timeout); + +namespace tera { + +// for record sdk all transactions perf +tera::MetricCounter gtxn_read_delay_us(kGTxnReadDelayMetric, kGTxnLabelRead); +tera::MetricCounter gtxn_read_cnt(kGTxnReadCountMetric, kGTxnLabelRead); +tera::MetricCounter gtxn_read_fail_cnt(kGTxnReadFailCountMetric, kGTxnLabelRead); +tera::MetricCounter gtxn_read_retry_cnt(kGTxnReadRetryCountMetric, kGTxnLabelRead); +tera::MetricCounter gtxn_read_rollback_cnt(kGTxnReadRollBackCountMetric, kGTxnLabelRead); +tera::MetricCounter gtxn_read_rollforward_cnt(kGTxnReadRollForwardCountMetric, kGTxnLabelRead); + +tera::MetricCounter gtxn_commit_delay_us(kGTxnCommitDelayMetric, kGTxnLabelCommit); +tera::MetricCounter gtxn_commit_cnt(kGTxnCommitCountMetric, kGTxnLabelCommit); +tera::MetricCounter gtxn_commit_fail_cnt(kGTxnCommitFailCountMetric, kGTxnLabelCommit); + +tera::MetricCounter gtxn_prewrite_delay_us(kGTxnPrewriteDelayMetric, kGTxnLabelCommit); +tera::MetricCounter gtxn_prewrite_cnt(kGTxnPrewriteCountMetric, kGTxnLabelCommit); +tera::MetricCounter gtxn_prewrite_fail_cnt(kGTxnPrewriteFailCountMetric, kGTxnLabelCommit); + +tera::MetricCounter gtxn_primary_delay_us(kGTxnPrimaryDelayMetric, kGTxnLabelCommit); +tera::MetricCounter gtxn_primary_cnt(kGTxnPrimaryCountMetric, kGTxnLabelCommit); +tera::MetricCounter gtxn_primary_fail_cnt(kGTxnPrimaryFailCountMetric, kGTxnLabelCommit); + +tera::MetricCounter gtxn_secondaries_delay_us(kGTxnSecondariesDelayMetric, kGTxnLabelCommit); +tera::MetricCounter gtxn_secondaries_cnt(kGTxnSecondariesCountMetric, kGTxnLabelCommit); +tera::MetricCounter gtxn_secondaries_fail_cnt(kGTxnSecondariesFailCountMetric, kGTxnLabelCommit); + +tera::MetricCounter gtxn_acks_delay_us(kGTxnAcksDelayMetric, kGTxnLabelCommit); +tera::MetricCounter gtxn_acks_cnt(kGTxnAcksCountMetric, kGTxnLabelCommit); +tera::MetricCounter gtxn_acks_fail_cnt(kGTxnAcksFailCountMetric, kGTxnLabelCommit); + +tera::MetricCounter gtxn_notifies_delay_us(kGTxnNotifiesDelayMetric, kGTxnLabelCommit); +tera::MetricCounter gtxn_notifies_cnt(kGTxnNotifiesCountMetric, kGTxnLabelCommit); +tera::MetricCounter gtxn_notifies_fail_cnt(kGTxnNotifiesFailCountMetric, kGTxnLabelCommit); + +tera::MetricCounter gtxn_tso_delay_us(kGTxnTsoDelayMetric, kGTxnLabelTso); +tera::MetricCounter gtxn_tso_req_cnt(kGTxnTsoRequestCountMetric, kGTxnLabelTso); + +GlobalTxnInternal::GlobalTxnInternal(tera::Client* client) + : TEST_GtxnTestHelper_(NULL), + start_ts_(0), + prewrite_start_ts_(0), + terminal_time_(0), + is_timeout_(false), + client_(client) {} + +GlobalTxnInternal::~GlobalTxnInternal() { + PerfReport(); +} + +void GlobalTxnInternal::SetStartTimestamp(int64_t ts) { + start_ts_ = ts; + prewrite_start_ts_ = ts; +} + +bool GlobalTxnInternal::CheckTable(Table* table, ErrorCode* status) { + assert(table != NULL); + MutexLock lock(&tables_mu_); + TableInfoMap::const_iterator tables_it = tables_.find(table->GetName()); + if (tables_it == tables_.end()) { + TableImpl* table_impl = static_cast(table); + TableSchema schema = table_impl->GetTableSchema(); + if (IsTransactionTable(schema)) { + std::set gtxn_cfs; + FindGlobalTransactionCfs(schema, >xn_cfs); + if (gtxn_cfs.size() > 0) { + tables_[table->GetName()] = std::pair >(table, gtxn_cfs); + return true; + } else { + status->SetFailed(ErrorCode::kBadParam, + "schema check fail: " + table->GetName() + " haven't gtxn cf"); + return false; + } + } else { + status->SetFailed(ErrorCode::kBadParam, + "schema check fail: " + table->GetName() + " not txn table"); + return false; + } + } + return true; +} + + + +bool GlobalTxnInternal::IsLockedByOthers(RowReader::TRow& row, const Cell& cell) { + if (row[cell.ColFamily()].find(cell.LockName()) != row[cell.ColFamily()].end()) { + for (auto k = row[cell.ColFamily()][cell.LockName()].rbegin(); + k != row[cell.ColFamily()][cell.LockName()].rend(); ++k) { + if (k->first < start_ts_) { + return true; + } + } + } + return false; +} + +bool GlobalTxnInternal::SuspectLive(const tera::PrimaryInfo& primary_info) { + std::string session_str = primary_info.client_session(); + VLOG(12) << "suppect_live : " << session_str; + ClientImpl* client_impl = static_cast(client_); + return client_impl->IsClientAlive(session_str); +} + +bool GlobalTxnInternal::VerifyUserRowReader(RowReader* user_reader) { + RowReaderImpl* reader_impl = static_cast(user_reader); + const RowReader::ReadColumnList& read_col_list = user_reader->GetReadColumnList(); + ErrorCode status; + bool schema_valid = true; + std::string reason(""); + + Table* table = reader_impl->GetTable(); + if (!CheckTable(table, &status)) { + // table schema error for gtxn + reader_impl->SetError(status.GetType(), status.GetReason()); + return false; + } else if (read_col_list.size() == 0) { + // TODO support read full + reason = "not support read full line in global transaction"; + LOG(ERROR) << "[gtxn][get] " << reason; + reader_impl->SetError(ErrorCode::kBadParam, reason); + return false; + } else if (reader_impl->GetSnapshot() != 0) { + reason = "not support read a snapshot in global transaction"; + LOG(ERROR) << "[gtxn][get] " << reason; + reader_impl->SetError(ErrorCode::kBadParam, reason); + return false; + } + + // check schema valid + const std::string& tablename = table->GetName(); + + for (auto it = read_col_list.begin(); it != read_col_list.end(); ++it) { + const std::string& column_family = it->first; + const std::set& qualifier_set = it->second; + + if (qualifier_set.size() == 0) { + reason = "not set any qualifier"; + LOG(ERROR) << "[gtxn][get] " << reason; + reader_impl->SetError(ErrorCode::kBadParam, reason); + schema_valid = false; + break; + } + if (!IsGTxnColumnFamily(tablename, column_family)) { + reason = "table:" + tablename + ",cf:" + column_family + " not set gtxn=\"on\""; + LOG(ERROR) << "[gtxn][get] " << reason; + reader_impl->SetError(ErrorCode::kBadParam, reason); + schema_valid = false; + break; + } + for (auto q_it = qualifier_set.begin(); q_it != qualifier_set.end(); ++q_it) { + const std::string& qualifier = *q_it; + + if (BadQualifier(qualifier)) { + reason = "table:" + tablename + ",qu:" + qualifier + " can't end with \"_*_\""; + LOG(ERROR) << "[gtxn][get] " << reason; + reader_impl->SetError(ErrorCode::kBadParam, reason); + schema_valid = false; + break; + } + } + } + return schema_valid; +} + +bool GlobalTxnInternal::VerifyUserRowMutation(RowMutation* user_mu) { + RowMutationImpl* row_mu_impl = static_cast(user_mu); + Table* table = user_mu->GetTable(); + + ErrorCode status; + if (!CheckTable(table, &status)) { + // table schema error for gtxn; + row_mu_impl->SetError(status.GetType(), status.GetReason()); + return false; + } else if (row_mu_impl->MutationNum() <= 0) { + // nothing to mutation + row_mu_impl->SetError(ErrorCode::kBadParam, "nothing to mutation"); + return false; + } + + std::string reason(""); + const std::string& tablename = table->GetName(); + + for (size_t i = 0; i < user_mu->MutationNum(); ++i) { + const RowMutation::Mutation& mu = user_mu->GetMutation(i); + // check this qualifier is right + if (BadQualifier(mu.qualifier)) { + reason = "@table" + tablename + ",qu:" + mu.qualifier + + " can't end with \"_*_\""; + LOG(ERROR) << "[gtxn][apply_mutation] " << reason; + row_mu_impl->SetError(ErrorCode::kBadParam, reason); + return false; + } else if (!IsGTxnColumnFamily(tablename, mu.family)) { + // check column has set gtxn="on" + reason = "@table" + tablename + ",cf:" + mu.family + + " not set gtxn=\"on\""; + LOG(ERROR) << "[gtxn][apply_mutation] " << reason; + row_mu_impl->SetError(ErrorCode::kBadParam, reason); + return false; + } else if (mu.type != RowMutation::kPut && mu.type != RowMutation::kDeleteColumn + && mu.type != RowMutation::kDeleteColumns) { + + reason = "@table " + tablename + ",row mutation type is " + + std::to_string(mu.type); + LOG(ERROR) << "[gtxn][apply_mutation] " << reason; + row_mu_impl->SetError(ErrorCode::kGTxnNotSupport, reason); + return false; + } + } + return true; +} + +bool GlobalTxnInternal::VerifyWritesSize(RowMutation* user_mu, int64_t* size) { + RowMutationImpl* row_mu_impl = static_cast(user_mu); + *size += row_mu_impl->Size(); + if (*size > FLAGS_tera_gtxn_all_puts_size_limit) { + LOG(ERROR) << "[gtxn][apply_mutation][" << start_ts_ << "] failed, " + << "mutations size " << *size << " > limit (" + << FLAGS_tera_gtxn_all_puts_size_limit << ")"; + row_mu_impl->SetError(ErrorCode::kGTxnDataTooLarge); + return false; + } else if ( *size <= 0) { + LOG(ERROR) << "[gtxn][apply_mutation][" << start_ts_ << "] failed, " + << "mutaions size " << *size; + row_mu_impl->SetError(ErrorCode::kBadParam); + return false; + } + return true; +} + +bool GlobalTxnInternal::PrimaryIsLocked(const tera::PrimaryInfo& primary, + const int64_t lock_ts, + ErrorCode* status) { + Table* table = FindTable(primary.table_name()); + if (table == NULL) { + status->SetFailed(ErrorCode::kGTxnPrimaryLost, + "not found primary table and open failed"); + return false; + } + if (!CheckTable(table, status)) { + status->SetFailed(ErrorCode::kGTxnPrimaryLost, + "primary table check failed" + status->ToString()); + return false; + } + const Cell& cell = Cell(table, primary.row_key(), + primary.column_family(), primary.qualifier()); + + std::unique_ptr reader(table->NewRowReader(cell.RowKey())); + reader->AddColumn(cell.ColFamily(), cell.LockName()); + reader->SetTimeRange(lock_ts, lock_ts); + table->Get(reader.get()); + + if (reader->GetError().GetType() != tera::ErrorCode::kOK && + reader->GetError().GetType() != tera::ErrorCode::kNotFound) { + *status = reader->GetError(); + return false; + } + while (!reader->Done()) { + if (reader->Timestamp() == lock_ts) { + VLOG(12) << DebugString(cell, "other transaction on prewrite @" + std::to_string(lock_ts)); + return true; + } + reader->Next(); + } + return false; +} + +void GlobalTxnInternal::BuildRowReaderForPrewrite(const std::vector& ws, RowReader* reader) { + for (auto& w : ws){ + reader->AddColumn(w.ColFamily(), w.Qualifier()); + reader->AddColumn(w.ColFamily(), w.LockName()); + reader->AddColumn(w.ColFamily(), w.WriteName()); + reader->SetTimeRange(0, kMaxTimeStamp); + reader->SetMaxVersions(UINT32_MAX); + } +} + +void GlobalTxnInternal::BuildRowMutationForPrewrite(std::vector* ws, + RowMutation* prewrite_mu, + const std::string& primary_info) { + for (auto it = ws->begin(); it != ws->end(); ++it) { + const Write& w = *it; // one cell + prewrite_mu->Put(w.ColFamily(), + w.LockName(), + EncodeLockValue(w.WriteType(), primary_info), + (int64_t)prewrite_start_ts_); + prewrite_mu->Put(w.ColFamily(), + w.Qualifier(), + w.Value(), + (int64_t)prewrite_start_ts_); + } +} + +void GlobalTxnInternal::BuildRowMutationForCommit(std::vector* ws, + RowMutation* commit_mu, + const int64_t commit_ts) { + for (auto it = ws->begin(); it != ws->end(); ++it) { + const Write& w = *it; // one cell + // value = type + start_ts + commit_mu->Put(w.ColFamily(), w.WriteName(), + EncodeWriteValue(w.WriteType(), prewrite_start_ts_), + commit_ts); + commit_mu->DeleteColumns(w.ColFamily(), w.LockName(), commit_ts); + } +} + +void GlobalTxnInternal::BuildRowMutationForAck(std::vector* ws, + RowMutation* commit_mu) { + for (auto it = ws->begin(); it != ws->end(); ++it) { + const Write& w = *it; // one cell + commit_mu->DeleteColumns(kNotifyColumnFamily, w.NotifyName(), start_ts_); + } +} + +void GlobalTxnInternal::BuildRowMutationForNotify(std::vector* ws, + RowMutation* commit_mu, + const int64_t commit_ts) { + for (auto it = ws->begin(); it != ws->end(); ++it) { + const Write& w = *it; // one cell + commit_mu->Put(kNotifyColumnFamily, w.NotifyName(), + Int64ToEncodedString(commit_ts), commit_ts); + } +} + +void GlobalTxnInternal::SetCommitDuration(int64_t timeout_ms) { + terminal_time_ = timeout_ms + get_millis(); +} + +void GlobalTxnInternal::SetInternalSdkTaskTimeout(RowReader* reader) { + int64_t duration = terminal_time_ - get_millis(); + if (duration < 0) { + is_timeout_ = true; + duration = 1; + } + // duration should not larger than FLAGS_tera_sdk_timeout + duration = duration > FLAGS_tera_sdk_timeout ? FLAGS_tera_sdk_timeout : duration; + reader->SetTimeOut(duration); +} + +void GlobalTxnInternal::SetInternalSdkTaskTimeout(RowMutation* mutation) { + int64_t duration = terminal_time_ - get_millis(); + if (duration < 0) { + is_timeout_ = true; + duration = 1; + } + // duration should not larger than FLAGS_tera_sdk_timeout + duration = duration > FLAGS_tera_sdk_timeout ? FLAGS_tera_sdk_timeout : duration; + mutation->SetTimeOut(duration); +} + +bool GlobalTxnInternal::IsTimeOut() { + return is_timeout_; +} + +bool GlobalTxnInternal::IsPrimary(const tera::Cell& cell, + const tera::PrimaryInfo& primary_info) { + return primary_info.table_name() == cell.TableName() + && primary_info.row_key() == cell.RowKey() + && primary_info.column_family() == cell.ColFamily() + && primary_info.qualifier() == cell.Qualifier(); +} + +Table* GlobalTxnInternal::FindTable(const std::string& tablename) { + assert(!tablename.empty()); + MutexLock lock(&tables_mu_); + TableInfoMap::const_iterator it = tables_.find(tablename); + if (it == tables_.end()) { + ErrorCode status; + Table* t = client_->OpenTable(tablename, &status); + if (t == NULL || status.GetType() != ErrorCode::kOK) { + LOG(ERROR) << "[gtxn] can't create table :" << tablename << "," << status.ToString(); + return NULL; + } + return t; + } + return (it->second).first; +} + +bool GlobalTxnInternal::ConflictWithOtherWrite(const std::vector* ws, + RowReader* reader, + ErrorCode* status) { + RowReader::TRow row; + reader->ToMap(&row); + + // check every cell + for (auto it = ws->begin(); it != ws->end(); ++it) { + const Write& w = *it; + const std::string& w_cf = w.ColFamily(); + if (row.find(w_cf) == row.end()) { + VLOG(12) << "[gtxn][prewrite][stxn_read]" << w.DebugString() + << "not found [" << w_cf << "]"; + continue; + } else { + // check Write column + const std::string& w_write = w.WriteName(); + if (row[w_cf].find(w_write) != row[w_cf].end()) { + for (auto k = row[w_cf][w_write].rbegin(); k != row[w_cf][w_write].rend(); ++k) { + std::string write_value = k->second; + int write_type; + int64_t data_start_ts; + DecodeWriteValue(write_value, &write_type, &data_start_ts); + VLOG(12) << "[gtxn][prewrite][stxn_read]" << w.DebugString() + << " prewrite_start_ts:" << prewrite_start_ts_ + << " found _W_ :" << k->first + << " type: " << write_type + << " data_ts: " << data_start_ts; + if (k->first >= prewrite_start_ts_) { + status->SetFailed(ErrorCode::kGTxnWriteConflict, + "writing by others ts:" + std::to_string(k->first)); + return true; + } + } + } else { + VLOG(12) << "[gtxn][prewrite][stxn_read]" << w.DebugString() + << "not found _W_ col"; + } + // check Lock column + const std::string& w_lock = w.LockName(); + if (row[w_cf].find(w_lock) != row[w_cf].end()) { + auto k = row[w_cf][w_lock].rbegin(); + if (k != row[w_cf][w_lock].rend()) { + VLOG(12) << "[gtxn][prewrite][stxn_read]" << w.DebugString() + << "locked@: " << k->first; + status->SetFailed(ErrorCode::kGTxnLockConflict, + w.DebugString() + "locked@:" + std::to_string(k->first)); + return true; + } + } + } + } + return false; +} + +void GlobalTxnInternal::SetPrewriteStartTimestamp(const int64_t prewrite_start_ts) { + prewrite_start_ts_ = prewrite_start_ts; +} + +bool GlobalTxnInternal::IsGTxnColumnFamily(const std::string& tablename, + const std::string& column_family) { + MutexLock lock(&tables_mu_); + auto it = tables_.find(tablename); + if (it != tables_.end()) { + std::set& gtxn_cfs = (it->second).second; + auto cfs_it = gtxn_cfs.find(column_family); + if (cfs_it != gtxn_cfs.end()) { + return true; + } + } + return false; +} + +std::string GlobalTxnInternal::GetClientSession() { + ClientImpl* client_impl = static_cast(client_); + return client_impl->ClientSession(); +} + +std::string GlobalTxnInternal::DebugString(const Cell& cell, const std::string& msg) const { + std::stringstream ss; + ss << msg << " @ [" << cell.Table()->GetName() << ":" + << cell.RowKey() << ":" << cell.ColFamily() + << ":" << cell.Qualifier() << ":" << cell.Timestamp() << "]"; + return ss.str(); +} + +int64_t GlobalTxnInternal::TEST_Init(const std::string& conf_file) { + if (FLAGS_tera_gtxn_test_opened) { + TEST_GtxnTestHelper_ = new GlobalTxnTestHelper(conf_file); + TEST_GtxnTestHelper_->LoadTxnConf(); + start_ts_ = TEST_GtxnTestHelper_->GetStartTs(); + prewrite_start_ts_ = TEST_GtxnTestHelper_->GetPrewriteStartTs(); + } + return start_ts_; +} + +void GlobalTxnInternal::TEST_GetSleep() { + if (FLAGS_tera_gtxn_test_opened) { + TEST_GtxnTestHelper_->GetWait(start_ts_); + } +} + +void GlobalTxnInternal::TEST_Sleep() { + if (FLAGS_tera_gtxn_test_opened) { + TEST_GtxnTestHelper_->Wait(start_ts_); + } +} + +void GlobalTxnInternal::TEST_Destory() { + if (FLAGS_tera_gtxn_test_opened) { + delete TEST_GtxnTestHelper_; + } +} + +int64_t GlobalTxnInternal::TEST_GetCommitTimestamp() { + return TEST_GtxnTestHelper_->GetCommitTs(); +} + +int64_t GlobalTxnInternal::TEST_GetPrewriteStartTimestamp() { + return TEST_GtxnTestHelper_->GetPrewriteStartTs(); +} + +void GlobalTxnInternal::PerfReadDelay(int64_t begin_time, int64_t finish_time) { + read_cost_time_.Add(finish_time - begin_time); +} +void GlobalTxnInternal::PerfCommitDelay(int64_t begin_time, int64_t finish_time) { + commit_cost_time_.Add(finish_time - begin_time); +} + +void GlobalTxnInternal::PerfPrewriteDelay(int64_t begin_time, int64_t finish_time) { + prewrite_cost_time_.Add(finish_time - begin_time); +} + +void GlobalTxnInternal::PerfPrimaryCommitDelay(int64_t begin_time, int64_t finish_time) { + primary_cost_time_.Add(finish_time - begin_time); +} + +void GlobalTxnInternal::PerfSecondariesCommitDelay(int64_t begin_time, int64_t finish_time) { + secondaries_cost_time_.Add(finish_time - begin_time); +} + +void GlobalTxnInternal::PerfAckDelay(int64_t begin_time, int64_t finish_time) { + acks_cost_time_.Add(finish_time - begin_time); +} + +void GlobalTxnInternal::PerfNotifyDelay(int64_t begin_time, int64_t finish_time) { + notifies_cost_time_.Add(finish_time - begin_time); +} + +void GlobalTxnInternal::PerfReport() { + gtxn_read_delay_us.Add(read_cost_time_.Clear()); + gtxn_commit_delay_us.Add(commit_cost_time_.Clear()); + gtxn_prewrite_delay_us.Add(prewrite_cost_time_.Clear()); + gtxn_primary_delay_us.Add(primary_cost_time_.Clear()); + gtxn_secondaries_delay_us.Add(secondaries_cost_time_.Clear()); + gtxn_acks_delay_us.Add(acks_cost_time_.Clear()); + gtxn_notifies_delay_us.Add(notifies_cost_time_.Clear()); +} + +} // namespace tera + +/* vim: set expandtab ts=4 sw=4 sts=4 tw=100: */ diff --git a/src/sdk/global_txn_internal.h b/src/sdk/global_txn_internal.h new file mode 100644 index 000000000..95eaae825 --- /dev/null +++ b/src/sdk/global_txn_internal.h @@ -0,0 +1,366 @@ +// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. +// +// Author: baorenyi@baidu.com + +#ifndef TERA_SDK_GLOBAL_TXN_INTERNAL_H_ +#define TERA_SDK_GLOBAL_TXN_INTERNAL_H_ + +#include +#include +#include +#include +#include + +#include "common/mutex.h" +#include "io/coding.h" +#include "sdk/global_txn.h" +#include "sdk/test/global_txn_testutils.h" +#include "sdk/sdk_utils.h" +#include "sdk/single_row_txn.h" +#include "sdk/table_impl.h" +#include "sdk/timeoracle_client_impl.h" +#include "tera.h" +#include "common/timer.h" + +namespace tera { + +class Cell; +class GlobalTxnTestHelper; +class Write; + +inline void PrintCostTime(const std::string& msg, int64_t begin_time) { + VLOG(12) << msg <<" cost: " << get_micros() - begin_time; +} + +inline std::string Int64ToEncodedString(int64_t n) { + char buf[sizeof(int64_t)]; + io::EncodeBigEndian(buf, n); + std::string s (buf, sizeof(int64_t)); + return s; +} + +inline int64_t EncodedStringToInt64(const std::string& s) { + return io::DecodeBigEndain(s.c_str()); +} + +inline std::string PackLockName(const std::string& qualifier) { + return qualifier + "_L_"; +} + +inline std::string PackWriteName(const std::string& qualifier) { + return qualifier + "_W_"; +} + +inline std::string EncodeLockValue(int type, const std::string& primary_str) { + return (char)type + primary_str; +} + +inline bool DecodeLockValue(const std::string& value, + int* type, tera::PrimaryInfo* info) { + if (value.length() > 1) { + *type = (int)value[0]; + return info->ParseFromString(value.substr(1)); + } else { + *type = -1; + return false; + } +} + +inline std::string EncodeWriteValue(int type, int64_t timestamp) { + return (char)type + Int64ToEncodedString(timestamp); +} + +inline bool DecodeWriteValue(const std::string& value, int* type, int64_t* timestamp) { + if (value.length() > 1) { + *type = (int)value[0]; + *timestamp = EncodedStringToInt64(value.substr(1)); + return true; + } else { + *type = -1; + *timestamp = -1; + return false; + } +} + +inline std::string PackNotifyName(const std::string& column_family, + const std::string& qualifier) { + return column_family + ":" + qualifier; +} + +inline bool BadQualifier(const std::string& qualifier) { + size_t q_len = qualifier.length(); + return q_len >= 3 && qualifier[q_len - 1] == '_' && qualifier[q_len - 3] == '_'; +} + +struct PrewriteContext { + std::vector* ws; + Transaction* gtxn; + std::string table_name; + std::string row_key; + ErrorCode status; + PrewriteContext(std::vector* same_row_ws, + Transaction* txn, + const std::string& tablename, + const std::string& rowkey) : + ws(same_row_ws), + gtxn(txn), + table_name(tablename), + row_key(rowkey) { + status.SetFailed(ErrorCode::kOK); + } + const std::string DebugString() const { + return "[tablename=" + table_name + ",rowkey=" + row_key + "]" + status.ToString(); + } +}; +// one user reader will have one InternalReaderContext +struct InternalReaderContext { + int expected_cell_cnt; + int active_cell_cnt; + int fail_cell_cnt; + int not_found_cnt; + RowReader* user_reader; + Transaction* gtxn; + std::map cell_map; + RowResult results; + ErrorCode last_err; + + InternalReaderContext(int expected_cnt, RowReader* reader, Transaction* txn) + : expected_cell_cnt(expected_cnt), + active_cell_cnt(0), + fail_cell_cnt(0), + not_found_cnt(0), + user_reader(reader), + gtxn(txn) {} +}; +// one cell reader will have one CellReaderContext +struct CellReaderContext { + Cell* cell; + InternalReaderContext* internal_reader_ctx; + ErrorCode status; + CellReaderContext(Cell* c, InternalReaderContext* ctx) + : cell(c), + internal_reader_ctx(ctx) {} +}; + +class Cell { +public: + Cell(tera::Table* table, + const std::string& row_key, + const std::string& column_family, + const std::string& qualifier, + const int64_t timestamp = 0, + const std::string& value = "") : + table_(table), + row_key_(row_key), + column_family_(column_family), + qualifier_(qualifier), + timestamp_(timestamp), + value_(value), + tablename_("") { + + assert(table_ != NULL); + tablename_ = table_->GetName(); + } + + tera::Table* Table() const { return table_; } + + const std::string TableName() const { return tablename_; } + const std::string& RowKey() const { return row_key_; } + const std::string& ColFamily() const { return column_family_; } + const std::string& Qualifier() const { return qualifier_; } + const std::string LockName() const { return PackLockName(qualifier_); } + const std::string WriteName() const { return PackWriteName(qualifier_); } + const std::string NotifyName() const { return PackNotifyName(column_family_, qualifier_); } + const int64_t Timestamp() const { return timestamp_; } + void SetTimestamp(const int64_t timestamp) { + timestamp_ = timestamp; + } + const std::string& Value() const { return value_; } + void SetValue(const std::string& value) { + value_ = value; + } +private: + tera::Table* table_; + std::string row_key_; + std::string column_family_; + std::string qualifier_; + int64_t timestamp_; + std::string value_; + std::string tablename_; +}; + +class Write { +public: + Write(const Cell& cell, const int& type = 0) + : cell_(cell), + type_(type), + is_primary_(false) {} + + int WriteType() const { return type_; } + bool IsPrimary() const { return is_primary_; } + tera::Table* Table() const { return cell_.Table(); } + const std::string TableName() const { return cell_.TableName(); } + const std::string& RowKey() const { return cell_.RowKey(); } + const std::string& ColFamily() const { return cell_.ColFamily(); } + const std::string& Qualifier() const { return cell_.Qualifier(); } + const std::string LockName() const { return cell_.LockName(); } + const std::string WriteName() const { return cell_.WriteName(); } + const std::string NotifyName() const { return cell_.NotifyName(); } + const int64_t Timestamp() const { return cell_.Timestamp(); } + const std::string& Value() const { return cell_.Value(); } + const int64_t GetSize() { + return cell_.RowKey().length() + cell_.ColFamily().length() + + cell_.Qualifier().length() + cell_.Value().length(); + } + bool IsSameRow(Write* w) { + return RowKey() == w->RowKey() + && Table() == w->Table(); + } + + void Serialize(const int64_t start_ts, + const std::string& session, + std::string* primary_info) { + tera::PrimaryInfo primary; + primary.set_table_name(TableName()); + primary.set_row_key(RowKey()); + primary.set_column_family(ColFamily()); + primary.set_qualifier(Qualifier()); + primary.set_gtxn_start_ts(start_ts); + primary.set_client_session(session), + primary.SerializeToString(primary_info); + } + + const std::string DebugString() const { + std::stringstream ss; + ss <<"[" << TableName() << ":" << RowKey() << ":" << ColFamily() + << ":" << Qualifier() << "]"; + return ss.str(); + } + +private: + tera::Cell cell_; + int type_; + bool is_primary_; +}; + +class GlobalTxnInternal { +public: + friend class GlobalTxn; + GlobalTxnInternal(tera::Client* client); + + ~GlobalTxnInternal(); + // for common + void SetStartTimestamp(int64_t ts); + + bool CheckTable(Table* table, ErrorCode* status); + + Table* FindTable(const std::string& tablename); + + bool IsPrimary(const tera::Cell& cell, + const tera::PrimaryInfo& primary_info); + + bool IsGTxnColumnFamily(const std::string& tablename, + const std::string& column_family); + + // for get + bool VerifyUserRowReader(RowReader* user_reader); + + bool PrimaryIsLocked(const tera::PrimaryInfo& primary_info, + const int64_t lock_ts, + ErrorCode* status); + + bool IsLockedByOthers(RowReader::TRow& row, const tera::Cell& cell); + + bool SuspectLive(const tera::PrimaryInfo& primary_info); + + // for prewrite + void BuildRowReaderForPrewrite(const std::vector& ws, RowReader* reader); + + void BuildRowMutationForPrewrite(std::vector* ws, + RowMutation* txn_mu, + const std::string& primary_info); + + bool ConflictWithOtherWrite(const std::vector* ws, + RowReader* reader, + ErrorCode* status); + + // for applyMutation + bool VerifyUserRowMutation(RowMutation* user_mu); + bool VerifyWritesSize(RowMutation* user_mu, int64_t* size); + + // for commit + void BuildRowMutationForCommit(std::vector* ws, + RowMutation* txn_mu, + const int64_t commit_ts); + + void BuildRowMutationForAck(std::vector* ws, RowMutation* txn_mu); + + void BuildRowMutationForNotify(std::vector* ws, + RowMutation* txn_mu, + const int64_t commit_ts); + + void SetPrewriteStartTimestamp(const int64_t prewrite_start_ts); + + // for timeout + void SetCommitDuration(int64_t timeout_ms); + void SetInternalSdkTaskTimeout(RowMutation* mutation); + void SetInternalSdkTaskTimeout(RowReader* reader); + bool IsTimeOut(); + + // for other transaction alive + std::string GetClientSession(); +private: + // for pref + void UpdateTimerCounter(Counter* c) { + c->Set(get_micros() - c->Get()); + } + + // for debug and test + std::string DebugString(const tera::Cell& cell, const std::string& msg) const ; + int64_t TEST_Init(const std::string& conf_file); + void TEST_Sleep(); + void TEST_GetSleep(); + void TEST_Destory(); + int64_t TEST_GetCommitTimestamp(); + int64_t TEST_GetPrewriteStartTimestamp(); + + void PerfReadDelay(int64_t begin_time, int64_t finish_time); + void PerfCommitDelay(int64_t begin_time, int64_t finish_time); + void PerfPrewriteDelay(int64_t begin_time, int64_t finish_time); + void PerfPrimaryCommitDelay(int64_t begin_time, int64_t finish_time); + void PerfSecondariesCommitDelay(int64_t begin_time, int64_t finish_time); + void PerfAckDelay(int64_t begin_time, int64_t finish_time); + void PerfNotifyDelay(int64_t begin_time, int64_t finish_time); + + void PerfReport(); +private: + GlobalTxnInternal(const GlobalTxnInternal&) = delete; + GlobalTxnInternal& operator=(const GlobalTxnInternal&) = delete; + // for test + GlobalTxnTestHelper* TEST_GtxnTestHelper_; + // tablename-> (Table*, set(gtxn_cf_name)) + typedef std::map > > TableInfoMap; + TableInfoMap tables_; + mutable Mutex tables_mu_; + int64_t start_ts_; + int64_t prewrite_start_ts_; + + // for record this transaction perf + Counter read_cost_time_; + Counter commit_cost_time_; + Counter prewrite_cost_time_; + Counter primary_cost_time_; + Counter secondaries_cost_time_; + Counter acks_cost_time_; + Counter notifies_cost_time_; + + int64_t terminal_time_; + std::atomic is_timeout_; + tera::Client* client_; +}; + +} // namespace tera + +#endif // TERA_SDK_GLOBAL_TXN_INTERNAL_H_ diff --git a/src/sdk/http/http.cc b/src/sdk/http/http.cc index 562c647f3..da7b571c0 100644 --- a/src/sdk/http/http.cc +++ b/src/sdk/http/http.cc @@ -13,7 +13,7 @@ #include "proto/http.pb.h" #include "tera.h" -#include "utils/counter.h" +#include "common/counter.h" DECLARE_int32(tera_http_ctrl_thread_num); DECLARE_int32(tera_http_request_thread_num); diff --git a/src/sdk/multi_row_txn.cc b/src/sdk/multi_row_txn.cc deleted file mode 100644 index 7f9b1a8c8..000000000 --- a/src/sdk/multi_row_txn.cc +++ /dev/null @@ -1,79 +0,0 @@ -// Copyright (c) 2017, Baidu.com, Inc. All Rights Reserved -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. - -#include "common/thread_pool.h" - -#include "sdk/read_impl.h" -#include "sdk/single_row_txn.h" -#include "sdk/table_impl.h" -#include "sdk/multi_row_txn.h" - -namespace tera { - -Transaction* NewTransaction() { - return MultiRowTxn::NewMultiRowTxn(); -} - -Transaction* MultiRowTxn::NewMultiRowTxn() { - // int64_t start_ts = TimeOracle::GetTimestamp(); - int64_t start_ts = 42; - if (start_ts > 0) { - return new MultiRowTxn(start_ts); - } else { - return NULL; - } -} - -MultiRowTxn::MultiRowTxn(int64_t start_ts) - : start_ts_(start_ts) {} - -MultiRowTxn::~MultiRowTxn() {} - -std::string LockColumnName(const std::string& c) { - return c + "__l__"; // lock -} - -std::string WriteColumnName(const std::string& c) { - return c + "__w__"; // write -} - -bool MultiRowTxn::IsWritingByOthers(RowMutation* row_mu, RowReader* reader) { - return false; -} - -bool MultiRowTxn::IsLockedByOthers(RowMutation* row_mu, RowReader* reader) { - return false; -} - -ErrorCode MultiRowTxn::Prewrite(RowMutation* w, RowMutation* primary) { - ErrorCode status; - return status; -} - -bool MultiRowTxn::LockExists(tera::Transaction* single_row_txn, RowMutation* row_mu) { - return false; -} - -ErrorCode MultiRowTxn::Commit() { - assert(writes_.size() > 0); - - ErrorCode status; - return status; -} - -void MultiRowTxn::ApplyMutation(RowMutation* row_mu) { - assert(row_mu != NULL); - writes_.push_back(row_mu); -} - -ErrorCode MultiRowTxn::Get(RowReader* row_reader) { - assert(row_reader != NULL); - - ErrorCode status; - return status; -} - -} // namespace tera - -/* vim: set expandtab ts=4 sw=4 sts=4 tw=100: */ diff --git a/src/sdk/multi_row_txn.h b/src/sdk/multi_row_txn.h deleted file mode 100644 index acc9998a6..000000000 --- a/src/sdk/multi_row_txn.h +++ /dev/null @@ -1,56 +0,0 @@ -// Copyright (c) 2016, Baidu.com, Inc. All Rights Reserved -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. - -#ifndef TERA_SDK_TXN_H_ -#define TERA_SDK_TXN_H_ - -#include -#include - -#include "tera.h" - -namespace tera { - -/// cross-row, cross-table transaction -/// 跨行,跨表事务 - -class MultiRowTxn: public Transaction { -public: - static Transaction* NewMultiRowTxn(); - virtual ~MultiRowTxn(); - - virtual ErrorCode Get(RowReader* row_reader); - virtual void ApplyMutation(RowMutation* row_mu); - /// 提交事务 - /// 同步模式下,Commit()的返回值代表了提交操作的结果(成功 或者 失败及其原因) - /// 异步模式下,通过GetError()获取提交结果 - virtual ErrorCode Commit(); - - typedef void (*Callback)(Transaction* transaction); - virtual void SetCommitCallback(Callback callback) {} - virtual Callback GetCommitCallback() { return NULL; } - virtual void SetContext(void* context) {} - virtual void* GetContext() { return NULL; } - virtual const ErrorCode& GetError() { return status_; } - virtual int64_t GetStartTimestamp() { return 0; } - -private: - MultiRowTxn(int64_t start_ts); - MultiRowTxn(const MultiRowTxn&); - void operator=(const MultiRowTxn&); - - bool IsWritingByOthers(RowMutation* row_mu, RowReader* reader); - bool IsLockedByOthers(RowMutation* row_mu, RowReader* reader); - bool LockExists(tera::Transaction* single_row_txn, RowMutation* row_mu); - ErrorCode Prewrite(RowMutation* w, RowMutation* primary); - -private: - int64_t start_ts_; - std::vector writes_; - ErrorCode status_; -}; - -} // namespace tera - -#endif // TERA_SDK_TXN_H_ diff --git a/src/sdk/mutate_impl.cc b/src/sdk/mutate_impl.cc index a90f850d8..634fb3817 100644 --- a/src/sdk/mutate_impl.cc +++ b/src/sdk/mutate_impl.cc @@ -5,7 +5,7 @@ #include "common/base/string_format.h" #include "io/coding.h" #include "sdk/mutate_impl.h" -#include "utils/timer.h" +#include "common/timer.h" namespace tera { diff --git a/src/sdk/mutate_impl.h b/src/sdk/mutate_impl.h index 9b22af41f..c86a98c8d 100644 --- a/src/sdk/mutate_impl.h +++ b/src/sdk/mutate_impl.h @@ -13,7 +13,7 @@ #include "sdk/sdk_task.h" #include "tera.h" #include "types.h" -#include "utils/timer.h" +#include "common/timer.h" namespace tera { diff --git a/src/sdk/read_impl.cc b/src/sdk/read_impl.cc index 352e645b0..35738cc53 100644 --- a/src/sdk/read_impl.cc +++ b/src/sdk/read_impl.cc @@ -19,6 +19,7 @@ RowReaderImpl::RowReaderImpl(TableImpl* table, const std::string& row_key) ts_start_(kOldestTs), ts_end_(kLatestTs), max_version_(1), + max_qualifiers_(std::numeric_limits::max()), snapshot_id_(0), timeout_ms_(0), retry_times_(0), @@ -78,6 +79,12 @@ uint32_t RowReaderImpl::GetMaxVersions() { return max_version_; } +void RowReaderImpl::SetMaxQualifiers(uint64_t max_qualifiers) { + max_qualifiers_ = max_qualifiers; +} +uint64_t RowReaderImpl::GetMaxQualifiers() { + return max_qualifiers_; +} /// 设置超时时间(只影响当前操作,不影响Table::SetReadTimeout设置的默认读超时) void RowReaderImpl::SetTimeOut(int64_t timeout_ms) { @@ -303,6 +310,7 @@ const RowReader::ReadColumnList& RowReaderImpl::GetReadColumnList() { void RowReaderImpl::ToProtoBuf(RowReaderInfo* info) { info->set_key(row_key_); info->set_max_version(max_version_); + info->set_max_qualifiers(max_qualifiers_); info->mutable_time_range()->set_ts_start(ts_start_); info->mutable_time_range()->set_ts_end(ts_end_); diff --git a/src/sdk/read_impl.h b/src/sdk/read_impl.h index cf88cd65c..23dabcda1 100644 --- a/src/sdk/read_impl.h +++ b/src/sdk/read_impl.h @@ -13,7 +13,7 @@ #include "sdk/sdk_task.h" #include "tera.h" #include "types.h" -#include "utils/timer.h" +#include "common/timer.h" namespace tera { @@ -44,6 +44,8 @@ class RowReaderImpl : public RowReader, public SdkTask { void SetMaxVersions(uint32_t max_version); /// 返回max_version uint32_t GetMaxVersions(); + void SetMaxQualifiers(uint64_t max_qualifiers); + uint64_t GetMaxQualifiers(); /// 设置超时时间(只影响当前操作,不影响Table::SetReadTimeout设置的默认读超时) void SetTimeOut(int64_t timeout_ms); /// 设置异步回调, 操作会异步返回 @@ -120,6 +122,8 @@ class RowReaderImpl : public RowReader, public SdkTask { Table* GetTable() { return (Table*)table_; } + uint32_t Size() { return 0; } + private: TableImpl* table_; std::string row_key_; @@ -137,6 +141,7 @@ class RowReaderImpl : public RowReader, public SdkTask { int64_t ts_start_; int64_t ts_end_; uint32_t max_version_; + uint64_t max_qualifiers_; uint64_t snapshot_id_; int64_t timeout_ms_; diff --git a/src/sdk/rowlock_client.cc b/src/sdk/rowlock_client.cc new file mode 100644 index 000000000..ff145eeb5 --- /dev/null +++ b/src/sdk/rowlock_client.cc @@ -0,0 +1,140 @@ +// Copyright (c) 2017, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "sdk/rowlock_client.h" + +#include +#include + +#include "gflags/gflags.h" + +#include "observer/rowlocknode/ins_rowlock_client_zk_adapter.h" +#include "proto/rowlocknode_rpc.pb.h" +#include "types.h" +#include "utils/utils_cmd.h" + +DECLARE_string(rowlock_server_port); +DECLARE_string(tera_coord_type); +DECLARE_bool(rowlock_test); +DECLARE_int32(rowlock_client_max_fail_times); +DECLARE_bool(mock_rowlock_enable); + +namespace tera{ +namespace observer { + +ThreadPool* RowlockStub::thread_pool_ = NULL; + +void RowlockStub::SetThreadPool(ThreadPool* thread_pool) { + thread_pool_ = thread_pool; +} + +void RowlockStub::SetRpcOption(int32_t max_inflow, int32_t max_outflow, + int32_t pending_buffer_size, int32_t thread_num) { + tera::RpcClientBase::SetOption(max_inflow, max_outflow, + pending_buffer_size, thread_num); +} + +RowlockStub::RowlockStub(const std::string& server_addr, + int32_t rpc_timeout) + : tera::RpcClient(server_addr), + rpc_timeout_(rpc_timeout) { +} + +RowlockStub::~RowlockStub() {} + +bool RowlockStub::TryLock(const RowlockRequest* request, + RowlockResponse* response, + std::function done) { + return SendMessageWithRetry(&RowlockService::Stub::Lock, + request, response, done, "TryLock", + rpc_timeout_, thread_pool_); +} + +bool RowlockStub::UnLock(const RowlockRequest* request, + RowlockResponse* response, + std::function done) { + return SendMessageWithRetry(&RowlockService::Stub::UnLock, + request, response, done, "UnLock", + rpc_timeout_, thread_pool_); +} + +void RowlockClient::SetThreadPool(ThreadPool* thread_pool) { + RowlockStub::SetThreadPool(thread_pool); +} + +RowlockClient::RowlockClient(const std::string& addr, int32_t rpc_timeout) + : local_addr_(tera::utils::GetLocalHostName() + ":" + FLAGS_rowlock_server_port) { + srand((unsigned int)(time(NULL))); + + SetZkAdapter(); +} + +void RowlockClient::Update(const std::vector& addrs) { + std::string addr = addrs[rand() % addrs.size()]; + std::shared_ptr client(new RowlockStub(addr)); + + MutexLock locker(&client_mutex_); + client_.swap(client); +} + +bool RowlockClient::TryLock(const RowlockRequest* request, + RowlockResponse* response, + std::function done) { + std::shared_ptr client; + { + MutexLock locker(&client_mutex_); + // COW ref +1 + client = client_; + } + for (int32_t i = 0; i < FLAGS_rowlock_client_max_fail_times; ++i) { + bool ret = client->TryLock(request, response, done); + if (ret) { + return true; + } + // rpc fail + SetZkAdapter(); + } + return false; +} + +bool RowlockClient::UnLock(const RowlockRequest* request, + RowlockResponse* response, + std::function done) { + std::shared_ptr client; + { + MutexLock locker(&client_mutex_); + // copy-on-write ref+1 + client = client_; + } + for (int32_t i = 0; i < FLAGS_rowlock_client_max_fail_times; ++i) { + bool ret = client->TryLock(request, response, done); + if (ret) { + return true; + } + // rpc fail + SetZkAdapter(); + } + return false; +} + +void RowlockClient::SetZkAdapter() { + // mock rowlock, do not need a real zk adapter + if (FLAGS_mock_rowlock_enable == true) { + return; + } + + if (FLAGS_tera_coord_type == "zk") { + zk_adapter_.reset(new ZkRowlockClientZkAdapter(this, local_addr_)); + } else if (FLAGS_tera_coord_type == "ins") { + zk_adapter_.reset(new InsRowlockClientZkAdapter(this, local_addr_)); + } else { + LOG(ERROR) << "Unknow coord type for rowlock client"; + return; + } + + zk_adapter_->Init(); +} + +} // namespace observer +} // namespace tera diff --git a/src/sdk/rowlock_client.h b/src/sdk/rowlock_client.h new file mode 100644 index 000000000..c475a180f --- /dev/null +++ b/src/sdk/rowlock_client.h @@ -0,0 +1,77 @@ +// Copyright (c) 2017, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef TERA_SDK_ROWLOCK_CLIENT_H_ +#define TERA_SDK_ROWLOCK_CLIENT_H_ + +#include + +#include +#include + +#include "common/mutex.h" +#include "observer/rowlocknode/zk_rowlock_client_zk_adapter.h" +#include "proto/rpc_client.h" +#include "proto/rowlocknode_rpc.pb.h" + +namespace tera { +namespace observer { + +class RowlockClientZkAdapter; + +class RowlockStub : public tera::RpcClient { +public: + static void SetThreadPool(ThreadPool* thread_pool); + + static void SetRpcOption(int32_t max_inflow = -1, int32_t max_outflow = -1, + int32_t pending_buffer_size = -1, + int32_t thread_num = -1); + + RowlockStub(const std::string& addr = "", int32_t rpc_timeout = 60000); + ~RowlockStub(); + + virtual bool TryLock(const RowlockRequest* request, + RowlockResponse* response, + std::function done = NULL); + + virtual bool UnLock(const RowlockRequest* request, + RowlockResponse* response, + std::function done = NULL); + + +private: + int32_t rpc_timeout_; + static ThreadPool* thread_pool_; +}; + +class RowlockClient { +public: + static void SetThreadPool(ThreadPool* thread_pool); + + RowlockClient(const std::string& addr = "", int32_t rpc_timeout = 60000); + ~RowlockClient() {} + + virtual bool TryLock(const RowlockRequest* request, + RowlockResponse* response, + std::function done = NULL); + + virtual bool UnLock(const RowlockRequest* request, + RowlockResponse* response, + std::function done = NULL); + + void Update(const std::vector& addrs); + +private: + void SetZkAdapter(); + +private: + mutable Mutex client_mutex_; + std::shared_ptr client_; + std::unique_ptr zk_adapter_; + std::string local_addr_; +}; + +} // namespace observer +} // namespace tera +#endif // TERA_SDK_ROWLOCK_CLIENT_H diff --git a/src/sdk/scan.cc b/src/sdk/scan.cc index 846cc7044..f4b630216 100644 --- a/src/sdk/scan.cc +++ b/src/sdk/scan.cc @@ -31,6 +31,10 @@ void ScanDescriptor::SetMaxVersions(int32_t versions) { impl_->SetMaxVersions(versions); } +void ScanDescriptor::SetMaxQualifiers(uint64_t max_qualifiers) { + impl_->SetMaxQualifiers(max_qualifiers); +} + void ScanDescriptor::SetPackInterval(int64_t interval) { impl_->SetPackInterval(interval); } diff --git a/src/sdk/scan_impl.cc b/src/sdk/scan_impl.cc index 786a05547..68049b017 100644 --- a/src/sdk/scan_impl.cc +++ b/src/sdk/scan_impl.cc @@ -5,6 +5,7 @@ #include "sdk/scan_impl.h" #include +#include #include "common/this_thread.h" #include "common/base/string_ext.h" @@ -14,16 +15,18 @@ #include "sdk/filter_utils.h" #include "sdk/sdk_utils.h" #include "sdk/table_impl.h" -#include "utils/atomic.h" -#include "utils/timer.h" +#include "common/atomic.h" +#include "common/timer.h" DECLARE_bool(tera_sdk_batch_scan_enabled); DECLARE_int64(tera_sdk_scan_number_limit); DECLARE_int64(tera_sdk_scan_buffer_size); DECLARE_int32(tera_sdk_max_batch_scan_req); DECLARE_int32(tera_sdk_batch_scan_max_retry); +DECLARE_int32(tera_sdk_sync_scan_max_retry); DECLARE_int64(tera_sdk_scan_timeout); DECLARE_int64(batch_scan_delay_retry_in_us); +DECLARE_int64(sync_scan_delay_retry_in_ms); namespace tera { @@ -374,6 +377,7 @@ ResultStreamSyncImpl::ResultStreamSyncImpl(TableImpl* table, response_(new tera::ScanTabletResponse), result_pos_(0), finish_cond_(&finish_mutex_), + retry_times_(0), finish_(false) { table_ptr_->ScanTabletSync(this); } @@ -392,13 +396,37 @@ bool ResultStreamSyncImpl::Done(ErrorCode* err) { while (1) { const string& scan_end_key = scan_desc_impl_->GetEndRowKey(); /// scan failed - if (response_->status() != kTabletNodeOk) { + while (response_->status() != kTabletNodeOk && + retry_times_ <= FLAGS_tera_sdk_sync_scan_max_retry) { + LOG(WARNING) << "[RETRY " << ++retry_times_ << "] scan error: " + << StatusCodeToString(response_->status()); + + int64_t wait_time; + if(response_->status() == kKeyNotInRange) { + wait_time = FLAGS_sync_scan_delay_retry_in_ms; + } else { + /// Wait less than 60 seconds + wait_time = std::min(static_cast(FLAGS_sync_scan_delay_retry_in_ms * (1 << (retry_times_ - 1))), + static_cast(60000)); + } + + delete response_; + response_ = new tera::ScanTabletResponse; + result_pos_ = 0; + Reset(); + + ThisThread::Sleep(wait_time); + table_ptr_->ScanTabletSync(this); + } + + if(response_->status() != kTabletNodeOk) { if (err) { err->SetFailed(ErrorCode::kSystem, - StatusCodeToString(response_->status())); + StatusCodeToString(response_->status())); } return true; } + if (result_pos_ < response_->results().key_values_size()) { break; } @@ -542,6 +570,7 @@ ScanDescImpl::ScanDescImpl(const string& rowkey) number_limit_(FLAGS_tera_sdk_scan_number_limit), is_async_(FLAGS_tera_sdk_batch_scan_enabled), max_version_(1), + max_qualifiers_(std::numeric_limits::max()), pack_interval_(FLAGS_tera_sdk_scan_timeout), snapshot_(0), value_converter_(&DefaultValueConverter) { @@ -558,6 +587,7 @@ ScanDescImpl::ScanDescImpl(const ScanDescImpl& impl) number_limit_(impl.number_limit_), is_async_(impl.is_async_), max_version_(impl.max_version_), + max_qualifiers_(impl.max_qualifiers_), pack_interval_(impl.pack_interval_), snapshot_(impl.snapshot_), table_schema_(impl.table_schema_) { @@ -622,6 +652,10 @@ void ScanDescImpl::SetMaxVersions(int32_t versions) { max_version_ = versions; } +void ScanDescImpl::SetMaxQualifiers(int64_t max_qualifiers) { + max_qualifiers_ = max_qualifiers; +} + void ScanDescImpl::SetPackInterval(int64_t interval) { pack_interval_ = interval; } @@ -693,6 +727,10 @@ int32_t ScanDescImpl::GetMaxVersion() const { return max_version_; } +int64_t ScanDescImpl::GetMaxQualifiers() const { + return max_qualifiers_; +} + int64_t ScanDescImpl::GetPackInterval() const { return pack_interval_; } diff --git a/src/sdk/scan_impl.h b/src/sdk/scan_impl.h index 2d808044f..32d647c8b 100644 --- a/src/sdk/scan_impl.h +++ b/src/sdk/scan_impl.h @@ -16,7 +16,7 @@ #include "sdk/sdk_task.h" #include "tera.h" #include "types.h" -#include "utils/timer.h" +#include "common/timer.h" namespace tera { @@ -162,21 +162,10 @@ class ResultStreamSyncImpl : public ResultStreamImpl { int32_t result_pos_; mutable Mutex finish_mutex_; common::CondVar finish_cond_; + int32_t retry_times_; bool finish_; }; -struct ScanTask : public SdkTask { - ResultStreamImpl* stream; - tera::ScanTabletRequest* request; - tera::ScanTabletResponse* response; - - uint32_t retry_times; - void IncRetryTimes() { retry_times++; } - uint32_t RetryTimes() { return retry_times; } - ScanTask() : SdkTask(SdkTask::SCAN), stream(NULL), request(NULL), - response(NULL), retry_times(0) {} -}; - typedef ScanDescriptor::ValueConverter ValueConverter; class ScanDescImpl { @@ -195,6 +184,8 @@ class ScanDescImpl { void SetMaxVersions(int32_t versions); + void SetMaxQualifiers(int64_t max_qualifiers); + void SetPackInterval(int64_t timeout); void SetTimeRange(int64_t ts_end, int64_t ts_start); @@ -238,6 +229,8 @@ class ScanDescImpl { int32_t GetMaxVersion() const; + int64_t GetMaxQualifiers() const; + int64_t GetPackInterval() const; uint64_t GetSnapshot() const; @@ -272,6 +265,7 @@ class ScanDescImpl { int64_t number_limit_; bool is_async_; int32_t max_version_; + int64_t max_qualifiers_; int64_t pack_interval_; uint64_t snapshot_; std::string filter_string_; @@ -280,6 +274,26 @@ class ScanDescImpl { TableSchema table_schema_; }; +struct ScanTask : public SdkTask { + ResultStreamImpl* stream; + tera::ScanTabletRequest* request; + tera::ScanTabletResponse* response; + + uint32_t retry_times; + void IncRetryTimes() { retry_times++; } + uint32_t RetryTimes() { return retry_times; } + ScanTask() : SdkTask(SdkTask::SCAN), stream(NULL), request(NULL), + response(NULL), retry_times(0) {} + + virtual bool IsAsync() { return false; } + virtual uint32_t Size() { return 0; } + virtual int64_t TimeOut() { return 0; } + virtual void Wait() {} + virtual void SetError(ErrorCode::ErrorCodeType err, + const std::string& reason) {} + virtual const std::string& RowKey() { return stream->GetScanDesc()->GetStartRowKey(); } +}; + } // namespace tera #endif // TERA_SDK_SCAN_IMPL_H_ diff --git a/src/sdk/schema_impl.cc b/src/sdk/schema_impl.cc index 7e9e3b264..bf8cc6f00 100644 --- a/src/sdk/schema_impl.cc +++ b/src/sdk/schema_impl.cc @@ -14,6 +14,7 @@ DECLARE_int64(tera_master_merge_tablet_size); namespace tera { const std::string TableDescImpl::DEFAULT_LG_NAME = "lg0"; +const std::string TableDescImpl::NOTIFY_LG_NAME = "notify"; const std::string TableDescImpl::DEFAULT_CF_NAME = ""; /// 列族名字仅允许使用字母、数字和下划线构造, 长度不超过256 @@ -29,7 +30,9 @@ CFDescImpl::CFDescImpl(const std::string& cf_name, acl_(0), owner_(0), disk_quota_(-1), - type_("") { + type_(""), + is_global_transaction_(false), + is_notify_enabled_(false) { } int32_t CFDescImpl::Id() const { @@ -88,6 +91,30 @@ ACL CFDescImpl::Acl() const { return ACL(); } +void CFDescImpl::EnableGlobalTransaction() { + is_global_transaction_ = true; +} + +void CFDescImpl::DisableGlobalTransaction() { + is_global_transaction_ = false; +} + +bool CFDescImpl::GlobalTransaction() const { + return is_global_transaction_; +} + +void CFDescImpl::EnableNotify() { + is_notify_enabled_ = true; +} + +void CFDescImpl::DisableNotify() { + is_notify_enabled_ = false; +} + +bool CFDescImpl::IsNotifyEnabled() const { + return is_notify_enabled_; +} + void CFDescImpl::SetType(const std::string& type) { type_ = type; } diff --git a/src/sdk/schema_impl.h b/src/sdk/schema_impl.h index a68a09f77..dcdb6c9a6 100644 --- a/src/sdk/schema_impl.h +++ b/src/sdk/schema_impl.h @@ -48,6 +48,18 @@ class CFDescImpl : public ColumnFamilyDescriptor { ACL Acl() const; + void EnableGlobalTransaction(); + + void DisableGlobalTransaction(); + + bool GlobalTransaction() const; + + void EnableNotify(); + + void DisableNotify(); + + bool IsNotifyEnabled() const; + void SetType(const std::string& type); const std::string& Type() const; @@ -63,6 +75,8 @@ class CFDescImpl : public ColumnFamilyDescriptor { int32_t owner_; int32_t disk_quota_; std::string type_; + bool is_global_transaction_; + bool is_notify_enabled_; }; /// 局部性群组描述 @@ -192,6 +206,7 @@ class TableDescImpl { std::string Alias() const; static const std::string DEFAULT_LG_NAME; + static const std::string NOTIFY_LG_NAME; static const std::string DEFAULT_CF_NAME; private: diff --git a/src/sdk/sdk_metric_name.h b/src/sdk/sdk_metric_name.h new file mode 100644 index 000000000..5b358e912 --- /dev/null +++ b/src/sdk/sdk_metric_name.h @@ -0,0 +1,58 @@ +// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef TERA_SDK_METRIC_NAME_H_ +#define TERA_SDK_METRIC_NAME_H_ + +#include + +#include "common/metric/hardware_collectors.h" + +namespace tera { + +// global transaction labels +const char* const kGTxnLabelRead = "gtxn:read"; +const char* const kGTxnLabelCommit = "gtxn:commit"; +const char* const kGTxnLabelTso = "gtxn:tso"; + +// glabel transaction read metric names +const char* const kGTxnReadDelayMetric = "tera_sdk_gtxn_read_delay_us"; +const char* const kGTxnReadCountMetric = "tera_sdk_gtxn_read_count"; +const char* const kGTxnReadFailCountMetric = "tera_sdk_gtxn_read_fail_count"; +const char* const kGTxnReadRetryCountMetric = "tera_sdk_gtxn_read_retry_count"; +const char* const kGTxnReadRollBackCountMetric = "tera_sdk_gtxn_read_rollback_count"; +const char* const kGTxnReadRollForwardCountMetric = "tera_sdk_gtxn_read_rollforward_count"; + +// global transaction commit metric names +const char* const kGTxnCommitDelayMetric = "tera_sdk_gtxn_commit_delay_us"; +const char* const kGTxnCommitCountMetric = "tera_sdk_gtxn_commit_count"; +const char* const kGTxnCommitFailCountMetric = "tera_sdk_gtxn_commit_fail_count"; + +const char* const kGTxnPrewriteDelayMetric = "tera_sdk_gtxn_prewrite_delay_us"; +const char* const kGTxnPrewriteCountMetric = "tera_sdk_gtxn_prewrite_count"; +const char* const kGTxnPrewriteFailCountMetric = "tera_sdk_gtxn_prewrite_fail_count"; + +const char* const kGTxnPrimaryDelayMetric = "tera_sdk_gtxn_primary_delay_us"; +const char* const kGTxnPrimaryCountMetric = "tera_sdk_gtxn_primary_count"; +const char* const kGTxnPrimaryFailCountMetric = "tera_sdk_gtxn_primary_fail_count"; + +const char* const kGTxnSecondariesDelayMetric = "tera_sdk_gtxn_secondaries_delay_us"; +const char* const kGTxnSecondariesCountMetric = "tera_sdk_gtxn_secondaries_count"; +const char* const kGTxnSecondariesFailCountMetric = "tera_sdk_gtxn_secondaries_fail_count"; + +const char* const kGTxnAcksDelayMetric = "tera_sdk_gtxn_acks_delay_us"; +const char* const kGTxnAcksCountMetric = "tera_sdk_gtxn_acks_count"; +const char* const kGTxnAcksFailCountMetric = "tera_sdk_gtxn_acks_fail_count"; + +const char* const kGTxnNotifiesDelayMetric = "tera_sdk_gtxn_notifies_delay_us"; +const char* const kGTxnNotifiesCountMetric = "tera_sdk_gtxn_notifies_count"; +const char* const kGTxnNotifiesFailCountMetric = "tera_sdk_gtxn_notifies_fail_count"; + +const char* const kGTxnTsoDelayMetric = "tera_sdk_gtxn_tso_delay_us"; +const char* const kGTxnTsoRequestCountMetric = "tera_sdk_gtxn_tso_request_count"; +} // end namespace tera + +#endif // TERA_SDK_METRIC_NAME_H_ + +/* vim: set expandtab ts=4 sw=4 sts=4 tw=100: */ diff --git a/src/sdk/sdk_perf.cc b/src/sdk/sdk_perf.cc new file mode 100644 index 000000000..7cc5704d8 --- /dev/null +++ b/src/sdk/sdk_perf.cc @@ -0,0 +1,85 @@ +// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "sdk/sdk_perf.h" + +#include "gflags/gflags.h" + +#include "common/metric/metric_counter.h" +#include "sdk/sdk_metric_name.h" + +namespace tera { +namespace sdk { + +void PerfCollecter::DumpLog() { + std::shared_ptr latest_report = CollectorReportPublisher::GetInstance().GetCollectorReport(); + int64_t interval = latest_report->interval_ms; + if (interval <= 0) { + // maybe happen at first report, the metric values must be 0 + // set to any non-zero value to avoid div 0 + VLOG(16) << "Metric Report interval is 0"; + interval = 1000; + } + int64_t read_delay = latest_report->FindMetricValue(kGTxnReadDelayMetric, kGTxnLabelRead); + int64_t read_cnt = latest_report->FindMetricValue(kGTxnReadCountMetric, kGTxnLabelRead); + read_delay = read_cnt > 0 ? read_delay / read_cnt : 0; + + LOG(INFO) << "[perf][gtxn] " + << "read_delay " << read_delay << " read_cnt " << read_cnt << " read_fail " + << latest_report->FindMetricValue(kGTxnReadFailCountMetric, kGTxnLabelRead) + << " read_retry_cnt " + << latest_report->FindMetricValue(kGTxnReadRetryCountMetric, kGTxnLabelRead) + << " read_rollback_cnt " + << latest_report->FindMetricValue(kGTxnReadRollBackCountMetric, kGTxnLabelRead) + << " read_rollforward_cnt " + << latest_report->FindMetricValue(kGTxnReadRollForwardCountMetric, kGTxnLabelRead); + + int64_t commit_delay = latest_report->FindMetricValue(kGTxnCommitDelayMetric, kGTxnLabelCommit); + int64_t commit_cnt = latest_report->FindMetricValue(kGTxnCommitCountMetric, kGTxnLabelCommit); + commit_delay = commit_cnt > 0 ? commit_delay / commit_cnt : 0; + + int64_t prewrite_delay = latest_report->FindMetricValue(kGTxnPrewriteDelayMetric, kGTxnLabelCommit); + int64_t prewrite_cnt = latest_report->FindMetricValue(kGTxnPrewriteCountMetric, kGTxnLabelCommit); + prewrite_delay = prewrite_cnt > 0 ? prewrite_delay / prewrite_cnt : 0; + + int64_t primary_delay = latest_report->FindMetricValue(kGTxnPrimaryDelayMetric, kGTxnLabelCommit); + int64_t primary_cnt = latest_report->FindMetricValue(kGTxnPrimaryCountMetric, kGTxnLabelCommit); + primary_delay = primary_cnt > 0 ? primary_delay / primary_cnt : 0; + + int64_t secondaries_delay = latest_report->FindMetricValue(kGTxnSecondariesDelayMetric, kGTxnLabelCommit); + int64_t secondaries_cnt = latest_report->FindMetricValue(kGTxnSecondariesCountMetric, kGTxnLabelCommit); + secondaries_delay = secondaries_cnt > 0 ? secondaries_delay / secondaries_cnt : 0; + + LOG(INFO) << "[perf][gtxn] " + << "commit_delay " << commit_delay << " commit_cnt " << commit_cnt << " commit_fail " + << latest_report->FindMetricValue(kGTxnCommitFailCountMetric, kGTxnLabelCommit) + << " prew_delay " << prewrite_delay << " prew_cnt " << prewrite_cnt << " prew_fail " + << latest_report->FindMetricValue(kGTxnPrewriteFailCountMetric, kGTxnLabelCommit) + << " pri_delay " << primary_delay << " pri_cnt " << primary_cnt << " pri_fail " + << latest_report->FindMetricValue(kGTxnPrimaryFailCountMetric, kGTxnLabelCommit) + << " se_delay " << secondaries_delay << " se_cnt " << secondaries_cnt << " se_fail " + << latest_report->FindMetricValue(kGTxnSecondariesFailCountMetric, kGTxnLabelCommit); + + int64_t tso_delay = latest_report->FindMetricValue(kGTxnTsoDelayMetric, kGTxnLabelTso); + int64_t tso_cnt = latest_report->FindMetricValue(kGTxnTsoRequestCountMetric, kGTxnLabelTso); + tso_delay = tso_cnt > 0 ? tso_delay / tso_cnt : 0; + LOG(INFO) << "[perf][gtxn] tso_delay " << tso_delay << " tso_cnt " << tso_cnt; + + int64_t notify_delay = latest_report->FindMetricValue(kGTxnNotifiesDelayMetric, kGTxnLabelCommit); + int64_t notify_cnt = latest_report->FindMetricValue(kGTxnNotifiesCountMetric, kGTxnLabelCommit); + notify_delay = notify_cnt > 0 ? notify_delay / notify_cnt : 0; + + int64_t ack_delay = latest_report->FindMetricValue(kGTxnAcksDelayMetric, kGTxnLabelCommit); + int64_t ack_cnt = latest_report->FindMetricValue(kGTxnAcksCountMetric, kGTxnLabelCommit); + ack_delay = ack_cnt > 0 ? ack_delay / ack_cnt : 0; + + LOG(INFO) << "[perf][gtxn] " + << "notify_delay " << notify_delay << " notify_cnt " << notify_cnt << " notify_fail " + << latest_report->FindMetricValue(kGTxnNotifiesFailCountMetric, kGTxnLabelCommit) + << " ack_delay " << ack_delay << " ack_cnt " << ack_cnt << " ack_fail " + << latest_report->FindMetricValue(kGTxnAcksFailCountMetric, kGTxnLabelCommit); +} + +} // namespace sdk +} // namespace tera diff --git a/src/sdk/sdk_perf.h b/src/sdk/sdk_perf.h new file mode 100644 index 000000000..d6b756a9e --- /dev/null +++ b/src/sdk/sdk_perf.h @@ -0,0 +1,54 @@ +// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef TERA_SDK_SDK_PERF_H_ +#define TERA_SDK_SDK_PERF_H_ + +#include "gflags/gflags.h" +#include "glog/logging.h" + +#include "common/metric/metric_counter.h" +#include "common/metric/collector_report.h" +#include "common/thread.h" +#include "common/this_thread.h" +#include "tera.h" + +DECLARE_int32(tera_sdk_perf_collect_interval); + +namespace tera { +namespace sdk { + +class PerfCollecter { +public: + PerfCollecter() : stopped_(false){} + ~PerfCollecter() {} + + void Run() { + thread_.Start(std::bind(&PerfCollecter::ScheduleCollect, this)); + } + + void Stop() { + stopped_ = true; + thread_.Join(); + } + +private: + void ScheduleCollect() { + while (!stopped_) { + CollectorReportPublisher::GetInstance().Refresh(); + DumpLog(); + ThisThread::Sleep(FLAGS_tera_sdk_perf_collect_interval); + } + } + + void DumpLog(); +private: + common::Thread thread_; + bool stopped_; +}; + +} // namespace sdk +} // namespace tera + +#endif // TERA_SDK_SDK_PERF_H_ diff --git a/src/sdk/sdk_task.cc b/src/sdk/sdk_task.cc index ce1d64e2d..834bb4a97 100644 --- a/src/sdk/sdk_task.cc +++ b/src/sdk/sdk_task.cc @@ -6,7 +6,7 @@ #include -#include "utils/timer.h" +#include "common/timer.h" DECLARE_int32(tera_sdk_timeout_precision); @@ -124,7 +124,8 @@ SdkTask* SdkTimeoutManager::PopTask(int64_t task_id) { SdkTask* task = it->second; CHECK_EQ(task->GetId(), task_id); map.id_hash_map.erase(it); - map.due_time_map.erase(task); + // make sure that we only erased the right one element + assert(map.due_time_map.erase(task) == 1); return task; } else { return NULL; diff --git a/src/sdk/sdk_task.h b/src/sdk/sdk_task.h index 58f61f65a..34ec25b1d 100644 --- a/src/sdk/sdk_task.h +++ b/src/sdk/sdk_task.h @@ -24,7 +24,8 @@ class SdkTask { enum TYPE { READ, MUTATION, - SCAN + SCAN, + TASKBATCH, }; TYPE Type() { return type_; } @@ -48,6 +49,14 @@ class SdkTask { void DecRef(); void ExcludeOtherRef(); + virtual bool IsAsync() = 0; + virtual uint32_t Size() = 0; + virtual int64_t TimeOut() = 0; + virtual void Wait() = 0; + virtual void SetError(ErrorCode::ErrorCodeType err, + const std::string& reason) = 0; + virtual const std::string& RowKey() = 0; + protected: SdkTask(TYPE type) : type_(type), @@ -76,7 +85,10 @@ typedef void (*StatCallback)(Table* table, SdkTask* task); struct SdkTaskDueTimeComp { bool operator() (SdkTask* lhs, SdkTask* rhs) { - return lhs->DueTime() < rhs->DueTime(); + if (lhs->DueTime() != rhs->DueTime()) { + return lhs->DueTime() < rhs->DueTime(); + } + return lhs->GetId() < rhs->GetId(); } }; diff --git a/src/sdk/sdk_utils.cc b/src/sdk/sdk_utils.cc index 175bc7245..b135b99ed 100644 --- a/src/sdk/sdk_utils.cc +++ b/src/sdk/sdk_utils.cc @@ -18,6 +18,7 @@ #include "sdk/schema_impl.h" #include "sdk/filter_utils.h" +#include "types.h" DECLARE_int64(tera_tablet_write_block_size); DECLARE_int64(tera_tablet_ldb_sst_size); @@ -184,6 +185,12 @@ void ShowTableSchema(const TableSchema& s, bool is_x) { cf_ss << "type=bytes" << ","; } } + if (is_x || (cf_schema.gtxn() != false)) { + cf_ss << "gtxn=" << Switch2Str(cf_schema.gtxn()) << ","; + } + if (is_x || (cf_schema.notify() != false)) { + cf_ss << "notify=" << Switch2Str(cf_schema.notify()) << ","; + } cf_ss << "\b>"; if (cf_ss.str().size() > 5) { ss << cf_ss.str(); @@ -281,6 +288,8 @@ void TableDescToSchema(const TableDescriptor& desc, TableSchema* schema) { cf->set_max_versions(cf_desc->MaxVersions()); cf->set_min_versions(cf_desc->MinVersions()); cf->set_type(cf_desc->Type()); + cf->set_gtxn(cf_desc->GlobalTransaction()); + cf->set_notify(cf_desc->IsNotifyEnabled()); } } @@ -365,6 +374,16 @@ void TableSchemaToDesc(const TableSchema& schema, TableDescriptor* desc) { cfd->SetMinVersions(cf.min_versions()); cfd->SetTimeToLive(cf.time_to_live()); cfd->SetType(cf.type()); + if (cf.gtxn()) { + cfd->EnableGlobalTransaction(); + } else { + cfd->DisableGlobalTransaction(); + } + if (cf.notify()) { + cfd->EnableNotify(); + } else { + cfd->DisableNotify(); + } } } @@ -402,6 +421,22 @@ bool SetCfProperties(const string& name, const string& value, return false; } desc->SetType(value); + } else if (name == "gtxn") { + if (value == "on") { + desc->EnableGlobalTransaction(); + } else if (value == "off") { + desc->DisableGlobalTransaction(); + } else { + return false; + } + } else if (name == "notify") { + if (value == "on") { + desc->EnableNotify(); + } else if (value == "off") { + desc->DisableNotify(); + } else { + return false; + } }else { return false; } @@ -556,6 +591,13 @@ bool CheckTableDescrptor(const TableDescriptor& desc, ErrorCode* err) { } return false; } + if (!desc.IsTxnEnabled() && desc.ColumnFamily(i)->GlobalTransaction() == true) { + ss << " columnfamily property: gtxn is valid only when table set 'txn=on') "; + if (err != NULL) { + err->SetFailed(ErrorCode::kBadParam, ss.str()); + } + return false; + } } if (desc.IsTxnEnabled() && (desc.RawKey() == kGeneralKv || desc.RawKey() == kTTLKv)) { ss << "kv and ttlkv don't support txn"; @@ -806,6 +848,8 @@ bool FillTableDescriptor(PropTree& schema_tree, TableDescriptor* table_desc) { return false; } } + // extend notify locality group and _N_ columnfamily + return ExtendNotifyLgToDescriptor(table_desc); } else if (schema_tree.MaxDepth() == 3) { // full mode, all elements are user-defined // e.g. table1{ @@ -860,6 +904,8 @@ bool FillTableDescriptor(PropTree& schema_tree, TableDescriptor* table_desc) { return false; } } + // extend notify locality group and _N_ columnfamily + return ExtendNotifyLgToDescriptor(table_desc); } else { LOG(FATAL) << "never here."; } @@ -975,4 +1021,56 @@ bool IsKvTable(const TableSchema& schema) { schema.raw_key() == TTLKv); } +bool IsTransactionTable(const TableSchema& schema) { + return schema.enable_txn(); +} + +void FindGlobalTransactionCfs(const TableSchema& schema, + std::set* column_families) { + size_t cf_num = schema.column_families_size(); + for (size_t cf_no = 0; cf_no < cf_num; ++cf_no) { + const ColumnFamilySchema& cf_schema = schema.column_families(cf_no); + if (cf_schema.gtxn()) { + column_families->insert(cf_schema.name()); + } + } +} + +bool ExtendNotifyLgToDescriptor(TableDescriptor* desc) { + bool do_extend = false; + bool have_n_cf = false; + for (int32_t i = 0; i < desc->ColumnFamilyNum(); ++i) { + if (desc->ColumnFamily(i)->Name() == kNotifyColumnFamily) { + have_n_cf = true; + } + if (desc->ColumnFamily(i)->IsNotifyEnabled()) { + do_extend = true; + } + } + if (!do_extend) { + return true; + } else if (do_extend && have_n_cf) { + return false; + } + if (desc->LocalityGroup(TableDescImpl::NOTIFY_LG_NAME) != NULL) { + LOG(ERROR) << "already exists locality group: " + << TableDescImpl::NOTIFY_LG_NAME; + return false; + } + LocalityGroupDescriptor* lg_desc + = desc->AddLocalityGroup(TableDescImpl::NOTIFY_LG_NAME); + if (lg_desc == NULL) { + LOG(ERROR) << "fail to add locality group: " + << TableDescImpl::NOTIFY_LG_NAME; + return false; + } + ColumnFamilyDescriptor* cf_desc + = desc->AddColumnFamily(kNotifyColumnFamily, TableDescImpl::NOTIFY_LG_NAME); + if (cf_desc == NULL) { + LOG(ERROR) << "fail to add column family: " << kNotifyColumnFamily; + return false; + } + return true; +} + } // namespace tera diff --git a/src/sdk/sdk_utils.h b/src/sdk/sdk_utils.h index 4974575af..0e8ddad54 100644 --- a/src/sdk/sdk_utils.h +++ b/src/sdk/sdk_utils.h @@ -50,5 +50,11 @@ bool ParseDelimiterFile(const string& filename, std::vector* delims); bool IsKvTable(const TableSchema& schema); +bool ExtendNotifyLgToDescriptor(TableDescriptor* desc); + +bool IsTransactionTable(const TableSchema& schema); + +void FindGlobalTransactionCfs(const TableSchema& schema, std::set* column_families); + } // namespace tera #endif // TERA_SDK_SDK_UTILS_H_ diff --git a/src/sdk/sdk_zk.cc b/src/sdk/sdk_zk.cc index e08bb6c9b..874b4912c 100644 --- a/src/sdk/sdk_zk.cc +++ b/src/sdk/sdk_zk.cc @@ -5,11 +5,15 @@ #include "sdk/sdk_zk.h" #include +#include +#include +#include #include +#include "common/this_thread.h" #include "ins_sdk.h" - #include "types.h" +#include "utils/utils_cmd.h" #include "zk/zk_adapter.h" DECLARE_string(tera_zk_lib_log_path); @@ -18,14 +22,174 @@ DECLARE_bool(tera_zk_enabled); DECLARE_bool(tera_mock_zk_enabled); DECLARE_string(tera_zk_addr_list); DECLARE_string(tera_zk_root_path); +DECLARE_int32(tera_zk_timeout); +DECLARE_int32(tera_zk_retry_max_times); +DECLARE_int64(tera_zk_retry_period); DECLARE_bool(tera_ins_enabled); DECLARE_string(tera_ins_root_path); DECLARE_string(tera_ins_addr_list); +DECLARE_int64(tera_sdk_ins_session_timeout); DECLARE_bool(tera_mock_ins_enabled); +DECLARE_bool(tera_timeoracle_mock_enabled); +DECLARE_string(tera_timeoracle_mock_root_path); +DECLARE_string(tera_coord_type); namespace tera { namespace sdk { +static pthread_once_t zk_init_once = PTHREAD_ONCE_INIT; + +static void InitZkLogOnce() { + zk::ZooKeeperLightAdapter::SetLibraryLogOutput(FLAGS_tera_zk_lib_log_path); +} + +bool ClientZkAdapter::Init() { + pthread_once(&zk_init_once, InitZkLogOnce); + MutexLock lock(&mutex_); + LOG(INFO) << "try init zk ..."; + int zk_errno = zk::ZE_OK; + int32_t retry_cnt = 0; + int wait_time = 60000; + while (!ZooKeeperAdapter::Init(FLAGS_tera_zk_addr_list, + FLAGS_tera_zk_root_path, + FLAGS_tera_zk_timeout, + "", &zk_errno, wait_time)) { + if (retry_cnt++ >= FLAGS_tera_zk_retry_max_times) { + LOG(ERROR) << "fail to init zk: " << zk::ZkErrnoToString(zk_errno); + return false; + } + LOG(ERROR) << "init zk fail: " << zk::ZkErrnoToString(zk_errno) + << ". retry in " << FLAGS_tera_zk_retry_period << " ms, retry: " + << retry_cnt; + ThisThread::Sleep(FLAGS_tera_zk_retry_period); + zk_errno = zk::ZE_OK; + } + LOG(INFO) << "init zk success"; + return true; +} + +bool ClientZkAdapter::RegisterClient(std::string* path) { + int64_t session_id = 0; + int zk_errno = zk::ZE_OK; + int32_t retry_cnt = 0; + LOG(INFO) << "try get client sesssion"; + while (!GetSessionId(&session_id, &zk_errno)) { + if (retry_cnt++ >= FLAGS_tera_zk_retry_max_times) { + LOG(ERROR) << "fail to get client session : " + << zk::ZkErrnoToString(zk_errno); + return false; + } + LOG(ERROR) << "get client session fail: " << zk::ZkErrnoToString(zk_errno) + << ". retry in " << FLAGS_tera_zk_retry_period << " ms, retry: " + << retry_cnt; + ThisThread::Sleep(FLAGS_tera_zk_retry_period); + zk_errno = zk::ZE_OK; + } + std::string internal_path = utils::GetLocalHostAddr() + + "-" + std::to_string(getpid()) + + "-" + std::to_string(session_id); + LOG(INFO) << "get client session success : " << internal_path; + zk_errno = zk::ZE_OK; + retry_cnt = 0; + LOG(INFO) << "try create client node : " << internal_path; + while (!CreateEphemeralNode(kClientsNodePath + "/" + internal_path, + "", + &zk_errno)) { + if (retry_cnt++ >= FLAGS_tera_zk_retry_max_times) { + LOG(ERROR) << "fail to create client node : " + << zk::ZkErrnoToString(zk_errno); + return false; + } + LOG(ERROR) << "create client node fail: " << zk::ZkErrnoToString(zk_errno) + << ". retry in " << FLAGS_tera_zk_retry_period << " ms, retry: " + << retry_cnt; + ThisThread::Sleep(FLAGS_tera_zk_retry_period); + zk_errno = zk::ZE_OK; + } + LOG(INFO) << "create client node success"; + *path = internal_path; + return true; +} + +bool ClientZkAdapter::IsClientAlive(const std::string& path) { + VLOG(12) << "try check client alive : " << path; + int32_t retry_cnt = 0; + int zk_errno = zk::ZE_OK; + bool ret = true; + while (!CheckExist(kClientsNodePath + "/" + path, &ret, &zk_errno)) { + if (retry_cnt++ >= FLAGS_tera_zk_retry_max_times) { + LOG(ERROR) << "fail to check client alive : " + << zk::ZkErrnoToString(zk_errno); + // when zk server error, client should think other client is alive + return true; + } + LOG(ERROR) << "check client alive fail: " << zk::ZkErrnoToString(zk_errno) + << ". retry in " << FLAGS_tera_zk_retry_period << " ms, retry: " + << retry_cnt; + ThisThread::Sleep(FLAGS_tera_zk_retry_period); + zk_errno = zk::ZE_OK; + } + VLOG(12) << "check client alive success"; + return ret; +} + +bool ClientZkAdapter::ReadNode(const std::string& path, std::string* value) { + VLOG(12) << "try read node : " << path; + int32_t retry_cnt = 0; + int zk_errno = zk::ZE_OK; + while (!ZooKeeperAdapter::ReadNode(path, value, &zk_errno)) { + if (retry_cnt++ >= FLAGS_tera_zk_retry_max_times) { + LOG(ERROR) << "fail to read node : " + << zk::ZkErrnoToString(zk_errno); + return false; + } + LOG(ERROR) << "read node fail: " << zk::ZkErrnoToString(zk_errno) + << ". retry in " << FLAGS_tera_zk_retry_period << " ms, retry: " + << retry_cnt; + ThisThread::Sleep(FLAGS_tera_zk_retry_period); + zk_errno = zk::ZE_OK; + } + VLOG(12) << "read node success"; + return true; +} + +bool InsClientZkAdapter::Init() { + ins_sdk_ = new galaxy::ins::sdk::InsSDK(FLAGS_tera_ins_addr_list); + ins_sdk_->SetTimeoutTime(FLAGS_tera_sdk_ins_session_timeout); + return true; +} + +bool InsClientZkAdapter::RegisterClient(std::string* path) { + std::string internal_path = utils::GetLocalHostAddr() + + "-" + std::to_string(getpid()) + + "-" + ins_sdk_->GetSessionID(); + LOG(INFO) << "get client session success : " << internal_path; + std::string client_path = FLAGS_tera_ins_root_path + kClientsNodePath + + "/" + internal_path; + galaxy::ins::sdk::SDKError err; + bool ret = ins_sdk_->Put(client_path, "", &err); + if (ret) { + *path = internal_path; + } + return ret; +} + +bool InsClientZkAdapter::IsClientAlive(const std::string& path) { + std::string client_path = kClientsNodePath + "/" + path; + std::string value; + return ReadNode(client_path, &value); +} + +bool InsClientZkAdapter::ReadNode(const std::string& path, std::string* value) { + std::string target_path = FLAGS_tera_ins_root_path + path; + galaxy::ins::sdk::SDKError err; + if (!ins_sdk_->Get(target_path, value, &err)) { + LOG(ERROR) << "ins read " << target_path << " fail: " << err; + return false; + } + return true; +} + std::string ClusterFinder::MasterAddr(bool update) { std::string master_addr; if (update || master_addr_ == "") { @@ -41,6 +205,21 @@ std::string ClusterFinder::MasterAddr(bool update) { return master_addr_; } +std::string ClusterFinder::TimeoracleAddr(bool update) { + std::string timeoracle_addr; + if (update || timeoracle_addr_ == "") { + if (!ReadNode(kTimeoracleNodePath, &timeoracle_addr)) { + timeoracle_addr = ""; + } + } + if (!timeoracle_addr.empty()) { + MutexLock lock(&mutex_); + timeoracle_addr_ = timeoracle_addr; + LOG(INFO) << "timeoracle addr: " << timeoracle_addr_; + } + return timeoracle_addr_; +} + std::string ClusterFinder::RootTableAddr(bool update) { std::string root_table_addr; if (update || root_table_addr_ == "") { @@ -72,46 +251,54 @@ std::string ClusterFinder::ClusterId() { } ZkClusterFinder::ZkClusterFinder(const std::string& zk_root_path, - const std::string& zk_addr_list) - : zk_root_path_(zk_root_path), zk_addr_list_(zk_addr_list) { -} - -static pthread_once_t zk_init_once = PTHREAD_ONCE_INIT; - -static void InitZkLogOnce() { - zk::ZooKeeperLightAdapter::SetLibraryLogOutput(FLAGS_tera_zk_lib_log_path); + const std::string& zk_addr_list, + ClientZkAdapterBase* zk_adapter) + : zk_root_path_(zk_root_path), + zk_addr_list_(zk_addr_list), + zk_adapter_(zk_adapter) { } bool ZkClusterFinder::ReadNode(const std::string& name, std::string* value) { - pthread_once(&zk_init_once, InitZkLogOnce); + if (zk_adapter_ == NULL) { + pthread_once(&zk_init_once, InitZkLogOnce); - int zk_errno = tera::zk::ZE_OK; - zk::ZooKeeperLightAdapter zk_adapter; - if (!zk_adapter.Init(zk_addr_list_, zk_root_path_, 1000 * 15, "", &zk_errno)) { - LOG(ERROR) << "Init zookeeper fail: " << tera::zk::ZkErrnoToString(zk_errno); - return false; - } + int zk_errno = tera::zk::ZE_OK; + zk::ZooKeeperLightAdapter zk_adapter; + if (!zk_adapter.Init(zk_addr_list_, zk_root_path_, 1000 * 15, "", &zk_errno)) { + LOG(ERROR) << "Init zookeeper fail: " << tera::zk::ZkErrnoToString(zk_errno); + return false; + } - if (!zk_adapter.ReadNode(name, value, &zk_errno)) { - LOG(ERROR) << "zk read " << name << " fail: " << zk::ZkErrnoToString(zk_errno); - return false; + if (!zk_adapter.ReadNode(name, value, &zk_errno)) { + LOG(ERROR) << "zk read " << name << " fail: " << zk::ZkErrnoToString(zk_errno); + return false; + } + return true; + } else { + return zk_adapter_->ReadNode(name, value); } - return true; } InsClusterFinder::InsClusterFinder(const std::string& ins_root_path, - const std::string& ins_addr_list) - : ins_root_path_(ins_root_path), ins_addr_list_(ins_addr_list) { + const std::string& ins_addr_list, + ClientZkAdapterBase* zk_adapter) + : ins_root_path_(ins_root_path), + ins_addr_list_(ins_addr_list), + zk_adapter_(zk_adapter) { } bool InsClusterFinder::ReadNode(const std::string& name, std::string* value) { - galaxy::ins::sdk::InsSDK ins_sdk(ins_addr_list_); - galaxy::ins::sdk::SDKError err; - if (!ins_sdk.Get(ins_root_path_ + name, value, &err)) { - LOG(ERROR) << "ins read " << name << " fail: " << err; - return false; + if (zk_adapter_ == NULL) { + galaxy::ins::sdk::InsSDK ins_sdk(ins_addr_list_); + galaxy::ins::sdk::SDKError err; + if (!ins_sdk.Get(ins_root_path_ + name, value, &err)) { + LOG(ERROR) << "ins read " << name << " fail: " << err; + return false; + } + return true; + } else { + return zk_adapter_->ReadNode(name, value); } - return true; } FakeZkClusterFinder::FakeZkClusterFinder(const std::string& fake_zk_path_prefix) @@ -122,18 +309,84 @@ bool FakeZkClusterFinder::ReadNode(const std::string& name, std::string* value) return zk::FakeZkUtil::ReadNode(fake_zk_path_prefix_ + name, value); } -ClusterFinder* NewClusterFinder() { - if (FLAGS_tera_zk_enabled) { - return new sdk::ZkClusterFinder(FLAGS_tera_zk_root_path, FLAGS_tera_zk_addr_list); - } else if (FLAGS_tera_ins_enabled) { - return new sdk::InsClusterFinder(FLAGS_tera_ins_root_path, FLAGS_tera_ins_addr_list); - } else if (FLAGS_tera_mock_zk_enabled) { +MockTimeoracleClusterFinder::MockTimeoracleClusterFinder(const std::string& mock_root_path) { + mock_root_path_ = mock_root_path; +} + +bool MockTimeoracleClusterFinder::ReadNode(const std::string& kpath, std::string* value) { + std::string path = mock_root_path_ + kpath; + int fd = ::open(path.c_str(), O_RDWR); + if (fd < 0) { + return false; + } + + value->resize(1024); + char *buf = &(*value)[0]; + ssize_t len = ::pread(fd, buf, sizeof(buf), 0); + ::close(fd); + if (len < 0) { + return false; + } + value->resize(len); + return true; +} + +ClientZkAdapterBase* NewClientZkAdapter() { + if (FLAGS_tera_coord_type.empty()) { + LOG(ERROR) << "Note: We don't recommend that use '--tera_[zk|ins|mock_zk|mock_ins]_enabled' flag for your cluster coord" + << " replace by '--tera_coord_type=[zk|ins|mock_zk|mock_ins|fake_zk]' flag is usually recommended."; + } + + if (FLAGS_tera_coord_type == "zk" + || (FLAGS_tera_coord_type.empty() && FLAGS_tera_zk_enabled)) { + return new sdk::ClientZkAdapter(); + } else if (FLAGS_tera_coord_type == "ins" + || (FLAGS_tera_coord_type.empty() && FLAGS_tera_ins_enabled)) { + return new sdk::InsClientZkAdapter(); + } else if (FLAGS_tera_coord_type == "mock_zk" + || (FLAGS_tera_coord_type.empty() && FLAGS_tera_mock_zk_enabled)) { + return new sdk::MockClientZkAdapter(); + } else if (FLAGS_tera_coord_type == "mock_ins" + || (FLAGS_tera_coord_type.empty() && FLAGS_tera_mock_ins_enabled)) { + return new sdk::MockInsClientZkAdapter(); + } + return NULL; +} + +ClusterFinder* NewClusterFinder(ClientZkAdapterBase* zk_adapter) { + if (FLAGS_tera_coord_type.empty()) { + LOG(ERROR) << "Note: We don't recommend that use '--tera_[zk|ins|mock_zk|mock_ins]_enabled' flag for your cluster coord" + << " replace by '--tera_coord_type=[zk|ins|mock_zk|mock_ins|fake_zk]' flag is usually recommended."; + } + if (FLAGS_tera_coord_type == "zk" + || (FLAGS_tera_coord_type.empty() && FLAGS_tera_zk_enabled)) { + return new sdk::ZkClusterFinder(FLAGS_tera_zk_root_path, FLAGS_tera_zk_addr_list, zk_adapter); + } else if (FLAGS_tera_coord_type == "ins" + || (FLAGS_tera_coord_type.empty() && FLAGS_tera_ins_enabled)) { + return new sdk::InsClusterFinder(FLAGS_tera_ins_root_path, FLAGS_tera_ins_addr_list, zk_adapter); + } else if (FLAGS_tera_coord_type == "mock_zk" + || (FLAGS_tera_coord_type.empty() && FLAGS_tera_mock_zk_enabled)) { return new sdk::MockZkClusterFinder(FLAGS_tera_zk_root_path, FLAGS_tera_zk_addr_list); - } else if (FLAGS_tera_mock_ins_enabled) { + } else if (FLAGS_tera_coord_type == "mock_ins" + || (FLAGS_tera_coord_type.empty() && FLAGS_tera_mock_ins_enabled)) { return new sdk::MockInsClusterFinder(FLAGS_tera_ins_root_path, FLAGS_tera_ins_addr_list); - } else { + } else if (FLAGS_tera_coord_type == "fake_zk" + || FLAGS_tera_coord_type.empty()) { return new sdk::FakeZkClusterFinder(FLAGS_tera_fake_zk_path_prefix); } + return nullptr; +} + +ClusterFinder* NewTimeoracleClusterFinder() { + if (FLAGS_tera_timeoracle_mock_enabled) { + return new sdk::MockTimeoracleClusterFinder(FLAGS_tera_timeoracle_mock_root_path); + } else if (FLAGS_tera_coord_type == "zk") { + return new sdk::ZkClusterFinder(FLAGS_tera_zk_root_path, FLAGS_tera_zk_addr_list); + } else if (FLAGS_tera_coord_type == "ins") { + return new sdk::InsClusterFinder(FLAGS_tera_ins_root_path, FLAGS_tera_ins_addr_list); + } + + return nullptr; } } // namespace sdk diff --git a/src/sdk/sdk_zk.h b/src/sdk/sdk_zk.h index dc199abe6..8ad026ebd 100644 --- a/src/sdk/sdk_zk.h +++ b/src/sdk/sdk_zk.h @@ -9,9 +9,95 @@ #include #include +#include "ins_sdk.h" +#include "zk/zk_adapter.h" + +namespace galaxy{ +namespace ins{ +namespace sdk { + class InsSDK; +} +} +} + namespace tera { namespace sdk { +class ClientZkAdapterBase : public zk::ZooKeeperLightAdapter { +public: + virtual ~ClientZkAdapterBase() {}; + virtual bool Init() = 0; + virtual bool RegisterClient(std::string* session_str) = 0; + virtual bool IsClientAlive(const std::string& path) = 0; + virtual bool ReadNode(const std::string& path, std::string* value) = 0; +}; + +class ClientZkAdapter : public ClientZkAdapterBase { +public: + ClientZkAdapter() {} + virtual ~ClientZkAdapter() {} + virtual bool Init(); + virtual bool RegisterClient(std::string* session_str); + virtual bool IsClientAlive(const std::string& path); + virtual bool ReadNode(const std::string& path, std::string* value); +private: + mutable Mutex mutex_; +}; + +class MockClientZkAdapter : public ClientZkAdapter { +public: + MockClientZkAdapter(): ClientZkAdapter() {} + virtual ~MockClientZkAdapter() {} + virtual bool Init() { return true; } + virtual bool RegisterClient(std::string* session_str) { + *session_str = "localhost"; + return true; + } + virtual bool IsClientAlive(const std::string& path) { + return true; + } + virtual bool ReadNode(const std::string& path, std::string* value) { + *value = "mock_zk_value"; + return true; + } +}; + +class InsClientZkAdapter : public ClientZkAdapterBase { +public: + InsClientZkAdapter() : ins_sdk_(NULL) {} + virtual ~InsClientZkAdapter() { + if (ins_sdk_ != NULL) { + delete ins_sdk_; + } + } + virtual bool Init (); + virtual bool RegisterClient(std::string* session_str); + virtual bool IsClientAlive(const std::string& path); + virtual bool ReadNode(const std::string& path, std::string* value); +private: + galaxy::ins::sdk::InsSDK* ins_sdk_; +}; + +class MockInsClientZkAdapter : public InsClientZkAdapter { +public: + MockInsClientZkAdapter() : InsClientZkAdapter() {} + virtual ~MockInsClientZkAdapter() {} + virtual bool Init() { return true; } + virtual bool RegisterClient(std::string* session_str) { + *session_str = "localhost"; + return true; + } + virtual bool IsClientAlive(const std::string& path) { + return true; + } + virtual bool ReadNode(const std::string& path, std::string* value) { + *value = "mock_ins_value"; + return true; + } +}; + +ClientZkAdapterBase* NewClientZkAdapter(); + class ClusterFinder { public: @@ -19,6 +105,7 @@ class ClusterFinder virtual ~ClusterFinder() {} std::string MasterAddr(bool update = false); std::string RootTableAddr(bool update = false); + std::string TimeoracleAddr(bool update = false); std::string ClusterId(); // cluster URI: :/// protected: @@ -30,12 +117,15 @@ class ClusterFinder private: mutable Mutex mutex_; std::string master_addr_; + std::string timeoracle_addr_; std::string root_table_addr_; }; class ZkClusterFinder : public ClusterFinder { public: - ZkClusterFinder(const std::string& zk_root_path, const std::string& zk_addr_list); + ZkClusterFinder(const std::string& zk_root_path, + const std::string& zk_addr_list, + ClientZkAdapterBase* zk_adapter = NULL); protected: virtual bool ReadNode(const std::string& path, std::string* value); virtual std::string Name() { return "zk"; }; @@ -44,6 +134,7 @@ class ZkClusterFinder : public ClusterFinder { private: std::string zk_root_path_; std::string zk_addr_list_; + ClientZkAdapterBase* zk_adapter_; }; class MockZkClusterFinder : public ZkClusterFinder { @@ -56,7 +147,9 @@ class MockZkClusterFinder : public ZkClusterFinder { class InsClusterFinder : public ClusterFinder { public: - InsClusterFinder(const std::string& ins_root_path, const std::string& ins_addr_list); + InsClusterFinder(const std::string& ins_root_path, + const std::string& ins_addr_list, + ClientZkAdapterBase* zk_adapter = NULL); protected: virtual bool ReadNode(const std::string& path, std::string* value); virtual std::string Name() { return "ins"; } @@ -65,6 +158,7 @@ class InsClusterFinder : public ClusterFinder { private: std::string ins_root_path_; std::string ins_addr_list_; + ClientZkAdapterBase* zk_adapter_; }; class MockInsClusterFinder : public InsClusterFinder { @@ -87,7 +181,24 @@ class FakeZkClusterFinder : public ClusterFinder { std::string fake_zk_path_prefix_; }; -ClusterFinder* NewClusterFinder(); +class MockTimeoracleClusterFinder : public ClusterFinder { +public: + MockTimeoracleClusterFinder(const std::string& mock_root_path); + +protected: + virtual bool ReadNode(const std::string& path, std::string* value); + + virtual std::string Name() { return "fakezk"; }; + + virtual std::string Authority() { return "localhost"; } + + virtual std::string Path() { return mock_root_path_; } +private: + std::string mock_root_path_; +}; + +ClusterFinder* NewTimeoracleClusterFinder(); +ClusterFinder* NewClusterFinder(ClientZkAdapterBase* zk_adapter = NULL); } // namespace sdk } // namespace tera diff --git a/src/sdk/single_row_txn.cc b/src/sdk/single_row_txn.cc index 0d63563e1..d55c31889 100644 --- a/src/sdk/single_row_txn.cc +++ b/src/sdk/single_row_txn.cc @@ -3,16 +3,18 @@ // found in the LICENSE file. #include +#include #include "common/thread_pool.h" #include "common/base/string_format.h" #include "io/coding.h" +#include "sdk/global_txn_internal.h" #include "sdk/read_impl.h" #include "sdk/single_row_txn.h" #include "sdk/table_impl.h" #include "types.h" -#include "utils/timer.h" +#include "common/timer.h" namespace tera { @@ -27,9 +29,12 @@ SingleRowTxn::SingleRowTxn(Table* table, const std::string& row_key, reader_max_versions_(1), reader_start_timestamp_(kOldestTs), reader_end_timestamp_(kLatestTs), + start_timestamp_(0), + commit_timestamp_(0), mutation_buffer_(table, row_key), user_commit_callback_(NULL), user_commit_context_(NULL) { + start_timestamp_ = get_micros(); } SingleRowTxn::~SingleRowTxn() { @@ -185,6 +190,8 @@ void CommitCallbackWrapper(RowMutation* row_mu) { /// 提交事务 ErrorCode SingleRowTxn::Commit() { + commit_timestamp_ = get_micros(); + InternalNotify(); if (mutation_buffer_.MutationNum() > 0) { if (user_commit_callback_ != NULL) { // use our callback wrapper @@ -266,6 +273,34 @@ void SingleRowTxn::Serialize(RowMutationSequence* mu_seq) { } } +void SingleRowTxn::Ack(Table* t, + const std::string& row_key, + const std::string& column_family, + const std::string& qualifier) { + std::unique_ptr mutation(t->NewRowMutation(row_key)); + std::string notify_qulifier = PackNotifyName(column_family, qualifier); + mutation->DeleteColumns(kNotifyColumnFamily, notify_qulifier, start_timestamp_); + this->ApplyMutation(mutation.get()); +} + +void SingleRowTxn::Notify(Table* t, + const std::string& row_key, + const std::string& column_family, + const std::string& qualifier) { + Cell cell(t, row_key, column_family, qualifier); + notify_cells_.push_back(cell); +} + +void SingleRowTxn::InternalNotify() { + for (auto cell : notify_cells_) { + std::unique_ptr mutation(cell.Table()->NewRowMutation(cell.RowKey())); + std::string notify_qulifier = PackNotifyName(cell.ColFamily(), cell.Qualifier()); + mutation->Put(kNotifyColumnFamily, notify_qulifier, commit_timestamp_); + // single row transaction may notify different rows + cell.Table()->ApplyMutation(mutation.get()); + } +} + } // namespace tera /* vim: set expandtab ts=4 sw=4 sts=4 tw=100: */ diff --git a/src/sdk/single_row_txn.h b/src/sdk/single_row_txn.h index 3a57ea143..96b0fd104 100644 --- a/src/sdk/single_row_txn.h +++ b/src/sdk/single_row_txn.h @@ -17,6 +17,7 @@ class ThreadPool; namespace tera { class TableImpl; +class Cell; class SingleRowTxn : public Transaction { public: @@ -45,8 +46,33 @@ class SingleRowTxn : public Transaction { /// 提交事务 virtual ErrorCode Commit(); - /// 请忽略此接口 - virtual int64_t GetStartTimestamp() { abort(); } + virtual int64_t GetStartTimestamp() { return start_timestamp_; } + + virtual int64_t GetCommitTimestamp() { return commit_timestamp_; } + + virtual void Ack(Table* t, + const std::string& row_key, + const std::string& column_family, + const std::string& qualifier); + + virtual void Notify(Table* t, + const std::string& row_key, + const std::string& column_family, + const std::string& qualifier); + + // not support + virtual void SetIsolation(const IsolationLevel& isolation_level) { abort(); } + + // use default isolation level snapshot + virtual IsolationLevel Isolation() { return IsolationLevel::kSnapshot; } + + virtual void SetTimeout(int64_t timeout_ms) { + mutation_buffer_.SetTimeOut(timeout_ms); + } + + virtual int64_t Timeout() { + return mutation_buffer_.TimeOut(); + } public: /// 内部读操作回调 @@ -61,6 +87,8 @@ class SingleRowTxn : public Transaction { bool MarkHasRead(); void MarkNoRead(); + + void InternalNotify(); private: Table* table_; const std::string row_key_; @@ -77,10 +105,15 @@ class SingleRowTxn : public Transaction { int64_t reader_start_timestamp_; int64_t reader_end_timestamp_; + int64_t start_timestamp_; + int64_t commit_timestamp_; + RowMutationImpl mutation_buffer_; Callback user_commit_callback_; void* user_commit_context_; + std::vector notify_cells_; + mutable Mutex mu_; }; diff --git a/src/sdk/table_impl.cc b/src/sdk/table_impl.cc index fc153676a..c87567abd 100644 --- a/src/sdk/table_impl.cc +++ b/src/sdk/table_impl.cc @@ -34,7 +34,7 @@ #include "tera.h" #include "utils/crypt.h" #include "utils/string_util.h" -#include "utils/timer.h" +#include "common/timer.h" DECLARE_string(tera_master_meta_table_name); DECLARE_int32(tera_sdk_delay_send_internal); @@ -73,8 +73,6 @@ TableImpl::TableImpl(const std::string& table_name, commit_size_(FLAGS_tera_sdk_batch_size), write_commit_timeout_(FLAGS_tera_sdk_write_send_interval), read_commit_timeout_(FLAGS_tera_sdk_read_send_interval), - mutation_batch_seq_(0), - reader_batch_seq_(0), max_commit_pending_num_(FLAGS_tera_sdk_max_mutation_pending_num), max_reader_pending_num_(FLAGS_tera_sdk_max_reader_pending_num), meta_cond_(&meta_mutex_), @@ -126,11 +124,11 @@ void OpStatCallback(Table* table, SdkTask* task) { if (task->Type() == SdkTask::MUTATION) { ((TableImpl*)table)->StatUserPerfCounter(task->Type(), ((RowMutationImpl*)task)->GetError().GetType(), - common::timer::get_micros() - ((RowMutationImpl*)task)->GetStartTime()); + get_micros() - ((RowMutationImpl*)task)->GetStartTime()); } else if (task->Type() == SdkTask::READ) { ((TableImpl*)table)->StatUserPerfCounter(task->Type(), ((RowReaderImpl*)task)->GetError().GetType(), - common::timer::get_micros() - ((RowReaderImpl*)task)->GetStartTime()); + get_micros() - ((RowReaderImpl*)task)->GetStartTime()); } } @@ -148,13 +146,15 @@ void TableImpl::ApplyMutation(RowMutation* row_mu) { thread_pool_->AddTask(task); return; } - std::vector mu_list; - mu_list.push_back(static_cast(row_mu)); - DistributeMutations(mu_list, true); + std::vector task_list; + task_list.push_back(static_cast((RowMutationImpl*)row_mu)); + int64_t ts = get_micros(); + DistributeTasks(task_list, true, SdkTask::MUTATION); + perf_counter_.hist_async_cost.Add(get_micros() - ts); } void TableImpl::ApplyMutation(const std::vector& row_mutations) { - std::vector mu_list; + std::vector task_list; for (uint32_t i = 0; i < row_mutations.size(); i++) { perf_counter_.user_mu_cnt.Add(1); ((RowMutationImpl*)row_mutations[i])->Prepare(OpStatCallback); @@ -169,9 +169,11 @@ void TableImpl::ApplyMutation(const std::vector& row_mutations) { thread_pool_->AddTask(task); continue; } - mu_list.push_back(static_cast(row_mutations[i])); + task_list.push_back(static_cast((RowMutationImpl*)row_mutations[i])); } - DistributeMutations(mu_list, true); + int64_t ts = get_micros(); + DistributeTasks(task_list, true, SdkTask::MUTATION); + perf_counter_.hist_async_cost.Add(get_micros() - ts); } bool TableImpl::Put(const std::string& row_key, const std::string& family, @@ -427,6 +429,7 @@ void TableImpl::CommitScan(ScanTask* scan_task, if (impl->GetMaxVersion() != 0) { request->set_max_version(impl->GetMaxVersion()); } + request->set_max_qualifiers(impl->GetMaxQualifiers()); if (impl->GetBufferSize() != 0) { request->set_buffer_limit(impl->GetBufferSize()); } @@ -450,7 +453,7 @@ void TableImpl::CommitScan(ScanTask* scan_task, << ", start_key " << request->start() << ", end_key " << request->end() << ", scan to " << server_addr; - request->set_timestamp(common::timer::get_micros()); + request->set_timestamp(get_micros()); std::function done = std::bind(&TableImpl::ScanCallBack, this, scan_task, _1, _2, _3, _4); tabletnode_client.ScanTablet(request, response, done); @@ -460,7 +463,7 @@ void TableImpl::ScanCallBack(ScanTask* scan_task, ScanTabletRequest* request, ScanTabletResponse* response, bool failed, int error_code) { - perf_counter_.rpc_s.Add(common::timer::get_micros() - request->timestamp()); + perf_counter_.rpc_s.Add(get_micros() - request->timestamp()); perf_counter_.rpc_s_cnt.Inc(); ResultStreamImpl* stream = scan_task->stream; @@ -548,202 +551,111 @@ bool TableImpl::OpenInternal(ErrorCode* err) { return true; } -void TableImpl::DistributeMutations(const std::vector& mu_list, - bool called_by_user) { - typedef std::map > TsMuMap; - TsMuMap ts_mu_list; +void TableImpl::DistributeTasks(const std::vector& task_list, + bool called_by_user, + SdkTask::TYPE task_type) { + typedef std::map > TsTaskMap; + TsTaskMap ts_task_list; int64_t sync_min_timeout = -1; - std::vector sync_mu_list; + std::vector sync_task_list; + + int64_t max_pending_counter; + Counter* task_cnt = NULL; + Counter* pending_counter = NULL; + SdkTask::TimeoutFunc timeout_task; + std::string err_reason; + if (task_type == SdkTask::MUTATION) { + task_cnt = &(perf_counter_.mutate_cnt); + pending_counter = &(cur_commit_pending_counter_); + max_pending_counter = max_commit_pending_num_; + err_reason = "pending too much mutations, try it later."; + timeout_task = std::bind(&TableImpl::MutationTimeout, this, _1); + } else if (task_type == SdkTask::READ) { + task_cnt = &(perf_counter_.reader_cnt); + pending_counter = &(cur_reader_pending_counter_); + max_pending_counter = max_reader_pending_num_; + err_reason = "pending too much readers, try it later."; + timeout_task = std::bind(&TableImpl::ReaderTimeout, this, _1); + } else { + assert(0); + } - // evaluate minimum timeout of sync requests - if (called_by_user) { - for (uint32_t i = 0; i < mu_list.size(); i++) { - RowMutationImpl* row_mutation = (RowMutationImpl*)mu_list[i]; - if (!row_mutation->IsAsync()) { - sync_mu_list.push_back(row_mutation); - int64_t row_timeout = row_mutation->TimeOut() > 0 ? row_mutation->TimeOut() : timeout_; - if (row_timeout > 0 && (sync_min_timeout <= 0 || sync_min_timeout > row_timeout)) { - sync_min_timeout = row_timeout; - } + for (uint32_t i = 0; called_by_user && i < task_list.size(); i++) { + SdkTask* task = (SdkTask*)task_list[i]; + if (!task->IsAsync()) { + sync_task_list.push_back(task); + int64_t task_timeout = task->TimeOut() > 0 ? task->TimeOut() : timeout_; + if (task_timeout > 0 && (sync_min_timeout <= 0 || sync_min_timeout > task_timeout)) { + sync_min_timeout = task_timeout; } } } - for (uint32_t i = 0; i < mu_list.size(); i++) { - RowMutationImpl* row_mutation = (RowMutationImpl*)mu_list[i]; - perf_counter_.mutate_cnt.Inc(); + for (uint32_t i = 0; i < task_list.size(); i++) { + SdkTask* task = (SdkTask*)task_list[i]; + task_cnt->Inc(); if (called_by_user) { - row_mutation->SetId(next_task_id_.Inc()); + task->SetId(next_task_id_.Inc()); - int64_t row_timeout = -1; - if (!row_mutation->IsAsync()) { - row_timeout = sync_min_timeout; + int64_t task_timeout = -1; + if (!task->IsAsync()) { + task_timeout = sync_min_timeout; } else { - row_timeout = row_mutation->TimeOut() > 0 ? row_mutation->TimeOut() : timeout_; + task_timeout = task->TimeOut() > 0 ? task->TimeOut() : timeout_; } - SdkTask::TimeoutFunc task = std::bind(&TableImpl::MutationTimeout, this, _1); - task_pool_.PutTask(row_mutation, row_timeout, task); + perf_counter_.total_task_cnt.Inc(); + task_pool_.PutTask(task, task_timeout, timeout_task); } // flow control if (called_by_user - && cur_commit_pending_counter_.Add(row_mutation->MutationNum()) > max_commit_pending_num_ - && row_mutation->IsAsync()) { + && pending_counter->Inc() > max_pending_counter + && task->IsAsync()) { if (FLAGS_tera_sdk_async_blocking_enabled) { - while (cur_commit_pending_counter_.Get() > max_commit_pending_num_) { + while (pending_counter->Get() > max_pending_counter) { usleep(100000); } } else { - cur_commit_pending_counter_.Sub(row_mutation->MutationNum()); - row_mutation->SetError(ErrorCode::kBusy, "pending too much mutations, try it later."); - ThreadPool::Task task = - std::bind(&TableImpl::BreakRequest, this, row_mutation->GetId()); - row_mutation->DecRef(); - thread_pool_->AddTask(task); + pending_counter->Dec(); + task->SetError(ErrorCode::kBusy, err_reason); + ThreadPool::Task break_task = + std::bind(&TableImpl::BreakRequest, this, task->GetId()); + task->DecRef(); + thread_pool_->AddTask(break_task); continue; } } std::string server_addr; - if (!GetTabletAddrOrScheduleUpdateMeta(row_mutation->RowKey(), - row_mutation, &server_addr)) { + if (!GetTabletAddrOrScheduleUpdateMeta(task->RowKey(), + task, &server_addr)) { + perf_counter_.meta_sched_cnt.Inc(); continue; } - - ts_mu_list[server_addr].push_back(row_mutation); + ts_task_list[server_addr].push_back(task); } - TsMuMap::iterator it = ts_mu_list.begin(); - for (; it != ts_mu_list.end(); ++it) { - PackMutations(it->first, it->second); + TsTaskMap::iterator it = ts_task_list.begin(); + for (; it != ts_task_list.end(); ++it) { + PackSdkTasks(it->first, it->second, task_type); } - // 从现在开始,所有异步的row_mutation都不可以再操作了,因为随时会被用户释放 + // 从现在开始,所有异步的row_mutation都不可以再操作了,因为随时会被用户释放 // 不是用户调用的,立即返回 if (!called_by_user) { return; } // 等待同步操作返回或超时 - for (uint32_t i = 0; i < sync_mu_list.size(); i++) { - while (cur_commit_pending_counter_.Get() > max_commit_pending_num_) { + for (uint32_t i = 0; i < sync_task_list.size(); i++) { + while (pending_counter->Get() > max_pending_counter) { usleep(100000); } - - RowMutationImpl* row_mutation = (RowMutationImpl*)sync_mu_list[i]; - row_mutation->Wait(); - } -} - -void TableImpl::DistributeMutationsById(std::vector* mu_id_list) { - std::vector mu_list; - for (uint32_t i = 0; i < mu_id_list->size(); ++i) { - int64_t mu_id = (*mu_id_list)[i]; - SdkTask* task = task_pool_.GetTask(mu_id); - if (task == NULL) { - VLOG(10) << "mutation " << mu_id << " timeout when retry mutate";; - continue; - } - CHECK_EQ(task->Type(), SdkTask::MUTATION); - RowMutationImpl* row_mutation = (RowMutationImpl*)task; - mu_list.push_back(row_mutation); - } - DistributeMutations(mu_list, false); - delete mu_id_list; -} - -void TableImpl::PackMutations(const std::string& server_addr, - std::vector& mu_list) { - MutexLock lock(&mutation_batch_mutex_); - TaskBatch* mutation_batch = NULL; - bool is_instant = false; - for (size_t i = 0; i < mu_list.size(); ++i) { - // find existing batch or create a new batch - if (mutation_batch == NULL) { - std::map::iterator it = mutation_batch_map_.find(server_addr); - if (it != mutation_batch_map_.end()) { - mutation_batch = &it->second; - } else { - mutation_batch = &mutation_batch_map_[server_addr]; - mutation_batch->sequence_num = mutation_batch_seq_++; - mutation_batch->row_id_list = new std::vector; - ThreadPool::Task task = std::bind(&TableImpl::MutationBatchTimeout, this, - server_addr, mutation_batch->sequence_num); - int64_t timer_id = thread_pool_->DelayTask(write_commit_timeout_, task); - mutation_batch->timer_id = timer_id; - mutation_batch->byte_size = 0; - } - } - - // put mutation into the batch - RowMutationImpl* row_mutation = mu_list[i]; - mutation_batch->row_id_list->push_back(row_mutation->GetId()); - mutation_batch->byte_size += row_mutation->Size(); - is_instant |= !row_mutation->IsAsync(); - row_mutation->DecRef(); - - // commit the batch if: - // 1) batch_byte_size >= max_rpc_byte_size - // for the *LAST* batch, commit it if: - // 2) any mutation is sync (flush == true) - // 3) batch_row_num >= min_batch_row_num - if (mutation_batch->byte_size >= kMaxRpcSize || - (i == mu_list.size() - 1 && - (is_instant || mutation_batch->row_id_list->size() >= commit_size_))) { - std::vector* mu_id_list = mutation_batch->row_id_list; - uint64_t timer_id = mutation_batch->timer_id; - const bool non_block_cancel = true; - bool is_running = false; - if (!thread_pool_->CancelTask(timer_id, non_block_cancel, &is_running)) { - CHECK(is_running); // this delay task must be waiting for mutation_batch_mutex_ - } - mutation_batch_map_.erase(server_addr); - mutation_batch_mutex_.Unlock(); - CommitMutationsById(server_addr, *mu_id_list); - delete mu_id_list; - mutation_batch = NULL; - is_instant = false; - mutation_batch_mutex_.Lock(); - } + SdkTask* task = (SdkTask*)sync_task_list[i]; + task->Wait(); } } -void TableImpl::MutationBatchTimeout(std::string server_addr, uint64_t batch_seq) { - std::vector* mu_id_list = NULL; - { - MutexLock lock(&mutation_batch_mutex_); - std::map::iterator it = - mutation_batch_map_.find(server_addr); - if (it == mutation_batch_map_.end()) { - return; - } - TaskBatch* mutation_batch = &it->second; - if (mutation_batch->sequence_num != batch_seq) { - return; - } - mu_id_list = mutation_batch->row_id_list; - mutation_batch_map_.erase(it); - } - CommitMutationsById(server_addr, *mu_id_list); - delete mu_id_list; -} - -void TableImpl::CommitMutationsById(const std::string& server_addr, - std::vector& mu_id_list) { - std::vector mu_list; - for (size_t i = 0; i < mu_id_list.size(); i++) { - int64_t mu_id = mu_id_list[i]; - SdkTask* task = task_pool_.GetTask(mu_id); - if (task == NULL) { - VLOG(10) << "mutation " << mu_id << " timeout"; - continue; - } - CHECK_EQ(task->Type(), SdkTask::MUTATION); - mu_list.push_back((RowMutationImpl*)task); - } - CommitMutations(server_addr, mu_list); -} - void TableImpl::CommitMutations(const std::string& server_addr, std::vector& mu_list) { tabletnode::TabletNodeClient tabletnode_client_async(server_addr); @@ -776,7 +688,7 @@ void TableImpl::CommitMutations(const std::string& server_addr, request->set_is_instant(is_instant); VLOG(20) << "commit " << mu_list.size() << " mutations to " << server_addr; - request->set_timestamp(common::timer::get_micros()); + request->set_timestamp(get_micros()); std::function done = std::bind(&TableImpl::MutateCallBack, this, mu_id_list, _1, _2, _3, _4); tabletnode_client_async.WriteTablet(request, response, done); @@ -786,7 +698,7 @@ void TableImpl::MutateCallBack(std::vector* mu_id_list, WriteTabletRequest* request, WriteTabletResponse* response, bool failed, int error_code) { - perf_counter_.rpc_w.Add(common::timer::get_micros() - request->timestamp()); + perf_counter_.rpc_w.Add(get_micros() - request->timestamp()); perf_counter_.rpc_w_cnt.Inc(); if (failed) { if (error_code == sofa::pbrpc::RPC_ERROR_SERVER_SHUTDOWN || @@ -807,7 +719,7 @@ void TableImpl::MutateCallBack(std::vector* mu_id_list, } std::map* > retry_times_list; - std::vector not_in_range_list; + std::vector not_in_range_list; for (uint32_t i = 0; i < mu_id_list->size(); ++i) { const std::string& row = request->row_list(i).row_key(); int64_t mu_id = (*mu_id_list)[i]; @@ -835,10 +747,10 @@ void TableImpl::MutateCallBack(std::vector* mu_id_list, } // only for flow control - cur_commit_pending_counter_.Sub(row_mutation->MutationNum()); - int64_t perf_time = common::timer::get_micros(); + cur_commit_pending_counter_.Dec(); + int64_t perf_time = get_micros(); row_mutation->RunCallback(); - perf_counter_.user_callback.Add(common::timer::get_micros() - perf_time); + perf_counter_.user_callback.Add(get_micros() - perf_time); perf_counter_.user_callback_cnt.Inc(); continue; } @@ -860,7 +772,7 @@ void TableImpl::MutateCallBack(std::vector* mu_id_list, if (err == kKeyNotInRange) { perf_counter_.mutate_range_cnt.Inc(); row_mutation->IncRetryTimes(); - not_in_range_list.push_back(row_mutation); + not_in_range_list.push_back(task); } else { row_mutation->IncRetryTimes(); std::vector* retry_mu_id_list = NULL; @@ -878,7 +790,7 @@ void TableImpl::MutateCallBack(std::vector* mu_id_list, } if (not_in_range_list.size() > 0) { - DistributeMutations(not_in_range_list, false); + DistributeTasks(not_in_range_list, false, SdkTask::MUTATION); } std::map* >::iterator it; for (it = retry_times_list.begin(); it != retry_times_list.end(); ++it) { @@ -894,6 +806,22 @@ void TableImpl::MutateCallBack(std::vector* mu_id_list, delete mu_id_list; } +void TableImpl::DistributeMutationsById(std::vector* mu_id_list) { + std::vector task_list; + for (uint32_t i = 0; i < mu_id_list->size(); ++i) { + int64_t mu_id = (*mu_id_list)[i]; + SdkTask* task = task_pool_.GetTask(mu_id); + if (task == NULL) { + VLOG(10) << "mutation " << mu_id << " timeout when retry mutate";; + continue; + } + CHECK_EQ(task->Type(), SdkTask::MUTATION); + task_list.push_back(task); + } + DistributeTasks(task_list, false, SdkTask::MUTATION); + delete mu_id_list; +} + void TableImpl::MutationTimeout(SdkTask* task) { perf_counter_.mutate_timeout_cnt.Inc(); CHECK_NOTNULL(task); @@ -907,199 +835,33 @@ void TableImpl::MutationTimeout(SdkTask* task) { ScheduleUpdateMeta(row_mutation->RowKey(), row_mutation->GetMetaTimeStamp()); } + + std::string err_reason; if (row_mutation->RetryTimes() == 0) { perf_counter_.mutate_queue_timeout_cnt.Inc(); - std::string err_reason = StringFormat("commit %lld times, retry 0 times, in %u ms.", - row_mutation->GetCommitTimes(), timeout_); - row_mutation->SetError(ErrorCode::kTimeout, err_reason); + err_reason = StringFormat("commit %lld times, retry 0 times, in %u ms.", + row_mutation->GetCommitTimes(), timeout_); } else { - std::string err_reason = StringFormat("commit %lld times, retry %u times, in %u ms. last error: %s", - row_mutation->GetCommitTimes(), row_mutation->RetryTimes(), - timeout_, StatusCodeToString(err).c_str()); - row_mutation->SetError(ErrorCode::kSystem, err_reason); + err_reason = StringFormat("commit %lld times, retry %u times, in %u ms. last error: %s", + row_mutation->GetCommitTimes(), row_mutation->RetryTimes(), + timeout_, StatusCodeToString(err).c_str()); } + row_mutation->SetError(ErrorCode::kTimeout, err_reason); // only for flow control - cur_commit_pending_counter_.Sub(row_mutation->MutationNum()); - int64_t perf_time = common::timer::get_micros(); + cur_commit_pending_counter_.Dec(); + int64_t perf_time = get_micros(); row_mutation->RunCallback(); - perf_counter_.user_callback.Add(common::timer::get_micros() - perf_time); + perf_counter_.user_callback.Add(get_micros() - perf_time); perf_counter_.user_callback_cnt.Inc(); } -bool TableImpl::GetTabletLocation(std::vector* tablets, - ErrorCode* err) { - return false; -} - -bool TableImpl::GetDescriptor(TableDescriptor* desc, ErrorCode* err) { - return false; -} - void TableImpl::DistributeReaders(const std::vector& row_reader_list, bool called_by_user) { - typedef std::map > TsReaderMap; - TsReaderMap ts_reader_list; - - int64_t sync_min_timeout = -1; - std::vector sync_reader_list; - - if (called_by_user) { - for (uint32_t i = 0; i < row_reader_list.size(); i++) { - RowReaderImpl* row_reader = (RowReaderImpl*)row_reader_list[i]; - if (row_reader->IsAsync()) { - continue; - } - sync_reader_list.push_back(row_reader); - int64_t row_timeout = row_reader->TimeOut() > 0 ? row_reader->TimeOut() : timeout_; - if (row_timeout > 0 && (sync_min_timeout <= 0 || sync_min_timeout > row_timeout)) { - sync_min_timeout = row_timeout; - } - } - } - - for (uint32_t i = 0; i < row_reader_list.size(); i++) { - perf_counter_.reader_cnt.Inc(); - RowReaderImpl* row_reader = (RowReaderImpl*)row_reader_list[i]; - if (called_by_user) { - row_reader->SetId(next_task_id_.Inc()); - - int64_t row_timeout = sync_min_timeout; - if (row_reader->IsAsync()) { - row_timeout = row_reader->TimeOut() > 0 ? row_reader->TimeOut() : timeout_; - } - SdkTask::TimeoutFunc task = std::bind(&TableImpl::ReaderTimeout, this, _1); - task_pool_.PutTask(row_reader, row_timeout, task); - } - - // flow control - if (called_by_user - && cur_reader_pending_counter_.Inc() > max_reader_pending_num_ - && row_reader->IsAsync()) { - if (FLAGS_tera_sdk_async_blocking_enabled) { - while (cur_reader_pending_counter_.Get() > max_reader_pending_num_) { - usleep(100000); - } - } else { - cur_reader_pending_counter_.Dec(); - row_reader->SetError(ErrorCode::kBusy, "pending too much readers, try it later."); - ThreadPool::Task task = - std::bind(&TableImpl::BreakRequest, this, row_reader->GetId()); - row_reader->DecRef(); - thread_pool_->AddTask(task); - continue; - } - } - - std::string server_addr; - if (!GetTabletAddrOrScheduleUpdateMeta(row_reader->RowName(), row_reader, - &server_addr)) { - continue; - } - - std::vector& ts_row_readers = ts_reader_list[server_addr]; - ts_row_readers.push_back(row_reader); - } - - TsReaderMap::iterator it = ts_reader_list.begin(); - for (; it != ts_reader_list.end(); ++it) { - std::vector& reader_list = it->second; - PackReaders(it->first, reader_list); - } - // 从现在开始,所有异步的row_reader都不可以再操作了,因为随时会被用户释放 - - // 不是用户调用的,立即返回 - if (!called_by_user) { - return; - } - - // 等待同步操作返回或超时 - for (uint32_t i = 0; i < sync_reader_list.size(); i++) { - while (cur_reader_pending_counter_.Get() > max_reader_pending_num_) { - usleep(100000); - } - - RowReaderImpl* row_reader = (RowReaderImpl*)sync_reader_list[i]; - row_reader->Wait(); + std::vector task_list; + for (size_t i = 0; i < row_reader_list.size(); ++i) { + task_list.push_back((SdkTask*)(row_reader_list[i])); } -} - -void TableImpl::PackReaders(const std::string& server_addr, - std::vector& reader_list) { - MutexLock lock(&reader_batch_mutex_); - TaskBatch* reader_buffer = NULL; - std::map::iterator it = reader_batch_map_.find(server_addr); - if (it != reader_batch_map_.end()) { - reader_buffer = &it->second; - } else { - reader_buffer = &reader_batch_map_[server_addr]; - reader_buffer->sequence_num = reader_batch_seq_++; - reader_buffer->row_id_list = new std::vector; - ThreadPool::Task task = std::bind(&TableImpl::ReaderBatchTimeout, this, - server_addr, reader_buffer->sequence_num); - uint64_t timer_id = thread_pool_->DelayTask(read_commit_timeout_, task); - reader_buffer->timer_id = timer_id; - } - - bool is_instant = false; - for (size_t i = 0; i < reader_list.size(); ++i) { - RowReaderImpl* reader = reader_list[i]; - reader_buffer->row_id_list->push_back(reader->GetId()); - is_instant |= !reader->IsAsync(); - reader->DecRef(); - } - - if (reader_buffer->row_id_list->size() >= commit_size_ || is_instant) { - std::vector* reader_id_list = reader_buffer->row_id_list; - uint64_t timer_id = reader_buffer->timer_id; - const bool non_block_cancel = true; - bool is_running = false; - if (!thread_pool_->CancelTask(timer_id, non_block_cancel, &is_running)) { - CHECK(is_running); // this delay task must be waiting for reader_batch_map_ - } - reader_batch_map_.erase(server_addr); - reader_batch_mutex_.Unlock(); - CommitReadersById(server_addr, *reader_id_list); - delete reader_id_list; - reader_buffer = NULL; - reader_batch_mutex_.Lock(); - } -} - -void TableImpl::ReaderBatchTimeout(std::string server_addr, uint64_t batch_seq) { - std::vector* reader_id_list = NULL; - { - MutexLock lock(&reader_batch_mutex_); - std::map::iterator it = - reader_batch_map_.find(server_addr); - if (it == reader_batch_map_.end()) { - return; - } - TaskBatch* reader_buffer = &it->second; - if (reader_buffer->sequence_num != batch_seq) { - return; - } - reader_id_list = reader_buffer->row_id_list; - reader_batch_map_.erase(it); - } - CommitReadersById(server_addr, *reader_id_list); - delete reader_id_list; -} - -void TableImpl::CommitReadersById(const std::string server_addr, - std::vector& reader_id_list) { - std::vector reader_list; - for (size_t i = 0; i < reader_id_list.size(); ++i) { - int64_t reader_id = reader_id_list[i]; - SdkTask* task = task_pool_.GetTask(reader_id); - if (task == NULL) { - VLOG(10) << "reader " << reader_id << " timeout when commit read";; - continue; - } - CHECK_EQ(task->Type(), SdkTask::READ); - RowReaderImpl* reader = (RowReaderImpl*)task; - reader_list.push_back(reader); - } - CommitReaders(server_addr, reader_list); + DistributeTasks(task_list, called_by_user, SdkTask::READ); } void TableImpl::CommitReaders(const std::string server_addr, @@ -1122,7 +884,7 @@ void TableImpl::CommitReaders(const std::string server_addr, row_reader->DecRef(); } VLOG(20) << "commit " << reader_list.size() << " reads to " << server_addr; - request->set_timestamp(common::timer::get_micros()); + request->set_timestamp(get_micros()); std::function done = std::bind(&TableImpl::ReaderCallBack, this, reader_id_list, _1, _2, _3, _4); tabletnode_client_async.ReadTablet(request, response, done); @@ -1132,7 +894,7 @@ void TableImpl::ReaderCallBack(std::vector* reader_id_list, ReadTabletRequest* request, ReadTabletResponse* response, bool failed, int error_code) { - perf_counter_.rpc_r.Add(common::timer::get_micros() - request->timestamp()); + perf_counter_.rpc_r.Add(get_micros() - request->timestamp()); perf_counter_.rpc_r_cnt.Inc(); if (failed) { if (error_code == sofa::pbrpc::RPC_ERROR_SERVER_SHUTDOWN || @@ -1185,9 +947,9 @@ void TableImpl::ReaderCallBack(std::vector* reader_id_list, } else { // err == kSnapshotNotExist row_reader->SetError(ErrorCode::kNotFound, "snapshot not found"); } - int64_t perf_time = common::timer::get_micros(); + int64_t perf_time = get_micros(); row_reader->RunCallback(); - perf_counter_.user_callback.Add(common::timer::get_micros() - perf_time); + perf_counter_.user_callback.Add(get_micros() - perf_time); perf_counter_.user_callback_cnt.Inc(); // only for flow control cur_reader_pending_counter_.Dec(); @@ -1273,25 +1035,161 @@ void TableImpl::ReaderTimeout(SdkTask* task) { ScheduleUpdateMeta(row_reader->RowName(), row_reader->GetMetaTimeStamp()); } + + std::string err_reason; if (row_reader->RetryTimes() == 0) { perf_counter_.reader_queue_timeout_cnt.Inc(); - std::string err_reason = StringFormat("commit %lld times, retry 0 times, in %u ms.", - row_reader->GetCommitTimes(), timeout_); - row_reader->SetError(ErrorCode::kTimeout, err_reason); + err_reason = StringFormat("commit %lld times, retry 0 times, in %u ms.", + row_reader->GetCommitTimes(), timeout_); } else { - std::string err_reason = StringFormat("commit %lld times, retry %u times, in %u ms. last error: %s", - row_reader->GetCommitTimes(), row_reader->RetryTimes(), - timeout_, StatusCodeToString(err).c_str()); - row_reader->SetError(ErrorCode::kSystem, err_reason); + err_reason = StringFormat("commit %lld times, retry %u times, in %u ms. last error: %s", + row_reader->GetCommitTimes(), row_reader->RetryTimes(), + timeout_, StatusCodeToString(err).c_str()); } - int64_t perf_time = common::timer::get_micros(); + row_reader->SetError(ErrorCode::kTimeout, err_reason); + int64_t perf_time = get_micros(); row_reader->RunCallback(); - perf_counter_.user_callback.Add(common::timer::get_micros() - perf_time); + perf_counter_.user_callback.Add(get_micros() - perf_time); perf_counter_.user_callback_cnt.Inc(); // only for flow control cur_reader_pending_counter_.Dec(); } +void TableImpl::PackSdkTasks(const std::string& server_addr, + std::vector& task_list, + SdkTask::TYPE task_type) { + Mutex* mutex = NULL; + std::map* task_batch_map = NULL; + SdkTask::TimeoutFunc task; + uint64_t commit_timeout = 10000; + uint32_t commit_size = commit_size_; + if (task_type == SdkTask::MUTATION) { + mutex = &mutation_batch_mutex_; + task_batch_map = &mutation_batch_map_; + commit_timeout = write_commit_timeout_; + } else if (task_type == SdkTask::READ) { + mutex = &reader_batch_mutex_; + task_batch_map = &reader_batch_map_; + commit_timeout = read_commit_timeout_; + } else { + assert(0); + } + + TaskBatch* task_batch = NULL; + bool is_instant = false; + MutexLock lock(mutex); + for (size_t i = 0; i < task_list.size(); ++i) { + // find existing batch or create a new batch + if (task_batch == NULL) { + std::map::iterator it = task_batch_map->find(server_addr); + if (it != task_batch_map->end()) { + task_batch = it->second; + } else { + task_batch = new TaskBatch; + task_batch->type = task_type; + task_batch->mutex = mutex; + task_batch->task_batch_map = task_batch_map; + task_batch->byte_size = 0; + task_batch->server_addr = server_addr; + task_batch->row_id_list = new std::vector; + + task_batch->SetId(next_task_id_.Inc()); + (*task_batch_map)[server_addr] = task_batch; + SdkTask::TimeoutFunc task = std::bind(&TableImpl::TaskBatchTimeout, this, _1); + task_pool_.PutTask(task_batch, commit_timeout, task); + task_batch->DecRef(); + } + } + + // put task into the batch + SdkTask* sdk_task = task_list[i]; + task_batch->row_id_list->push_back(sdk_task->GetId()); + task_batch->byte_size += sdk_task->Size(); + is_instant |= !sdk_task->IsAsync(); + sdk_task->DecRef(); + + // commit the batch if: + // 1) batch_byte_size >= max_rpc_byte_size + // for the *LAST* batch, commit it if: + // 2) any mutation is sync (flush == true) + // 3) batch_row_num >= min_batch_row_num + // 4) commit timeout + if (task_batch->byte_size >= kMaxRpcSize || + ((i == task_list.size() - 1) && + (is_instant || + (task_batch->row_id_list->size() >= commit_size)))) { + std::vector* task_id_list = task_batch->row_id_list; + task_batch->row_id_list = NULL; + task_batch_map->erase(server_addr); + mutex->Unlock(); + + CommitTasksById(server_addr, *task_id_list, task_type); + delete task_id_list; + task_batch = NULL; + is_instant = false; + mutex->Lock(); + } + } +} + +void TableImpl::TaskBatchTimeout(SdkTask* task) { + std::vector* task_id_list = NULL; + CHECK_NOTNULL(task); + CHECK_EQ(task->Type(), SdkTask::TASKBATCH); + TaskBatch* task_batch = (TaskBatch*)task; + task_batch->ExcludeOtherRef(); + + const std::string& server_addr = task_batch->server_addr; + SdkTask::TYPE task_type = task_batch->type; + Mutex* mutex = task_batch->mutex; + std::map* task_batch_map = task_batch->task_batch_map; + { + MutexLock lock(mutex); + std::map::iterator it = + task_batch_map->find(server_addr); + if (it != task_batch_map->end() && + task_batch->GetId() == it->second->GetId()) { + task_id_list = task_batch->row_id_list; + task_batch->row_id_list = NULL; + task_batch_map->erase(it); + } + } + + if (task_id_list != NULL) { + CommitTasksById(server_addr, *task_id_list, task_type); + delete task_id_list; + } + delete task_batch; +} + +void TableImpl::CommitTasksById(const std::string& server_addr, + std::vector& task_id_list, + SdkTask::TYPE task_type) { + std::vector mutation_list; + std::vector reader_list; + + for (size_t i = 0; i < task_id_list.size(); i++) { + int64_t task_id = task_id_list[i]; + SdkTask* task = task_pool_.GetTask(task_id); + if (task == NULL) { + VLOG(10) << "commit task, type " << task_type << ", id " << task_id << " timeout"; + continue; + } + perf_counter_.total_commit_cnt.Inc(); + CHECK_EQ(task->Type(), task_type); + if (task_type == SdkTask::MUTATION) { + mutation_list.push_back((RowMutationImpl*)task); + } else if (task_type == SdkTask::READ) { + reader_list.push_back((RowReaderImpl*)task); + } + } + if (task_type == SdkTask::MUTATION) { + CommitMutations(server_addr, mutation_list); + } else if (task_type == SdkTask::READ) { + CommitReaders(server_addr, reader_list); + } +} + bool TableImpl::GetTabletMetaForKey(const std::string& key, TabletMeta* meta) { MutexLock lock(&meta_mutex_); TabletMetaNode* node = GetTabletMetaNodeForKey(key); @@ -1486,7 +1384,7 @@ void TableImpl::ScanMetaTableAsync(const std::string& key_start, const std::stri std::function done = std::bind(&TableImpl::ScanMetaTableCallBack, this, key_start, key_end, - expand_key_end, ::common::timer::get_micros(), _1, _2, _3, _4); + expand_key_end, get_micros(), _1, _2, _3, _4); tabletnode_client_async.ScanTablet(request, response, done); } @@ -1497,7 +1395,7 @@ void TableImpl::ScanMetaTableCallBack(std::string key_start, ScanTabletRequest* request, ScanTabletResponse* response, bool failed, int error_code) { - perf_counter_.get_meta.Add(::common::timer::get_micros() - start_time); + perf_counter_.get_meta.Add(get_micros() - start_time); perf_counter_.get_meta_cnt.Inc(); if (failed) { if (error_code == sofa::pbrpc::RPC_ERROR_SERVER_SHUTDOWN || @@ -1699,8 +1597,8 @@ void TableImpl::WakeUpPendingRequest(const TabletMetaNode& node) { const std::string& server_addr = node.meta.server_addr(); int64_t meta_timestamp = node.update_time; - std::vector mutation_list; - std::vector reader_list; + std::vector mutation_list; + std::vector reader_list; std::map >::iterator it = pending_task_id_list_.lower_bound(start_key); @@ -1711,6 +1609,7 @@ void TableImpl::WakeUpPendingRequest(const TabletMetaNode& node) { std::list& task_id_list = it->second; for (std::list::iterator itask = task_id_list.begin(); itask != task_id_list.end(); ++itask) { + perf_counter_.meta_update_cnt.Inc(); int64_t task_id = *itask; SdkTask* task = task_pool_.GetTask(task_id); if (task == NULL) { @@ -1721,12 +1620,10 @@ void TableImpl::WakeUpPendingRequest(const TabletMetaNode& node) { switch (task->Type()) { case SdkTask::READ: { - RowReaderImpl* reader = (RowReaderImpl*)task; - reader_list.push_back(reader); + reader_list.push_back(task); } break; case SdkTask::MUTATION: { - RowMutationImpl* mutation = (RowMutationImpl*)task; - mutation_list.push_back(mutation); + mutation_list.push_back(task); } break; case SdkTask::SCAN: { ScanTask* scan_task = (ScanTask*)task; @@ -1743,10 +1640,10 @@ void TableImpl::WakeUpPendingRequest(const TabletMetaNode& node) { } if (mutation_list.size() > 0) { - PackMutations(server_addr, mutation_list); + PackSdkTasks(server_addr, mutation_list, SdkTask::MUTATION); } if (reader_list.size() > 0) { - PackReaders(server_addr, reader_list); + PackSdkTasks(server_addr, reader_list, SdkTask::READ); } } @@ -2068,6 +1965,19 @@ void TableImpl::PerfCounter::DoDumpPerfCounterLog(const std::string& log_prefix) << " cost_90: " << hist_read_cost.Percentile(90) << " cost_99: " << hist_read_cost.Percentile(99); hist_read_cost.Clear(); + + LOG(INFO) << log_prefix << "[hist_async_cost]" + << " cost_ave: " << hist_async_cost.Average() + << " cost_50: " << hist_async_cost.Percentile(50) + << " cost_90: " << hist_async_cost.Percentile(90) + << " cost_99: " << hist_async_cost.Percentile(99); + hist_async_cost.Clear(); + + LOG(INFO) << log_prefix << "[total]" + << " meta_sched_cnt: " << meta_sched_cnt.Get() + << " meta_update_cnt: " << meta_update_cnt.Get() + << " total_task_cnt: " << total_task_cnt.Get() + << " total_commit_cnt: " << total_commit_cnt.Get(); } void TableImpl::DelayTaskWrapper(ThreadPool::Task task, int64_t task_id) { @@ -2148,6 +2058,15 @@ void TableImpl::StatUserPerfCounter(enum SdkTask::TYPE op, ErrorCode::ErrorCodeT } } +bool TableImpl::GetTabletLocation(std::vector* tablets, + ErrorCode* err) { + return false; +} + +bool TableImpl::GetDescriptor(TableDescriptor* desc, ErrorCode* err) { + return false; +} + /// 创建事务 Transaction* TableImpl::StartRowTransaction(const std::string& row_key) { return new SingleRowTxn((Table*)this, row_key, thread_pool_); diff --git a/src/sdk/table_impl.h b/src/sdk/table_impl.h index 088a2c206..6e0986b62 100644 --- a/src/sdk/table_impl.h +++ b/src/sdk/table_impl.h @@ -16,7 +16,7 @@ #include "sdk/sdk_task.h" #include "sdk/sdk_zk.h" #include "tera.h" -#include "utils/counter.h" +#include "common/counter.h" namespace tera { @@ -261,10 +261,16 @@ class TableImpl : public Table { Counter user_read_fail; ::leveldb::Histogram hist_read_cost; + ::leveldb::Histogram hist_async_cost; + Counter meta_sched_cnt; + Counter meta_update_cnt; + Counter total_task_cnt; + Counter total_commit_cnt; + void DoDumpPerfCounterLog(const std::string& log_prefix); PerfCounter() { - start_time = common::timer::get_micros(); + start_time = get_micros(); } }; private: @@ -274,22 +280,13 @@ class TableImpl : public Table { std::vector* kv_list, ErrorCode* err); - // 将一批mutation根据rowkey分配给各个TS - void DistributeMutations(const std::vector& mu_list, - bool called_by_user); + void DistributeTasks(const std::vector& task_list, + bool called_by_user, + SdkTask::TYPE task_type); void DistributeMutationsById(std::vector* retry_mu_id_list); - // 分配完成后将mutation打包 - void PackMutations(const std::string& server_addr, - std::vector& mu_list); - - // mutation打包不满但到达最大等待时间 - void MutationBatchTimeout(std::string server_addr, uint64_t batch_seq); - // 通过异步RPC将mutation提交至TS - void CommitMutationsById(const std::string& server_addr, - std::vector& mu_id_list); void CommitMutations(const std::string& server_addr, std::vector& mu_list); @@ -306,21 +303,12 @@ class TableImpl : public Table { void DistributeReaders(const std::vector& row_reader_list, bool called_by_user); - void DistributeReadersById(std::vector* reader_id_list); - - // 分配完成后将reader打包 - void PackReaders(const std::string& server_addr, - std::vector& reader_list); - - // reader打包不满但到达最大等待时间 - void ReaderBatchTimeout(std::string server_addr, uint64_t batch_seq); - // 通过异步RPC将reader提交至TS - void CommitReadersById(const std::string server_addr, - std::vector& reader_id_list); void CommitReaders(const std::string server_addr, std::vector& reader_list); + void DistributeReadersById(std::vector* reader_id_list); + // reader RPC回调 void ReaderCallBack(std::vector* reader_id_list, ReadTabletRequest* request, @@ -330,6 +318,14 @@ class TableImpl : public Table { // reader到达用户设置的超时时间但尚未处理完 void ReaderTimeout(SdkTask* sdk_task); + void PackSdkTasks(const std::string& server_addr, + std::vector& task_list, + SdkTask::TYPE task_type); + void TaskBatchTimeout(SdkTask* task); + void CommitTasksById(const std::string& server_addr, + std::vector& task_id_list, + SdkTask::TYPE task_type); + void ScanTabletAsync(ScanTask* scan_task, bool called_by_user); void CommitScan(ScanTask* scan_task, const std::string& server_addr); @@ -415,11 +411,22 @@ class TableImpl : public Table { TableImpl(const TableImpl&); void operator=(const TableImpl&); - struct TaskBatch { - uint64_t sequence_num; - uint64_t timer_id; + struct TaskBatch : public SdkTask { uint64_t byte_size; + std::string server_addr; + SdkTask::TYPE type; + Mutex* mutex; + std::map* task_batch_map; std::vector* row_id_list; + + TaskBatch() : SdkTask(SdkTask::TASKBATCH) {} + virtual bool IsAsync() { return false; } + virtual uint32_t Size() { return 0; } + virtual int64_t TimeOut() { return 0; } + virtual void Wait() {} + virtual void SetError(ErrorCode::ErrorCodeType err, + const std::string& reason) {} + virtual const std::string& RowKey() { return server_addr; } }; std::string name_; @@ -432,10 +439,8 @@ class TableImpl : public Table { uint32_t commit_size_; uint64_t write_commit_timeout_; uint64_t read_commit_timeout_; - std::map mutation_batch_map_; - std::map reader_batch_map_; - uint64_t mutation_batch_seq_; - uint64_t reader_batch_seq_; + std::map mutation_batch_map_; + std::map reader_batch_map_; Counter cur_commit_pending_counter_; Counter cur_reader_pending_counter_; int64_t max_commit_pending_num_; diff --git a/src/sdk/tera.cc b/src/sdk/tera.cc index 0003f9a5f..d01bce0fe 100644 --- a/src/sdk/tera.cc +++ b/src/sdk/tera.cc @@ -41,6 +41,42 @@ static const char* strerr(ErrorCode::ErrorCodeType type) { case ErrorCode::kTxnFail: ret = "TransactionFail"; break; + case ErrorCode::kGTxnDataTooLarge: + ret = "GlobalTransactionDataTooLarge"; + break; + case ErrorCode::kGTxnNotSupport: + ret = "GlobalTransactionNotSupport"; + break; + case ErrorCode::kGTxnSchemaError: + ret = "GlobalTransactionSchemaError"; + break; + case ErrorCode::kGTxnOpAfterCommit: + ret = "GlobalTransactionOpAfterCommit"; + break; + case ErrorCode::kGTxnPrimaryLost: + ret = "GlobalTransactionPrimaryLost"; + break; + case ErrorCode::kGTxnWriteConflict: + ret = "GlobalTransactionWriteConflict"; + break; + case ErrorCode::kGTxnLockConflict: + ret = "GlobalTransactionLockConflict"; + break; + case ErrorCode::kGTxnOKButAckFailed: + ret = "GlobalTransactionOkButAckFailed"; + break; + case ErrorCode::kGTxnOKButNotifyFailed: + ret = "GlobalTransactionOKButNotifyFailed"; + break; + case ErrorCode::kGTxnPrewriteTimeout: + ret = "GlobalTransactionPrewriteTimeout"; + break; + case ErrorCode::kGTxnPrimaryCommitTimeout: + ret = "GlobalTransactionPrimaryCommitTimeout"; + break; + case ErrorCode::kGTxnTimestampLost: + ret = "GlobalTransactionTimestampLost"; + break; default: ret = "UnkownError"; } diff --git a/src/sdk/tera_easy.cc b/src/sdk/tera_easy.cc index c0758eb1d..6978ad9e5 100644 --- a/src/sdk/tera_easy.cc +++ b/src/sdk/tera_easy.cc @@ -13,8 +13,8 @@ #include "common/thread_pool.h" #include "tera.h" -#include "utils/atomic.h" -#include "utils/counter.h" +#include "common/atomic.h" +#include "common/counter.h" DEFINE_int32(tera_easy_ttl, 90 * 24 * 3600, "ttl(s) of key-value writed by tera_easy"); DEFINE_int32(tera_sdk_rpc_max_pending_num, 1024 * 1024, "max num of pending kv"); diff --git a/src/sdk/test/filter_utils_test.cc b/src/sdk/test/filter_utils_test.cc index 19051ce6c..456d406e7 100644 --- a/src/sdk/test/filter_utils_test.cc +++ b/src/sdk/test/filter_utils_test.cc @@ -40,27 +40,21 @@ TEST(FilterUtils, DefaultValueConverter) { EXPECT_FALSE(DefaultValueConverter("", "", NULL)); in = "8"; - out_p = string("\x80\x0\x0\x0\x0\x0\x0\x7", 8); + out_p = string("\x08\x0\x0\x0\x0\x0\x0\x0", 8); type = "int64"; + EXPECT_TRUE(DefaultValueConverter(in, type, &out)); EXPECT_EQ(out, out_p); in = "-8"; - out_p = string("\x7F\xFF\xFF\xFF\xFF\xFF\xFF\xF7", 8); + out_p = string("\xF8\xFF\xFF\xFF\xFF\xFF\xFF\xFF", 8); type = "int64"; EXPECT_TRUE(DefaultValueConverter(in, type, &out)); EXPECT_EQ(out, out_p); - in = "8"; - out_p = string("\x0\x0\x0\x0\x0\x0\x0\x8", 8); - type = "uint64"; - EXPECT_TRUE(DefaultValueConverter(in, type, &out)); - EXPECT_EQ(out, out_p); - in = "-8"; type = "string"; - EXPECT_TRUE(DefaultValueConverter(in, type, &out)); - EXPECT_TRUE(out == "-8"); + EXPECT_FALSE(DefaultValueConverter(in, type, &out)); type = "illegal"; EXPECT_FALSE(DefaultValueConverter(in, type, &out)); diff --git a/src/sdk/test/global_txn_batch_op.cc b/src/sdk/test/global_txn_batch_op.cc new file mode 100644 index 000000000..3e1d14af6 --- /dev/null +++ b/src/sdk/test/global_txn_batch_op.cc @@ -0,0 +1,440 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "tera.h" +#include "version.h" + +DECLARE_string(flagfile); + +using std::vector; +using std::string; +using std::cout; +using std::endl; +using std::pair; +using std::shared_ptr; +using std::unique_ptr; +using std::unordered_map; +using std::function; + +using TxnPtr = shared_ptr; +using RowMutationPtr = shared_ptr; +using ClientPtr = shared_ptr; +using TablePtr = shared_ptr; + +struct RowkeyCfQu{ + RowkeyCfQu()=default; + RowkeyCfQu(string rowkey, string cf, string qu): + rowkey_(rowkey), + cf_(cf), + qu_(qu) + {} + + string rowkey_, cf_, qu_; +}; +//Used for parsing operator string +using OperatorStructure = vector + vector>>; //vector of rowkey-cf-qus in a table + +static unordered_map& GetHelpCommand() { + static unordered_map help_commands; + return help_commands; +} + +static void InitHelpCommand() { + auto& help_commands = GetHelpCommand(); + help_commands["cas"] = "Compare and set old_vals to new_vals across different Tables, Rows, and Columns atomically, usage: \n" + " cas "; + help_commands["get"] = "Get values across different Tables, Rows, and Columns atomically, usage: \n" + " get "; + help_commands["put"] = "Put values across different Tables, Rows, and Columns atomically, usage: \n" + " put "; +} + +static void PrintHelp(const string& str = "") { + auto& help_commands = GetHelpCommand(); + if (str == "" || help_commands.find(str) == help_commands.end()) { + for (auto& help_info : help_commands) { + cout << help_info.first << " " << help_info.second << endl; + } + } else { + cout << str << ": " << help_commands[str] << endl; + } +} + +static vector split(const string& str, const char delimiter) { + vector res; + string::size_type pos = 0; + while (pos < str.size()) { + string::size_type new_pos = str.find(delimiter, pos); + if (new_pos == string::npos) { + res.emplace_back(str.begin() + pos, str.end()); + break; + } else { + res.emplace_back(str.begin() + pos, str.begin() + new_pos); + } + pos = new_pos + 1; + } + return res; +} + +static int64_t ParseOperatorStructure(const string& str, OperatorStructure& opst, size_t& num) { + opst.clear(); + num = 0; + vector table_operations = split(str, '#'); + for (auto& table_op : table_operations) { + vector table_rowkey = split(table_op, '-'); + if (table_rowkey.size() != 2) { + return -1; + } + + opst.emplace_back(table_rowkey[0], vector()); + vector row_operations = split(table_rowkey[1], ':'); + for (auto& row_op : row_operations) { + vector rowkey_cf_qu = split(row_op, '.'); + if (rowkey_cf_qu.size() < 2 || + rowkey_cf_qu.size() > 3) { + return -1; + } + + if (rowkey_cf_qu.size() == 3) { + opst.back().second.emplace_back(rowkey_cf_qu[0], rowkey_cf_qu[1], rowkey_cf_qu[2]); + } else { + opst.back().second.emplace_back(rowkey_cf_qu[0], rowkey_cf_qu[1], ""); + } + ++num; + } + } + return 0; +} + +static int64_t OpenTables(ClientPtr client, + const OperatorStructure& opst, + unordered_map& tables) { + tables.clear(); + tera::ErrorCode ec; + for (auto& table : opst) { + string tablename = table.first; + if (tables.find(table.first) == tables.end()) { + tables.emplace(table.first, TablePtr(client->OpenTable(table.first, &ec))); + if (!tables[table.first]) { + cout << "open table: " << table.first << " failed" << endl; + cout << ec.ToString() << endl; + return -1; + } + } + } + return 0; +} + +static int64_t PutOp(ClientPtr client, const vector& args) { + if (args.size() != 4) { + cout << "Arguments Error: " << args.size() << ", need 4" << endl; + PrintHelp(args[1]); + return -1; + } + + OperatorStructure opst; + size_t op_num = 0; + if (ParseOperatorStructure(args[2], opst, op_num) != 0) { + cout << "Parse Arguments Error" << endl; + PrintHelp(args[1]); + return -1; + } + + vector val = split(args[3], ':'); + if (op_num != val.size()) { + cout << "op size is not equal to val size" << endl; + return -1; + } + + unordered_map tables; + if (OpenTables(client, opst, tables) != 0) { + return -1; + } + + TxnPtr g_txn(client->NewGlobalTransaction()); + if (!g_txn) { + cout << "open txn failed" << endl; + return -1; + } + + string result; + for (auto& table : opst) { + const string& tablename = table.first; + const auto& row_cf_qu_list = table.second; + for (auto& row_cf_qu : row_cf_qu_list) { + const string& rowkey = row_cf_qu.rowkey_; + const string& cf = row_cf_qu.cf_ ; + const string& qu = row_cf_qu.qu_ ; + + unique_ptr reader(tables[tablename]->NewRowReader(rowkey)); + reader->AddColumn(cf, qu); + g_txn->Get(reader.get()); + if (reader->GetError().GetType() != tera::ErrorCode::kOK && + reader->GetError().GetType() != tera::ErrorCode::kNotFound) { + std::cout << reader->GetError().ToString() << std::endl; + return -1; + } + + if (reader->Done()) { + result += ":"; + } else { + result += reader->Value() + ":"; + } + } + } + + if (!result.empty()) result.pop_back(); + + auto val_iter = val.begin(); + for (auto& table : opst) { + const string& tablename = table.first; + const auto& row_cf_qu_list = table.second; + unordered_map row_mutations; + + for (auto& row_cf_qu : row_cf_qu_list) { + const string& rowkey = row_cf_qu.rowkey_; + const string& cf = row_cf_qu.cf_ ; + const string& qu = row_cf_qu.qu_ ; + + if (row_mutations.find(rowkey) == row_mutations.end()) { + RowMutationPtr row_mutation(tables[tablename]->NewRowMutation(rowkey)); + row_mutations[rowkey] = row_mutation; + } + row_mutations[rowkey]->Put(cf, qu, *(val_iter++)); + } + + for (auto mutation : row_mutations) { + g_txn->ApplyMutation(mutation.second.get()); + } + } + + + g_txn->Commit(); + if (g_txn->GetError().GetType() != tera::ErrorCode::kOK) { + std::cout << "commit failed: " << g_txn->GetError().ToString() << std::endl; + cout << result << endl; + return -1; + } + std::cout << "commit success" << std::endl; + + return 0; +} + +static int64_t GetOp(ClientPtr client, const vector& args) { + if (args.size() != 3) { + cout << "Arguments Error: " << args.size() << ", need 3" << endl; + PrintHelp(args[1]); + return -1; + } + + OperatorStructure opst; + size_t op_num = 0; + if (ParseOperatorStructure(args[2], opst, op_num) != 0) { + cout << "Parse Arguments Error" << endl; + PrintHelp(args[1]); + return -1; + } + + unordered_map tables; + if (OpenTables(client, opst, tables) != 0) { + return -1; + } + + TxnPtr g_txn(client->NewGlobalTransaction()); + if (!g_txn) { + cout << "open txn failed" << endl; + return -1; + } + + string result; + for (auto& table : opst) { + const string& tablename = table.first; + const auto& row_cf_qu_list = table.second; + for (auto& row_cf_qu : row_cf_qu_list) { + const string& rowkey = row_cf_qu.rowkey_; + const string& cf = row_cf_qu.cf_ ; + const string& qu = row_cf_qu.qu_ ; + + unique_ptr reader(tables[tablename]->NewRowReader(rowkey)); + reader->AddColumn(cf, qu); + g_txn->Get(reader.get()); + if (reader->GetError().GetType() != tera::ErrorCode::kOK && + reader->GetError().GetType() != tera::ErrorCode::kNotFound) { + std::cout << reader->GetError().ToString() << std::endl; + return -1; + } + + if (reader->Done()) { + result += ":"; + } else { + result += reader->Value() + ":"; + } + } + } + + if (!result.empty()) result.pop_back(); + cout << result << endl; + return 0; +} + +static int64_t CasOp(ClientPtr client, const vector& args) { + if (args.size() != 5) { + cout << "Arguments Error: " << args.size() << ", need 5" << endl; + PrintHelp(args[1]); + return -1; + } + + OperatorStructure opst; + size_t op_num = 0; + if (ParseOperatorStructure(args[2], opst, op_num) != 0) { + cout << "Parse Arguments Error" << endl; + PrintHelp(args[1]); + return -1; + } + + unordered_map tables; + if (OpenTables(client, opst, tables) != 0) { + return -1; + } + + TxnPtr g_txn(client->NewGlobalTransaction()); + if (!g_txn) { + cout << "open txn failed" << endl; + return -1; + } + + string cur_val; + const string& old_val = args[3]; + const string& new_val = args[4]; + for (auto& table : opst) { + const string& tablename = table.first; + const auto& row_cf_qu_list = table.second; + for (auto& row_cf_qu : row_cf_qu_list) { + const string& rowkey = row_cf_qu.rowkey_; + const string& cf = row_cf_qu.cf_ ; + const string& qu = row_cf_qu.qu_ ; + + unique_ptr reader(tables[tablename]->NewRowReader(rowkey)); + reader->AddColumn(cf, qu); + g_txn->Get(reader.get()); + if (g_txn->GetError().GetType() != tera::ErrorCode::kOK) { + std::cout << g_txn->GetError().ToString() << std::endl; + return -1; + } + + if (reader->Done()) { + cur_val += ":"; + } else { + cur_val += reader->Value() + ":"; + } + } + } + + if (!cur_val.empty()) cur_val.pop_back(); + + if (old_val != cur_val) { + cout << "cas failed: NotEqual" << endl; + return -1; + } + + vector new_val_list = split(new_val, ':'); + if (op_num != new_val_list.size()) { + cout << "op size is not equal to val size" << endl; + return -1; + } + + auto val_iter = new_val_list.begin(); + for (auto& table : opst) { + const string& tablename = table.first; + const auto& row_cf_qu_list = table.second; + unordered_map row_mutations; + + for (auto& row_cf_qu : row_cf_qu_list) { + const string& rowkey = row_cf_qu.rowkey_; + const string& cf = row_cf_qu.cf_ ; + const string& qu = row_cf_qu.qu_ ; + + if (row_mutations.find(rowkey) == row_mutations.end()) { + RowMutationPtr row_mutation(tables[tablename]->NewRowMutation(rowkey)); + row_mutations[rowkey] = row_mutation; + } + + row_mutations[rowkey]->Put(cf, qu, *(val_iter++)); + } + + for (auto mutation : row_mutations) { + g_txn->ApplyMutation(mutation.second.get()); + } + } + + g_txn->Commit(); + if (g_txn->GetError().GetType() != tera::ErrorCode::kOK) { + std::cout << "cas failed: " << g_txn->GetError().ToString() << std::endl; + return -1; + } else { + std::cout << "cas success" << endl; + } + + return 0; +} + +static void SignalHandler(int){ + _exit(0); +} + +int main(int argc, char *argv[]) { + signal(SIGINT, SignalHandler); + signal(SIGTERM, SignalHandler); + ::google::ParseCommandLineFlags(&argc, &argv, true); + + vector args(argv, argv + argc); + InitHelpCommand(); + + if (args.size() < 2) { + PrintHelp(); + return 0; + } else if (args[1] == "help") { + if (args.size() > 2) { + PrintHelp(args[2]); + return 0; + } else { + PrintHelp(); + return 0; + } + } else if (args[1] == "version") { + PrintSystemVersion(); + return 0; + } + + unordered_map& args)>> command_table; + command_table["put"] = PutOp; + command_table["get"] = GetOp; + command_table["cas"] = CasOp; + + if (command_table.find(args[1]) == command_table.end()) { + cout << "Wrong Command" << endl; + PrintHelp(); + return -1; + } + + tera::ErrorCode ec; + ClientPtr client(tera::Client::NewClient(FLAGS_flagfile, args[1], &ec)); + if (!client) { + cout << "Create Client Failed: " << ec.ToString() << endl; + return -1; + } + + return command_table[args[1]](client, args); +} diff --git a/src/sdk/test/global_txn_internal_test.cc b/src/sdk/test/global_txn_internal_test.cc new file mode 100644 index 000000000..e3310aa3a --- /dev/null +++ b/src/sdk/test/global_txn_internal_test.cc @@ -0,0 +1,789 @@ +// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. +// +// Author: baorenyi@baidu.com + +#include + +#include "gflags/gflags.h" +#include "glog/logging.h" +#include "gtest/gtest.h" + +#include "sdk/global_txn_internal.h" +#include "sdk/read_impl.h" +#include "sdk/sdk_zk.h" +#include "sdk/sdk_utils.h" +#include "sdk/table_impl.h" +#include "sdk/test/mock_table.h" +#include "tera.h" + +DECLARE_string(tera_coord_type); +DECLARE_int32(tera_sdk_timeout); +DECLARE_int32(tera_gtxn_all_puts_size_limit); + +namespace tera { + +class GlobalTxnInternalTest : public ::testing::Test { +public: + GlobalTxnInternalTest() + : start_ts_(100), thread_pool_(2), gtxn_internal_(Client::NewClient()) { + gtxn_internal_.SetStartTimestamp(start_ts_); + } + + ~GlobalTxnInternalTest() {} + + Table* OpenTable(const std::string& tablename) { + FLAGS_tera_coord_type = "fake_zk"; + return static_cast(new MockTable(tablename, &thread_pool_)); + } + + void MakeKvPair(const std::string& row, + const std::string& cf, + const std::string& qu, + int64_t ts, + const std::string& val, + RowResult* value_list) { + + value_list->clear_key_values(); + KeyValuePair* kv = value_list->add_key_values(); + kv->set_key(row); + kv->set_column_family(cf); + kv->set_qualifier(qu); + kv->set_timestamp(ts); + kv->set_value(val); + } + + void SetSchema(Table* table, const TableSchema& table_schema) { + TableImpl* table_impl = static_cast(table); + table_impl->table_schema_ = table_schema; + } + + void BuildResult(RowReaderImpl* reader_impl, + const RowResult& value_list, + RowReader::TRow *row) { + + reader_impl->result_.clear_key_values(); + reader_impl->SetResult(value_list); + row->clear(); + reader_impl->ToMap(row); + } + +private: + int64_t start_ts_; + common::ThreadPool thread_pool_; + GlobalTxnInternal gtxn_internal_; +}; + +TEST_F(GlobalTxnInternalTest, CheckTable) { + ErrorCode status; + Table* t1 = OpenTable("t1"); + Table* t2 = OpenTable("t2"); + Table* t3 = OpenTable("t3"); + Table* t4 = OpenTable("t4"); + EXPECT_FALSE(t1 == NULL); + EXPECT_FALSE(t2 == NULL); + EXPECT_FALSE(t3 == NULL); + EXPECT_FALSE(t4 == NULL); + + // table and exist cf + TableDescriptor desc("t1"); + desc.EnableTxn(); + desc.AddLocalityGroup("lg0"); + ColumnFamilyDescriptor* cfd1 = desc.AddColumnFamily("cf1"); + cfd1->EnableGlobalTransaction(); + + TableSchema schema; + TableDescToSchema(desc, &schema); + SetSchema(t1, schema); + + EXPECT_TRUE(gtxn_internal_.CheckTable(t1, &status)); + + // table and not exist cf + TableDescriptor desc1("t1"); + desc1.EnableTxn(); + desc1.AddLocalityGroup("lg0"); + ColumnFamilyDescriptor* cfd11 = desc1.AddColumnFamily("cf1"); + cfd11->DisableGlobalTransaction(); + + TableSchema schema1; + TableDescToSchema(desc1, &schema1); + SetSchema(t2, schema1); + EXPECT_FALSE(gtxn_internal_.CheckTable(t2, &status)); + + // table and exist cf + TableDescriptor desc2("t1"); + desc2.AddLocalityGroup("lg0"); + ColumnFamilyDescriptor* cfd12 = desc2.AddColumnFamily("cf1"); + cfd12->EnableGlobalTransaction(); + + TableSchema schema2; + TableDescToSchema(desc2, &schema2); + SetSchema(t3, schema2); + EXPECT_FALSE(gtxn_internal_.CheckTable(t3, &status)); + + // table and not exist cf + TableDescriptor desc3("t1"); + desc3.AddLocalityGroup("lg0"); + ColumnFamilyDescriptor* cfd13 = desc3.AddColumnFamily("cf1"); + cfd13->DisableGlobalTransaction(); + + TableSchema schema3; + TableDescToSchema(desc3, &schema3); + SetSchema(t4, schema3); + EXPECT_FALSE(gtxn_internal_.CheckTable(t4, &status)); + + delete t1; + delete t2; + delete t3; + delete t4; +} + +TEST_F(GlobalTxnInternalTest, IsLockedByOthers) { + Table* t1_ptr = OpenTable("t1"); + + Cell cell1(t1_ptr, "row1", "cf1", "qu1", start_ts_, "val"); + + RowReader* reader = t1_ptr->NewRowReader("row1"); + RowReaderImpl* reader_impl = (RowReaderImpl*)reader; + RowResult value_list; + // exist lock col && ts < start_ts_ + // 12 < 100 less than start_ts + MakeKvPair("row1", "cf1", PackLockName("qu1"), 12, "", &value_list); + RowReader::TRow row; + BuildResult(reader_impl, value_list, &row); + EXPECT_TRUE(gtxn_internal_.IsLockedByOthers(row, cell1)); + + // not exist lock col + value_list.clear_key_values(); + MakeKvPair("row1", "cf1", "qu1", 120, "", &value_list); + BuildResult(reader_impl, value_list, &row); + EXPECT_FALSE(gtxn_internal_.IsLockedByOthers(row, cell1)); + + // exist lock col && ts > start_ts_ + value_list.clear_key_values(); + // 120 > 100 + MakeKvPair("row1", "cf1", PackLockName("qu1"), 120, "", &value_list); + BuildResult(reader_impl, value_list, &row); + + EXPECT_FALSE(gtxn_internal_.IsLockedByOthers(row, cell1)); + delete t1_ptr; +} + +TEST_F(GlobalTxnInternalTest, IsPrimary) { + const std::string t1 = "t1", t2 = "t2", cf2 = "cf2"; + Table* t1_ptr = OpenTable(t1); + EXPECT_FALSE(t1_ptr == NULL); + Cell cell1(t1_ptr, "row1", "cf1", "qu1", start_ts_, "val"); + Cell cell2(t1_ptr, "row1", "cf2", "qu1", start_ts_, "val"); + + PrimaryInfo info2; + info2.set_table_name("t1"); + info2.set_row_key("row1"); + info2.set_column_family("cf1"); + info2.set_qualifier("qu1"); + info2.set_gtxn_start_ts(200); + + EXPECT_TRUE(gtxn_internal_.IsPrimary(cell1, info2)); + EXPECT_FALSE(gtxn_internal_.IsPrimary(cell2, info2)); + + delete t1_ptr; +} + +TEST_F(GlobalTxnInternalTest, FindTable) { + const std::string t1 = "t1", t2 = "t2", cf2 = "cf2"; + + Table* t1_ptr = OpenTable(t1); + EXPECT_FALSE(t1_ptr == NULL); + + TableDescriptor desc(t1); + desc.EnableTxn(); + desc.AddLocalityGroup("lg0"); + ColumnFamilyDescriptor* cfd = desc.AddColumnFamily(cf2); + cfd->EnableGlobalTransaction(); + + TableSchema schema; + TableDescToSchema(desc, &schema); + SetSchema(t1_ptr, schema); + + // call CheckTable(t1) + ErrorCode status; + EXPECT_TRUE(gtxn_internal_.CheckTable(t1_ptr, &status)); + + // t1 in tables_ + Table* t1_ptr1 = gtxn_internal_.FindTable(t1); + EXPECT_TRUE(t1_ptr1->GetName() == t1_ptr->GetName()); + + delete t1_ptr; +} + +TEST_F(GlobalTxnInternalTest, ConflictWithOtherWrite) { + Table* t1_ptr = OpenTable("t1"); + RowReader* reader = t1_ptr->NewRowReader("row1"); + RowReaderImpl* reader_impl = (RowReaderImpl*)reader; + RowResult value_list; + // 12 < 100 less than start_ts + MakeKvPair("row1", "cf1", "qu1", 12, "", &value_list); + reader_impl->SetResult(value_list); + ErrorCode status; + std::vector ws; + // ws is empty + EXPECT_FALSE(gtxn_internal_.ConflictWithOtherWrite(&ws, reader, &status)); + + // different row writes + for(int i = 0; i < 3; ++i) { + Cell cell(t1_ptr, "row2", "cf" + std::to_string(i), + "qu" + std::to_string(i), start_ts_, "val"); + Write w(cell); + ws.push_back(w); + } + EXPECT_FALSE(gtxn_internal_.ConflictWithOtherWrite(&ws, reader, &status)); + + // same row, but not exist target cf + ws.clear(); + for(int i = 0; i < 3; ++i) { + Cell cell(t1_ptr, "row1", "cf0", "qu" + std::to_string(i), start_ts_, "val"); + Write w(cell); + ws.push_back(w); + } + EXPECT_FALSE(gtxn_internal_.ConflictWithOtherWrite(&ws, reader, &status)); + + // same row,cf, but not exist write_col, lock_col + ws.clear(); + for(int i = 0; i < 3; ++i) { + Cell cell(t1_ptr, "row1", "cf1", "qu" + std::to_string(i), start_ts_, "val"); + Write w(cell); + ws.push_back(w); + } + EXPECT_FALSE(gtxn_internal_.ConflictWithOtherWrite(&ws, reader, &status)); + + // same row, cf && exist write_col(latest_ts >= start_ts_) + value_list.clear_key_values(); + // 120 > 100 + MakeKvPair("row1", "cf1", PackWriteName("qu1"), 120, "", &value_list); + reader_impl->result_.clear_key_values(); + reader_impl->SetResult(value_list); + + EXPECT_TRUE(gtxn_internal_.ConflictWithOtherWrite(&ws, reader, &status)); + EXPECT_TRUE(status.GetType() == ErrorCode::kGTxnWriteConflict); + + // same row, cf && exist write_col(latest_ts < start_ts_) + // not exist lock_col + value_list.clear_key_values(); + // 20 < 100 less than start_ts + MakeKvPair("row1", "cf1", PackWriteName("qu1"), 20, "", &value_list); + reader_impl->result_.clear_key_values(); + reader_impl->SetResult(value_list); + + EXPECT_FALSE(gtxn_internal_.ConflictWithOtherWrite(&ws, reader, &status)); + + // same row, cf && exist write_col(latest_ts < start_ts_) + // not exist lock_col + value_list.clear_key_values(); + // 20 < 100 less than start_ts + MakeKvPair("row1", "cf1", PackWriteName("qu1"), 20, "", &value_list); + MakeKvPair("row1", "cf1", PackLockName("qu1"), 20, "", &value_list); + reader_impl->result_.clear_key_values(); + reader_impl->SetResult(value_list); + + EXPECT_TRUE(gtxn_internal_.ConflictWithOtherWrite(&ws, reader, &status)); + EXPECT_TRUE(status.GetType() == ErrorCode::kGTxnLockConflict); + + delete t1_ptr; +} + +TEST_F(GlobalTxnInternalTest, IsGTxnColumnFamily) { + const std::string t1 = "t1", t2 = "t2", cf1 = "cf1", cf2 = "cf2"; + + Table* t1_ptr = OpenTable(t1); + EXPECT_FALSE(t1_ptr == NULL); + + TableDescriptor desc(t1); + desc.EnableTxn(); + desc.AddLocalityGroup("lg0"); + ColumnFamilyDescriptor* cfd = desc.AddColumnFamily(cf1); + cfd->DisableGlobalTransaction(); + ColumnFamilyDescriptor* cfd1 = desc.AddColumnFamily(cf2); + cfd1->EnableGlobalTransaction(); + + TableSchema schema; + TableDescToSchema(desc, &schema); + SetSchema(t1_ptr, schema); + + // IsGTxnColumnFamily(t1, xxx) must be call after CheckTable(t1) + EXPECT_FALSE(gtxn_internal_.IsGTxnColumnFamily(t1, cf1)); + EXPECT_FALSE(gtxn_internal_.IsGTxnColumnFamily(t1, cf2)); + EXPECT_FALSE(gtxn_internal_.IsGTxnColumnFamily(t2, cf1)); + // call CheckTable(t1) + ErrorCode status; + EXPECT_TRUE(gtxn_internal_.CheckTable(t1_ptr, &status)); + + // call IsGTxnColumnFamily(t1, xxx) cf1 is gtxn=false + EXPECT_FALSE(gtxn_internal_.IsGTxnColumnFamily(t1, cf1)); + + // call IsGTxnColumnFamily(t1, xxx) cf2 is gtxn=true + EXPECT_TRUE(gtxn_internal_.IsGTxnColumnFamily(t1, cf2)); + + // call IsGTxnColumnFamily(t2, xxx) + EXPECT_FALSE(gtxn_internal_.IsGTxnColumnFamily(t2, cf1)); + delete t1_ptr; +} + +TEST_F(GlobalTxnInternalTest, SetInternalSdkTaskTimeout) { + Table* t1_ptr = OpenTable("t1"); + RowReader* reader = t1_ptr->NewRowReader("row1"); + RowReaderImpl* reader_impl = (RowReaderImpl*)reader; + + EXPECT_TRUE(gtxn_internal_.terminal_time_ == 0); + gtxn_internal_.SetCommitDuration(1000); + EXPECT_TRUE(gtxn_internal_.terminal_time_ > 1000); + + gtxn_internal_.SetInternalSdkTaskTimeout(reader); + EXPECT_TRUE(reader_impl->TimeOut() == 1000); + + sleep(2); + gtxn_internal_.SetInternalSdkTaskTimeout(reader); + EXPECT_TRUE(reader_impl->TimeOut() == 1); + EXPECT_TRUE(gtxn_internal_.IsTimeOut() == true); + + gtxn_internal_.is_timeout_ = false; + EXPECT_FALSE(gtxn_internal_.terminal_time_ == 0); + gtxn_internal_.SetCommitDuration(1000000); + EXPECT_TRUE(gtxn_internal_.terminal_time_ > 1000000); + + gtxn_internal_.SetInternalSdkTaskTimeout(reader); + EXPECT_TRUE(reader_impl->TimeOut() == FLAGS_tera_sdk_timeout); + EXPECT_TRUE(gtxn_internal_.IsTimeOut() == false); +} + +TEST_F(GlobalTxnInternalTest, VerifyWritesSize0) { + Table* t1_ptr = OpenTable("t1"); + RowMutation* mu = t1_ptr->NewRowMutation("r1"); + int64_t writes_size = 0; + bool ret = gtxn_internal_.VerifyWritesSize(mu, &writes_size); + EXPECT_TRUE(writes_size == 0); + EXPECT_FALSE(ret); + EXPECT_TRUE(mu->GetError().GetType() == ErrorCode::kBadParam); + delete t1_ptr; + delete mu; +} + +TEST_F(GlobalTxnInternalTest, VerifyWritesSize1) { + Table* t1_ptr = OpenTable("t1"); + RowMutation* mu = t1_ptr->NewRowMutation("r1"); + mu->Put("cf0", "qu1", "value", (int64_t)(5)); + mu->Put("cf0", "qu2", "value", (int64_t)(5)); + mu->Put("cf0", "qu3", "value", (int64_t)(5)); + mu->Put("cf0", "qu4", "value", (int64_t)(5)); + mu->DeleteColumns("cf1", "qu5", (int64_t)(5)); + mu->DeleteColumns("cf1", "qu6", (int64_t)(5)); + mu->DeleteColumns("cf1", "qu7", (int64_t)(5)); + + int64_t writes_size = 0; + FLAGS_tera_gtxn_all_puts_size_limit = 10; + bool ret = gtxn_internal_.VerifyWritesSize(mu, &writes_size); + RowMutationImpl* row_mu_impl = static_cast(mu); + EXPECT_TRUE(row_mu_impl->Size() == writes_size); + EXPECT_FALSE(ret); + EXPECT_TRUE(mu->GetError().GetType() == ErrorCode::kGTxnDataTooLarge); + delete t1_ptr; + delete mu; +} + +TEST_F(GlobalTxnInternalTest, VerifyWritesSize2) { + Table* t1_ptr = OpenTable("t1"); + RowMutation* mu = t1_ptr->NewRowMutation("r1"); + mu->Put("cf0", "qu1", "value", (int64_t)(5)); + + int64_t writes_size = 0; + FLAGS_tera_gtxn_all_puts_size_limit = 100000; + bool ret = gtxn_internal_.VerifyWritesSize(mu, &writes_size); + RowMutationImpl* row_mu_impl = static_cast(mu); + EXPECT_TRUE(row_mu_impl->Size() == writes_size); + EXPECT_TRUE(ret); + EXPECT_TRUE(mu->GetError().GetType() == ErrorCode::kOK); + delete t1_ptr; + delete mu; +} + +TEST_F(GlobalTxnInternalTest, BadQualifier) { + bool ret = BadQualifier(""); + EXPECT_FALSE(ret); + ret = BadQualifier("aaaaaaaaaaaaaaa"); + EXPECT_FALSE(ret); + ret = BadQualifier("_*_"); + EXPECT_TRUE(ret); + ret = BadQualifier("____*_"); + EXPECT_TRUE(ret); + ret = BadQualifier("______"); + EXPECT_TRUE(ret); + ret = BadQualifier("____NN_"); + EXPECT_FALSE(ret); + ret = BadQualifier("NN_"); + EXPECT_FALSE(ret); +} + +TEST_F(GlobalTxnInternalTest, VerifyUserRowMutation0) { + Table* t1_ptr = OpenTable("t1"); + RowMutation* mu = t1_ptr->NewRowMutation("r1"); + bool ret = gtxn_internal_.VerifyUserRowMutation(mu); + EXPECT_FALSE(ret); + EXPECT_TRUE(mu->GetError().GetType() == ErrorCode::kBadParam); + delete t1_ptr; + delete mu; +} + +TEST_F(GlobalTxnInternalTest, VerifyUserRowMutation1) { + // set a table to tables_ + ErrorCode status; + Table* t1 = OpenTable("t1"); + // table and exist cf + TableDescriptor desc("t1"); + desc.EnableTxn(); + desc.AddLocalityGroup("lg0"); + ColumnFamilyDescriptor* cfd1 = desc.AddColumnFamily("cf1"); + cfd1->EnableGlobalTransaction(); + + TableSchema schema; + TableDescToSchema(desc, &schema); + SetSchema(t1, schema); + + EXPECT_TRUE(gtxn_internal_.CheckTable(t1, &status)); + + RowMutation* mu = t1->NewRowMutation("r1"); + mu->Put("cf1", "qu1", "value", (int64_t)(5)); + mu->Put("cf1", "qu1_N_", "value", (int64_t)(5)); + mu->Put("cf1", "qu2", "value", (int64_t)(5)); + bool ret = gtxn_internal_.VerifyUserRowMutation(mu); + EXPECT_FALSE(ret); + EXPECT_TRUE(mu->GetError().GetType() == ErrorCode::kBadParam); + delete t1; + delete mu; +} + +TEST_F(GlobalTxnInternalTest, VerifyUserRowMutation2) { + // set a table to tables_ + ErrorCode status; + Table* t1 = OpenTable("t1"); + // table and exist cf + TableDescriptor desc("t1"); + desc.EnableTxn(); + desc.AddLocalityGroup("lg0"); + ColumnFamilyDescriptor* cfd1 = desc.AddColumnFamily("cf1"); + cfd1->EnableGlobalTransaction(); + + TableSchema schema; + TableDescToSchema(desc, &schema); + SetSchema(t1, schema); + + EXPECT_TRUE(gtxn_internal_.CheckTable(t1, &status)); + + RowMutation* mu = t1->NewRowMutation("r1"); + mu->Put("cf0", "qu1", "value", (int64_t)(5)); + mu->Put("cf1", "qu1_N_", "value", (int64_t)(5)); + mu->Put("cf1", "qu2", "value", (int64_t)(5)); + bool ret = gtxn_internal_.VerifyUserRowMutation(mu); + EXPECT_FALSE(ret); + EXPECT_TRUE(mu->GetError().GetType() == ErrorCode::kBadParam); + delete t1; + delete mu; +} + +TEST_F(GlobalTxnInternalTest, VerifyUserRowMutation3) { + // set a table to tables_ + ErrorCode status; + Table* t1 = OpenTable("t1"); + // table and exist cf + TableDescriptor desc("t1"); + desc.EnableTxn(); + desc.AddLocalityGroup("lg0"); + ColumnFamilyDescriptor* cfd1 = desc.AddColumnFamily("cf1"); + cfd1->EnableGlobalTransaction(); + + TableSchema schema; + TableDescToSchema(desc, &schema); + SetSchema(t1, schema); + + EXPECT_TRUE(gtxn_internal_.CheckTable(t1, &status)); + + RowMutation* mu = t1->NewRowMutation("r1"); + mu->Put("cf1", "qu1", "value", (int64_t)(5)); + mu->DeleteColumns("cf1", "qu1", (int64_t)(5)); + mu->DeleteColumn("cf1", "qu2", (int64_t)(5)); + mu->DeleteFamily("cf1", (int64_t)(5)); + bool ret = gtxn_internal_.VerifyUserRowMutation(mu); + EXPECT_FALSE(ret); + EXPECT_TRUE(mu->GetError().GetType() == ErrorCode::kGTxnNotSupport); + delete t1; + delete mu; +} + +TEST_F(GlobalTxnInternalTest, VerifyUserRowMutation4) { + // set a table to tables_ + ErrorCode status; + Table* t1 = OpenTable("t1"); + // table and exist cf + TableDescriptor desc("t1"); + desc.EnableTxn(); + desc.AddLocalityGroup("lg0"); + ColumnFamilyDescriptor* cfd1 = desc.AddColumnFamily("cf1"); + cfd1->EnableGlobalTransaction(); + + TableSchema schema; + TableDescToSchema(desc, &schema); + SetSchema(t1, schema); + + EXPECT_TRUE(gtxn_internal_.CheckTable(t1, &status)); + + RowMutation* mu = t1->NewRowMutation("r1"); + mu->Put("cf1", "qu1", "value", (int64_t)(5)); + mu->DeleteColumns("cf1", "qu1", (int64_t)(5)); + mu->DeleteColumn("cf1", "qu2", (int64_t)(5)); + bool ret = gtxn_internal_.VerifyUserRowMutation(mu); + EXPECT_TRUE(ret); + EXPECT_TRUE(mu->GetError().GetType() == ErrorCode::kOK); + delete t1; + delete mu; +} + +TEST_F(GlobalTxnInternalTest, VerifyUserRowReader0) { + Table* t1_ptr = OpenTable("t1"); + RowReader* r = t1_ptr->NewRowReader("r1"); + bool ret = gtxn_internal_.VerifyUserRowReader(r); + EXPECT_FALSE(ret); + EXPECT_TRUE(r->GetError().GetType() == ErrorCode::kBadParam); + delete t1_ptr; + delete r; +} + +TEST_F(GlobalTxnInternalTest, VerifyUserRowReader1) { + // set a table to tables_ + ErrorCode status; + Table* t1 = OpenTable("t1"); + // table and exist cf + TableDescriptor desc("t1"); + desc.EnableTxn(); + desc.AddLocalityGroup("lg0"); + ColumnFamilyDescriptor* cfd1 = desc.AddColumnFamily("cf1"); + //cfd1->EnableGlobalTransaction(); + + TableSchema schema; + TableDescToSchema(desc, &schema); + SetSchema(t1, schema); + + EXPECT_FALSE(gtxn_internal_.CheckTable(t1, &status)); + + RowReader* r = t1->NewRowReader("r1"); + r->AddColumn("cf1", "qu"); + bool ret = gtxn_internal_.VerifyUserRowReader(r); + EXPECT_FALSE(ret); + EXPECT_TRUE(r->GetError().GetType() == status.GetType()); + delete t1; + delete r; +} + +TEST_F(GlobalTxnInternalTest, VerifyUserRowReader2) { + // set a table to tables_ + ErrorCode status; + Table* t1 = OpenTable("t1"); + // table and exist cf + TableDescriptor desc("t1"); + desc.EnableTxn(); + desc.AddLocalityGroup("lg0"); + ColumnFamilyDescriptor* cfd1 = desc.AddColumnFamily("cf1"); + cfd1->EnableGlobalTransaction(); + + TableSchema schema; + TableDescToSchema(desc, &schema); + SetSchema(t1, schema); + + EXPECT_TRUE(gtxn_internal_.CheckTable(t1, &status)); + + RowReader* r = t1->NewRowReader("r1"); + r->AddColumn("cf1", "qu"); + r->SetSnapshot(10); + bool ret = gtxn_internal_.VerifyUserRowReader(r); + EXPECT_FALSE(ret); + EXPECT_TRUE(r->GetError().GetType() == ErrorCode::kBadParam); + delete t1; + delete r; +} + +TEST_F(GlobalTxnInternalTest, VerifyUserRowReader3) { + // set a table to tables_ + ErrorCode status; + Table* t1 = OpenTable("t1"); + // table and exist cf + TableDescriptor desc("t1"); + desc.EnableTxn(); + desc.AddLocalityGroup("lg0"); + ColumnFamilyDescriptor* cfd1 = desc.AddColumnFamily("cf1"); + cfd1->EnableGlobalTransaction(); + + TableSchema schema; + TableDescToSchema(desc, &schema); + SetSchema(t1, schema); + + EXPECT_TRUE(gtxn_internal_.CheckTable(t1, &status)); + + RowReader* r = t1->NewRowReader("r1"); + r->AddColumnFamily("cf1"); + bool ret = gtxn_internal_.VerifyUserRowReader(r); + EXPECT_FALSE(ret); + EXPECT_TRUE(r->GetError().GetType() == ErrorCode::kBadParam); + delete t1; + delete r; +} + +TEST_F(GlobalTxnInternalTest, VerifyUserRowReader4) { + // set a table to tables_ + ErrorCode status; + Table* t1 = OpenTable("t1"); + // table and exist cf + TableDescriptor desc("t1"); + desc.EnableTxn(); + desc.AddLocalityGroup("lg0"); + ColumnFamilyDescriptor* cfd1 = desc.AddColumnFamily("cf1"); + cfd1->EnableGlobalTransaction(); + + TableSchema schema; + TableDescToSchema(desc, &schema); + SetSchema(t1, schema); + + EXPECT_TRUE(gtxn_internal_.CheckTable(t1, &status)); + + RowReader* r = t1->NewRowReader("r1"); + r->AddColumn("cf0", "qu"); + bool ret = gtxn_internal_.VerifyUserRowReader(r); + EXPECT_FALSE(ret); + EXPECT_TRUE(r->GetError().GetType() == ErrorCode::kBadParam); + delete t1; + delete r; +} + +TEST_F(GlobalTxnInternalTest, VerifyUserRowReader5) { + // set a table to tables_ + ErrorCode status; + Table* t1 = OpenTable("t1"); + // table and exist cf + TableDescriptor desc("t1"); + desc.EnableTxn(); + desc.AddLocalityGroup("lg0"); + ColumnFamilyDescriptor* cfd1 = desc.AddColumnFamily("cf1"); + cfd1->EnableGlobalTransaction(); + + TableSchema schema; + TableDescToSchema(desc, &schema); + SetSchema(t1, schema); + + EXPECT_TRUE(gtxn_internal_.CheckTable(t1, &status)); + + RowReader* r = t1->NewRowReader("r1"); + r->AddColumn("cf1", "qu_*_"); + bool ret = gtxn_internal_.VerifyUserRowReader(r); + EXPECT_FALSE(ret); + EXPECT_TRUE(r->GetError().GetType() == ErrorCode::kBadParam); + delete t1; + delete r; +} + +TEST_F(GlobalTxnInternalTest, VerifyUserRowReader6) { + // set a table to tables_ + ErrorCode status; + Table* t1 = OpenTable("t1"); + // table and exist cf + TableDescriptor desc("t1"); + desc.EnableTxn(); + desc.AddLocalityGroup("lg0"); + ColumnFamilyDescriptor* cfd1 = desc.AddColumnFamily("cf1"); + cfd1->EnableGlobalTransaction(); + + TableSchema schema; + TableDescToSchema(desc, &schema); + SetSchema(t1, schema); + + EXPECT_TRUE(gtxn_internal_.CheckTable(t1, &status)); + + RowReader* r = t1->NewRowReader("r1"); + r->AddColumn("cf1", "qu"); + r->AddColumn("cf1", "q1"); + r->AddColumn("cf1", "q2"); + bool ret = gtxn_internal_.VerifyUserRowReader(r); + EXPECT_TRUE(ret); + EXPECT_TRUE(r->GetError().GetType() == ErrorCode::kOK); + delete t1; + delete r; +} + +TEST_F(GlobalTxnInternalTest, PrimaryIsLocked1) { + // bad case b. read primary lock failed + ErrorCode status; + Table* t1 = OpenTable("t1"); + // table and exist cf + TableDescriptor desc("t1"); + desc.EnableTxn(); + desc.AddLocalityGroup("lg0"); + ColumnFamilyDescriptor* cfd1 = desc.AddColumnFamily("cf1"); + cfd1->EnableGlobalTransaction(); + + TableSchema schema; + TableDescToSchema(desc, &schema); + SetSchema(t1, schema); + + tera::PrimaryInfo info2; + std::string info2_str; + info2.set_table_name("t1"); + info2.set_row_key("row1"); + info2.set_column_family("cf1"); + info2.set_qualifier("qu1"); + info2.set_gtxn_start_ts(100); + info2.SerializeToString(&info2_str); + EXPECT_TRUE(gtxn_internal_.CheckTable(t1, &status)); + + ErrorCode mock_status; + mock_status.SetFailed(ErrorCode::kSystem,""); + std::vector reader_errs; + reader_errs.push_back(mock_status); + (static_cast(t1))->AddReaderErrors(reader_errs); + + EXPECT_FALSE(gtxn_internal_.PrimaryIsLocked(info2, 12, &status)); + EXPECT_TRUE(status.GetType() == ErrorCode::kSystem); + delete t1; +} + +TEST_F(GlobalTxnInternalTest, PrimaryIsLocked2) { + // bad case a. read primary lock notfound + ErrorCode status; + Table* t1 = OpenTable("t1"); + // table and exist cf + TableDescriptor desc("t1"); + desc.EnableTxn(); + desc.AddLocalityGroup("lg0"); + ColumnFamilyDescriptor* cfd1 = desc.AddColumnFamily("cf1"); + cfd1->EnableGlobalTransaction(); + + TableSchema schema; + TableDescToSchema(desc, &schema); + SetSchema(t1, schema); + + tera::PrimaryInfo info2; + std::string info2_str; + info2.set_table_name("t1"); + info2.set_row_key("row1"); + info2.set_column_family("cf1"); + info2.set_qualifier("qu1"); + info2.set_gtxn_start_ts(100); + info2.SerializeToString(&info2_str); + EXPECT_TRUE(gtxn_internal_.CheckTable(t1, &status)); + + ErrorCode mock_status; + mock_status.SetFailed(ErrorCode::kNotFound,""); + std::vector reader_errs; + reader_errs.push_back(mock_status); + (static_cast(t1))->AddReaderErrors(reader_errs); + + EXPECT_FALSE(gtxn_internal_.PrimaryIsLocked(info2, 12, &status)); + delete t1; +} + +} // namespace tera diff --git a/src/sdk/test/global_txn_test.cc b/src/sdk/test/global_txn_test.cc new file mode 100644 index 000000000..c68e0cd2e --- /dev/null +++ b/src/sdk/test/global_txn_test.cc @@ -0,0 +1,1265 @@ +// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. +// +// Author: baorenyi@baidu.com + +#include +#include +#include +#include + +#include "gflags/gflags.h" +#include "glog/logging.h" +#include "gtest/gtest.h" + +#include "sdk/global_txn.h" +#include "sdk/global_txn_internal.h" +#include "sdk/read_impl.h" +#include "sdk/table_impl.h" +#include "sdk/sdk_zk.h" +#include "sdk/test/mock_table.h" +#include "tera.h" + +DECLARE_string(tera_coord_type); + +namespace tera { + +class GlobalTxnTest : public ::testing::Test { +public: + GlobalTxnTest() : + thread_pool_(2), + gtxn_(Client::NewClient(), &thread_pool_, (new sdk::MockTimeoracleClusterFinder(""))) { + gtxn_.status_.SetFailed(ErrorCode::kOK); + gtxn_.status_returned_ = false; + } + + ~GlobalTxnTest() {} + + void SetSchema(Table* table, const TableSchema& table_schema) { + TableImpl* table_impl = static_cast(table); + table_impl->table_schema_ = table_schema; + } + + Table* OpenTable(const std::string& tablename) { + FLAGS_tera_coord_type = "fake_zk"; + return static_cast(new MockTable(tablename, &thread_pool_)); + } + +private: + common::ThreadPool thread_pool_; + GlobalTxn gtxn_; +}; + +TEST_F(GlobalTxnTest, Commit) { + + // sync commit ut + gtxn_.user_commit_callback_ = NULL; + // mutation haven't apply + gtxn_.finish_ = false; + gtxn_.status_returned_ = false; + gtxn_.put_fail_cnt_.Set(10); + gtxn_.has_commited_ = false; + EXPECT_TRUE(gtxn_.Commit().GetType() == ErrorCode::kGTxnOpAfterCommit); + EXPECT_TRUE(gtxn_.status_.GetType() == ErrorCode::kGTxnOpAfterCommit); + EXPECT_TRUE(gtxn_.finish_ == true); + EXPECT_TRUE(gtxn_.has_commited_ == false); + + // have commited + gtxn_.finish_ = false; + gtxn_.status_returned_ = false; + gtxn_.put_fail_cnt_.Set(0); + gtxn_.has_commited_ = true; + EXPECT_TRUE(gtxn_.Commit().GetType() == ErrorCode::kGTxnOpAfterCommit); + EXPECT_TRUE(gtxn_.status_.GetType() == ErrorCode::kGTxnOpAfterCommit); + EXPECT_TRUE(gtxn_.finish_ == true); + EXPECT_TRUE(gtxn_.has_commited_ == true); + + // run commit in the legal state + gtxn_.finish_ = false; + gtxn_.status_returned_ = false; + gtxn_.writes_.clear(); + gtxn_.put_fail_cnt_.Set(0); + gtxn_.has_commited_ = false; + EXPECT_TRUE(gtxn_.Commit().GetType() == ErrorCode::kOK); + EXPECT_TRUE(gtxn_.status_.GetType() == ErrorCode::kOK); + EXPECT_TRUE(gtxn_.finish_ == true); + EXPECT_TRUE(gtxn_.has_commited_ == true); +} + +TEST_F(GlobalTxnTest, DoVerifyPrimaryLockedCallback) { + RowReaderImpl* reader_impl = new RowReaderImpl(NULL, "rowkey"); + SingleRowTxn* txn = new SingleRowTxn(NULL, "rowkey", NULL); + reader_impl->txn_ = txn; + + // not found primary + reader_impl->error_code_.SetFailed(ErrorCode::kNotFound, ""); + + RowReader* reader = static_cast(reader_impl); + gtxn_.DoVerifyPrimaryLockedCallback(reader); + EXPECT_TRUE(gtxn_.finish_ == true); + EXPECT_TRUE(gtxn_.status_.GetType() == ErrorCode::kGTxnPrimaryLost); +} + +TEST_F(GlobalTxnTest, DoVerifyPrimaryLockedCallback1) { + RowReaderImpl* reader_impl = new RowReaderImpl(NULL, "rowkey"); + SingleRowTxn* txn = new SingleRowTxn(NULL, "rowkey", NULL); + reader_impl->txn_ = txn; + + // reader timeout + reader_impl->error_code_.SetFailed(ErrorCode::kTimeout, ""); + RowReader* reader = static_cast(reader_impl); + gtxn_.DoVerifyPrimaryLockedCallback(reader); + EXPECT_TRUE(gtxn_.finish_ == true); + EXPECT_TRUE(gtxn_.status_.GetType() == ErrorCode::kGTxnPrimaryCommitTimeout); +} + +TEST_F(GlobalTxnTest, DoVerifyPrimaryLockedCallback2) { + RowReaderImpl* reader_impl = new RowReaderImpl(NULL, "rowkey"); + SingleRowTxn* txn = new SingleRowTxn(NULL, "rowkey", NULL); + reader_impl->txn_ = txn; + // reader other error + reader_impl->error_code_.SetFailed(ErrorCode::kSystem, ""); + RowReader* reader = static_cast(reader_impl); + gtxn_.DoVerifyPrimaryLockedCallback(reader); + EXPECT_TRUE(gtxn_.finish_ == true); + EXPECT_TRUE(gtxn_.status_.GetType() == ErrorCode::kSystem); +} + +TEST_F(GlobalTxnTest, CheckPrimaryStatusAndCommmitSecondaries) { + SingleRowTxn* txn = new SingleRowTxn(NULL, "rowkey", NULL); + + // primary commit timeout + gtxn_.finish_ = false; + gtxn_.status_returned_ = false; + txn->mutation_buffer_.SetError(ErrorCode::kTimeout,""); + gtxn_.CheckPrimaryStatusAndCommmitSecondaries(txn); + EXPECT_TRUE(gtxn_.finish_ == true); + EXPECT_TRUE(gtxn_.status_.GetType() == ErrorCode::kGTxnPrimaryCommitTimeout); + + // primary commit other error + gtxn_.finish_ = false; + gtxn_.status_returned_ = false; + txn = new SingleRowTxn(NULL, "rowkey", NULL); + txn->mutation_buffer_.SetError(ErrorCode::kSystem, ""); + gtxn_.CheckPrimaryStatusAndCommmitSecondaries(txn); + + EXPECT_TRUE(gtxn_.finish_ == true); + EXPECT_TRUE(gtxn_.status_.GetType() == ErrorCode::kSystem); + + // primary done run next step + gtxn_.finish_ = false; + gtxn_.status_returned_ = false; + txn = new SingleRowTxn(NULL, "rowkey", NULL); + txn->mutation_buffer_.SetError(ErrorCode::kOK, ""); + gtxn_.writes_.clear(); + const std::string tablename = "test_t"; + Table* t = OpenTable(tablename); + Cell cell(t, "r1", "cf", "qu", 1, "val"); + Write w(cell); + // insert a 'Write' + gtxn_.SaveWrite(tablename, "r1", w); + + gtxn_.acks_.clear(); + gtxn_.notifies_.clear(); + gtxn_.CheckPrimaryStatusAndCommmitSecondaries(txn); + + EXPECT_TRUE(gtxn_.finish_ == true); + EXPECT_TRUE(gtxn_.status_returned_ == true); + EXPECT_TRUE(gtxn_.status_.GetType() == ErrorCode::kOK); +} + +TEST_F(GlobalTxnTest, SaveWrite) { + const std::string tablename = "test_t", tablename4 = "test_t4"; + Table* t = OpenTable(tablename); + const std::string row_key = "r1", row_key4 = "r2"; + Cell cell(t, row_key, "cf", "qu", 1, "val"); + Write w(cell); + gtxn_.writes_.clear(); + // insert a 'Write' + gtxn_.SaveWrite(tablename, row_key, w); + GlobalTxn::TableWithRowkey twr(tablename, row_key); + auto w1 = gtxn_.writes_.find(twr); + EXPECT_TRUE(w1 != gtxn_.writes_.end()); + + // insert a same 'Write' + gtxn_.SaveWrite(tablename, row_key, w); + EXPECT_TRUE(gtxn_.writes_.size() == 1); + + // insert a delete type 'Write' at same Cell + Cell cell2(t, row_key, "cf", "qu", 1); + Write w2(cell2); + gtxn_.SaveWrite(tablename, row_key, w2); + EXPECT_TRUE(gtxn_.writes_.size() == 1); + + delete t; +} + +TEST_F(GlobalTxnTest, DoAckCallback) { + const std::string tablename = "test_t1", tablename5 = "test_t5"; + Table* t1 = OpenTable(tablename); + Table* t5 = OpenTable(tablename5); + + // test acks cnt = 2 && not notify + RowMutation* mu1 = t1->NewRowMutation("r1"); + RowMutation* mu5 = t5->NewRowMutation("r1"); + gtxn_.finish_ = false; + gtxn_.ack_done_cnt_.Set(0); + gtxn_.acks_cnt_.Set(2); + gtxn_.notifies_cnt_.Set(0); + gtxn_.DoAckCallback(mu1); + EXPECT_TRUE(gtxn_.finish_ == false); + gtxn_.DoAckCallback(mu5); + EXPECT_TRUE(gtxn_.finish_ == true); + + // test acks cnt = 2 && notify cnt > 0 + RowMutation* mu11 = t1->NewRowMutation("r1"); + RowMutation* mu55 = t5->NewRowMutation("r1"); + gtxn_.finish_ = false; + gtxn_.ack_done_cnt_.Set(0); + gtxn_.acks_cnt_.Set(2); + gtxn_.notifies_cnt_.Set(1); + + gtxn_.DoAckCallback(mu11); + EXPECT_TRUE(gtxn_.finish_ == false); + gtxn_.DoAckCallback(mu55); + EXPECT_TRUE(gtxn_.finish_ == false); + + delete t1; + delete t5; +} + +TEST_F(GlobalTxnTest, DoNotifyCallback) { + const std::string tablename = "test_t11", tablename5 = "test_t55"; + Table* t11 = OpenTable(tablename); + Table* t55 = OpenTable(tablename5); + + // test notifies cnt = 2 + RowMutation* mu1 = t11->NewRowMutation("r1"); + RowMutation* mu5 = t55->NewRowMutation("r1"); + gtxn_.finish_ = false; + gtxn_.notify_done_cnt_.Set(0); + gtxn_.notifies_cnt_.Set(2); + gtxn_.all_task_pushed_ = true; + gtxn_.DoNotifyCallback(mu1); + EXPECT_TRUE(gtxn_.finish_ == false); + gtxn_.DoNotifyCallback(mu5); + EXPECT_TRUE(gtxn_.finish_ == true); + delete t11; + delete t55; +} + +void NotifyWarpper(GlobalTxn* gtxn, + Table* t, + const std::string& row_key, + const std::string& column_family, + const std::string& qualifier) { + gtxn->Notify(t, row_key, column_family, qualifier); +} + +TEST_F(GlobalTxnTest, Notify) { + size_t notify_thread_cnt = 30; + std::vector threads; + // all Table* is NULL + gtxn_.notifies_.clear(); + gtxn_.notifies_cnt_.Set(0); + EXPECT_TRUE(0 == gtxn_.notifies_.size()); + EXPECT_TRUE(gtxn_.notifies_cnt_.Get() == 0); + threads.reserve(notify_thread_cnt); + Table* t0 = NULL; + for (int i = 0; i < notify_thread_cnt; ++i) { + threads.emplace_back(std::thread(NotifyWarpper, >xn_, t0, "", "", "")); + } + for (int i = 0; i < threads.size(); ++i) { + threads[i].join(); + } + threads.clear(); + EXPECT_TRUE(0 == gtxn_.notifies_.size()); + EXPECT_TRUE(gtxn_.notifies_cnt_.Get() == 0); + + // same table and same row + gtxn_.notifies_.clear(); + gtxn_.notifies_cnt_.Set(0); + EXPECT_TRUE(0 == gtxn_.notifies_.size()); + EXPECT_TRUE(gtxn_.notifies_cnt_.Get() == 0); + Table* t1 = OpenTable("t1"); + threads.reserve(30); + for (int i = 0; i < notify_thread_cnt; ++i) { + threads.emplace_back(std::thread(NotifyWarpper, >xn_, t1, "r1", "", "")); + } + for (int i = 0; i < threads.size(); ++i) { + threads[i].join(); + } + threads.clear(); + EXPECT_TRUE(1 == gtxn_.notifies_.size()); + EXPECT_TRUE(gtxn_.notifies_cnt_.Get() == 1); + GlobalTxn::TableWithRowkey twr("t1", "r1"); + EXPECT_TRUE(gtxn_.notifies_[twr].size() == notify_thread_cnt); + + // same table and diff row + gtxn_.notifies_.clear(); + gtxn_.notifies_cnt_.Set(0); + EXPECT_TRUE(0 == gtxn_.notifies_.size()); + EXPECT_TRUE(gtxn_.notifies_cnt_.Get() == 0); + for (int i = 0; i < notify_thread_cnt; ++i) { + threads.emplace_back(std::thread(NotifyWarpper, >xn_, t1, "r" + std::to_string(i), "", "")); + } + for (int i = 0; i < threads.size(); ++i) { + threads[i].join(); + } + threads.clear(); + EXPECT_TRUE(notify_thread_cnt == gtxn_.notifies_.size()); + EXPECT_TRUE(gtxn_.notifies_cnt_.Get() == notify_thread_cnt); + + for (int i = 0; i < notify_thread_cnt; ++i) { + GlobalTxn::TableWithRowkey twr1("t1", "r" + std::to_string(i)); + EXPECT_TRUE(gtxn_.notifies_[twr1].size() == 1); + } +} + +void AckWarpper(GlobalTxn* gtxn, Table* t, + const std::string& row_key, + const std::string& column_family, + const std::string& qualifier) { + gtxn->Ack(t, row_key, column_family, qualifier); +} + +TEST_F(GlobalTxnTest, Ack) { + size_t ack_thread_cnt = 30; + std::vector threads; + // all Table* is NULL + gtxn_.acks_.clear(); + gtxn_.acks_cnt_.Set(0); + EXPECT_TRUE(0 == gtxn_.acks_.size()); + EXPECT_TRUE(gtxn_.acks_cnt_.Get() == 0); + threads.reserve(ack_thread_cnt); + Table* t0 = NULL; + for (int i = 0; i < ack_thread_cnt; ++i) { + threads.emplace_back(std::thread(AckWarpper, >xn_, t0, "", "", "")); + } + for (int i = 0; i < threads.size(); ++i) { + threads[i].join(); + } + threads.clear(); + EXPECT_TRUE(0 == gtxn_.acks_.size()); + EXPECT_TRUE(gtxn_.acks_cnt_.Get() == 0); + + // same table and same row + gtxn_.acks_.clear(); + gtxn_.acks_cnt_.Set(0); + EXPECT_TRUE(0 == gtxn_.acks_.size()); + EXPECT_TRUE(gtxn_.acks_cnt_.Get() == 0); + Table* t1 = OpenTable("t1"); + threads.reserve(30); + for (int i = 0; i < ack_thread_cnt; ++i) { + threads.emplace_back(std::thread(AckWarpper, >xn_, t1, "r1", "", "")); + } + for (int i = 0; i < threads.size(); ++i) { + threads[i].join(); + } + threads.clear(); + EXPECT_TRUE(1 == gtxn_.acks_.size()); + EXPECT_TRUE(gtxn_.acks_cnt_.Get() == 1); + GlobalTxn::TableWithRowkey twr("t1", "r1"); + EXPECT_TRUE(gtxn_.acks_[twr].size() == ack_thread_cnt); + + // same table and diff row + gtxn_.acks_.clear(); + gtxn_.acks_cnt_.Set(0); + EXPECT_TRUE(0 == gtxn_.acks_.size()); + EXPECT_TRUE(gtxn_.acks_cnt_.Get() == 0); + for (int i = 0; i < ack_thread_cnt; ++i) { + threads.emplace_back(std::thread(AckWarpper, >xn_, t1, "r" + std::to_string(i), "", "")); + } + for (int i = 0; i < threads.size(); ++i) { + threads[i].join(); + } + threads.clear(); + EXPECT_TRUE(ack_thread_cnt == gtxn_.acks_.size()); + EXPECT_TRUE(gtxn_.acks_cnt_.Get() == ack_thread_cnt); + + for (int i = 0; i < ack_thread_cnt; ++i) { + GlobalTxn::TableWithRowkey twr1("t1", "r" + std::to_string(i)); + EXPECT_TRUE(gtxn_.acks_[twr1].size() == 1); + } +} + +TEST_F(GlobalTxnTest, DoCommitSecondariesCallback0) { + // mutation error is kOK will finish + std::vector threads; + size_t secondaries_thread_cnt = 10; + gtxn_.all_task_pushed_ = true; + gtxn_.status_.SetFailed(ErrorCode::kOK); + gtxn_.acks_cnt_.Set(0); + gtxn_.ack_done_cnt_.Set(0); + gtxn_.notifies_cnt_.Set(0); + gtxn_.notify_done_cnt_.Set(0); + gtxn_.writes_cnt_.Set(secondaries_thread_cnt); + for (int i = 0; i < secondaries_thread_cnt; ++i) { + RowMutationImpl* mu_impl = new RowMutationImpl(NULL, "rowkey"); + mu_impl->error_code_.SetFailed(ErrorCode::kOK, ""); + RowMutation* mu = static_cast(mu_impl); + auto func = std::bind(&GlobalTxn::DoCommitSecondariesCallback, >xn_, mu); + threads.emplace_back(std::thread(func)); + } + for (int i = 0; i < threads.size(); ++i) { + threads[i].join(); + } + threads.clear(); + EXPECT_TRUE(gtxn_.finish_ == true); +} + +TEST_F(GlobalTxnTest, DoCommitSecondariesCallback1) { + // mutation error is kOK not last one + size_t secondaries_thread_cnt = 50; + std::vector threads; + threads.reserve(secondaries_thread_cnt); + gtxn_.status_.SetFailed(ErrorCode::kOK); + gtxn_.acks_cnt_.Set(0); + gtxn_.ack_done_cnt_.Set(0); + gtxn_.notifies_cnt_.Set(0); + gtxn_.notify_done_cnt_.Set(0); + gtxn_.writes_cnt_.Set(secondaries_thread_cnt + 1); + for (int i = 0; i < secondaries_thread_cnt; ++i) { + RowMutationImpl* mu_impl = new RowMutationImpl(NULL, "rowkey"); + mu_impl->error_code_.SetFailed(ErrorCode::kOK, ""); + RowMutation* mu = static_cast(mu_impl); + auto func = std::bind(&GlobalTxn::DoCommitSecondariesCallback, >xn_, mu); + threads.emplace_back(std::thread(func)); + } + for (int i = 0; i < threads.size(); ++i) { + threads[i].join(); + } + threads.clear(); + EXPECT_TRUE(gtxn_.finish_ == false); +} + +TEST_F(GlobalTxnTest, DoCommitSecondariesCallback2) { + // mutation error is not kOK but status_ is not changed + size_t secondaries_thread_cnt = 10; + std::vector threads; + threads.reserve(secondaries_thread_cnt); + gtxn_.all_task_pushed_ = true; + gtxn_.status_.SetFailed(ErrorCode::kOK); + gtxn_.acks_cnt_.Set(0); + gtxn_.ack_done_cnt_.Set(0); + gtxn_.notifies_cnt_.Set(0); + gtxn_.notify_done_cnt_.Set(0); + gtxn_.writes_cnt_.Set(secondaries_thread_cnt); + for (int i = 0; i < secondaries_thread_cnt; ++i) { + RowMutationImpl* mu_impl = new RowMutationImpl(NULL, "rowkey"); + mu_impl->error_code_.SetFailed(ErrorCode::kSystem, ""); + RowMutation* mu = static_cast(mu_impl); + auto func = std::bind(&GlobalTxn::DoCommitSecondariesCallback, >xn_, mu); + threads.emplace_back(std::thread(func)); + } + for (int i = 0; i < threads.size(); ++i) { + threads[i].join(); + } + threads.clear(); + EXPECT_TRUE(gtxn_.status_.GetType() == ErrorCode::kOK); + EXPECT_TRUE(gtxn_.finish_ == true); +} + +TEST_F(GlobalTxnTest, DoVerifyPrimaryLockedCallback3) { + // mutation error is not kOK but status_ is not changed + size_t secondaries_thread_cnt = 30; + std::vector threads; + + threads.reserve(secondaries_thread_cnt); + gtxn_.status_.SetFailed(ErrorCode::kOK); + gtxn_.acks_cnt_.Set(10); + gtxn_.ack_done_cnt_.Set(9); + gtxn_.notifies_cnt_.Set(10); + gtxn_.notify_done_cnt_.Set(10); + gtxn_.writes_cnt_.Set(secondaries_thread_cnt); + for (int i = 0; i < secondaries_thread_cnt; ++i) { + RowMutationImpl* mu_impl = new RowMutationImpl(NULL, "rowkey"); + mu_impl->error_code_.SetFailed(ErrorCode::kOK, ""); + RowMutation* mu = static_cast(mu_impl); + auto func = std::bind(&GlobalTxn::DoCommitSecondariesCallback, >xn_, mu); + threads.emplace_back(std::thread(func)); + } + for (int i = 0; i < threads.size(); ++i) { + threads[i].join(); + } + threads.clear(); + EXPECT_TRUE(gtxn_.status_.GetType() == ErrorCode::kOK); + EXPECT_TRUE(gtxn_.finish_ == false); + +} + +std::atomic g_callback_run_cnt(0); + +static void EmptyMutationCallback(RowMutation* mu) { + LOG(INFO) << "run empty mutation callback"; + ++g_callback_run_cnt; +} + +// has_commited == true && status_returned_ == false && set mutation callback +TEST_F(GlobalTxnTest, ApplyMutation0) { + g_callback_run_cnt = 0; + gtxn_.has_commited_ = true; + gtxn_.status_returned_ = false; + + RowMutationImpl* mu_impl = new RowMutationImpl(NULL, "rowkey"); + RowMutation* mu = static_cast(mu_impl); + mu->SetCallBack(EmptyMutationCallback); + gtxn_.ApplyMutation(mu); + thread_pool_.Stop(true); + EXPECT_TRUE(mu->GetError().GetType() == ErrorCode::kGTxnOpAfterCommit); + EXPECT_TRUE(gtxn_.status_returned_ == true); + EXPECT_TRUE(gtxn_.put_fail_cnt_.Get() == 0); + EXPECT_TRUE(gtxn_.status_.GetType() == ErrorCode::kGTxnOpAfterCommit); + EXPECT_TRUE(g_callback_run_cnt == 1); +} + +// has_commited == true && status_returned_ == false && don't set mutation callback +TEST_F(GlobalTxnTest, ApplyMutation1) { + g_callback_run_cnt = 0; + gtxn_.has_commited_ = true; + gtxn_.status_returned_ = false; + + RowMutationImpl* mu_impl = new RowMutationImpl(NULL, "rowkey"); + RowMutation* mu = static_cast(mu_impl); + gtxn_.ApplyMutation(mu); + thread_pool_.Stop(true); + EXPECT_TRUE(mu->GetError().GetType() == ErrorCode::kGTxnOpAfterCommit); + EXPECT_TRUE(gtxn_.status_returned_ == true); + EXPECT_TRUE(gtxn_.put_fail_cnt_.Get() == 0); + EXPECT_TRUE(gtxn_.status_.GetType() == ErrorCode::kGTxnOpAfterCommit); + EXPECT_TRUE(g_callback_run_cnt == 0); +} + +TEST_F(GlobalTxnTest, SetReaderStatusAndRunCallback0) { + RowReaderImpl* reader_impl = new RowReaderImpl(NULL, "rowkey"); + ErrorCode status; + status.SetFailed(ErrorCode::kSystem, ""); + gtxn_.SetReaderStatusAndRunCallback(reader_impl,&status); + RowReader* r = static_cast(reader_impl); + thread_pool_.Stop(true); + EXPECT_TRUE(r->GetError().GetType() == ErrorCode::kSystem); + EXPECT_TRUE(r->IsFinished()); + delete r; +} + +TEST_F(GlobalTxnTest, SetReaderStatusAndRunCallback1) { + RowReaderImpl* reader_impl = new RowReaderImpl(NULL, "rowkey"); + reader_impl->SetCallBack([](RowReader* r) { + EXPECT_TRUE(r->GetError().GetType() == ErrorCode::kSystem); + delete r; + }); + ErrorCode status; + status.SetFailed(ErrorCode::kSystem, ""); + gtxn_.SetReaderStatusAndRunCallback(reader_impl,&status); + thread_pool_.Stop(true); +} + +TEST_F(GlobalTxnTest, Get0) { + gtxn_.has_commited_ = true; + RowReaderImpl* reader_impl = new RowReaderImpl(NULL, "rowkey"); + RowReader* r = static_cast(reader_impl); + EXPECT_TRUE(gtxn_.Get(r).GetType() == ErrorCode::kGTxnOpAfterCommit); + thread_pool_.Stop(true); + EXPECT_TRUE(r->GetError().GetType() == ErrorCode::kGTxnOpAfterCommit); + EXPECT_TRUE(r->IsFinished()); + delete r; +} + +TEST_F(GlobalTxnTest, Get1) { + // set a table to tables_ + ErrorCode status; + Table* t1 = OpenTable("t1"); + // table and exist cf + TableDescriptor desc("t1"); + desc.EnableTxn(); + desc.AddLocalityGroup("lg0"); + ColumnFamilyDescriptor* cfd1 = desc.AddColumnFamily("cf1"); + cfd1->EnableGlobalTransaction(); + + TableSchema schema; + TableDescToSchema(desc, &schema); + SetSchema(t1, schema); + + EXPECT_TRUE(gtxn_.gtxn_internal_->CheckTable(t1, &status)); + + RowReader* r = t1->NewRowReader("r1"); + bool ret = gtxn_.gtxn_internal_->VerifyUserRowReader(r); + EXPECT_FALSE(ret); + + gtxn_.has_commited_ = false; + EXPECT_TRUE(gtxn_.Get(r).GetType() == ErrorCode::kBadParam); + thread_pool_.Stop(true); + EXPECT_TRUE(r->GetError().GetType() == ErrorCode::kBadParam); + EXPECT_TRUE(r->IsFinished()); + delete r; + delete t1; +} + +TEST_F(GlobalTxnTest, DoGetCellReaderCallback0) { + Table* t1 = OpenTable("t1"); + RowReader* r = t1->NewRowReader("r1"); + RowReaderImpl* r_impl = static_cast(r); + InternalReaderContext* ctx = new InternalReaderContext(2, r_impl, >xn_); + r->SetContext(ctx); + std::vector cells; + cells.push_back(new Cell(t1, "r1", "cf1", "qu")); + cells.push_back(new Cell(t1, "r1", "cf2", "qu")); + for(auto& cell : cells) { + ctx->cell_map[cell] = 0; + } + RowReader* inter_r = t1->NewRowReader("r1"); + inter_r->SetContext(new CellReaderContext(cells[0], ctx)); + RowReaderImpl* inter_r_impl = static_cast(inter_r); + inter_r_impl->error_code_.SetFailed(ErrorCode::kNotFound, ""); + gtxn_.DoGetCellReaderCallback(inter_r); + EXPECT_TRUE(ctx->not_found_cnt == 1); + EXPECT_TRUE(ctx->fail_cell_cnt == 0); + EXPECT_TRUE(ctx->active_cell_cnt == 1); + thread_pool_.Stop(true); + EXPECT_FALSE(r_impl->IsFinished()); +} + +TEST_F(GlobalTxnTest, DoGetCellReaderCallback1) { + Table* t1 = OpenTable("t1"); + RowReader* r = t1->NewRowReader("r1"); + RowReaderImpl* r_impl = static_cast(r); + InternalReaderContext* ctx = new InternalReaderContext(2, r_impl, >xn_); + r->SetContext(ctx); + std::vector cells; + cells.push_back(new Cell(t1, "r1", "cf1", "qu")); + cells.push_back(new Cell(t1, "r1", "cf2", "qu")); + for(auto& cell : cells) { + ctx->cell_map[cell] = 0; + } + RowReader* inter_r = t1->NewRowReader("r1"); + inter_r->SetContext(new CellReaderContext(cells[0], ctx)); + RowReaderImpl* inter_r_impl = static_cast(inter_r); + inter_r_impl->error_code_.SetFailed(ErrorCode::kOK, ""); + gtxn_.DoGetCellReaderCallback(inter_r); + EXPECT_TRUE(ctx->fail_cell_cnt == 0); + EXPECT_TRUE(ctx->not_found_cnt == 1); + EXPECT_TRUE(ctx->active_cell_cnt == 1); + thread_pool_.Stop(true); + EXPECT_FALSE(r_impl->IsFinished()); +} + +TEST_F(GlobalTxnTest, DoGetCellReaderCallback2) { + Table* t1 = OpenTable("t1"); + RowReader* r = t1->NewRowReader("r1"); + RowReaderImpl* r_impl = static_cast(r); + InternalReaderContext* ctx = new InternalReaderContext(2, r_impl, >xn_); + r->SetContext(ctx); + std::vector cells; + cells.push_back(new Cell(t1, "r1", "cf1", "qu")); + cells.push_back(new Cell(t1, "r1", "cf2", "qu")); + for(auto& cell : cells) { + ctx->cell_map[cell] = 0; + } + RowReader* inter_r = t1->NewRowReader("r1"); + inter_r->SetContext(new CellReaderContext(cells[0], ctx)); + RowReaderImpl* inter_r_impl = static_cast(inter_r); + inter_r_impl->error_code_.SetFailed(ErrorCode::kSystem, ""); + gtxn_.DoGetCellReaderCallback(inter_r); + EXPECT_TRUE(ctx->fail_cell_cnt == 1); + EXPECT_TRUE(ctx->not_found_cnt == 0); + EXPECT_TRUE(ctx->active_cell_cnt == 1); + thread_pool_.Stop(true); + EXPECT_FALSE(r_impl->IsFinished()); +} + +TEST_F(GlobalTxnTest, DoGetCellReaderCallback3) { + Table* t1 = OpenTable("t1"); + RowReader* r = t1->NewRowReader("r1"); + RowReaderImpl* r_impl = static_cast(r); + InternalReaderContext* ctx = new InternalReaderContext(1, r_impl, >xn_); + r->SetContext(ctx); + std::vector cells; + cells.push_back(new Cell(t1, "r1", "cf1", "qu")); + for(auto& cell : cells) { + ctx->cell_map[cell] = 0; + } + RowReader* inter_r = t1->NewRowReader("r1"); + inter_r->SetContext(new CellReaderContext(cells[0], ctx)); + RowReaderImpl* inter_r_impl = static_cast(inter_r); + inter_r_impl->error_code_.SetFailed(ErrorCode::kSystem, ""); + gtxn_.DoGetCellReaderCallback(inter_r); + thread_pool_.Stop(true); + EXPECT_TRUE(r_impl->IsFinished()); +} + +TEST_F(GlobalTxnTest, MergeCellToRow) { + Table* t1 = OpenTable("t1"); + RowReader* r = t1->NewRowReader("r1"); + RowReaderImpl* r_impl = static_cast(r); + InternalReaderContext* ctx = new InternalReaderContext(1, r_impl, >xn_); + r->SetContext(ctx); + std::vector cells; + cells.push_back(new Cell(t1, "r1", "cf1", "qu")); + for(auto& cell : cells) { + ctx->cell_map[cell] = 0; + } + RowReader* inter_r = t1->NewRowReader("r1"); + inter_r->SetContext(new CellReaderContext(cells[0], ctx)); + ErrorCode status; + status.SetFailed(ErrorCode::kSystem, ""); + gtxn_.MergeCellToRow(inter_r, status); + thread_pool_.Stop(true); + EXPECT_TRUE(r_impl->IsFinished()); +} + +TEST_F(GlobalTxnTest, GetCellCallback) { + Table* t1 = OpenTable("t1"); + RowReader* r = t1->NewRowReader("r1"); + RowReaderImpl* r_impl = static_cast(r); + InternalReaderContext* ctx = new InternalReaderContext(1, r_impl, >xn_); + r->SetContext(ctx); + std::vector cells; + cells.push_back(new Cell(t1, "r1", "cf1", "qu")); + for(auto& cell : cells) { + ctx->cell_map[cell] = 0; + } + RowReader* inter_r = t1->NewRowReader("r1"); + inter_r->SetContext(new CellReaderContext(cells[0], ctx)); + RowReaderImpl* inter_r_impl = static_cast(inter_r); + inter_r_impl->error_code_.SetFailed(ErrorCode::kSystem, ""); + gtxn_.GetCellCallback((CellReaderContext*)inter_r->GetContext()); + thread_pool_.Stop(true); + EXPECT_TRUE(r_impl->IsFinished()); +} + +TEST_F(GlobalTxnTest, RollForward) { + // can't find primary write cell + Table* t1 = OpenTable("t1"); + Cell cell(t1, "r1", "cf1", "qu"); + tera::PrimaryInfo primary; + primary.set_table_name("t1"); + primary.set_row_key("r1"); + primary.set_column_family("cf1"); + primary.set_qualifier("qu"); + primary.set_gtxn_start_ts(12); + ErrorCode status; + + std::set gtxn_cfs; + gtxn_.gtxn_internal_->tables_["t1"] = + std::pair>(t1, gtxn_cfs); + ErrorCode mock_status; + mock_status.SetFailed(ErrorCode::kNotFound,""); + std::vector reader_errs; + reader_errs.push_back(mock_status); + (static_cast(t1))->AddReaderErrors(reader_errs); + gtxn_.RollForward(cell, primary, 0, &status); + EXPECT_TRUE(ErrorCode::kGTxnPrimaryLost == status.GetType()); +} + +TEST_F(GlobalTxnTest, CleanLock0) { + // cell same as primary + Table* t1 = OpenTable("t1"); + Cell cell(t1, "r1", "cf1", "qu"); + tera::PrimaryInfo primary; + primary.set_table_name("t1"); + primary.set_row_key("r1"); + primary.set_column_family("cf1"); + primary.set_qualifier("qu"); + primary.set_gtxn_start_ts(12); + // init status is OK + ErrorCode status; + status.SetFailed(ErrorCode::kOK); + std::set gtxn_cfs; + gtxn_.gtxn_internal_->tables_["t1"] = + std::pair>(t1, gtxn_cfs); + // only this cell will call mutation + ErrorCode mock_status1; + mock_status1.SetFailed(ErrorCode::kSystem,""); + std::vector mu_errs; + mu_errs.push_back(mock_status1); + (static_cast(t1))->AddMutationErrors(mu_errs); + // run test + gtxn_.CleanLock(cell, primary, &status); + EXPECT_TRUE(mock_status1.GetType() == status.GetType()); +} + +TEST_F(GlobalTxnTest, CleanLock1) { + // cell diff with primary + Table* t1 = OpenTable("t1"); + Cell cell(t1, "r1", "cf1", "qu"); + tera::PrimaryInfo primary; + primary.set_table_name("t1"); + primary.set_row_key("r2"); // diff row + primary.set_column_family("cf1"); + primary.set_qualifier("qu"); + primary.set_gtxn_start_ts(12); + // init status is OK + ErrorCode status; + status.SetFailed(ErrorCode::kOK); + std::set gtxn_cfs; + gtxn_.gtxn_internal_->tables_["t1"] = + std::pair>(t1, gtxn_cfs); + // mock primary return kSystem but cell kOK + // will get kSystem + ErrorCode mock_status1; + ErrorCode mock_status2; + mock_status1.SetFailed(ErrorCode::kSystem,""); + mock_status2.SetFailed(ErrorCode::kOK,""); + std::vector mu_errs; + mu_errs.push_back(mock_status1); + mu_errs.push_back(mock_status2); + (static_cast(t1))->AddMutationErrors(mu_errs); + // run test + gtxn_.CleanLock(cell, primary, &status); + EXPECT_TRUE(mock_status1.GetType() == status.GetType()); + EXPECT_TRUE(mock_status2.GetType() != status.GetType()); +} + +TEST_F(GlobalTxnTest, CleanLock2) { + // cell diff with primary + Table* t1 = OpenTable("t1"); + Cell cell(t1, "r1", "cf1", "qu"); + tera::PrimaryInfo primary; + primary.set_table_name("t1"); + primary.set_row_key("r2"); // diff row + primary.set_column_family("cf1"); + primary.set_qualifier("qu"); + primary.set_gtxn_start_ts(12); + // init status is OK + ErrorCode status; + status.SetFailed(ErrorCode::kOK); + std::set gtxn_cfs; + gtxn_.gtxn_internal_->tables_["t1"] = + std::pair>(t1, gtxn_cfs); + // mock primary return kOk but cell kSystem + // will get kSystem + ErrorCode mock_status1; + ErrorCode mock_status2; + mock_status1.SetFailed(ErrorCode::kOK,""); + mock_status2.SetFailed(ErrorCode::kSystem,""); + std::vector mu_errs; + mu_errs.push_back(mock_status1); + mu_errs.push_back(mock_status2); + (static_cast(t1))->AddMutationErrors(mu_errs); + // run test + gtxn_.CleanLock(cell, primary, &status); + EXPECT_TRUE(mock_status1.GetType() != status.GetType()); + EXPECT_TRUE(mock_status2.GetType() == status.GetType()); +} + +TEST_F(GlobalTxnTest, CleanLock3) { + // cell diff with primary + Table* t1 = OpenTable("t1"); + Cell cell(t1, "r1", "cf1", "qu"); + tera::PrimaryInfo primary; + primary.set_table_name("t1"); + primary.set_row_key("r2"); // diff row + primary.set_column_family("cf1"); + primary.set_qualifier("qu"); + primary.set_gtxn_start_ts(12); + // init status is OK + ErrorCode status; + status.SetFailed(ErrorCode::kOK); + std::set gtxn_cfs; + gtxn_.gtxn_internal_->tables_["t1"] = + std::pair>(t1, gtxn_cfs); + // mock primary return kTimeout but cell kSystem + // will get kSystem, the latest error will return + ErrorCode mock_status1; + ErrorCode mock_status2; + mock_status1.SetFailed(ErrorCode::kTimeout,""); + mock_status2.SetFailed(ErrorCode::kSystem,""); + std::vector mu_errs; + mu_errs.push_back(mock_status1); + mu_errs.push_back(mock_status2); + (static_cast(t1))->AddMutationErrors(mu_errs); + // run test + gtxn_.CleanLock(cell, primary, &status); + EXPECT_TRUE(mock_status1.GetType() != status.GetType()); + EXPECT_TRUE(mock_status2.GetType() == status.GetType()); +} + +void AddKeyValueToResult(const std::string& key, const std::string& cf, + const std::string& qu, int64_t timestamp, + const std::string& value, RowResult* result) { + KeyValuePair* kv = result->add_key_values(); + kv->set_key(key); + kv->set_column_family(cf); + kv->set_qualifier(qu); + kv->set_timestamp(timestamp); + kv->set_value(value); +} + +TEST_F(GlobalTxnTest, EncodeWriteValue) { + std::string ret = EncodeWriteValue(1, 100); + int type; + int64_t ts; + DecodeWriteValue(ret, &type, &ts); + + EXPECT_TRUE(type == 1); + EXPECT_TRUE(ts == 100); +} + +TEST_F(GlobalTxnTest, DecodeWriteValue) { + // a int bigger than mutaion type + std::string ret = EncodeWriteValue(99, 1000000); + int type; + int64_t ts; + DecodeWriteValue(ret, &type, &ts); + + EXPECT_TRUE(type == 99); + EXPECT_TRUE(ts == 1000000); +} + +TEST_F(GlobalTxnTest, FindValueFromResultRow0) { + // the success case + Table* t1 = OpenTable("t1"); + RowReader* r = t1->NewRowReader("r1"); + RowReaderImpl* r_impl = static_cast(r); + + // build RowReader::TRow + // cf must exist before call FindValueFromResultRow + RowResult result; + gtxn_.start_ts_ = 14; + AddKeyValueToResult("r1", "cf1", "qu1", 9, "v1", &result); + AddKeyValueToResult("r1", "cf1", "qu1", 13, "v2", &result); + + AddKeyValueToResult("r1", "cf1", "qu1_W_", 15, EncodeWriteValue(0, 13), &result); + AddKeyValueToResult("r1", "cf1", "qu1_W_", 12, EncodeWriteValue(0, 9), &result); + r_impl->SetResult(result); + RowReader::TRow row; + r->ToMap(&row); + + for (auto& cf : row) { + std::cout << cf.first << "\n"; + for (auto& qu : cf.second) { + std::cout << "\t" << qu.first << "\n"; + for (auto& v : qu.second) { + std::cout << "\t\tts=" << v.first << ",v=" << v.second << "\n"; + } + } + } + + // build target_cell + Cell target_cell(t1, "r1", "cf1", "qu1"); + + // run test + EXPECT_TRUE(gtxn_.FindValueFromResultRow(row, &target_cell)); + EXPECT_TRUE(target_cell.Timestamp() == 9); + EXPECT_TRUE(target_cell.Value() == "v1"); + + delete t1; + delete r; +} + +TEST_F(GlobalTxnTest, FindValueFromResultRow1) { + // the not found + Table* t1 = OpenTable("t1"); + RowReader* r = t1->NewRowReader("r1"); + RowReaderImpl* r_impl = static_cast(r); + + // build RowReader::TRow + // cf must exist before call FindValueFromResultRow + RowResult result; + r_impl->SetResult(result); + gtxn_.start_ts_ = 11; + RowReader::TRow row; + r->ToMap(&row); + + // build target_cell + Cell target_cell(t1, "r1", "cf1", "qu1"); + + // run test + EXPECT_FALSE(gtxn_.FindValueFromResultRow(row, &target_cell)); + + delete t1; + delete r; +} + +TEST_F(GlobalTxnTest, FindValueFromResultRow2) { + // the not found write col + Table* t1 = OpenTable("t1"); + RowReader* r = t1->NewRowReader("r1"); + RowReaderImpl* r_impl = static_cast(r); + + // build RowReader::TRow + // cf must exist before call FindValueFromResultRow + RowResult result; + gtxn_.start_ts_ = 11; + + AddKeyValueToResult("r1", "cf1", "qu1", 9, "v1", &result); + AddKeyValueToResult("r1", "cf1", "qu1", 13, "v2", &result); + r_impl->SetResult(result); + + RowReader::TRow row; + r->ToMap(&row); + + // build target_cell + Cell target_cell(t1, "r1", "cf1", "qu1"); + + // run test + EXPECT_FALSE(gtxn_.FindValueFromResultRow(row, &target_cell)); + + delete t1; + delete r; +} + +TEST_F(GlobalTxnTest, FindValueFromResultRow3) { + // the not found rigth version + Table* t1 = OpenTable("t1"); + RowReader* r = t1->NewRowReader("r1"); + RowReaderImpl* r_impl = static_cast(r); + + // build RowReader::TRow + // cf must exist before call FindValueFromResultRow + RowResult result; + gtxn_.start_ts_ = 11; + + AddKeyValueToResult("r1", "cf1", "qu1", 9, "v1", &result); + AddKeyValueToResult("r1", "cf1", "qu1", 13, "v2", &result); + + AddKeyValueToResult("r1", "cf1", "qu1_W_", 15, EncodeWriteValue(0, 13), &result); + // make ts = 9 v1 is deleted before this function called + AddKeyValueToResult("r1", "cf1", "qu1_W_", 12, EncodeWriteValue(1, 9), &result); + r_impl->SetResult(result); + RowReader::TRow row; + r->ToMap(&row); + + // build target_cell + Cell target_cell(t1, "r1", "cf1", "qu1"); + // run test + EXPECT_FALSE(gtxn_.FindValueFromResultRow(row, &target_cell)); + + delete t1; + delete r; +} + +TEST_F(GlobalTxnTest, FindValueFromResultRow4) { + // the not found rigth version + Table* t1 = OpenTable("t1"); + RowReader* r = t1->NewRowReader("r1"); + RowReaderImpl* r_impl = static_cast(r); + + // build RowReader::TRow + // cf must exist before call FindValueFromResultRow + RowResult result; + gtxn_.start_ts_ = 11; + + AddKeyValueToResult("r1", "cf1", "qu1", 9, "v1", &result); + AddKeyValueToResult("r1", "cf1", "qu1", 13, "v2", &result); + + // maybe other older version clean by gc, before this function called + AddKeyValueToResult("r1", "cf1", "qu1_W_", 15, EncodeWriteValue(0, 13), &result); + r_impl->SetResult(result); + RowReader::TRow row; + r->ToMap(&row); + + // build target_cell + Cell target_cell(t1, "r1", "cf1", "qu1"); + // run test + EXPECT_FALSE(gtxn_.FindValueFromResultRow(row, &target_cell)); + + delete t1; + delete r; +} + +TEST_F(GlobalTxnTest, FindValueFromResultRow5) { + // the not found rigth version + Table* t1 = OpenTable("t1"); + RowReader* r = t1->NewRowReader("r1"); + RowReaderImpl* r_impl = static_cast(r); + + // build RowReader::TRow + // cf must exist before call FindValueFromResultRow + RowResult result; + gtxn_.start_ts_ = 11; + + // maybe version 1 was clean by gc, before this function called + AddKeyValueToResult("r1", "cf1", "qu1", 13, "v2", &result); + + AddKeyValueToResult("r1", "cf1", "qu1_W_", 15, EncodeWriteValue(0, 13), &result); + AddKeyValueToResult("r1", "cf1", "qu1_W_", 12, EncodeWriteValue(0, 9), &result); + r_impl->SetResult(result); + RowReader::TRow row; + r->ToMap(&row); + + // build target_cell + Cell target_cell(t1, "r1", "cf1", "qu1"); + // run test + EXPECT_FALSE(gtxn_.FindValueFromResultRow(row, &target_cell)); + + delete t1; + delete r; +} + +TEST_F(GlobalTxnTest, SetLastStatus) { + ErrorCode status; + status.SetFailed(ErrorCode::kOK, ""); + gtxn_.status_returned_ = false; + gtxn_.SetLastStatus(&status); + EXPECT_TRUE(gtxn_.status_returned_); + EXPECT_TRUE(gtxn_.status_.GetType() == status.GetType()); + + status.SetFailed(ErrorCode::kTimeout, ""); + gtxn_.status_returned_ = true; + gtxn_.SetLastStatus(&status); + EXPECT_TRUE(gtxn_.status_returned_); + EXPECT_TRUE(gtxn_.status_.GetType() == ErrorCode::kOK); +} + +static bool g_callback_run_flag = false; + +TEST_F(GlobalTxnTest, RunUserCallback0) { + g_callback_run_flag = false; + gtxn_.SetCommitCallback([](Transaction* t) {g_callback_run_flag = true;}); + gtxn_.RunUserCallback(); + EXPECT_TRUE(g_callback_run_flag); +} + +static void WaitWapper(GlobalTxn* gtxn) { + gtxn->WaitForComplete(); + g_callback_run_flag = true; +} + +TEST_F(GlobalTxnTest, RunUserCallback1) { + g_callback_run_flag = false; + thread_pool_.AddTask(std::bind(&WaitWapper, >xn_)); + gtxn_.RunUserCallback(); + EXPECT_TRUE(gtxn_.finish_); + thread_pool_.Stop(true); + EXPECT_TRUE(g_callback_run_flag); +} + +TEST_F(GlobalTxnTest, BackoffAndMaybeCleanupLock0) { + bool try_clean = false; + ErrorCode status; + // make sure have lock_ts < start_ts + // can't found primary + Table* t1 = OpenTable("t1"); + RowReader* r = t1->NewRowReader("r1"); + RowReaderImpl* r_impl = static_cast(r); + + // build RowReader::TRow + // cf must exist before call FindValueFromResultRow + RowResult result; + gtxn_.start_ts_ = 11; + + // start_ts > lock ts and primary info is bad for parse + AddKeyValueToResult("r1", "cf1", "qu1_L_", 9, "primary info", &result); + r_impl->SetResult(result); + RowReader::TRow row; + r->ToMap(&row); + + // build target_cell + Cell target_cell(t1, "r1", "cf1", "qu1"); + // run test + gtxn_.BackoffAndMaybeCleanupLock(row, target_cell, try_clean, &status); + EXPECT_TRUE(status.GetType() == ErrorCode::kGTxnPrimaryLost); + delete t1; + delete r; +} + +TEST_F(GlobalTxnTest, RunAfterPrewriteFailed0) { + Table* t = OpenTable("t1"); + Cell cell(t, "r1", "cf", "qu", 1, "val"); + Write w(cell); + std::vector ws; + ws.push_back(w); + PrewriteContext* ctx = new PrewriteContext(&ws, >xn_, w.TableName(), w.RowKey()); + ctx->status.SetFailed(ErrorCode::kOK, ""); + gtxn_.RunAfterPrewriteFailed(ctx); + EXPECT_TRUE(gtxn_.status_.GetType() == ErrorCode::kOK); +} + +TEST_F(GlobalTxnTest, RunAfterPrewriteFailed1) { + Table* t = OpenTable("t1"); + Cell cell(t, "r1", "cf", "qu", 1, "val"); + Write w(cell); + std::vector ws; + ws.push_back(w); + PrewriteContext* ctx = new PrewriteContext(&ws, >xn_, w.TableName(), w.RowKey()); + ctx->status.SetFailed(ErrorCode::kTimeout, ""); + gtxn_.RunAfterPrewriteFailed(ctx); + EXPECT_TRUE(gtxn_.status_.GetType() == ErrorCode::kGTxnPrewriteTimeout); +} + +TEST_F(GlobalTxnTest, RunAfterPrewriteFailed2) { + Table* t = OpenTable("t1"); + Cell cell(t, "r1", "cf", "qu", 1, "val"); + Write w(cell); + std::vector ws; + ws.push_back(w); + PrewriteContext* ctx = new PrewriteContext(&ws, >xn_, w.TableName(), w.RowKey()); + gtxn_.gtxn_internal_->is_timeout_ = true; + gtxn_.RunAfterPrewriteFailed(ctx); + EXPECT_TRUE(gtxn_.status_.GetType() == ErrorCode::kGTxnPrewriteTimeout); + delete t; +} + +TEST_F(GlobalTxnTest, DoPrewriteCallback0) { + // case a. global timeout + Table* t = OpenTable("t1"); + Transaction* txn = t->StartRowTransaction("r1"); + SingleRowTxn* stxn = static_cast(txn); + Cell cell(t, "r1", "cf", "qu", 1, "val"); + Write w(cell); + std::vector ws; + ws.push_back(w); + PrewriteContext* ctx = new PrewriteContext(&ws, >xn_, w.TableName(), w.RowKey()); + stxn->SetContext(ctx); + gtxn_.gtxn_internal_->is_timeout_ = true; + gtxn_.DoPrewriteCallback(stxn); + EXPECT_TRUE(gtxn_.status_.GetType() == ErrorCode::kGTxnPrewriteTimeout); + delete t; +} + +TEST_F(GlobalTxnTest, DoPrewriteCallback1) { + // case b. this operator timeout + Table* t = OpenTable("t1"); + Transaction* txn = t->StartRowTransaction("r1"); + SingleRowTxn* stxn = static_cast(txn); + Cell cell(t, "r1", "cf", "qu", 1, "val"); + Write w(cell); + std::vector ws; + ws.push_back(w); + PrewriteContext* ctx = new PrewriteContext(&ws, >xn_, w.TableName(), w.RowKey()); + stxn->SetContext(ctx); + stxn->mutation_buffer_.SetError(ErrorCode::kTimeout,""); + gtxn_.gtxn_internal_->is_timeout_ = false; + gtxn_.DoPrewriteCallback(stxn); + EXPECT_TRUE(gtxn_.status_.GetType() == ErrorCode::kGTxnPrewriteTimeout); + delete t; +} + +TEST_F(GlobalTxnTest, DoPrewriteCallback2) { + // case b. this operator error + Table* t = OpenTable("t1"); + Transaction* txn = t->StartRowTransaction("r1"); + SingleRowTxn* stxn = static_cast(txn); + Cell cell(t, "r1", "cf", "qu", 1, "val"); + Write w(cell); + std::vector ws; + ws.push_back(w); + PrewriteContext* ctx = new PrewriteContext(&ws, >xn_, w.TableName(), w.RowKey()); + stxn->SetContext(ctx); + stxn->mutation_buffer_.SetError(ErrorCode::kSystem,""); + gtxn_.gtxn_internal_->is_timeout_ = false; + gtxn_.DoPrewriteCallback(stxn); + EXPECT_TRUE(gtxn_.status_.GetType() == ErrorCode::kSystem); + delete t; +} + +TEST_F(GlobalTxnTest, VerifyPrimaryLocked) { + Table* t = OpenTable("t1"); + Cell cell(t, "r1", "cf", "qu", 1, "val"); + Write w(cell); + gtxn_.primary_write_ = &w; + + ErrorCode mock_status; + mock_status.SetFailed(ErrorCode::kNotFound,""); + std::vector reader_errs; + reader_errs.push_back(mock_status); + (static_cast(t))->AddReaderErrors(reader_errs); + + gtxn_.VerifyPrimaryLocked(); + EXPECT_TRUE(gtxn_.status_.GetType() == ErrorCode::kGTxnPrimaryLost); +} + + +} // namespace tera diff --git a/src/sdk/test/global_txn_test_tool.cc b/src/sdk/test/global_txn_test_tool.cc new file mode 100644 index 000000000..889e442fa --- /dev/null +++ b/src/sdk/test/global_txn_test_tool.cc @@ -0,0 +1,754 @@ +// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. +// +// Author: baorenyi@baidu.com + +#include "sdk/test/global_txn_test_tool.h" + +#include +#include +#include +#include + +#include +#include + +#include "common/base/string_ext.h" +#include "common/file/file_path.h" +#include "sdk/sdk_utils.h" +#include "sdk/client_impl.h" +#include "utils/config_utils.h" +#include "common/timer.h" +#include "version.h" + +DECLARE_string(tera_gtxn_test_flagfile); +DEFINE_string(gtxn_test_conf_dir, "../conf/", "gtxn test conf dir"); +DEFINE_string(gtxn_test_case_dir, "../cases/", "gtxn test cases dir"); +DEFINE_string(case_number, "", "gtxn test case number"); +DEFINE_bool(ignore_bad_case, false, "gtxn test ignore bad case"); +DEFINE_bool(gtxn_test_async_mode, false, "gtxn test async mode"); +DEFINE_bool(gtxn_test_debug_opened, false, "gtxn test debug opened"); +DEFINE_int32(gtxn_test_thread_pool_size, 20, "gtxn test thread pool size"); +DEFINE_bool(gtxn_test_drop_table_before, true, "gtxn test set drop tables before test"); + +namespace tera { +/** + * cases/ directory format + * + * CONF_ROOR/cases/1/schemas/table_1 [table schema file] + * .... + * CONF_ROOR/cases/1/schemas/table_x + * + * CONF_ROOR/cases/1/T_1/op_list [operations list] + * Format of op_list: + * + * TABLES:table_1,table_2,table_3 + * GET table_1 r1 cf1 qu1 + * PUT table_2 r2 cf2 qu2 valuex + * DEL table_3 r3 cf3 qu3 + * + * CONF_ROOR/cases/1/T_1/gtxn.flag [option] + * CONF_ROOR/cases/1/T_1/result_list [set result list] + * + * CONF_ROOR/cases/1/T_2/op_list + * CONF_ROOR/cases/1/T_1/gtxn.flag [option] + * CONF_ROOR/cases/1/T_2/result_list + * + **/ +bool GlobalTxnTestTool::LoadTestConf() { + // list cases + const std::string case_dir = FLAGS_gtxn_test_case_dir; + std::vector file_list; + if (IsEmpty(case_dir) || !ListCurrentDir(case_dir, &file_list)) { + LOG(ERROR) << "list cases failed, dir:" << case_dir; + return false; + } + for (auto it = file_list.begin(); it != file_list.end(); ++it) { + if (FLAGS_case_number != "" && (*it) != FLAGS_case_number) { + continue; + } + const std::string& dir_name = case_dir + (*it); + + if (!IsDir(dir_name)) { + continue; + } + + int case_num = atoi((*it).c_str()); + if (case_num <= 0) { + LOG(ERROR) << "load case failed, dir:" << dir_name; + return false; + } + // list cases/x/schemas/ + std::vector schema_files; + const std::string& schema_dir = dir_name + "/schemas/"; + if (IsEmpty(schema_dir) || !ListCurrentDir(schema_dir, &schema_files)) { + LOG(ERROR) << "list case(" << dir_name << ") schemas failed"; + return false; + } + int schema_cnt = 0; + for (auto sit = schema_files.begin(); sit != schema_files.end(); ++sit) { + const std::string& schema_file = schema_dir + (*sit); + if (IsDir(schema_file)) { + continue; + } + // load schemas + TableDescriptor* desc = new TableDescriptor(); + if (LoadDescriptor(schema_file, desc)) { + if (case_desc_map_.find(case_num) == case_desc_map_.end()) { + case_desc_map_[case_num] = std::vector(); + } + case_desc_map_[case_num].push_back(desc); + ++schema_cnt; + } else { + delete desc; + LOG(ERROR) << "load schema failed, schema_file:" << schema_file; + break; + } + } + if (schema_cnt == 0) { + LOG(ERROR) << "schemafile not found"; + return false; + } + + // mark cases/x/T_xx/ + std::vector txn_list; + if (!ListCurrentDir(dir_name, &txn_list)) { + LOG(ERROR) << "find txn dir failed, dir:" << dir_name; + return false; + } + int reg_cnt = 0; + for(auto it = txn_list.begin(); it != txn_list.end(); ++it) { + if (!IsDir(dir_name + "/" + (*it)) || *it == "schemas") { + continue; + } + if ((*it).find("T_") != std::string::npos) { + // find transaction + int gtxn_id = atoi(((*it).substr(2)).c_str()); + if (gtxn_id <= 0) { + LOG(ERROR) << "mark gtxn conf failed, dir:" + << case_dir << "/" << dir_name; + return false; + } else { + CaseRegister(case_num, gtxn_id); + ++reg_cnt; + } + } + } + if (reg_cnt == 0) { + LOG(ERROR) << "transaction not found"; + return false; + } + } + return true; +} + +void GlobalTxnTestTool::CaseRegister(const int case_num, const int gtxn_id) { + CasePair case_pair(case_num, gtxn_id); + case_list_.push_back(case_pair); +} + +bool GlobalTxnTestTool::LoadDescriptor(const std::string& schema_file, + TableDescriptor* table_desc) { + ErrorCode err; + if (!ParseTableSchemaFile(schema_file, table_desc, &err)) { + LOG(ERROR) << "fail to parse input table schema." << schema_file; + return false; + } + //ShowTableDescriptor(*table_desc, true); + return true; +} + +GlobalTxnTestTool::GlobalTxnTestTool(Client* client): + thread_pool_(FLAGS_gtxn_test_thread_pool_size), + client_(client) { +} + +void GlobalTxnTestTool::RunTest(tera::Client* client, int case_number) { + do_cnt_.Set(0); + done_cnt_.Set(0); + done_fail_cnt_.Set(0); + for (auto it = case_list_.begin(); it != case_list_.end(); ++it) { + CasePair case_pair = *it; + int case_num = case_pair.first; + if (case_number != -1 && case_num != case_number) { + continue; + } + int gtxn_id = case_pair.second; + + const std::string case_dir = FLAGS_gtxn_test_case_dir; + const std::string conf_dir = case_dir + std::to_string(case_num) + + "/T_" + std::to_string(gtxn_id); + const std::string& op_list_file = conf_dir + "/op_list"; + std::vector op_list; + std::ifstream ifile(op_list_file); + std::string line; + int cnt = 0; + while (std::getline(ifile, line)) { + if (cnt == 0) { + std::size_t found = line.find("TABLES:"); + if (found!=std::string::npos) { + std::vector tables; + SplitString(line.substr(found + 7), ",", &tables); + if (!OpenTestTables(tables)) { + return; + } + } + } else { + op_list.push_back(line); + } + ++cnt; + } + ifile.close(); + if (cnt < 1) { + LOG(ERROR) << "no operations in op_list"; + } + do_cnt_.Inc(); + ThreadPool::Task task = std::bind(&GlobalTxnTestTool::RunTestInternal, + this, client, case_num, gtxn_id, op_list); + thread_pool_.AddTask(task); + } +} + +void GlobalTxnTestTool::RunTestInternal(tera::Client* client, const int case_num, const int gtxn_id, + const std::vector& op_list) { + const std::string case_dir = FLAGS_gtxn_test_case_dir; + const std::string conf_dir = case_dir + std::to_string(case_num) + + "/T_" + std::to_string(gtxn_id); + + // make sure flagfile only service for this transaction + tera::Transaction* gtxn = nullptr; + { + MutexLock lock(&mu_); + FLAGS_tera_gtxn_test_flagfile = conf_dir + "/gtxn.flag"; + gtxn = client->NewGlobalTransaction(); + } + + if (!FLAGS_gtxn_test_async_mode) { + std::vector result; + for (auto it = op_list.begin(); it != op_list.end(); ++it) { + const std::string& op_str = *it; + VLOG(12) << "OPERATION:" << op_str; + OpType op_type; + std::vector op_args; + if (!ParseOp(op_str, &op_type, &op_args) + || !DoOp(gtxn, op_type, op_args, &result)) { + LOG(ERROR) << gtxn->GetError().ToString(); + delete gtxn; + done_cnt_.Inc(); + return; + } + } + gtxn->Commit(); + result.push_back(std::to_string(gtxn->GetError().GetType())); + if(!CheckResult(case_num, gtxn_id, result)) { + done_fail_cnt_.Inc(); + } + delete gtxn; + done_cnt_.Inc(); + } else { + if (op_list.size() > 0) { + GTxnTestContext* ctx = new GTxnTestContext(); + ctx->tool = this; + ctx->gtxn = gtxn; + ctx->op_list = op_list; + ctx->case_num = case_num; + ctx->gtxn_id = gtxn_id; + ctx->it = ctx->op_list.begin(); + const std::string& op_str = *(ctx->it); + VLOG(12) << "OPERATION:" << op_str; + OpType op_type; + std::vector op_args; + if (!ParseOp(op_str, &op_type, &op_args)) { + LOG(ERROR) << "parse op failed"; + delete ctx->gtxn; + delete ctx; + done_cnt_.Inc(); + return; + } + DoOpAsync(ctx, op_type, op_args); + } else { + LOG(ERROR) << "not set operators"; + delete gtxn; + done_cnt_.Inc(); + } + } +} + +bool GlobalTxnTestTool::OpenTestTables(const std::vector& tables) { + ErrorCode err; + MutexLock lock(&mu_); + for(auto it = tables.begin(); it != tables.end(); ++it) { + const std::string tablename = *it; + if (tables_.find(tablename) == tables_.end()) { + Table* table = client_->OpenTable(tablename, &err); + if (table == NULL) { + return false; + } + tables_[tablename] = table; + } + } + return true; +} + +void GlobalTxnTestTool::DoOpAsync(GTxnTestContext* ctx, + const OpType& op_type, + const std::vector& op_args) { + if (op_args.size() < 4) { + return; + } + Table* table = nullptr; + const std::string tablename = op_args[0]; + auto table_it = tables_.find(tablename); + if (table_it != tables_.end()) { + table = table_it->second; + } else { + return; + } + const std::string row = op_args[1]; + const std::string cf = op_args[2]; + const std::string qu = op_args[3]; + if (op_type == OpType::PUT && op_args.size() == 5) { + const std::string value = op_args[4]; + tera::RowMutation* m = table->NewRowMutation(row); + m->Put(cf, qu, value); + ctx->gtxn->ApplyMutation(m); + ctx->result.push_back("PUT: " + std::to_string(ctx->gtxn->GetError().GetType())); + delete m; + } else if (op_type == OpType::GET && op_args.size() == 4) { + tera::RowReader* r = table->NewRowReader(row); + r->AddColumn(cf, qu); + r->SetCallBack([] (RowReader* r) { + ((GTxnTestContext*)r->GetContext())->tool->DoOpAsyncCallback(r); + }); + r->SetContext(ctx); + ctx->gtxn->Get(r); + return; + } else if (op_type == OpType::DEL && op_args.size() == 4) { + tera::RowMutation* m = table->NewRowMutation(row); + m->DeleteColumns(cf, qu); + ctx->gtxn->ApplyMutation(m); + ctx->result.push_back("DEL: " + std::to_string(ctx->gtxn->GetError().GetType())); + delete m; + } + + // this operation is muation , run next operation + if (op_type == OpType::PUT || op_type == OpType::DEL) { + if (++ctx->it != ctx->op_list.end()) { + const std::string& op_str = *(ctx->it); + VLOG(12) << "OPERATION:" << op_str; + OpType next_op_type; + std::vector next_op_args; + if (!ParseOp(op_str, &next_op_type, &next_op_args)) { + LOG(ERROR) << "parse op failed"; + delete ctx->gtxn; + delete ctx; + done_cnt_.Inc(); + return; + } + DoOpAsync(ctx, next_op_type, next_op_args); + } else { + ctx->gtxn->SetCommitCallback([] (Transaction* t) { + ((GTxnTestContext*)t->GetContext())->tool->DoCommitCallback(t); + }); + ctx->gtxn->SetContext(ctx); + ctx->gtxn->Commit(); + } + } +} + +void GlobalTxnTestTool::DoOpAsyncCallback(RowReader* r) { + GTxnTestContext* ctx = (GTxnTestContext*)r->GetContext(); + if (r->GetError().GetType() == ErrorCode::kOK) { + while (!r->Done()) { + const std::string& result_item = "GET: " + + std::to_string(r->GetError().GetType()) + " " + + std::to_string(r->Timestamp()) + ":" + r->Value(); + ctx->result.push_back(result_item); + r->Next(); + } + } else if (r->GetError().GetType() == ErrorCode::kNotFound) { + ctx->result.push_back("GET: " + std::to_string(r->GetError().GetType())); + } else { + ctx->result.push_back("GET: " + std::to_string(r->GetError().GetType())); + } + delete r; + // if not last, call next operation + if (++ctx->it != ctx->op_list.end()) { + const std::string& op_str = *(ctx->it); + VLOG(12) << "OPERATION:" << op_str; + OpType next_op_type; + std::vector next_op_args; + if (!ParseOp(op_str, &next_op_type, &next_op_args)) { + LOG(ERROR) << "parse op failed"; + delete ctx->gtxn; + delete ctx; + done_cnt_.Inc(); + return; + } + DoOpAsync(ctx, next_op_type, next_op_args); + } else { + ctx->gtxn->SetCommitCallback([] (Transaction* t) { + ((GTxnTestContext*)t->GetContext())->tool->DoCommitCallback(t); + }); + ctx->gtxn->SetContext(ctx); + ctx->gtxn->Commit(); + } +} + +void GlobalTxnTestTool::DoCommitCallback(Transaction* t) { + GTxnTestContext* ctx = (GTxnTestContext*)t->GetContext(); + + ctx->result.push_back(std::to_string(t->GetError().GetType())); + if (!CheckResult(ctx->case_num, ctx->gtxn_id, ctx->result)) { + done_fail_cnt_.Inc(); + } + delete ctx; + delete t; + done_cnt_.Inc(); +} + +bool GlobalTxnTestTool::DoOp(tera::Transaction* gtxn, + const OpType& op_type, + const std::vector& op_args, + std::vector* result) { + if (op_args.size() < 4) { + return false; + } + Table* table = nullptr; + const std::string tablename = op_args[0]; + auto table_it = tables_.find(tablename); + if (table_it != tables_.end()) { + table = table_it->second; + } else { + return false; + } + const std::string row = op_args[1]; + const std::string cf = op_args[2]; + const std::string qu = op_args[3]; + if (op_type == OpType::PUT && op_args.size() == 5) { + const std::string value = op_args[4]; + std::unique_ptr m(table->NewRowMutation(row)); + m->Put(cf, qu, value); + gtxn->ApplyMutation(m.get()); + result->push_back("PUT: " + std::to_string(gtxn->GetError().GetType())); + return true; + } else if (op_type == OpType::GET && op_args.size() == 4) { + std::unique_ptr r(table->NewRowReader(row)); + r->AddColumn(cf, qu); + gtxn->Get(r.get()); + if (r->GetError().GetType() == ErrorCode::kOK) { + while (!r->Done()) { + const std::string& result_item = "GET: " + + std::to_string(r->GetError().GetType()) + " " + + std::to_string(r->Timestamp()) + ":" + r->Value(); + result->push_back(result_item); + r->Next(); + } + return true; + } else if (r->GetError().GetType() == ErrorCode::kNotFound) { + result->push_back("GET: " + std::to_string(r->GetError().GetType())); + return true; + } else { + result->push_back("GET: " + std::to_string(r->GetError().GetType())); + } + } else if (op_type == OpType::DEL && op_args.size() == 4) { + std::unique_ptr m(table->NewRowMutation(row)); + m->DeleteColumns(cf, qu); + gtxn->ApplyMutation(m.get()); + result->push_back("DEL: " + std::to_string(gtxn->GetError().GetType())); + return true; + } + return false; +} + +bool GlobalTxnTestTool::ParseOp(const std::string& op_str, + OpType* op_type, std::vector* op_args) { + std::vector args; + SplitString(op_str, " ", &args); + if (TrimString(args[0]) == "PUT") { + *op_type = OpType::PUT; + } else if (TrimString(args[0]) == "GET") { + *op_type = OpType::GET; + } else if (TrimString(args[0]) == "DEL") { + *op_type = OpType::DEL; + } else { + LOG(ERROR) << "operation type not support :[" << TrimString(args[0]) << "]"; + return false; + } + for (size_t i = 1; i < args.size(); ++i) { + op_args->push_back(TrimString(args[i])); + } + return true; +} + +void GlobalTxnTestTool::DebugOpList(const std::string& op_list_file) { + std::vector op_list; + std::ifstream ofile(op_list_file); + std::string line; + int cnt = 0; + while (std::getline(ofile, line)) { + op_list.push_back(line); + ++cnt; + } + ofile.close(); + if (cnt < 1) { + LOG(ERROR) << "no operators in op_list"; + } + std::cout << "OpList:" << std::endl; + for (auto l : op_list) { + std::cout << l < flag_list; + std::ifstream ofile(flag_file); + std::string line; + int cnt = 0; + while (std::getline(ofile, line)) { + flag_list.push_back(line); + ++cnt; + } + ofile.close(); + if (cnt < 1) { + LOG(ERROR) << "no flags in gtxn.flag"; + } + std::cout << "FLAGS:" << std::endl; + for (auto f : flag_list) { + std::string flag = TrimString(f); + if (flag.length() > 0 && flag[0] == '#') { + continue; + } + std::cout << flag <& result) { + MutexLock lock(&mu_); + const std::string case_dir = FLAGS_gtxn_test_case_dir; + const std::string conf_dir = case_dir + std::to_string(case_num) + + "/T_" + std::to_string(gtxn_id); + std::cout << "===========================================" << std::endl; + std::cout << "CASE:" << case_num << " GTXN_ID:" << gtxn_id << std::endl; + if (FLAGS_gtxn_test_debug_opened) { + const std::string& op_list_file = conf_dir + "/op_list"; + const std::string& flag_file = conf_dir + "/gtxn.flag"; + DebugOpList(op_list_file); + DebugFlagFile(flag_file); + std::cout << "Result Printing:" << std::endl; + for (auto it = result.begin(); it != result.end(); ++it) { + std::cout << "RESULT:" << *it << std::endl; + } + std::cout << "-------------------------------------------" << std::endl; + } + + VLOG(12) << "case:" << case_num + << " gtxn_id:" << gtxn_id << " Printing"; + for (auto it = result.begin(); it != result.end(); ++it) { + VLOG(12) << "RESULT:" << *it; + } + + const std::string& result_list_file = conf_dir + "/result_list"; + std::vector result_list; + std::ifstream ofile(result_list_file); + std::string line; + int cnt = 0; + while (std::getline(ofile, line)) { + result_list.push_back(line); + ++cnt; + } + ofile.close(); + if (cnt < 1) { + LOG(ERROR) << "no results in result_list"; + return false; + } + + if (result_list.size() != result.size()) { + std::cout << "\tERROR[expect_line_count: " << result_list.size() << " actual_line_count: " << result.size() << "]\n"; + return false; + } else { + int have_diff = 0; + for (size_t i = 0; i < result.size(); ++i) { + const std::string& ret = result[i]; + const std::string& default_ret = result_list[i]; + if (TrimString(ret) != TrimString(default_ret)) { + std::cout << "\tERROR[expect: (" << default_ret << ") actual: (" << ret << ")]\n"; + ++have_diff; + } + } + if (have_diff > 0) { + std::cout << "FAILED :" << have_diff << std::endl; + return false; + } + } + std::cout << "SUCCEED" << std::endl; + return true; +} + +bool GlobalTxnTestTool::InitTestTables(int case_num) { + ErrorCode err; + std::unordered_map table_map; + for (auto it = case_desc_map_.begin(); it != case_desc_map_.end(); ++it) { + if (case_num != -1 && case_num != it->first) { + continue; + } + std::vector& desc_list = it->second; + for (auto dit = desc_list.begin(); dit != desc_list.end(); ++dit) { + TableDescriptor* desc = (*dit); + const std::string& tablename = desc->TableName(); + if (table_map.find(tablename) == table_map.end()) { + table_map[tablename] = desc; + } + } + } + + for (auto& table : table_map) { + if (client_->CreateTable(*(table.second), &err) && err.GetType() == ErrorCode::kOK) { + VLOG(12) << "create table " << table.first << " ok"; + } else { + LOG(ERROR) << "create table " << table.first << " failed"; + return false; + } + } + return true; +} + +bool GlobalTxnTestTool::DropTestTables(int case_num) { + ErrorCode err; + std::unordered_map table_map; + for (auto it = case_desc_map_.begin(); it != case_desc_map_.end(); ++it) { + if (case_num != -1 && case_num != it->first) { + continue; + } + std::vector& desc_list = it->second; + for (auto dit = desc_list.begin(); dit != desc_list.end(); ++dit) { + TableDescriptor* desc = (*dit); + const std::string& tablename = desc->TableName(); + if (table_map.find(tablename) == table_map.end()) { + table_map[tablename] = desc; + } + } + } + + for (auto& table : table_map) { + const std::string& tablename = table.first; + if (!client_->DisableTable(tablename, &err)) { + LOG(ERROR) << "disable table failed, table: " << tablename; + return false; + } + TableMeta table_meta; + TabletMetaList tablet_list; + tera::ClientImpl* client_impl = static_cast(client_); + if (!client_impl->ShowTablesInfo(tablename, &table_meta, &tablet_list, &err)) { + LOG(ERROR) << "table not exist: " << tablename; + return false; + } + + uint64_t tablet_num = tablet_list.meta_size(); + while (true) { + if (!client_impl->ShowTablesInfo(tablename, &table_meta, &tablet_list, &err)) { + LOG(ERROR) << "table not exist: " << tablename; + return false; + } + uint64_t tablet_cnt = 0; + for (int32_t i = 0; i < tablet_list.meta_size(); ++i) { + const TabletMeta& tablet = tablet_list.meta(i); + if (tablet.status() == kTabletDisable || tablet.status() == kTableOffLine) { + tablet_cnt++; + } + } + if (tablet_cnt == tablet_num) { + // disable finish + break; + } + sleep(1); + } + + if (!client_->DropTable(tablename, &err)) { + LOG(ERROR) << "drop table " << tablename << " failed"; + return false; + } + } + return true; +} + +void GlobalTxnTestTool::Wait() { + while(do_cnt_.Get() > done_cnt_.Get()) { + sleep(1); + } +} + +void GlobalTxnTestTool::RunCaseOneByOne() { + std::set cases; + for (auto it = case_list_.begin(); it != case_list_.end(); ++it) { + CasePair case_pair = *it; + int case_num = case_pair.first; + cases.insert(case_num); + } + for (auto& case_num : cases) { + LOG(INFO) << "GlobalTxnTest Case " << case_num << " Begin"; + // drop table + if (FLAGS_gtxn_test_drop_table_before) { + DropTestTables(case_num); + } + + if (!InitTestTables(case_num)) { + LOG(ERROR) << "GlobalTxnTest Case " << case_num + << " InitTestTables Failed"; + if (FLAGS_ignore_bad_case == true) { + continue; + } else { + break; + } + } + RunTest(client_, case_num); + Wait(); + LOG(INFO) << "GlobalTxnTest Case " << case_num << " Finish"; + if (done_fail_cnt_.Get() > 0) { + if (FLAGS_ignore_bad_case == true) { + continue; + } else { + break; + } + } + } +} + +} // namespace tera + + +int main(int argc, char *argv[]){ + ::google::ParseCommandLineFlags(&argc, &argv, true); + + if (argc > 1 && std::string(argv[1]) == "version") { + PrintSystemVersion(); + return 0; + } + if (FLAGS_gtxn_test_conf_dir == "") { + LOG(ERROR) << "not set \"--gtxn_test_conf_dir\""; + return -1; + } + if (FLAGS_gtxn_test_case_dir == "") { + LOG(ERROR) << "not set \"--gtxn_test_case_dir\""; + return -1; + } + + tera::ErrorCode error_code; + tera::Client* client = tera::Client::NewClient(FLAGS_gtxn_test_conf_dir + "/tera.flag", + &error_code); + if (client == NULL) { + return -1; + } + + tera::GlobalTxnTestTool gtxn_test_tool(client); + // init table + if (!gtxn_test_tool.LoadTestConf()) { + return -1; + } + gtxn_test_tool.RunCaseOneByOne(); + return 0; +} diff --git a/src/sdk/test/global_txn_test_tool.h b/src/sdk/test/global_txn_test_tool.h new file mode 100644 index 000000000..7acf12644 --- /dev/null +++ b/src/sdk/test/global_txn_test_tool.h @@ -0,0 +1,95 @@ +// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef TERA_SDK_TEST_GLOBAL_TXN_TEST_TOOL_H_ +#define TERA_SDK_TEST_GLOBAL_TXN_TEST_TOOL_H_ + +#include + +#include "common/thread_pool.h" +#include "common/counter.h" +#include "tera.h" + +namespace tera { + +enum OpType { GET, PUT, DEL }; + +class GlobalTxnTestTool; + + +struct GTxnTestContext { + GlobalTxnTestTool* tool; + tera::Transaction* gtxn; + std::vector op_list; + std::vector result; + std::vector::iterator it; + int case_num; + int gtxn_id; +}; + +class GlobalTxnTestTool { +public: + GlobalTxnTestTool(Client* client); + ~GlobalTxnTestTool(){} + + bool LoadTestConf(); + + bool InitTestTables(int case_num = -1); + + bool DropTestTables(int case_num = -1); + + void RunTest(tera::Client* client, int case_num = -1); + + void Wait(); + + void RunCaseOneByOne(); +private: + void RunTestInternal(tera::Client* client, const int case_num, const int gtxn_id, + const std::vector& op_list); + + void CaseRegister(const int case_num, const int gtxn_id); + + bool LoadDescriptor(const std::string& schema_file, TableDescriptor* schema); + + void DebugOpList(const std::string& op_list_file); + + void DebugFlagFile(const std::string& flag_file); + + bool CheckResult(const int case_num, const int gtxn_id, + const std::vector& result); + + bool ParseOp(const std::string& op_str, + OpType* op_type, std::vector* op_args); + + bool DoOp(tera::Transaction* gtxn, + const OpType& op_type, + const std::vector& op_args, + std::vector* result); + + void DoOpAsync(GTxnTestContext* ctx, const OpType& op_type, + const std::vector& op_args); + + void DoOpAsyncCallback(tera::RowReader* r); + + void DoCommitCallback(tera::Transaction* t); + + bool OpenTestTables(const std::vector& tables); + +private: + typedef std::pair CasePair; + std::vector case_list_; + typedef std::map> CaseDescMap; + CaseDescMap case_desc_map_; + std::map tables_; + mutable Mutex mu_; + common::ThreadPool thread_pool_; + Client* client_; + Counter do_cnt_; + Counter done_cnt_; + Counter done_fail_cnt_; +}; + +} // namespace tera + +#endif // TERA_SDK_TEST_GLOBAL_TXN_TEST_TOOL_H_ diff --git a/src/sdk/test/global_txn_testutils.cc b/src/sdk/test/global_txn_testutils.cc new file mode 100644 index 000000000..c615489d7 --- /dev/null +++ b/src/sdk/test/global_txn_testutils.cc @@ -0,0 +1,178 @@ +// Copyright (c) 2017, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include +#include +#include "common/base/string_ext.h" +#include "common/this_thread.h" +#include "sdk/test/global_txn_testutils.h" +#include "utils/config_utils.h" +#include "common/timer.h" + +DEFINE_bool(tera_gtxn_test_opened, false, "for test gtxn opened"); +DEFINE_bool(tera_gtxn_test_isolation_snapshot, true, "true means Snapshot, false means ReadCommitedSnapshot"); +DEFINE_string(tera_gtxn_test_flagfile, "", "gtxn test flagfile"); +DEFINE_int64(start_ts, 1, "start ts"); +DEFINE_int64(begin_commit_ts, 0, "time to wait before begin commit"); +DEFINE_int64(begin_prewrite_ts, 0, "time to wait before prewrite"); +DEFINE_int64(end_prewrite_ts, 0, "time to wait from before prewrite"); +DEFINE_int64(commit_ts, 1, "time to wait from end prewrite"); +DEFINE_int64(begin_primary_commit_ts, 0, "time to wait before primary commit"); +DEFINE_int64(end_primary_commit_ts, 0, "time to wait from primary commit"); +DEFINE_int64(begin_other_commit_ts, 0, "time to wait before other commit"); +DEFINE_string(get_wait_ts_list, "", "timestamp list for wait to get"); + +namespace tera { + +constexpr int64_t kMillisPerSecond = 1000L; + +GlobalTxnTestHelper::GlobalTxnTestHelper(const std::string& conffile) : + pos_(0), get_pos_(0), conf_file_(conffile), + start_ts_(0), prewrite_start_ts_(0), commit_ts_(0), + helper_create_time_(get_millis()) { +} + +void GlobalTxnTestHelper::LoadTxnConf() { + utils::LoadFlagFile(conf_file_); + ts_[0] = FLAGS_start_ts; + start_ts_ = FLAGS_start_ts; + ts_[1] = FLAGS_begin_commit_ts; + ts_[2] = FLAGS_begin_prewrite_ts; + ts_[3] = FLAGS_end_prewrite_ts; + ts_[4] = FLAGS_commit_ts; + ts_[5] = FLAGS_begin_primary_commit_ts; + ts_[6] = FLAGS_end_primary_commit_ts; + ts_[7] = FLAGS_begin_other_commit_ts; + VLOG(13) << "split get wait ts list begin..."; + SplitString(FLAGS_get_wait_ts_list, ",", &get_ts_list_); + for (auto item : get_ts_list_) { + VLOG(13) << item; + } + VLOG(13) << "split get wait ts list done"; + // if isolation_level == ReadCommitedSnapshot + if (!FLAGS_tera_gtxn_test_isolation_snapshot) { + prewrite_start_ts_ = FLAGS_start_ts + FLAGS_begin_commit_ts + FLAGS_begin_prewrite_ts; + } else { + prewrite_start_ts_ = start_ts_; + } + commit_ts_ = FLAGS_start_ts + FLAGS_begin_commit_ts + FLAGS_begin_prewrite_ts + + FLAGS_end_prewrite_ts + FLAGS_commit_ts; + if (commit_ts_ <= prewrite_start_ts_) { + commit_ts_ = prewrite_start_ts_ + 1; + } + Wait(ts_[0]); +} + +int64_t GlobalTxnTestHelper::GetStartTs() { + return start_ts_; +} + +int64_t GlobalTxnTestHelper::GetPrewriteStartTs() { + return prewrite_start_ts_; +} + +int64_t GlobalTxnTestHelper::GetCommitTs() { + return commit_ts_; +} + +void GlobalTxnTestHelper::GetWait(int64_t start_ts) { + if (get_ts_list_.size() == 0) { + // don't wait + VLOG(13) << "[gtxn_helper] [" << start_ts << "] will do get operater immediate"; + } else { + // get operaters in 'get_ts_list' will wait by 'get_ts_list' set, + // not in get_ts_list will immediate GET after the last 'get_ts_list' item finished + if (get_pos_ < get_ts_list_.size()) { + int64_t now_millis = tera::get_millis(); + int64_t def_wait_time = stol(get_ts_list_[get_pos_]) * kMillisPerSecond; + int64_t wait_time = helper_create_time_ + def_wait_time - now_millis; + VLOG(13) << "get_pos_:" << get_pos_ + << " now_millis:" << now_millis + << " def_wait_time:" << def_wait_time + << " size:" << get_ts_list_.size() + << " wait_time:" << wait_time; + if (wait_time > 0) { + VLOG(13) << "[gtxn_helper] [" << start_ts << "] will do get operater(" + << (get_pos_ + 1) << ") after" << wait_time << " ms."; + ThisThread::Sleep(wait_time); + } else { + VLOG(13) << "[gtxn_helper] [" << start_ts << "] will do get operater(" + << (get_pos_ + 1) << ") immediate"; + } + } else { + VLOG(13) << "[gtxn_helper] [" << start_ts << "] will do get operater(" + << (get_pos_ + 1) << ") immediate"; + } + get_pos_++; + } +} + +void GlobalTxnTestHelper::Wait(int64_t start_ts) { + int wait_position = pos_++; + int64_t* info = ts_; + int64_t now_micros = tera::get_micros(); + if (wait_position == 0) { + PrintLog(start_ts, "begin txn", info[wait_position + 1]); + } else { + if (info[wait_position] == -1) { + ExitNow(start_ts, wait_position); + } + int64_t should_wait = info[wait_position] * 1000000L + info[wait_position - 1]; + if (should_wait - now_micros > 10) { + ThisThread::Sleep((should_wait - now_micros) / 1000L); + } else if (info[wait_position] == 0) { + // nothing to do + } else if (should_wait < now_micros) { + LOG(ERROR) << "[gtxn_helper] [" << start_ts << "] txn run timeout, exited"; + _Exit(0); + } + switch (wait_position) { + case 1: + PrintLog(start_ts, "begin commit", info[wait_position + 1]); + break; + case 2: + PrintLog(start_ts, "begin prewrite", info[wait_position + 1]); + break; + case 3: + PrintLog(start_ts, "end prewrite", info[wait_position + 1]); + break; + case 4: + PrintLog(start_ts, "begin real commit", info[wait_position + 1]); + break; + case 5: + PrintLog(start_ts, "begin primary commit", info[wait_position + 1]); + break; + case 6: + PrintLog(start_ts, "end primary commit", info[wait_position + 1]); + break; + case 7: + PrintLog(start_ts, "begin other commit"); + break; + default: + LOG(ERROR) << "overflow position"; + _Exit(0); + } + } + info[wait_position] = tera::get_micros(); + return; +} + +void GlobalTxnTestHelper::ExitNow(int64_t start_ts, int position) { + VLOG(13) << "[gtxn_helper] [" << start_ts << "] exit @ position=" << position; + _Exit(0); // for simulate test gtxn stop at anywhere +} + +void GlobalTxnTestHelper::PrintLog(int64_t start_ts, + const std::string& log_str, + int64_t next_wait_time) { + if (next_wait_time == -1) { + VLOG(13) << "[gtxn_helper] [" << start_ts << "] " << log_str << ", txn will be done."; + } else { + VLOG(13) << "[gtxn_helper] [" << start_ts << "] " << log_str + << ", next step will begin after [" << next_wait_time << "s]"; + } +} + +} // namespace tera + diff --git a/src/sdk/test/global_txn_testutils.h b/src/sdk/test/global_txn_testutils.h new file mode 100644 index 000000000..278ef8e68 --- /dev/null +++ b/src/sdk/test/global_txn_testutils.h @@ -0,0 +1,41 @@ +// Copyright (c) 2017, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef TERA_SDK_TEST_GLOBAL_TXN_TESTUTILS_H_ +#define TERA_SDK_TEST_GLOBAL_TXN_TESTUTILS_H_ + +#include + +namespace tera { + +class GlobalTxnTestHelper { +public: + GlobalTxnTestHelper(const std::string& conffile); + ~GlobalTxnTestHelper(){} + int64_t GetStartTs(); + int64_t GetPrewriteStartTs(); + int64_t GetCommitTs(); + void Wait(int64_t start_ts); + void GetWait(int64_t start_ts); + void LoadTxnConf(); +private: + + void ExitNow(int64_t start_ts, int position); + void PrintLog(int64_t start_ts, + const std::string& log_str, + int64_t next_wait_time = -1); + int pos_; + size_t get_pos_; + std::string conf_file_; + int64_t start_ts_; + int64_t prewrite_start_ts_; + int64_t commit_ts_; + int64_t ts_[8]; + std::vector get_ts_list_; + int64_t helper_create_time_; +}; + +} // namespace tera + +#endif // TERA_SDK_TEST_GLOBAL_TXN_TESTUTILS_H_ diff --git a/src/sdk/test/mock_table.h b/src/sdk/test/mock_table.h new file mode 100644 index 000000000..5d1a75e3b --- /dev/null +++ b/src/sdk/test/mock_table.h @@ -0,0 +1,78 @@ +// Copyright (c) 2015-2017, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. +// +// Author: baorenyi@baidu.com + +#ifndef TERA_SDK_TEST_MOCK_TABLE_H_ +#define TERA_SDK_TEST_MOCK_TABLE_H_ + +#include +#include +#include + +#include "gflags/gflags.h" +#include "glog/logging.h" + +#include "sdk/read_impl.h" +#include "sdk/mutate_impl.h" +#include "sdk/table_impl.h" + +namespace tera { + +struct MockReaderResult { + RowResult result; + ErrorCode status; +}; + +class MockTable: public TableImpl { +public: + MockTable(const std::string& table_name, + common::ThreadPool* thread_pool, + sdk::ClusterFinder* cluster = NULL) + : TableImpl(table_name, thread_pool, cluster) { + reader_err_.clear(); + mu_err_.clear(); + reader_pos_ = 0; + mu_pos_ = 0; + } + void ApplyMutation(RowMutation* row_mu) { + RowMutationImpl* mu = static_cast(row_mu); + mu->SetError(mu_err_[mu_pos_++].GetType(),""); + mu->RunCallback(); + } + + void Get(RowReader* reader) { + RowReaderImpl* r = static_cast(reader); + if (reader_result_.size() > 0) { + r->SetResult(reader_result_[reader_pos_].result); + r->SetError(reader_result_[reader_pos_++].status.GetType(), ""); + } else { + r->SetError(reader_err_[reader_pos_++].GetType(), ""); + } + r->RunCallback(); + } + + void AddReaderResult(const std::vector& results) { + reader_result_.insert(reader_result_.end(), + results.begin(), results.end()); + } + + void AddReaderErrors(const std::vector& errs) { + reader_err_.insert(reader_err_.end(), errs.begin(), errs.end()); + } + + void AddMutationErrors(const std::vector& errs) { + mu_err_.insert(mu_err_.end(), errs.begin(), errs.end()); + } +private: + std::vector reader_err_; + std::vector mu_err_; + std::vector reader_result_; + int reader_pos_; + int mu_pos_; +}; + +} // namespace tera + +#endif // TERA_SDK_TEST_MOCK_TABLE_H_ diff --git a/src/sdk/test/scan_impl_test.cc b/src/sdk/test/scan_impl_test.cc index abef2d305..475e2ff1c 100644 --- a/src/sdk/test/scan_impl_test.cc +++ b/src/sdk/test/scan_impl_test.cc @@ -49,21 +49,6 @@ class ScanDescImplTest : public ::testing::Test, public ScanDescImpl { TableSchema table_schema_; }; -TEST_F(ScanDescImplTest, GetCfType) { - string cf_name, type; - - cf_name = "cf0"; - EXPECT_TRUE(GetCfType(cf_name, &type)); - EXPECT_EQ(type, "int32"); - - cf_name = "cf2"; - EXPECT_TRUE(GetCfType(cf_name, &type)); - EXPECT_EQ(type, "binary"); - - cf_name = "cf100"; - EXPECT_FALSE(GetCfType(cf_name, &type)); -} - TEST_F(ScanDescImplTest, ParseValueCompareFilter) { string filter_str; Filter filter; @@ -76,21 +61,19 @@ TEST_F(ScanDescImplTest, ParseValueCompareFilter) { filter_str = "qualifier10"; EXPECT_FALSE(ParseValueCompareFilter(filter_str, &filter)); - filter_str = "cf0==-10"; + filter_str = "int64cf0==-10"; EXPECT_TRUE(ParseValueCompareFilter(filter_str, &filter)); EXPECT_EQ(filter.type(), BinComp); EXPECT_EQ(filter.bin_comp_op(), EQ); EXPECT_EQ(filter.field(), ValueFilter); EXPECT_EQ(filter.content(), "cf0"); - filter_str = "cf1>1"; + filter_str = "int64cf1>1"; EXPECT_TRUE(ParseValueCompareFilter(filter_str, &filter)); EXPECT_EQ(filter.bin_comp_op(), GT); filter_str = "cf2==hello"; - EXPECT_TRUE(ParseValueCompareFilter(filter_str, &filter)); - EXPECT_EQ(filter.bin_comp_op(), EQ); - EXPECT_EQ(filter.ref_value(), "hello"); + EXPECT_FALSE(ParseValueCompareFilter(filter_str, &filter)); } TEST_F(ScanDescImplTest, ParseSubFilterString) { @@ -104,33 +87,15 @@ TEST_F(ScanDescImplTest, ParseSubFilterString) { filter_str = "qual@ifier10"; EXPECT_FALSE(ParseSubFilterString(filter_str, &filter)); - filter_str = "cf0 == -10"; + filter_str = "int64cf0 == -10"; EXPECT_TRUE(ParseSubFilterString(filter_str, &filter)); EXPECT_EQ(filter.type(), BinComp); EXPECT_EQ(filter.bin_comp_op(), EQ); EXPECT_EQ(filter.field(), ValueFilter); EXPECT_EQ(filter.content(), "cf0"); - filter_str = "cf1 > 1"; + filter_str = "int64cf1 > 1"; EXPECT_TRUE(ParseSubFilterString(filter_str, &filter)); EXPECT_EQ(filter.bin_comp_op(), GT); } - -TEST_F(ScanDescImplTest, ParseFilterString) { - string filter_str; - - filter_str = "cf0 < 10 AND cf1 >100 AND cf2 == world"; - SetFilterString(filter_str); - EXPECT_TRUE(ParseFilterString()); - EXPECT_EQ(filter_list_.filter_size(), 3); - - filter_str = "cf < 10 AND cf1 >100 AND cf2 == world"; - SetFilterString(filter_str); - EXPECT_FALSE(ParseFilterString()); - - filter_str = "cf0 < 10 OR cf1 >100 AND cf2 == world"; - SetFilterString(filter_str); - EXPECT_FALSE(ParseFilterString()); -} - } // namespace tera diff --git a/src/sdk/test/sdk_test.cc b/src/sdk/test/sdk_test.cc new file mode 100644 index 000000000..7177bdc3a --- /dev/null +++ b/src/sdk/test/sdk_test.cc @@ -0,0 +1,16 @@ +// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "gflags/gflags.h" +#include "glog/logging.h" +#include "gtest/gtest.h" +DECLARE_bool(tera_sdk_tso_client_enabled); +DECLARE_bool(tera_sdk_client_for_gtxn); + +int main(int argc, char* argv[]) { + FLAGS_tera_sdk_client_for_gtxn = true; + FLAGS_tera_sdk_tso_client_enabled = false; + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/sdk/test/sdk_timeout_manager_test.cc b/src/sdk/test/sdk_timeout_manager_test.cc new file mode 100644 index 000000000..84ea5a4c1 --- /dev/null +++ b/src/sdk/test/sdk_timeout_manager_test.cc @@ -0,0 +1,244 @@ +// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include +#include "gflags/gflags.h" +#include "gtest/gtest.h" + +#include "sdk_task.h" +#include "common/counter.h" +#include "common/timer.h" + +using std::string; +using namespace std::placeholders; +DEFINE_int32(thread_num, 10, "thread number for TimeoutManager ThreadPool"); +DEFINE_int32(perf_test_thead_num, 10, "thread number of put/pop"); +DEFINE_int32(perf_test_duration, 2, "seconds for performance test"); +namespace tera { + +#define YELLOW "\033[33m" /* Yellow */ + +static Counter callback_called_times = Counter(); +static Counter task_counter = Counter(); + +class TestTask : public SdkTask { +public: + std::string dummy_key; + + TestTask() : SdkTask(SdkTask::READ) {} + virtual ~TestTask() {} + + bool IsAsync() { return false; } + uint32_t Size() { return 0; } + int64_t TimeOut() { return 0; } + void Wait() {} + void SetError(ErrorCode::ErrorCodeType err, + const std::string& reason) {} + const std::string& RowKey() { return dummy_key; } +}; + +class SdkTimeoutManagerTest : public ::testing::Test { +public: + SdkTimeoutManagerTest() : thread_pool_(FLAGS_thread_num), timeout_manager_(NULL) {} + + virtual void SetUp() { + timeout_manager_ = new SdkTimeoutManager(&thread_pool_); + ASSERT_TRUE(timeout_manager_ != NULL); + callback_called_times.Clear(); + task_counter.Clear(); + } + virtual void TearDown() { + delete timeout_manager_; + } + +private: + common::ThreadPool thread_pool_; + SdkTimeoutManager* timeout_manager_ = NULL; +}; + +static void TimeoutFunc(SdkTask* task) { + callback_called_times.Add(1); +} + +static SdkTask::TimeoutFunc timeout_func = std::bind(TimeoutFunc, _1); + +TEST_F(SdkTimeoutManagerTest, PutTaskPopTaskTest) { + const int32_t LOOP_CNT = 10000; + int64_t put_start_time = get_micros(); + bool succ = true; + for (int32_t i = 0; i < LOOP_CNT; ++i) { + TestTask* sdk_task = new TestTask(); + sdk_task->SetId(LOOP_CNT - i); + succ &= timeout_manager_->PutTask(sdk_task, 5000, timeout_func); + } + EXPECT_TRUE(succ); + int64_t put_done_time = get_micros(); + + uint32_t task_cnt = 0; + for (uint32_t i = 0; i < SdkTimeoutManager::kShardNum; ++i) { + uint32_t shard_due_cnt = timeout_manager_->map_shard_[i].due_time_map.size(); + EXPECT_EQ(shard_due_cnt, timeout_manager_->map_shard_[i].id_hash_map.size()); + task_cnt += shard_due_cnt; + } + EXPECT_EQ(task_cnt, LOOP_CNT); + + int64_t pop_start_time = get_micros(); + for (uint32_t shard_idx = 0; shard_idx < SdkTimeoutManager::kShardNum; ++shard_idx) { + SdkTimeoutManager::DueTimeMap& due_time_map = + timeout_manager_->map_shard_[shard_idx].due_time_map; + uint32_t shard_task_cnt = due_time_map.size(); + uint32_t shard_pop_cnt = 0; + while (!due_time_map.empty()) { + SdkTask* task = timeout_manager_->PopTask((*due_time_map.begin())->GetId()); + EXPECT_TRUE(task != NULL); + shard_pop_cnt += 1; + delete static_cast(task); + } + EXPECT_EQ(shard_pop_cnt, shard_task_cnt); + } + int64_t pop_done_time = get_micros(); + + std::cout << YELLOW << "SdkTimeoutManager performance(single thread): " + << "\n\t\tPutTask: " << int(LOOP_CNT / ((put_done_time - put_start_time + 1) / 1000000.0)) + << "\n\t\tPopTask: " << int(LOOP_CNT / ((pop_done_time - pop_start_time + 1) / 1000000.0)) + << std::endl; +} + +TEST_F(SdkTimeoutManagerTest, CheckTimeout) { + const int32_t LOOP_CNT = 10000; + std::vector tasks; + tasks.reserve(LOOP_CNT); + bool succ = true; + for (int32_t i = 0; i < LOOP_CNT; ++i) { + TestTask* sdk_task = new TestTask(); + sdk_task->SetId(i + 1); + succ &= timeout_manager_->PutTask(sdk_task, 500, timeout_func); + tasks.push_back(sdk_task); + } + EXPECT_TRUE(true); + // waiting until all SdkTasks have been check timeout and their TimeoutFunc been put to thread pool to execute + for (uint32_t shard = 0; shard < SdkTimeoutManager::kShardNum; ++shard) { + while (!timeout_manager_->map_shard_[shard].due_time_map.empty()){ + usleep(timeout_manager_->timeout_precision_); + } + } + // waiting another 100ms until all TimeoutFunc in thread_pool have been done + usleep(250000); + EXPECT_EQ(callback_called_times.Get(), LOOP_CNT); + + TestTask* sdk_task = new TestTask(); + sdk_task->SetId(100); + EXPECT_TRUE(timeout_manager_->PutTask(sdk_task, 500, timeout_func)); + tasks.push_back(sdk_task); + EXPECT_FALSE(timeout_manager_->PutTask(sdk_task, 500, timeout_func)); + + sdk_task = new TestTask(); + sdk_task->SetId(100); + EXPECT_FALSE(timeout_manager_->PutTask(sdk_task, 500, timeout_func)); + tasks.push_back(sdk_task); + + usleep(1000); + sdk_task = new TestTask(); + sdk_task->SetId(100); + EXPECT_FALSE(timeout_manager_->PutTask(sdk_task, 500, timeout_func)); + tasks.push_back(sdk_task); + // waiting until all SdkTasks have been check timeout and their TimeoutFunc been put to thread pool to execute + for (uint32_t shard = 0; shard < SdkTimeoutManager::kShardNum; ++shard) { + while (!timeout_manager_->map_shard_[shard].due_time_map.empty()){ + usleep(timeout_manager_->timeout_precision_); + } + } + // waiting another 100ms until all TimeoutFunc in thread_pool have been done + usleep(250000); + EXPECT_EQ(callback_called_times.Get(), 1 + LOOP_CNT); + for (std::size_t i = 0; i < tasks.size(); ++i) { + delete tasks[i]; + } +} + +static bool add_task_run = true; +static void AddTaskFunc(SdkTimeoutManager* mgr, int64_t timeout) { + while (add_task_run) { + SdkTask* task = new TestTask(); + task->SetId(task_counter.Add(1)); + mgr->PutTask(task, timeout, timeout_func); + } +} + +static void PopTaskFunc(SdkTimeoutManager* mgr) { + int64_t task_id; + while ((task_id = task_counter.Sub(1) + 1) > 0) { + SdkTask* task = mgr->PopTask(task_id); + delete static_cast(task); + } +} + +TEST_F(SdkTimeoutManagerTest, PutPopPerformance) { + std::vector threads; + threads.reserve(FLAGS_perf_test_thead_num); + add_task_run = true; + int64_t timeout = FLAGS_perf_test_duration * 1000 + 1000; + for (int32_t i = 0; i < FLAGS_perf_test_thead_num; ++i) { + threads.emplace_back(std::thread(std::bind(&AddTaskFunc, timeout_manager_, timeout))); + } + sleep(FLAGS_perf_test_duration); + add_task_run = false; + for (std::size_t i = 0; i < threads.size(); ++i) { + threads[i].join(); + } + threads.clear(); + int64_t task_cnt = task_counter.Get(); + + int64_t pop_start_time = get_micros(); + for (int i = 0; i < FLAGS_perf_test_thead_num; ++i) { + threads.emplace_back(std::thread(std::bind(PopTaskFunc, timeout_manager_))); + } + for (std::size_t i = 0; i < threads.size(); ++i) { + threads[i].join(); + } + int64_t pop_end_time = get_micros(); + std::cout << YELLOW + << "SdkTimeoutManager performance(" << FLAGS_perf_test_thead_num <<" put/pop threads): " + << "\n\t\tPutTask: " << task_cnt / FLAGS_perf_test_duration + << "\n\t\tPopTask: " << int(task_cnt / ((pop_end_time - pop_start_time) / 1000000.0)) + << std::endl; +} + +TEST_F(SdkTimeoutManagerTest, CheckTimeoutPerformance) { + common::ThreadPool thread_pool(FLAGS_thread_num); + SdkTimeoutManager* timeout_mgr = new SdkTimeoutManager(&thread_pool); + + std::vector threads; + threads.reserve(FLAGS_perf_test_thead_num); + add_task_run = true; + // timeout set to 1us + int64_t timeout = 1; + int64_t start_time = get_micros(); + for (int32_t i = 0; i < FLAGS_perf_test_thead_num; ++i) { + threads.emplace_back(std::thread(std::bind(&AddTaskFunc, timeout_mgr, timeout))); + } + sleep(FLAGS_perf_test_duration); + add_task_run = false; + int64_t end_time = get_micros(); + for (std::size_t i = 0; i < threads.size(); ++i) { + threads[i].join(); + } + threads.clear(); + int64_t callback_run_cnt = callback_called_times.Get(); + int64_t pending_cnt = task_counter.Get() - callback_run_cnt; + delete timeout_mgr; + + std::cout << YELLOW + << "SdkTimeoutManager performance@CheckTimeout(" + << FLAGS_perf_test_thead_num <<" put threads, " + << FLAGS_thread_num << "TimeoutFunc run threads): " + << "\n\t\tPutTask: " << task_counter.Get() / FLAGS_perf_test_duration + << "\n\t\tPending: " << pending_cnt / FLAGS_perf_test_duration + << "\n\t\tCheckTimeout: " <DisableNotify(); + tera::ColumnFamilyDescriptor* cfd2 = schema.AddColumnFamily("cf2"); + cfd2->DisableNotify(); + auto before_num = schema.LocalityGroupNum(); + EXPECT_TRUE(ExtendNotifyLgToDescriptor(&schema)); + EXPECT_TRUE(schema.LocalityGroupNum() == before_num); } -TEST(SdkUtils, ParseCfNameType) { - string in, name, type; - - in = "cf"; - ASSERT_TRUE(ParseCfNameType(in, &name, &type)); - ASSERT_TRUE(name == "cf"); - ASSERT_TRUE(type == ""); - ASSERT_TRUE(ParseCfNameType(in, NULL, &type)); - - in = ""; - ASSERT_TRUE(ParseCfNameType(in, &name, &type)); - ASSERT_TRUE(name == ""); - ASSERT_TRUE(type == ""); - ASSERT_TRUE(ParseCfNameType(in, &name, NULL)); - - in = "cf"; - ASSERT_TRUE(ParseCfNameType(in, &name, &type)); - ASSERT_TRUE(name == "cf"); - ASSERT_TRUE(type == "int"); - ASSERT_TRUE(ParseCfNameType(in, NULL, NULL)); - - in = ""; - ASSERT_TRUE(ParseCfNameType(in, &name, &type)); - ASSERT_TRUE(name == ""); - ASSERT_TRUE(type == "int"); - - in = "cf<"; - ASSERT_FALSE(ParseCfNameType(in, &name, &type)); - - in = "cf1int>"; - ASSERT_FALSE(ParseCfNameType(in, &name, &type)); - - in = "<>"; - ASSERT_FALSE(ParseCfNameType(in, &name, &type)); -} - -TEST(SdkUtils, CommaInBracket) { - string test; - - test = "0123,{67,90,23},6,89,1{3,567,}01{345}789,12"; - EXPECT_TRUE(CommaInBracket(test, 8)); - EXPECT_TRUE(CommaInBracket(test, 13)); - EXPECT_TRUE(CommaInBracket(test, 23)); - EXPECT_TRUE(CommaInBracket(test, 27)); - EXPECT_TRUE(CommaInBracket(test, 34)); - - EXPECT_FALSE(CommaInBracket(test, 2)); - EXPECT_FALSE(CommaInBracket(test, 4)); - EXPECT_FALSE(CommaInBracket(test, 15)); - EXPECT_FALSE(CommaInBracket(test, 20)); - EXPECT_FALSE(CommaInBracket(test, 37)); -} - -TEST(SdkUtils, SplitCfSchema) { - string schema; - std::vector cfs; - - schema = "cf1"; - SplitCfSchema(schema, &cfs); - EXPECT_EQ(cfs.size(), 1); - - schema = "cf1,cf2,cf3"; - SplitCfSchema(schema, &cfs); - EXPECT_EQ(cfs.size(), 3); - - schema = "cf2{prop1,prop2}"; - SplitCfSchema(schema, &cfs); - EXPECT_EQ(cfs.size(), 1); - - schema = "cf1,cf2{prop1,prop2},cf3{prop2}"; - SplitCfSchema(schema, &cfs); - EXPECT_EQ(cfs.size(), 3); - - schema = "cf1{prop1,prop2,prop3},cf2,cf3{prop1,prop2,prop3}"; - SplitCfSchema(schema, &cfs); - EXPECT_EQ(cfs.size(), 3); - - schema = "cf1{prop1,prop2,prop3},cf2{prop1,prop2,prop3},cf3"; - SplitCfSchema(schema, &cfs); - EXPECT_EQ(cfs.size(), 3); - - schema = "cf1,cf2{prop1,prop2,prop3},cf3{prop1,prop2,prop3}"; - SplitCfSchema(schema, &cfs); - EXPECT_EQ(cfs.size(), 3); - - schema = "cf1{prop1,prop2,prop3},cf2{prop1,prop2,prop3},cf3{prop1,prop2,prop3}"; - SplitCfSchema(schema, &cfs); - EXPECT_EQ(cfs.size(), 3); +TEST(SdkUtilsTest, ExtendNotifyLgToDescriptor1) { + // some disable notify + tera::TableDescriptor schema("t1"); + schema.AddLocalityGroup("lg0"); + tera::ColumnFamilyDescriptor* cfd1 = schema.AddColumnFamily("cf1"); + cfd1->EnableNotify(); + tera::ColumnFamilyDescriptor* cfd2 = schema.AddColumnFamily("cf2"); + cfd2->DisableNotify(); + auto before_num = schema.LocalityGroupNum(); + EXPECT_TRUE(ExtendNotifyLgToDescriptor(&schema)); + EXPECT_TRUE(schema.LocalityGroupNum() == before_num + 1); } -TEST(SdkUtils, ParseProperty) { - string schema; - PropertyList prop_list; - string name; - - schema = "name{prop1,prop2=value2,prop3=value3}"; - ASSERT_TRUE(ParseProperty(schema, &name, &prop_list)); - ASSERT_TRUE(name == "name"); - ASSERT_EQ(prop_list.size(), 3); - ASSERT_TRUE(prop_list[0].first == "prop1"); - ASSERT_TRUE(prop_list[0].second == ""); - ASSERT_TRUE(prop_list[1].first == "prop2"); - ASSERT_TRUE(prop_list[1].second == "value2"); - ASSERT_TRUE(prop_list[2].first == "prop3"); - ASSERT_TRUE(prop_list[2].second == "value3"); - - schema = "{prop1,prop2=value2}"; - ASSERT_TRUE(ParseProperty(schema, &name, &prop_list)); - ASSERT_TRUE(name == ""); - ASSERT_EQ(prop_list.size(), 2); - ASSERT_TRUE(prop_list[0].first == "prop1"); - ASSERT_TRUE(prop_list[0].second == ""); - ASSERT_TRUE(prop_list[1].first == "prop2"); - ASSERT_TRUE(prop_list[1].second == "value2"); - - schema = "name"; - ASSERT_TRUE(ParseProperty(schema, &name, &prop_list)); - ASSERT_TRUE(name == "name"); - ASSERT_EQ(prop_list.size(), 0); - - schema = ""; - ASSERT_TRUE(ParseProperty(schema, &name, &prop_list)); - ASSERT_TRUE(name == ""); - ASSERT_EQ(prop_list.size(), 0); - - schema = "nameprop1,prop2=value2,prop3=value3}"; - ASSERT_FALSE(ParseProperty(schema, &name, &prop_list)); - - schema = "name{prop1,pr'op2=value2,prop3=value3}"; - ASSERT_FALSE(ParseProperty(schema, &name, &prop_list)); - - schema = "name{0prop1,prop2=value2,prop3=value3}"; - ASSERT_FALSE(ParseProperty(schema, &name, &prop_list)); +TEST(SdkUtilsTest, ExtendNotifyLgToDescriptor2) { + // some disable notify + tera::TableDescriptor schema("t1"); + schema.AddLocalityGroup("lg0"); + tera::ColumnFamilyDescriptor* cfd1 = schema.AddColumnFamily("cf1"); + cfd1->DisableNotify(); + tera::ColumnFamilyDescriptor* cfd2 = schema.AddColumnFamily("cf2"); + cfd2->EnableNotify(); + auto before_num = schema.LocalityGroupNum(); + EXPECT_TRUE(ExtendNotifyLgToDescriptor(&schema)); + EXPECT_TRUE(schema.LocalityGroupNum() == before_num + 1); } -TEST(SdkUtils, ParseScanSchema) { - ScanDescriptor desc("row1"); - ScanDescImpl* impl; - string schema; - - schema = "SELECT cf0,cf1:qu2"; - ASSERT_TRUE(ParseScanSchema(schema, &desc)); - impl = desc.GetImpl(); - ASSERT_EQ(impl->GetSizeofColumnFamilyList(), 2); - ASSERT_TRUE(impl->GetFilterString() == ""); - - schema = "SELECT cf0,cf1:qu2 WHERE cf0 < 10 AND cf1 > 23"; - ASSERT_TRUE(ParseScanSchema(schema, &desc)); - impl = desc.GetImpl(); - ASSERT_EQ(impl->GetSizeofColumnFamilyList(), 2); - ASSERT_TRUE(impl->GetFilterString() == "cf0 < 10 AND cf1 > 23"); +TEST(SdkUtilsTest, ExtendNotifyLgToDescriptor3) { + // all enable notify + tera::TableDescriptor schema("t1"); + schema.AddLocalityGroup("lg0"); + tera::ColumnFamilyDescriptor* cfd1 = schema.AddColumnFamily("cf1"); + cfd1->EnableNotify(); + tera::ColumnFamilyDescriptor* cfd2 = schema.AddColumnFamily("cf2"); + cfd2->EnableNotify(); + auto before_num = schema.LocalityGroupNum(); + EXPECT_TRUE(ExtendNotifyLgToDescriptor(&schema)); + EXPECT_TRUE(schema.LocalityGroupNum() == before_num + 1); } -TEST(SdkUtils, BuildSchema) { - string schema = "lg0:cf1,cf2|lg3:cf3,cf4,cf5"; - - TableDescriptor table_desc("unittest"); - ParseSchema(schema, &table_desc); - - string schema_t; - BuildSchema(&table_desc, &schema_t); - EXPECT_TRUE(schema == schema_t); +TEST(SdkUtilsTest, ExtendNotifyLgToDescriptor4) { + // have lg named 'notify' but not set any cf 'notify=on' + tera::TableDescriptor schema("t1"); + schema.AddLocalityGroup("notify"); + tera::ColumnFamilyDescriptor* cfd1 = schema.AddColumnFamily("cf1", "notify"); + tera::ColumnFamilyDescriptor* cfd2 = schema.AddColumnFamily("cf2", "notify"); + auto before_num = schema.LocalityGroupNum(); + EXPECT_TRUE(ExtendNotifyLgToDescriptor(&schema)); + EXPECT_TRUE(schema.LocalityGroupNum() == before_num); } -TEST(SdkUtils, HasInvalidCharInSchema) { - EXPECT_FALSE(HasInvalidCharInSchema("")); - EXPECT_FALSE(HasInvalidCharInSchema("table:splitsize=3,lg0:compress=none")); - - EXPECT_TRUE(HasInvalidCharInSchema("\n \t`~!@#$%^&*()-+{}[]\\|;\"'.<>?/")); - EXPECT_TRUE(HasInvalidCharInSchema("table:splitsize=3;lg0:compress=none")); +TEST(SdkUtilsTest, ExtendNotifyLgToDescriptor5) { + // have lg named 'notify' and set some cf 'notify=on' + tera::TableDescriptor schema("t1"); + schema.AddLocalityGroup("notify"); + tera::ColumnFamilyDescriptor* cfd1 = schema.AddColumnFamily("cf1", "notify"); + cfd1->EnableNotify(); + tera::ColumnFamilyDescriptor* cfd2 = schema.AddColumnFamily("cf2", "notify"); + auto before_num = schema.LocalityGroupNum(); + EXPECT_FALSE(ExtendNotifyLgToDescriptor(&schema)); + EXPECT_TRUE(schema.LocalityGroupNum() == before_num); } -TEST(SdkUtils, PrefixType) { - EXPECT_TRUE(PrefixType("compress") == "lg"); - EXPECT_TRUE(PrefixType("storage") == "lg"); - EXPECT_TRUE(PrefixType("blocksize") == "lg"); - EXPECT_TRUE(PrefixType("ttl") == "cf"); - EXPECT_TRUE(PrefixType("maxversions") == "cf"); - EXPECT_TRUE(PrefixType("minversions") == "cf"); - EXPECT_TRUE(PrefixType("diskquota") == "cf"); - EXPECT_TRUE(PrefixType("splitsize") == "unknown"); // only support lg && cf - EXPECT_TRUE(PrefixType("anythingother") == "unknown"); +TEST(SdkUtilsTest, ExtendNotifyLgToDescriptor6) { + // have cf named '_N_' but not set any cf 'notify=on' + tera::TableDescriptor schema("t1"); + schema.AddLocalityGroup("lg0"); + tera::ColumnFamilyDescriptor* cfd1 = schema.AddColumnFamily("_N_"); + tera::ColumnFamilyDescriptor* cfd2 = schema.AddColumnFamily("cf2"); + auto before_num = schema.LocalityGroupNum(); + EXPECT_TRUE(ExtendNotifyLgToDescriptor(&schema)); + EXPECT_TRUE(schema.LocalityGroupNum() == before_num); } -TEST(SdkUtils, ParsePrefixPropertyValue) { - string prefix; - string property; - string value; - EXPECT_TRUE(ParsePrefixPropertyValue("lg123:compress=none", prefix, property, value)); - - EXPECT_FALSE(ParsePrefixPropertyValue(":ttl=3", prefix, property, value)); - EXPECT_FALSE(ParsePrefixPropertyValue("cf123:=3", prefix, property, value)); - EXPECT_FALSE(ParsePrefixPropertyValue("cf123:ttl=", prefix, property, value)); - EXPECT_FALSE(ParsePrefixPropertyValue("ttl", prefix, property, value)); - EXPECT_FALSE(ParsePrefixPropertyValue("cf123:ttl", prefix, property, value)); - EXPECT_FALSE(ParsePrefixPropertyValue("cf123:ttl:3", prefix, property, value)); +TEST(SdkUtilsTest, ExtendNotifyLgToDescriptor7) { + // have cf named '_N_' but some set cf 'notify=on' + tera::TableDescriptor schema("t1"); + schema.AddLocalityGroup("lg0"); + tera::ColumnFamilyDescriptor* cfd1 = schema.AddColumnFamily("_N_"); + cfd1->EnableNotify(); + tera::ColumnFamilyDescriptor* cfd2 = schema.AddColumnFamily("cf2"); + auto before_num = schema.LocalityGroupNum(); + EXPECT_FALSE(ExtendNotifyLgToDescriptor(&schema)); + EXPECT_TRUE(schema.LocalityGroupNum() == before_num); } -} // namespace sdk } // namespace tera diff --git a/src/sdk/timeoracle_client_impl.cc b/src/sdk/timeoracle_client_impl.cc new file mode 100644 index 000000000..7f0e16b6e --- /dev/null +++ b/src/sdk/timeoracle_client_impl.cc @@ -0,0 +1,118 @@ +// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "sdk/timeoracle_client_impl.h" +#include +#include + +#include "common/timer.h" + +namespace tera { +namespace timeoracle { + +TimeoracleClientImpl::TimeoracleClientImpl(ThreadPool* thread_pool, + sdk::ClusterFinder* cluster_finder, + int32_t rpc_timeout) : + RpcClient(cluster_finder->TimeoracleAddr()), + thread_pool_(thread_pool), + rpc_timeout_(rpc_timeout), + update_timestamp_(0), + cluster_finder_(cluster_finder) {} + +void TimeoracleClientImpl::refresh_timeoracle_address(int64_t last_timestamp) { + std::unique_lock lock_guard(mutex_); + if (last_timestamp > 0 && last_timestamp < update_timestamp_) { + return; + } + + LOG(INFO) << "TimeoracleClientImpl try to update cluster, before is " << GetConnectAddr(); + std::string addr = cluster_finder_->TimeoracleAddr(true); + ResetClient(addr); + LOG(INFO) << "TimeoracleClientImpl update cluster, current is " << GetConnectAddr(); + update_timestamp_ = get_micros(); +} + +int64_t TimeoracleClientImpl::GetTimestamp(uint32_t count) { + GetTimestampRequest request; + GetTimestampResponse response; + + request.set_count(count); + + std::function done; + + if (SendMessageWithRetry(&TimeoracleServer::Stub::GetTimestamp, + &request, + &response, + done, + "GetTimestamp", + rpc_timeout_, + thread_pool_)) { + int code = response.status(); + if (code != kTimeoracleOk) { + // Internel Error + return 0; + } + return response.start_timestamp(); + } + + // Rpc Failed + refresh_timeoracle_address(0); + return 0; +} + +bool TimeoracleClientImpl::GetTimestamp(uint32_t count, std::function callback) { + auto request = new GetTimestampRequest(); + auto response = new GetTimestampResponse(); + request->set_count(count); + int64_t start_time = get_micros(); + + std::function done + = std::bind(&TimeoracleClientImpl::OnRpcFinished, this, start_time, callback, + std::placeholders::_1, std::placeholders::_2, + std::placeholders::_3, std::placeholders::_4); + + if (SendMessageWithRetry(&TimeoracleServer::Stub::GetTimestamp, + request, + response, + done, + "GetTimestamp", + rpc_timeout_, + thread_pool_)) { + return true; + } + + // Rpc Failed + refresh_timeoracle_address(0); + return false; +} + +void TimeoracleClientImpl::OnRpcFinished(int64_t start_time, + std::function callback, + const GetTimestampRequest* request, + GetTimestampResponse* response, + bool rpc_error, + int error_code){ + std::unique_ptr req_hold(request); + std::unique_ptr res_hold(response); + + if (rpc_error) { + LOG(ERROR) << "RpcRequest failed for GetTimestamp, errno=" << error_code; + callback(0); + refresh_timeoracle_address(start_time); + return ; + } + + int64_t ts = response->start_timestamp(); + + int code = response->status(); + + if (code != kTimeoracleOk) { + ts = 0; + } + + callback(ts); +} + +} // namespace timeoracle +} // namespace tera diff --git a/src/sdk/timeoracle_client_impl.h b/src/sdk/timeoracle_client_impl.h new file mode 100644 index 000000000..e47fe9995 --- /dev/null +++ b/src/sdk/timeoracle_client_impl.h @@ -0,0 +1,56 @@ +// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef TERA_SDK_TIMEORACLE_CLIENT_IMPL_H_ +#define TERA_SDK_TIMEORACLE_CLIENT_IMPL_H_ + +#include +#include +#include +#include + +#include "proto/timeoracle_rpc.pb.h" +#include "proto/rpc_client.h" +#include "sdk/sdk_zk.h" + +DECLARE_int32(tera_rpc_timeout_period); + +namespace tera { +namespace timeoracle { + +class TimeoracleClientImpl : public RpcClient { +public: + TimeoracleClientImpl(ThreadPool* thread_pool, + sdk::ClusterFinder* cluster_finder, + int32_t rpc_timeout = FLAGS_tera_rpc_timeout_period); + + ~TimeoracleClientImpl() {} + + int64_t GetTimestamp(uint32_t count); + + bool GetTimestamp(uint32_t count, std::function callback); + +private: + void refresh_timeoracle_address(int64_t last_timestamp); + + void OnRpcFinished(int64_t start_time, + std::function callback, + const GetTimestampRequest* request, + GetTimestampResponse* response, + bool rpc_error, + int error_code); + +private: + ThreadPool* thread_pool_; + int32_t rpc_timeout_; + + std::mutex mutex_; + int64_t update_timestamp_; + sdk::ClusterFinder* cluster_finder_; +}; + +} // namespace timeoracle +} // namespace tera + +#endif // TERA_SDK_TIMEORACLE_CLIENT_IMPL_H_ diff --git a/src/tabletnode/remote_tabletnode.cc b/src/tabletnode/remote_tabletnode.cc index 2d95a0e5a..87f1a71de 100644 --- a/src/tabletnode/remote_tabletnode.cc +++ b/src/tabletnode/remote_tabletnode.cc @@ -5,14 +5,18 @@ #include "tabletnode/remote_tabletnode.h" #include +#include #include "gflags/gflags.h" #include "glog/logging.h" +#include "common/metric/metric_counter.h" +#include "common/metric/ratio_subscriber.h" +#include "common/metric/prometheus_subscriber.h" #include "tabletnode/tabletnode_impl.h" -#include "utils/counter.h" +#include "tabletnode/tabletnode_metric_name.h" #include "utils/network_utils.h" -#include "utils/timer.h" +#include "common/timer.h" DECLARE_int32(tera_tabletnode_ctrl_thread_num); DECLARE_int32(tera_tabletnode_write_thread_num); @@ -22,14 +26,93 @@ DECLARE_int32(tera_tabletnode_manual_compact_thread_num); DECLARE_int32(tera_request_pending_limit); DECLARE_int32(tera_scan_request_pending_limit); -extern tera::Counter read_pending_counter; -extern tera::Counter write_pending_counter; -extern tera::Counter scan_pending_counter; -extern tera::Counter compact_pending_counter; - namespace tera { namespace tabletnode { +//Add SubscriberType::SUM for caculating SLA +tera::MetricCounter read_request_counter(kRequestCountMetric, kApiLabelRead, + {SubscriberType::QPS, SubscriberType::SUM}); +tera::MetricCounter write_request_counter(kRequestCountMetric, kApiLabelWrite, + {SubscriberType::QPS, SubscriberType::SUM}); +tera::MetricCounter scan_request_counter(kRequestCountMetric, kApiLabelScan, {SubscriberType::QPS}); + +tera::MetricCounter read_pending_counter(kPendingCountMetric, kApiLabelRead, {SubscriberType::LATEST}, false); +tera::MetricCounter write_pending_counter(kPendingCountMetric, kApiLabelWrite, {SubscriberType::LATEST}, false); +tera::MetricCounter scan_pending_counter(kPendingCountMetric, kApiLabelScan, {SubscriberType::LATEST}, false); +tera::MetricCounter compact_pending_counter(kPendingCountMetric, kApiLabelCompact, {SubscriberType::LATEST}, false); + +//Add SubscriberType::SUM for caculating SLA +tera::MetricCounter read_reject_counter(kRejectCountMetric, kApiLabelRead, + {SubscriberType::QPS, SubscriberType::SUM}); +tera::MetricCounter write_reject_counter(kRejectCountMetric, kApiLabelWrite, + {SubscriberType::QPS, SubscriberType::SUM}); +tera::MetricCounter scan_reject_counter(kRejectCountMetric, kApiLabelScan, {SubscriberType::QPS}); + +tera::MetricCounter finished_read_request_counter(kFinishedRequestCountMetric, kApiLabelRead, {SubscriberType::QPS}); +tera::MetricCounter finished_write_request_counter(kFinishedRequestCountMetric, kApiLabelWrite, {SubscriberType::QPS}); +tera::MetricCounter finished_scan_request_counter(kFinishedRequestCountMetric, kApiLabelScan, {SubscriberType::QPS}); + +//These three metrics are not auto registered with a subscriber, they are used for ratio subscriber. +tera::MetricCounter read_delay(kRequestDelayMetric, kApiLabelRead, {}); +tera::MetricCounter write_delay(kRequestDelayMetric, kApiLabelWrite, {}); +tera::MetricCounter scan_delay(kRequestDelayMetric, kApiLabelScan, {}); + +tera::AutoSubscriberRegister rand_read_delay_per_request(std::unique_ptr(new tera::RatioSubscriber( + MetricId("tera_ts_read_delay_us_per_request"), + std::unique_ptr(new tera::PrometheusSubscriber(MetricId(kRequestDelayMetric, kApiLabelRead), SubscriberType::SUM)), + std::unique_ptr(new tera::PrometheusSubscriber(MetricId(kFinishedRequestCountMetric, kApiLabelRead), SubscriberType::SUM))))); + +tera::AutoSubscriberRegister write_delay_per_request(std::unique_ptr(new tera::RatioSubscriber( + MetricId("tera_ts_write_delay_us_per_request"), + std::unique_ptr(new tera::PrometheusSubscriber(MetricId(kRequestDelayMetric, kApiLabelWrite), SubscriberType::SUM)), + std::unique_ptr(new tera::PrometheusSubscriber(MetricId(kFinishedRequestCountMetric, kApiLabelWrite), SubscriberType::SUM))))); + +tera::AutoSubscriberRegister scan_delay_per_request(std::unique_ptr(new tera::RatioSubscriber( + MetricId("tera_ts_scan_delay_us_per_request"), + std::unique_ptr(new tera::PrometheusSubscriber(MetricId(kRequestDelayMetric, kApiLabelScan), SubscriberType::SUM)), + std::unique_ptr(new tera::PrometheusSubscriber(MetricId(kFinishedRequestCountMetric, kApiLabelScan), SubscriberType::SUM))))); + +void ReadDoneWrapper::Run() { + if (response_->has_detail()) { + int64_t now_us = get_micros(); + int64_t used_us = now_us - start_micros_; + if (used_us <= 0) { + LOG(ERROR) << "now us: "<< now_us << " start_us: "<< start_micros_; + } + finished_read_request_counter.Add(response_->detail().status_size()); + read_delay.Add(used_us); + } + delete this; +} + +void WriteDoneWrapper::Run() { + if (response_->row_status_list_size() != 0) { + int64_t now_us = get_micros(); + int64_t used_us = now_us - start_micros_; + if (used_us <= 0) { + LOG(ERROR) << "now us: "<< now_us << " start_us: "<< start_micros_; + } + + finished_write_request_counter.Add(response_->row_status_list_size()); + write_delay.Add(used_us); + } + delete this; +} + +void ScanDoneWrapper::Run() { + if (response_->has_results()) { + int64_t now_us = get_micros(); + int64_t used_us = now_us - start_micros_; + if (used_us <= 0) { + LOG(ERROR) << "now us: "<< now_us << " start_us: "<< start_micros_; + } + + finished_scan_request_counter.Add(response_->results().key_values_size()); + scan_delay.Add(used_us); + } + delete this; +} + enum RpcType { RPC_READ = 1, RPC_SCAN = 2 @@ -105,11 +188,16 @@ void RemoteTabletNode::ReadTablet(google::protobuf::RpcController* controller, const ReadTabletRequest* request, ReadTabletResponse* response, google::protobuf::Closure* done) { + int64_t start_micros = get_micros(); + done = ReadDoneWrapper::NewInstance(start_micros, response, done); VLOG(8) << "accept RPC (ReadTablet): [" << request->tablet_name() << "] " << tera::utils::GetRemoteAddress(controller); static uint32_t last_print = time(NULL); + int32_t row_num = request->row_info_list_size(); + read_request_counter.Add(row_num); if (read_pending_counter.Get() > FLAGS_tera_request_pending_limit) { response->set_sequence_id(request->sequence_id()); response->set_status(kTabletNodeIsBusy); + read_reject_counter.Add(row_num); done->Run(); uint32_t now_time = time(NULL); if (now_time > last_print) { @@ -118,9 +206,7 @@ void RemoteTabletNode::ReadTablet(google::protobuf::RpcController* controller, } VLOG(8) << "finish RPC (ReadTablet)"; } else { - int32_t row_num = request->row_info_list_size(); read_pending_counter.Add(row_num); - int64_t start_micros = get_micros(); ReadRpcTimer* timer = new ReadRpcTimer(request, response, done, start_micros); RpcTimerList::Instance()->Push(timer); @@ -136,11 +222,16 @@ void RemoteTabletNode::WriteTablet(google::protobuf::RpcController* controller, const WriteTabletRequest* request, WriteTabletResponse* response, google::protobuf::Closure* done) { + int64_t start_micros = get_micros(); + done = WriteDoneWrapper::NewInstance(start_micros, response, done); VLOG(8) << "accept RPC (WriteTablet): [" << request->tablet_name() << "] " << tera::utils::GetRemoteAddress(controller); static uint32_t last_print = time(NULL); + int32_t row_num = request->row_list_size(); + write_request_counter.Add(row_num); if (write_pending_counter.Get() > FLAGS_tera_request_pending_limit) { response->set_sequence_id(request->sequence_id()); response->set_status(kTabletNodeIsBusy); + write_reject_counter.Add(row_num); done->Run(); uint32_t now_time = time(NULL); if (now_time > last_print) { @@ -149,9 +240,7 @@ void RemoteTabletNode::WriteTablet(google::protobuf::RpcController* controller, } VLOG(8) << "finish RPC (WriteTablet)"; } else { - int32_t row_num = request->row_list_size(); write_pending_counter.Add(row_num); - int64_t start_micros = get_micros(); WriteRpcTimer* timer = new WriteRpcTimer(request, response, done, start_micros); RpcTimerList::Instance()->Push(timer); ThreadPool::Task callback = @@ -165,10 +254,13 @@ void RemoteTabletNode::ScanTablet(google::protobuf::RpcController* controller, const ScanTabletRequest* request, ScanTabletResponse* response, google::protobuf::Closure* done) { + done = ScanDoneWrapper::NewInstance(get_micros(), response, done); VLOG(8) << "accept RPC (ScanTablet): [" << request->table_name() << "] " << tera::utils::GetRemoteAddress(controller); + scan_request_counter.Inc(); if (scan_pending_counter.Get() > FLAGS_tera_scan_request_pending_limit) { response->set_sequence_id(request->sequence_id()); response->set_status(kTabletNodeIsBusy); + scan_reject_counter.Inc(); done->Run(); VLOG(8) << "finish RPC (ScanTablet)"; } else { @@ -254,6 +346,18 @@ void RemoteTabletNode::SplitTablet(google::protobuf::RpcController* controller, ctrl_thread_pool_->AddTask(callback); } +void RemoteTabletNode::ComputeSplitKey(google::protobuf::RpcController* controller, + const SplitTabletRequest* request, + SplitTabletResponse* response, + google::protobuf::Closure* done) { + uint64_t id = request->sequence_id(); + LOG(INFO) << "accept RPC (ComputeSplitKey) id: " << id << ", src: " << tera::utils::GetRemoteAddress(controller); + ThreadPool::Task callback = + std::bind(&RemoteTabletNode::DoComputeSplitKey, this, controller, + request, response, done); + ctrl_thread_pool_->AddTask(callback); +} + void RemoteTabletNode::CompactTablet(google::protobuf::RpcController* controller, const CompactTabletRequest* request, CompactTabletResponse* response, @@ -322,7 +426,7 @@ void RemoteTabletNode::DoReadTablet(google::protobuf::RpcController* controller, int64_t read_timeout = request->client_timeout_ms() * 1000; // ms -> us int64_t detal = get_micros() - start_micros; if (detal > read_timeout) { - VLOG(5) << "timeout, drop read request for:" << request->tablet_name() + LOG(WARNING) << "timeout, drop read request for:" << request->tablet_name() << ", detal(in us):" << detal << ", read_timeout(in us):" << read_timeout; is_read_timeout = true; @@ -335,6 +439,7 @@ void RemoteTabletNode::DoReadTablet(google::protobuf::RpcController* controller, response->set_sequence_id(request->sequence_id()); response->set_success_num(0); response->set_status(kTableIsBusy); + read_reject_counter.Inc(); done->Run(); } @@ -431,6 +536,16 @@ void RemoteTabletNode::DoSplitTablet(google::protobuf::RpcController* controller LOG(INFO) << "finish RPC (SplitTablet) id: " << id; } +void RemoteTabletNode::DoComputeSplitKey(google::protobuf::RpcController* controller, + const SplitTabletRequest* request, + SplitTabletResponse* response, + google::protobuf::Closure* done) { + uint64_t id = request->sequence_id(); + LOG(INFO) << "run RPC (ComputeSplitKey) id: " << id; + tabletnode_impl_->ComputeSplitKey(request, response, done); + LOG(INFO) << "finish RPC (ComputeSplitKey) id: " << id; +} + void RemoteTabletNode::DoCompactTablet(google::protobuf::RpcController* controller, const CompactTabletRequest* request, CompactTabletResponse* response, diff --git a/src/tabletnode/remote_tabletnode.h b/src/tabletnode/remote_tabletnode.h index 93e692121..936a3ff12 100644 --- a/src/tabletnode/remote_tabletnode.h +++ b/src/tabletnode/remote_tabletnode.h @@ -7,6 +7,7 @@ #include "common/base/scoped_ptr.h" #include "common/thread_pool.h" +#include "common/request_done_wrapper.h" #include "proto/tabletnode_rpc.pb.h" #include "tabletnode/rpc_schedule.h" @@ -17,6 +18,82 @@ namespace tabletnode { class TabletNodeImpl; + +class ReadDoneWrapper final : public RequestDoneWrapper { +public: + static google::protobuf::Closure* NewInstance(int64_t start_micros, + ReadTabletResponse* response, + google::protobuf::Closure* done) { + return new ReadDoneWrapper(start_micros, response, done); + } + + virtual void Run() override; + + virtual ~ReadDoneWrapper() {} + +protected: + //Just Can Create on Heap; + ReadDoneWrapper(int64_t start_micros, + ReadTabletResponse* response, + google::protobuf::Closure* done): + RequestDoneWrapper(done), + start_micros_(start_micros), + response_(response) { } + + int64_t start_micros_; + ReadTabletResponse* response_; +}; + +class WriteDoneWrapper final : public RequestDoneWrapper { +public: + static google::protobuf::Closure* NewInstance(int64_t start_micros, + WriteTabletResponse* response, + google::protobuf::Closure* done) { + return new WriteDoneWrapper(start_micros, response, done); + } + + virtual void Run() override; + + virtual ~WriteDoneWrapper() {} + +protected: + //Just Can Create on Heap; + WriteDoneWrapper(int64_t start_micros, + WriteTabletResponse* response, + google::protobuf::Closure* done): + RequestDoneWrapper(done), + start_micros_(start_micros), + response_(response) { } + + int64_t start_micros_; + WriteTabletResponse* response_; +}; + +class ScanDoneWrapper final : public RequestDoneWrapper { +public: + static google::protobuf::Closure* NewInstance(int64_t start_micros, + ScanTabletResponse* response, + google::protobuf::Closure* done) { + return new ScanDoneWrapper(start_micros, response, done); + } + + virtual void Run() override; + + virtual ~ScanDoneWrapper() {} + +protected: + //Just Can Create on Heap; + ScanDoneWrapper(int64_t start_micros, + ScanTabletResponse* response, + google::protobuf::Closure* done): + RequestDoneWrapper(done), + start_micros_(start_micros), + response_(response) { } + + int64_t start_micros_; + ScanTabletResponse* response_; +}; + class RemoteTabletNode : public TabletNodeServer { public: explicit RemoteTabletNode(TabletNodeImpl* tabletnode_impl); @@ -72,6 +149,11 @@ class RemoteTabletNode : public TabletNodeServer { SplitTabletResponse* response, google::protobuf::Closure* done); + void ComputeSplitKey(google::protobuf::RpcController* controller, + const SplitTabletRequest* request, + SplitTabletResponse* response, + google::protobuf::Closure* done); + void CompactTablet(google::protobuf::RpcController* controller, const CompactTabletRequest* request, CompactTabletResponse* response, @@ -139,6 +221,10 @@ class RemoteTabletNode : public TabletNodeServer { const SplitTabletRequest* request, SplitTabletResponse* response, google::protobuf::Closure* done); + void DoComputeSplitKey(google::protobuf::RpcController* controller, + const SplitTabletRequest* request, + SplitTabletResponse* response, + google::protobuf::Closure* done); void DoMergeTablet(google::protobuf::RpcController* controller, const MergeTabletRequest* request, diff --git a/src/tabletnode/rpc_schedule_policy.cc b/src/tabletnode/rpc_schedule_policy.cc index 2c43156ab..99a897dee 100644 --- a/src/tabletnode/rpc_schedule_policy.cc +++ b/src/tabletnode/rpc_schedule_policy.cc @@ -8,7 +8,7 @@ #include "glog/logging.h" -#include "utils/timer.h" +#include "common/timer.h" namespace tera { namespace tabletnode { diff --git a/src/tabletnode/tabletnode_entry.cc b/src/tabletnode/tabletnode_entry.cc index 37ac8409f..a81628b14 100644 --- a/src/tabletnode/tabletnode_entry.cc +++ b/src/tabletnode/tabletnode_entry.cc @@ -9,6 +9,7 @@ #include "common/base/string_ext.h" #include "common/base/string_number.h" +#include "common/metric/collector_report.h" #include "common/net/ip_address.h" #include "common/this_thread.h" #include "common/thread_attributes.h" @@ -19,20 +20,21 @@ #include "proto/tabletnode.pb.h" #include "tabletnode/remote_tabletnode.h" #include "tabletnode/tabletnode_impl.h" -#include "utils/counter.h" +#include "common/counter.h" #include "utils/rpc_timer_list.h" -#include "utils/timer.h" +#include "common/timer.h" #include "utils/utils_cmd.h" DECLARE_string(tera_tabletnode_port); DECLARE_int32(tera_garbage_collect_period); -DECLARE_bool(tera_zk_enabled); DECLARE_bool(tera_tabletnode_cpu_affinity_enabled); DECLARE_string(tera_tabletnode_cpu_affinity_set); DECLARE_bool(tera_tabletnode_hang_detect_enabled); DECLARE_int32(tera_tabletnode_hang_detect_threshold); DECLARE_int32(tera_tabletnode_rpc_server_max_inflow); DECLARE_int32(tera_tabletnode_rpc_server_max_outflow); +DECLARE_bool(tera_metric_http_server_enable); +DECLARE_int32(tera_metric_http_server_listen_port); std::string GetTeraEntryName() { return "tabletnode"; @@ -47,7 +49,8 @@ namespace tabletnode { TabletNodeEntry::TabletNodeEntry() : tabletnode_impl_(NULL), - remote_tabletnode_(NULL) { + remote_tabletnode_(NULL), + metric_http_server_(new tera::MetricHttpServer()) { sofa::pbrpc::RpcServerOptions rpc_options; rpc_options.max_throughput_in = FLAGS_tera_tabletnode_rpc_server_max_inflow; rpc_options.max_throughput_out = FLAGS_tera_tabletnode_rpc_server_max_outflow; @@ -78,14 +81,23 @@ bool TabletNodeEntry::StartServer() { return false; } LOG(INFO) << "finish starting RPC server"; + + // start metric http server + if (FLAGS_tera_metric_http_server_enable) { + if(!metric_http_server_->Start(FLAGS_tera_metric_http_server_listen_port)) { + LOG(WARNING) << "Start metric http server failed. Ignore"; + } + } else { + LOG(INFO) << "Metric http server is disabled."; + } return true; } void TabletNodeEntry::ShutdownServer() { + metric_http_server_->Stop(); tabletnode_impl_->Exit(); - LOG(INFO) << "shut down server"; - rpc_server_->Stop(); LOG(INFO) << "TabletNodeEntry stop done!"; + _exit(0); } bool TabletNodeEntry::Run() { @@ -99,20 +111,17 @@ bool TabletNodeEntry::Run() { tabletnode_impl_->GarbageCollect(); } + CollectorReportPublisher::GetInstance().Refresh(); tabletnode_impl_->RefreshSysInfo(); tabletnode_impl_->GetSysInfo().DumpLog(); LOG(INFO) << "[ThreadPool schd/task/cnt] " << remote_tabletnode_->ProfilingLog(); - LOG(INFO) << "[Cache HitRate/Cnt/Size] table_cache " - << tabletnode_impl_->TableCacheProfileInfo() - << ", block_cache " << tabletnode_impl_->BlockCacheProfileInfo(); - int64_t now_time = get_micros(); int64_t earliest_rpc_time = now_time; RpcTimerList::Instance()->TopTime(&earliest_rpc_time); double max_delay = (now_time - earliest_rpc_time) / 1000.0; - VLOG(5) << "pending rpc max delay: " + LOG(INFO) << "pending rpc max delay: " << std::fixed<< std::setprecision(2) << max_delay; if (FLAGS_tera_tabletnode_hang_detect_enabled && max_delay > FLAGS_tera_tabletnode_hang_detect_threshold) { diff --git a/src/tabletnode/tabletnode_entry.h b/src/tabletnode/tabletnode_entry.h index a27a89747..ec87acc2b 100644 --- a/src/tabletnode/tabletnode_entry.h +++ b/src/tabletnode/tabletnode_entry.h @@ -10,6 +10,7 @@ #include #include "common/base/scoped_ptr.h" +#include "common/metric/metric_http_server.h" #include "tera_entry.h" namespace tera { @@ -37,6 +38,7 @@ class TabletNodeEntry : public TeraEntry { scoped_ptr tabletnode_impl_; RemoteTabletNode* remote_tabletnode_; scoped_ptr rpc_server_; + scoped_ptr metric_http_server_; }; } // namespace tabletnode diff --git a/src/tabletnode/tabletnode_impl.cc b/src/tabletnode/tabletnode_impl.cc index aed9d27f8..42f720723 100644 --- a/src/tabletnode/tabletnode_impl.cc +++ b/src/tabletnode/tabletnode_impl.cc @@ -14,6 +14,10 @@ #include "db/filename.h" #include "db/table_cache.h" +#include "common/metric/cache_collector.h" +#include "common/metric/prometheus_subscriber.h" +#include "common/metric/ratio_collector.h" +#include "common/metric/metric_counter.h" #include "common/thread.h" #include "io/io_utils.h" #include "io/utils_leveldb.h" @@ -28,12 +32,13 @@ #include "proto/proto_helper.h" #include "proto/tabletnode_client.h" #include "tabletnode/tablet_manager.h" +#include "tabletnode/tabletnode_metric_name.h" #include "tabletnode/tabletnode_zk_adapter.h" #include "types.h" #include "utils/config_utils.h" -#include "utils/counter.h" +#include "common/counter.h" #include "utils/string_util.h" -#include "utils/timer.h" +#include "common/timer.h" #include "utils/utils_cmd.h" DECLARE_string(tera_tabletnode_port); @@ -84,6 +89,7 @@ DECLARE_string(tera_leveldb_env_type); DECLARE_string(tera_local_addr); DECLARE_bool(tera_ins_enabled); DECLARE_bool(tera_mock_ins_enabled); +DECLARE_string(tera_coord_type); DECLARE_bool(tera_io_cache_path_vanish_allowed); DECLARE_int64(tera_tabletnode_tcm_cache_size); @@ -92,20 +98,49 @@ DECLARE_string(flagfile); using namespace std::placeholders; -extern tera::Counter range_error_counter; -extern tera::Counter rand_read_delay; - static const int GC_LOG_LEVEL = FLAGS_tera_tabletnode_gc_log_level; +namespace leveldb { +extern tera::Counter snappy_before_size_counter; +extern tera::Counter snappy_after_size_counter; +} + namespace tera { namespace tabletnode { +using tera::SubscriberType; + +tera::MetricCounter read_error_counter(kErrorCountMetric, kApiLabelRead, + {SubscriberType::QPS, SubscriberType::SUM}); +tera::MetricCounter write_error_counter(kErrorCountMetric, kApiLabelWrite, + {SubscriberType::QPS, SubscriberType::SUM}); +tera::MetricCounter scan_error_counter(kErrorCountMetric, kApiLabelScan, + {SubscriberType::QPS, SubscriberType::SUM}); + +tera::MetricCounter read_range_error_counter(kRangeErrorMetric, kApiLabelRead, {SubscriberType::QPS}); +tera::MetricCounter write_range_error_counter(kRangeErrorMetric, kApiLabelWrite, {SubscriberType::QPS}); +tera::MetricCounter scan_range_error_counter(kRangeErrorMetric, kApiLabelScan, {SubscriberType::QPS}); + +TabletNodeImpl::CacheMetrics::CacheMetrics(leveldb::Cache* block_cache, leveldb::TableCache* table_cache) + : block_cache_hitrate_(kBlockCacheHitRateMetric, + std::unique_ptr(new LRUCacheCollector(block_cache, CacheCollectType::kHitRate))), + block_cache_entries_(kBlockCacheEntriesMetric, + std::unique_ptr(new LRUCacheCollector(block_cache, CacheCollectType::kEntries))), + block_cache_charge_(kBlockCacheChargeMetric, + std::unique_ptr(new LRUCacheCollector(block_cache, CacheCollectType::kCharge))), + table_cache_hitrate_(kTableCacheHitRateMetric, + std::unique_ptr(new TableCacheCollector(table_cache, CacheCollectType::kHitRate))), + table_cache_entries_(kTableCacheEntriesMetric, + std::unique_ptr(new TableCacheCollector(table_cache, CacheCollectType::kEntries))), + table_cache_charge_(kTableCacheChargeMetric, + std::unique_ptr(new TableCacheCollector(table_cache, CacheCollectType::kCharge))) {} TabletNodeImpl::TabletNodeImpl() : status_(kNotInited), tablet_manager_(new TabletManager()), zk_adapter_(NULL), release_cache_timer_id_(kInvalidTimerId), - thread_pool_(new ThreadPool(FLAGS_tera_tabletnode_impl_thread_max_num)) { + thread_pool_(new ThreadPool(FLAGS_tera_tabletnode_impl_thread_max_num)), + cache_metrics_(NULL) { if (FLAGS_tera_local_addr == "") { local_addr_ = utils::GetLocalHostName()+ ":" + FLAGS_tera_tabletnode_port; } else { @@ -157,24 +192,42 @@ TabletNodeImpl::~TabletNodeImpl() { } bool TabletNodeImpl::Init() { - if (FLAGS_tera_zk_enabled) { + if (FLAGS_tera_coord_type.empty()) { + LOG(ERROR) << "Note: We don't recommend that use '" + << "--tera_[zk|ins|mock_zk|mock_ins]_enabled' flag for your cluster coord" + << " replace by '--tera_coord_type=[zk|ins|mock_zk|mock_ins|fake_zk]'" + << " flag is usually recommended."; + } + if (FLAGS_tera_coord_type == "zk" || + (FLAGS_tera_coord_type.empty() && FLAGS_tera_zk_enabled)) { zk_adapter_.reset(new TabletNodeZkAdapter(this, local_addr_)); - } else if(FLAGS_tera_ins_enabled) { + } else if (FLAGS_tera_coord_type == "ins" || + (FLAGS_tera_coord_type.empty() && FLAGS_tera_ins_enabled)) { LOG(INFO) << "ins mode!"; zk_adapter_.reset(new InsTabletNodeZkAdapter(this, local_addr_)); - } else if (FLAGS_tera_mock_zk_enabled) { + } else if (FLAGS_tera_coord_type == "mock_zk" || + (FLAGS_tera_coord_type.empty() && FLAGS_tera_mock_zk_enabled)) { LOG(INFO) << "mock zk mode!"; zk_adapter_.reset(new MockTabletNodeZkAdapter(this, local_addr_)); - } else if (FLAGS_tera_mock_ins_enabled) { + } else if (FLAGS_tera_coord_type == "mock_ins" || + (FLAGS_tera_coord_type.empty() && FLAGS_tera_mock_ins_enabled)) { LOG(INFO) << "mock ins mode!"; zk_adapter_.reset(new MockInsTabletNodeZkAdapter(this, local_addr_)); - } else { + } else if (FLAGS_tera_coord_type == "fake_zk" || + FLAGS_tera_coord_type.empty()) { LOG(INFO) << "fake zk mode!"; zk_adapter_.reset(new FakeTabletNodeZkAdapter(this, local_addr_)); } SetTabletNodeStatus(kIsIniting); thread_pool_->AddTask(std::bind(&TabletNodeZkAdapterBase::Init, zk_adapter_.get())); + + // register cache metrics + cache_metrics_.reset(new CacheMetrics(ldb_block_cache_, ldb_table_cache_)); + // register snappy metrics + snappy_ratio_metric_.reset(new AutoCollectorRegister(kSnappyCompressionRatioMetric, std::unique_ptr( + new RatioCollector(&leveldb::snappy_before_size_counter, &leveldb::snappy_after_size_counter, true)))); + return true; } @@ -208,6 +261,8 @@ void TabletNodeImpl::InitCacheSystem() { } bool TabletNodeImpl::Exit() { + cache_metrics_.reset(NULL); + std::vector tablet_ios; tablet_manager_->GetAllTablets(&tablet_ios); @@ -309,6 +364,11 @@ void TabletNodeImpl::LoadTablet(const LoadTabletRequest* request, CHECK(i < 2) << "parent_tablets should less than 2: " << i; parent_tablets.push_back(request->parent_tablets(i)); } + std::set ignore_err_lgs; + for (int i = 0; i < request->ignore_err_lgs_size(); ++i) { + VLOG(10) << "oops lg:" << request->ignore_err_lgs(i); + ignore_err_lgs.insert(request->ignore_err_lgs(i)); + } io::TabletIO* tablet_io = NULL; StatusCode status = kTabletNodeOk; @@ -324,7 +384,7 @@ void TabletNodeImpl::LoadTablet(const LoadTabletRequest* request, ///TODO: User per user memery_cache according to user quota. tablet_io->SetMemoryCache(m_memory_cache); if (!tablet_io->Load(schema, request->path(), parent_tablets, - snapshots, rollbacks, ldb_logger_, + ignore_err_lgs, snapshots, rollbacks, ldb_logger_, ldb_block_cache_, ldb_table_cache_, &status)) { tablet_io->DecRef(); LOG(ERROR) << "fail to load tablet: " << request->path() @@ -466,28 +526,50 @@ void TabletNodeImpl::ReadTablet(int64_t start_micros, const ReadTabletRequest* request, ReadTabletResponse* response, google::protobuf::Closure* done) { + bool is_timeout = false; int32_t row_num = request->row_info_list_size(); uint64_t snapshot_id = request->snapshot_id() == 0 ? 0 : request->snapshot_id(); uint32_t read_success_num = 0; + int64_t client_timeout_ms = std::numeric_limits::max() / 2; + if (request->has_client_timeout_ms()) { + client_timeout_ms = request->client_timeout_ms(); + } + int64_t end_time_ms = start_micros / 1000 + client_timeout_ms; + VLOG(20) << "start_ms: " << start_micros / 1000 << ", client_timeout_ms: " << client_timeout_ms + << " end_ms: " << end_time_ms; + for (int32_t i = 0; i < row_num; i++) { + int64_t time_remain_ms = end_time_ms - GetTimeStampInMs(); StatusCode row_status = kTabletNodeOk; io::TabletIO* tablet_io = tablet_manager_->GetTablet( request->tablet_name(), request->row_info_list(i).key(), &row_status); if (tablet_io == NULL) { - range_error_counter.Inc(); + read_error_counter.Inc(); + read_range_error_counter.Inc(); response->mutable_detail()->add_status(kKeyNotInRange); } else { + VLOG(20) << "time_remain_ms: " << time_remain_ms; if (tablet_io->ReadCells(request->row_info_list(i), response->mutable_detail()->add_row_result(), - snapshot_id, &row_status)) { + snapshot_id, &row_status, time_remain_ms)) { read_success_num++; } else { + if (row_status != kKeyNotExist && row_status != kRPCTimeout) { + read_error_counter.Inc(); + } response->mutable_detail()->mutable_row_result()->RemoveLast(); } tablet_io->DecRef(); response->mutable_detail()->add_status(row_status); } + + if (row_status == kRPCTimeout) { + is_timeout = true; + LOG(WARNING) << "seq_id: " << request->sequence_id() << " timeout," + << " clinet_timeout_ms: " << request->client_timeout_ms(); + break; + } } VLOG(10) << "seq_id: " << request->sequence_id() @@ -495,15 +577,14 @@ void TabletNodeImpl::ReadTablet(int64_t start_micros, << ", read_suc: " << read_success_num; response->set_sequence_id(request->sequence_id()); response->set_success_num(read_success_num); - response->set_status(kTabletNodeOk); - done->Run(); - int64_t now_ms = get_micros(); - int64_t used_ms = now_ms - start_micros; - if (used_ms <= 0) { - LOG(ERROR) << "now ms: "<< now_ms << " start_ms: "<< start_micros; + if (is_timeout) { + response->set_status(kRPCTimeout); + } else { + response->set_status(kTabletNodeOk); } - rand_read_delay.Add(used_ms); + + done->Run(); } void TabletNodeImpl::WriteTablet(const WriteTabletRequest* request, @@ -527,12 +608,12 @@ void TabletNodeImpl::WriteTablet(const WriteTabletRequest* request, return; } - Counter* row_done_counter = new Counter; + std::shared_ptr row_done_counter(new Counter); for (int32_t i = 0; i < row_num; i++) { io::TabletIO* tablet_io = tablet_manager_->GetTablet( request->tablet_name(), request->row_list(i).row_key(), &status); if (tablet_io == NULL) { - range_error_counter.Inc(); + write_range_error_counter.Inc(); } it = tablet_task_map.find(tablet_io); WriteTabletTask* tablet_task = NULL; @@ -579,6 +660,7 @@ void TabletNodeImpl::WriteTablet(const WriteTabletRequest* request, void TabletNodeImpl::WriteTabletFail(WriteTabletTask* tablet_task, StatusCode status) { int32_t row_num = tablet_task->row_status_vec.size(); + write_error_counter.Add(row_num); for (int32_t i = 0; i < row_num; i++) { tablet_task->row_status_vec[i] = status; } @@ -600,7 +682,6 @@ void TabletNodeImpl::WriteTabletCallback(WriteTabletTask* tablet_task, RpcTimerList::Instance()->Erase(tablet_task->timer); delete tablet_task->timer; } - delete tablet_task->row_done_counter; } delete tablet_task; @@ -806,12 +887,14 @@ void TabletNodeImpl::ScanTablet(const ScanTabletRequest* request, request->start(), &status); if (tablet_io == NULL) { - range_error_counter.Inc(); + scan_range_error_counter.Inc(); response->set_status(status); done->Run(); } else { response->set_end(tablet_io->GetEndKey()); - tablet_io->ScanRows(request, response, done); + if (!tablet_io->ScanRows(request, response, done)) { + scan_error_counter.Inc(); + } tablet_io->DecRef(); } } @@ -837,6 +920,14 @@ void TabletNodeImpl::SplitTablet(const SplitTabletRequest* request, done->Run(); return; } + // Master is not responsible for update children tablets to meta table, refuse to split + if (!request->has_master_update_meta() || !request->master_update_meta()) { + LOG(ERROR) << kSms <<"SplitRequest without master_update_meta, maybe " + "request from old master, refuse split!" << *tablet_io; + response->set_status(kTableNotSupport); + done->Run(); + + } if (!tablet_io->Split(&split_key, &status)) { LOG(ERROR) << "fail to split tablet: " << tablet_io->GetTablePath() @@ -852,10 +943,6 @@ void TabletNodeImpl::SplitTablet(const SplitTabletRequest* request, done->Run(); return; } - uint64_t tablet_size = 0; - tablet_io->GetDataSize(&tablet_size); - int64_t first_half_size = tablet_size / 2; - int64_t second_half_size = tablet_size / 2; LOG(INFO) << "split tablet: " << tablet_io->GetTablePath() << " [" << DebugString(tablet_io->GetStartKey()) << ", " << DebugString(tablet_io->GetEndKey()) @@ -888,11 +975,58 @@ void TabletNodeImpl::SplitTablet(const SplitTabletRequest* request, << ", " << DebugString(request->key_range().key_end()) << "], status: " << StatusCodeToString(status); } + response->set_status(kTabletNodeOk); + response->add_split_keys(split_key); + done->Run(); +} - UpdateMetaTableAsync(request, response, done, path, split_key, schema, - first_half_size, second_half_size, request->tablet_meta()); +void TabletNodeImpl::ComputeSplitKey(const SplitTabletRequest* request, + SplitTabletResponse* response, + google::protobuf::Closure* done) { + response->set_sequence_id(request->sequence_id()); + + std::string split_key = request->split_key(); + std::string path; + StatusCode status = kTabletNodeOk; + io::TabletIO* tablet_io = tablet_manager_->GetTablet(request->tablet_name(), + request->key_range().key_start(), + request->key_range().key_end(), + &status); + if (tablet_io == NULL) { + LOG(WARNING) << "split fail to get tablet: " << request->tablet_name() + << " [" << DebugString(request->key_range().key_start()) + << ", " << DebugString(request->key_range().key_end()) + << "], status: " << StatusCodeToString(status); + response->set_status(kKeyNotInRange); + done->Run(); + return; + } + + if (!tablet_io->Split(&split_key, &status)) { + LOG(ERROR) << "fail to split tablet: " << tablet_io->GetTablePath() + << " [" << DebugString(tablet_io->GetStartKey()) + << ", " << DebugString(tablet_io->GetEndKey()) + << "], split_key: " << DebugString(split_key) << ". status: " << StatusCodeToString(status); + if (status == kTableNotSupport) { + response->set_status(kTableNotSupport); + } else { + response->set_status((StatusCode)tablet_io->GetStatus()); + } + tablet_io->DecRef(); + done->Run(); + return; + } + LOG(INFO) << "split tablet: " << tablet_io->GetTablePath() + << " [" << DebugString(tablet_io->GetStartKey()) + << ", " << DebugString(tablet_io->GetEndKey()) + << "], split key: " << DebugString(split_key); + response->set_status(kTabletNodeOk); + response->add_split_keys(split_key); + tablet_io->DecRef(); + done->Run(); } + bool TabletNodeImpl::CheckInKeyRange(const KeyList& key_list, const std::string& key_start, const std::string& key_end) { @@ -954,7 +1088,7 @@ void TabletNodeImpl::LeaveSafeMode() { void TabletNodeImpl::ExitService() { LOG(FATAL) << "master kick me!"; - exit(1); + _exit(1); } void TabletNodeImpl::SetTabletNodeStatus(const TabletNodeStatus& status) { @@ -971,96 +1105,6 @@ void TabletNodeImpl::SetRootTabletAddr(const std::string& root_tablet_addr) { root_tablet_addr_ = root_tablet_addr; } -void TabletNodeImpl::UpdateMetaTableAsync(const SplitTabletRequest* rpc_request, - SplitTabletResponse* rpc_response, google::protobuf::Closure* rpc_done, - const std::string& path, const std::string& key_split, - const TableSchema& schema, int64_t first_size, int64_t second_size, - const TabletMeta& meta) { - WriteTabletRequest* request = new WriteTabletRequest; - WriteTabletResponse* response = new WriteTabletResponse; - request->set_sequence_id(this_sequence_id_++); - request->set_tablet_name(FLAGS_tera_master_meta_table_name); - request->set_is_sync(true); - request->set_is_instant(true); - - TabletMeta tablet_meta; - tablet_meta.CopyFrom(meta); - tablet_meta.set_server_addr(local_addr_); - tablet_meta.clear_parent_tablets(); - tablet_meta.add_parent_tablets(leveldb::GetTabletNumFromPath(path)); - - std::string meta_key, meta_value; - VLOG(5) << "update meta for split tablet: " << path - << " [" << DebugString(rpc_request->key_range().key_start()) - << ", " << DebugString(rpc_request->key_range().key_end()) << "]"; - - CHECK(2 == rpc_request->child_tablets_size()); - // first write 2nd half - tablet_meta.set_path(leveldb::GetChildTabletPath(path, rpc_request->child_tablets(0))); - tablet_meta.set_size(second_size); - tablet_meta.mutable_key_range()->set_key_start(key_split); - tablet_meta.mutable_key_range()->set_key_end(rpc_request->key_range().key_end()); - MakeMetaTableKeyValue(tablet_meta, &meta_key, &meta_value); - RowMutationSequence* mu_seq = request->add_row_list(); - mu_seq->set_row_key(meta_key); - Mutation* mutation = mu_seq->add_mutation_sequence(); - mutation->set_type(kPut); - mutation->set_value(meta_value); - VLOG(5) << "write meta: key [" << DebugString(meta_key) - << "], value_size: " << meta_value.size(); - - // then write 1st half - // update root_tablet_addr in fake zk mode - if (!FLAGS_tera_zk_enabled) { - zk_adapter_->GetRootTableAddr(&root_tablet_addr_); - } - TabletNodeClient meta_tablet_client(root_tablet_addr_); - - tablet_meta.set_path(leveldb::GetChildTabletPath(path, rpc_request->child_tablets(1))); - tablet_meta.set_size(first_size); - tablet_meta.mutable_key_range()->set_key_start(rpc_request->key_range().key_start()); - tablet_meta.mutable_key_range()->set_key_end(key_split); - MakeMetaTableKeyValue(tablet_meta, &meta_key, &meta_value); - mu_seq = request->add_row_list(); - mu_seq->set_row_key(meta_key); - mutation = mu_seq->add_mutation_sequence(); - mutation->set_type(kPut); - mutation->set_value(meta_value); - VLOG(5) << "write meta: key [" << DebugString(meta_key) - << "], value_size: " << meta_value.size(); - - std::function done = - std::bind(&TabletNodeImpl::UpdateMetaTableCallback, this, rpc_request, - rpc_response, rpc_done, _1, _2, _3, _4); - meta_tablet_client.WriteTablet(request, response, done); -} - - -void TabletNodeImpl::UpdateMetaTableCallback(const SplitTabletRequest* rpc_request, - SplitTabletResponse* rpc_response, google::protobuf::Closure* rpc_done, - WriteTabletRequest* request, WriteTabletResponse* response, bool failed, - int error_code) { - if (failed) { - rpc_response->set_status(kMetaTabletError); - } else if (response->status() != kTabletNodeOk) { - LOG(ERROR) << "fail to update meta for tablet: " - << request->tablet_name() << " [" - << DebugString(rpc_request->key_range().key_start()) - << ", " << DebugString(rpc_request->key_range().key_end()) - << "], status: " << StatusCodeToString(response->status()); - rpc_response->set_status(kMetaTabletError); - } else { - LOG(INFO) << "split tablet success: " << rpc_request->tablet_name() - << " [" << DebugString(rpc_request->key_range().key_start()) - << ", " << DebugString(rpc_request->key_range().key_end()) << "]"; - rpc_response->set_status(kTabletNodeOk); - } - - delete request; - delete response; - rpc_done->Run(); -} - /* * all cached tablets/files: * ------------------------------------------ @@ -1191,22 +1235,6 @@ std::string TabletNodeImpl::GetSessionId() { return session_id_; } -std::string TabletNodeImpl::BlockCacheProfileInfo() { - std::stringstream ss; - ss << ldb_block_cache_->HitRate(true); - ss << " " << ldb_block_cache_->Entries(); - ss << " " << ldb_block_cache_->TotalCharge(); - return ss.str(); -} - -std::string TabletNodeImpl::TableCacheProfileInfo() { - std::stringstream ss; - ss << ldb_table_cache_->HitRate(true); - ss << " " << ldb_table_cache_->TableEntries(); - ss << " " << ldb_table_cache_->ByteSize(); - return ss.str(); -} - TabletNodeSysInfo& TabletNodeImpl::GetSysInfo() { return sysinfo_; } diff --git a/src/tabletnode/tabletnode_impl.h b/src/tabletnode/tabletnode_impl.h index ed19d4ad6..b4d327a2d 100644 --- a/src/tabletnode/tabletnode_impl.h +++ b/src/tabletnode/tabletnode_impl.h @@ -6,8 +6,10 @@ #define TERA_TABLETNODE_TABLETNODE_IMPL_H_ #include +#include #include "common/base/scoped_ptr.h" +#include "common/metric/collector_report_publisher.h" #include "common/thread_pool.h" #include "io/tablet_io.h" @@ -38,7 +40,7 @@ class TabletNodeImpl { std::vector row_mutation_vec; std::vector row_status_vec; std::vector row_index_vec; - Counter* row_done_counter; + std::shared_ptr row_done_counter; const WriteTabletRequest* request; WriteTabletResponse* response; @@ -46,7 +48,7 @@ class TabletNodeImpl { WriteRpcTimer* timer; WriteTabletTask(const WriteTabletRequest* req, WriteTabletResponse* resp, - google::protobuf::Closure* d, WriteRpcTimer* t, Counter* c) + google::protobuf::Closure* d, WriteRpcTimer* t, std::shared_ptr c) : row_done_counter(c), request(req), response(resp), done(d), timer(t) {} }; @@ -112,6 +114,9 @@ class TabletNodeImpl { void SplitTablet(const SplitTabletRequest* request, SplitTabletResponse* response, google::protobuf::Closure* done); + void ComputeSplitKey(const SplitTabletRequest* request, + SplitTabletResponse* response, + google::protobuf::Closure* done); void EnterSafeMode(); void LeaveSafeMode(); @@ -125,10 +130,6 @@ class TabletNodeImpl { void SetSessionId(const std::string& session_id); std::string GetSessionId(); - std::string BlockCacheProfileInfo(); - - std::string TableCacheProfileInfo(); - TabletNodeSysInfo& GetSysInfo(); void RefreshSysInfo(); @@ -157,15 +158,6 @@ class TabletNodeImpl { const std::string& key_start, const std::string& key_end); - void UpdateMetaTableAsync(const SplitTabletRequest* request, - SplitTabletResponse* response, google::protobuf::Closure* done, - const std::string& path, const std::string& key_split, - const TableSchema& schema, int64_t first_size, int64_t second_size, - const TabletMeta& meta); - void UpdateMetaTableCallback(const SplitTabletRequest* rpc_request, - SplitTabletResponse* rpc_response, google::protobuf::Closure* rpc_done, - WriteTabletRequest* request, WriteTabletResponse* response, - bool failed, int error_code); void InitCacheSystem(); @@ -206,6 +198,22 @@ class TabletNodeImpl { leveldb::Cache* ldb_block_cache_; leveldb::Cache* m_memory_cache; leveldb::TableCache* ldb_table_cache_; + + // metric for caches + struct CacheMetrics { + tera::AutoCollectorRegister block_cache_hitrate_; + tera::AutoCollectorRegister block_cache_entries_; + tera::AutoCollectorRegister block_cache_charge_; + + tera::AutoCollectorRegister table_cache_hitrate_; + tera::AutoCollectorRegister table_cache_entries_; + tera::AutoCollectorRegister table_cache_charge_; + + CacheMetrics(leveldb::Cache* block_cache, leveldb::TableCache* table_cache); + }; + + scoped_ptr cache_metrics_; + scoped_ptr snappy_ratio_metric_; }; } // namespace tabletnode diff --git a/src/tabletnode/tabletnode_metric_name.h b/src/tabletnode/tabletnode_metric_name.h new file mode 100644 index 000000000..bca35a3dd --- /dev/null +++ b/src/tabletnode/tabletnode_metric_name.h @@ -0,0 +1,113 @@ +// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef TERA_TABLETNODE_TABLETNODE_METRIC_NAME_H_ +#define TERA_TABLETNODE_TABLETNODE_METRIC_NAME_H_ + +#include + +#include "common/metric/hardware_collectors.h" + +namespace tera { +namespace tabletnode { + +// api labels +const char* const kApiLabelRead = "api:read"; +const char* const kApiLabelWrite = "api:write"; +const char* const kApiLabelScan = "api:scan"; +const char* const kApiLabelCompact = "api:compact"; + +// env lables +const char* const kEnvLabelDfs = "env:dfs"; +const char* const kEnvLabelSsd = "env:ssd"; +const char* const kEnvLabelPosix = "env:posix"; +const char* const kEnvLabelOther = "env:other"; + +// metric names +const char* const kRequestCountMetric = "tera_ts_request_count"; +const char* const kPendingCountMetric = "tera_ts_pending_count"; +const char* const kRejectCountMetric = "tera_ts_reject_count"; +const char* const kErrorCountMetric = "tera_ts_error_count"; +const char* const kRangeErrorMetric = "tera_ts_range_error_count"; + +const char* const kRowDelayMetric = "tera_ts_row_delay_us_total"; +const char* const kRowCountMetric = "tera_ts_row_count"; +const char* const kRowThroughPutMetric = "tera_ts_row_through_put"; +const char* const kLowLevelReadMetric = "tera_ts_low_level_read"; + +const char* const kRequestDelayMetric = "tera_ts_request_delay_us_total"; +const char* const kFinishedRequestCountMetric = "tera_ts_finished_request_count"; + +// cache metric names +const char* const kBlockCacheHitRateMetric = "tera_ts_block_cache_hit_percentage"; +const char* const kBlockCacheEntriesMetric = "tera_ts_block_cache_entry_count"; +const char* const kBlockCacheChargeMetric = "tera_ts_block_cache_charge_bytes"; + +const char* const kTableCacheHitRateMetric = "tera_ts_table_cache_hit_percentage"; +const char* const kTableCacheEntriesMetric = "tera_ts_table_cache_entry_count"; +const char* const kTableCacheChargeMetric = "tera_ts_table_cache_charge_bytes"; + +// env metric names +const char* const kDfsReadBytesThroughPut = "tera_ts_dfs_read_bytes_through_put"; +const char* const kDfsWriteBytesThroughPut = "tera_ts_dfs_write_bytes_through_put"; +const char* const kDfsReadDelayMetric = "tera_ts_dfs_read_delay_us_total"; +const char* const kDfsWriteDelayMetric = "tera_ts_dfs_write_delay_us_total"; +const char* const kDfsSyncDelayMetric = "tera_ts_dfs_sync_delay_us_total"; +const char* const kDfsReadCountMetric = "tera_ts_dfs_read_count"; +const char* const kDfsWriteCountMetric = "tera_ts_dfs_write_count"; +const char* const kDfsSyncCountMetric = "tera_ts_dfs_sync_count"; +const char* const kDfsReadDelayPerRequestMetric = "tera_ts_dfs_read_delay_us_per_request"; +const char* const kDfsWriteDelayPerRequestMetric = "tera_ts_dfs_write_delay_us_per_request"; +const char* const kDfsSyncDelayPerRequestMetric = "tera_ts_dfs_sync_delay_us_per_request"; +const char* const kDfsFlushCountMetric = "tera_ts_dfs_flush_count"; +const char* const kDfsListCountMetric = "tera_ts_dfs_list_count"; +const char* const kDfsOtherCountMetric = "tera_ts_dfs_other_count"; +const char* const kDfsExistsCountMetric = "tera_ts_dfs_exists_count"; +const char* const kDfsOpenCountMetric = "tera_ts_dfs_open_count"; +const char* const kDfsCloseCountMetric = "tera_ts_dfs_close_count"; +const char* const kDfsDeleteCountMetric = "tera_ts_dfs_delete_count"; +const char* const kDfsTellCountMetric = "tera_ts_dfs_tell_count"; +const char* const kDfsInfoCountMetric = "tera_ts_dfs_info_count"; +const char* const kDfsReadHangMetric = "tera_ts_dfs_read_hang_total"; +const char* const kDfsWriteHangMetric = "tera_ts_dfs_write_hang_total"; +const char* const kDfsSyncHangMetric = "tera_ts_dfs_sync_hang_total"; +const char* const kDfsFlushHangMetric = "tera_ts_dfs_flush_hang_total"; +const char* const kDfsListHangMetric = "tera_ts_dfs_list_hang_total"; +const char* const kDfsOtherHangMetric = "tera_ts_dfs_other_hang_total"; +const char* const kDfsExistsHangMetric = "tera_ts_dfs_exists_hang_total"; +const char* const kDfsOpenHangMetric = "tera_ts_dfs_open_hang_total"; +const char* const kDfsCloseHangMetric = "tera_ts_dfs_close_hang_total"; +const char* const kDfsDeleteHangMetric = "tera_ts_dfs_delete_hang_total"; +const char* const kDfsTellHangMetric = "tera_ts_dfs_tell_hang_total"; +const char* const kDfsInfoHangMetric = "tera_ts_dfs_info_hang_total"; + +const char* const kSsdReadCountMetric = "tera_ts_ssd_read_count"; +const char* const kSsdReadThroughPutMetric = "tera_ts_ssd_read_through_put"; +const char* const kSsdWriteCountMetric = "tera_ts_ssd_write_count"; +const char* const kSsdWriteThroughPutMetric = "tera_ts_ssd_write_through_put"; + +const char* const kPosixReadThroughPutMetric = "tera_ts_posix_read_through_put"; +const char* const kPosixWriteThroughPutMetric = "tera_ts_posix_write_through_put"; +const char* const kPosixReadCountMetric = "tera_ts_posix_read_count"; +const char* const kPosixWriteCountMetric = "tera_ts_posix_write_count"; +const char* const kPosixSyncCountMetric = "tera_ts_posix_sync_count"; +const char* const kPosixListCountMetric = "tera_ts_posix_list_count"; +const char* const kPosixExistsCountMetric = "tera_ts_posix_exists_count"; +const char* const kPosixOpenCountMetric = "tera_ts_posix_open_count"; +const char* const kPosixCloseCountMetric = "tera_ts_posix_close_count"; +const char* const kPosixDeleteCountMetric = "tera_ts_posix_delete_count"; +const char* const kPosixTellCountMetric = "tera_ts_posix_tell_count"; +const char* const kPosixSeekCountMetric = "tera_ts_posix_seek_count"; +const char* const kPosixInfoCountMetric = "tera_ts_posix_info_count"; +const char* const kPosixOtherCountMetric = "tera_ts_posix_other_count"; + +const char* const kRawkeyCompareCountMetric = "tera_ts_rawkey_compare_count"; +const char* const kSnappyCompressionRatioMetric = "tera_ts_snappy_compression_percentage"; +} // end namespace tabletnode +} // end namespace tera + +#endif // TERA_TABLETNODE_TABLETNODE_METRIC_NAME_H_ + +/* vim: set expandtab ts=4 sw=4 sts=4 tw=100: */ + diff --git a/src/tabletnode/tabletnode_sysinfo.cc b/src/tabletnode/tabletnode_sysinfo.cc index b3c09520d..30b325df9 100644 --- a/src/tabletnode/tabletnode_sysinfo.cc +++ b/src/tabletnode/tabletnode_sysinfo.cc @@ -4,8 +4,7 @@ // // Author: Xu Peilin (xupeilin@baidu.com) -#include "tabletnode_sysinfo.h" - +#include #include #include #include @@ -16,17 +15,20 @@ #include #include +#include "tabletnode/tabletnode_sysinfo.h" #include "common/base/string_number.h" #include "proto/proto_helper.h" -#include "utils/timer.h" +#include "tabletnode/tabletnode_metric_name.h" +#include "common/timer.h" #include "utils/tprinter.h" #include "utils/utils_cmd.h" +#include "common/metric/collector_report_publisher.h" +#include "common/metric/ratio_subscriber.h" +#include "common/metric/prometheus_subscriber.h" -DEFINE_int32(tera_tabletnode_sysinfo_mem_collect_interval, 10, "interval of mem checking(s)"); -DEFINE_int32(tera_tabletnode_sysinfo_net_collect_interval, 5, "interval of net checking(s)"); -DEFINE_int32(tera_tabletnode_sysinfo_cpu_collect_interval, 5, "interval of cpu checking(s)"); DECLARE_bool(tera_tabletnode_dump_running_info); DECLARE_string(tera_tabletnode_running_info_dump_file); +DECLARE_int64(tera_tabletnode_sysinfo_check_interval); namespace leveldb { extern tera::Counter rawkey_compare_counter; @@ -49,9 +51,6 @@ extern tera::Counter posix_seek_counter; extern tera::Counter posix_info_counter; extern tera::Counter posix_other_counter; -extern tera::Counter snappy_before_size_counter; -extern tera::Counter snappy_after_size_counter; - extern tera::Counter dfs_read_counter; extern tera::Counter dfs_write_counter; extern tera::Counter dfs_read_delay_counter; @@ -87,17 +86,127 @@ extern tera::Counter ssd_write_counter; extern tera::Counter ssd_write_size_counter; } -tera::Counter rand_read_delay; -extern tera::Counter row_read_delay; -tera::Counter range_error_counter; -tera::Counter read_pending_counter; -tera::Counter write_pending_counter; -tera::Counter scan_pending_counter; -tera::Counter compact_pending_counter; namespace tera { namespace tabletnode { +// dfs metrics +tera::AutoCollectorRegister dfs_read_size_metric(kDfsReadBytesThroughPut, + std::unique_ptr(new CounterCollector(&leveldb::dfs_read_size_counter, true)), {SubscriberType::THROUGHPUT}); +tera::AutoCollectorRegister dfs_write_size_metric(kDfsWriteBytesThroughPut, + std::unique_ptr(new CounterCollector(&leveldb::dfs_write_size_counter, true)), {SubscriberType::THROUGHPUT}); +tera::AutoCollectorRegister dfs_read_delay_metric(kDfsReadDelayMetric, + std::unique_ptr(new CounterCollector(&leveldb::dfs_read_delay_counter, true)), {}); +tera::AutoCollectorRegister dfs_write_delay_metric(kDfsWriteDelayMetric, + std::unique_ptr(new CounterCollector(&leveldb::dfs_write_delay_counter, true)), {}); +tera::AutoCollectorRegister dfs_sync_delay_metric(kDfsSyncDelayMetric, + std::unique_ptr(new CounterCollector(&leveldb::dfs_sync_delay_counter, true)), {}); +tera::AutoCollectorRegister dfs_read_metric(kDfsReadCountMetric, + std::unique_ptr(new CounterCollector(&leveldb::dfs_read_counter, true)), {SubscriberType::QPS}); +tera::AutoCollectorRegister dfs_write_metric(kDfsWriteCountMetric, + std::unique_ptr(new CounterCollector(&leveldb::dfs_write_counter, true)), {SubscriberType::QPS}); +tera::AutoCollectorRegister dfs_sync_metric(kDfsSyncCountMetric, + std::unique_ptr(new CounterCollector(&leveldb::dfs_sync_counter, true)), {SubscriberType::QPS}); + +tera::AutoSubscriberRegister dfs_read_delay_avg_subscriber (std::unique_ptr(new RatioSubscriber( + MetricId(kDfsReadDelayPerRequestMetric), + std::unique_ptr(new PrometheusSubscriber(MetricId(kDfsReadDelayMetric), SubscriberType::SUM)), + std::unique_ptr(new PrometheusSubscriber(MetricId(kDfsReadCountMetric), SubscriberType::SUM))))); + +tera::AutoSubscriberRegister dfs_write_delay_avg_subscriber (std::unique_ptr(new RatioSubscriber( + MetricId(kDfsWriteDelayPerRequestMetric), + std::unique_ptr(new PrometheusSubscriber(MetricId(kDfsWriteDelayMetric), SubscriberType::SUM)), + std::unique_ptr(new PrometheusSubscriber(MetricId(kDfsWriteCountMetric), SubscriberType::SUM))))); + +tera::AutoSubscriberRegister dfs_sync_delay_avg_subscriber (std::unique_ptr(new RatioSubscriber( + MetricId(kDfsSyncDelayPerRequestMetric), + std::unique_ptr(new PrometheusSubscriber(MetricId(kDfsSyncDelayMetric), SubscriberType::SUM)), + std::unique_ptr(new PrometheusSubscriber(MetricId(kDfsSyncCountMetric), SubscriberType::SUM))))); + +tera::AutoCollectorRegister dfs_flush_metric(kDfsFlushCountMetric, + std::unique_ptr(new CounterCollector(&leveldb::dfs_flush_counter, true)), {SubscriberType::QPS}); +tera::AutoCollectorRegister dfs_list_metric(kDfsListCountMetric, + std::unique_ptr(new CounterCollector(&leveldb::dfs_list_counter, true)), {SubscriberType::QPS}); +tera::AutoCollectorRegister dfs_exists_metric(kDfsExistsCountMetric, + std::unique_ptr(new CounterCollector(&leveldb::dfs_exists_counter, true)), {SubscriberType::QPS}); +tera::AutoCollectorRegister dfs_open_metric(kDfsOpenCountMetric, + std::unique_ptr(new CounterCollector(&leveldb::dfs_open_counter, true)), {SubscriberType::QPS}); +tera::AutoCollectorRegister dfs_close_metric(kDfsCloseCountMetric, + std::unique_ptr(new CounterCollector(&leveldb::dfs_close_counter, true)), {SubscriberType::QPS}); +tera::AutoCollectorRegister dfs_delete_metric(kDfsDeleteCountMetric, + std::unique_ptr(new CounterCollector(&leveldb::dfs_delete_counter, true)), {SubscriberType::QPS}); +tera::AutoCollectorRegister dfs_tell_metric(kDfsTellCountMetric, + std::unique_ptr(new CounterCollector(&leveldb::dfs_tell_counter, true)), {SubscriberType::QPS}); +tera::AutoCollectorRegister dfs_info_metric(kDfsInfoCountMetric, + std::unique_ptr(new CounterCollector(&leveldb::dfs_info_counter, true)), {SubscriberType::QPS}); +tera::AutoCollectorRegister dfs_other_metric(kDfsOtherCountMetric, + std::unique_ptr(new CounterCollector(&leveldb::dfs_other_counter, true)), {SubscriberType::QPS}); +tera::AutoCollectorRegister dfs_read_hang_metric(kDfsReadHangMetric, + std::unique_ptr(new CounterCollector(&leveldb::dfs_read_hang_counter, false))); +tera::AutoCollectorRegister dfs_write_hang_metric(kDfsWriteHangMetric, + std::unique_ptr(new CounterCollector(&leveldb::dfs_write_hang_counter, false))); +tera::AutoCollectorRegister dfs_sync_hang_metric(kDfsSyncHangMetric, + std::unique_ptr(new CounterCollector(&leveldb::dfs_sync_hang_counter, false))); +tera::AutoCollectorRegister dfs_flush_hang_metric(kDfsFlushHangMetric, + std::unique_ptr(new CounterCollector(&leveldb::dfs_flush_hang_counter, false))); +tera::AutoCollectorRegister dfs_list_hang_metric(kDfsListHangMetric, + std::unique_ptr(new CounterCollector(&leveldb::dfs_list_hang_counter, false))); +tera::AutoCollectorRegister dfs_exists_hang_metric(kDfsExistsHangMetric, + std::unique_ptr(new CounterCollector(&leveldb::dfs_exists_hang_counter, false))); +tera::AutoCollectorRegister dfs_open_hang_metric(kDfsOpenHangMetric, + std::unique_ptr(new CounterCollector(&leveldb::dfs_open_hang_counter, false))); +tera::AutoCollectorRegister dfs_close_hang_metric(kDfsCloseHangMetric, + std::unique_ptr(new CounterCollector(&leveldb::dfs_close_hang_counter, false))); +tera::AutoCollectorRegister dfs_delete_hang_metric(kDfsDeleteHangMetric, + std::unique_ptr(new CounterCollector(&leveldb::dfs_delete_hang_counter, false))); +tera::AutoCollectorRegister dfs_tell_hang_metric(kDfsTellHangMetric, + std::unique_ptr(new CounterCollector(&leveldb::dfs_tell_hang_counter, false))); +tera::AutoCollectorRegister dfs_info_hang_metric(kDfsInfoHangMetric, + std::unique_ptr(new CounterCollector(&leveldb::dfs_info_hang_counter, false))); +tera::AutoCollectorRegister dfs_other_hang_metric(kDfsOtherHangMetric, + std::unique_ptr(new CounterCollector(&leveldb::dfs_other_hang_counter, false))); +// ssd metrics +tera::AutoCollectorRegister ssd_read_through_put_metric(kSsdReadThroughPutMetric, + std::unique_ptr(new CounterCollector(&leveldb::ssd_read_size_counter, true)), {SubscriberType::THROUGHPUT}); +tera::AutoCollectorRegister ssd_write_through_put_metric(kSsdWriteThroughPutMetric, + std::unique_ptr(new CounterCollector(&leveldb::ssd_write_size_counter, true)), {SubscriberType::THROUGHPUT}); +tera::AutoCollectorRegister ssd_read_metric(kSsdReadCountMetric, + std::unique_ptr(new CounterCollector(&leveldb::ssd_read_counter, true)), {SubscriberType::QPS}); +tera::AutoCollectorRegister ssd_write_metric(kSsdWriteCountMetric, + std::unique_ptr(new CounterCollector(&leveldb::ssd_write_counter, true)), {SubscriberType::QPS}); +// local metrics +tera::AutoCollectorRegister posix_read_size_metric(kPosixReadThroughPutMetric, + std::unique_ptr(new CounterCollector(&leveldb::posix_read_size_counter, true)), {SubscriberType::THROUGHPUT}); +tera::AutoCollectorRegister posix_write_size_metric(kPosixWriteThroughPutMetric, + std::unique_ptr(new CounterCollector(&leveldb::posix_write_size_counter, true)), {SubscriberType::THROUGHPUT}); +tera::AutoCollectorRegister posix_read_metric(kPosixReadCountMetric, + std::unique_ptr(new CounterCollector(&leveldb::posix_read_counter, true)), {SubscriberType::QPS}); +tera::AutoCollectorRegister posix_write_metric(kPosixWriteCountMetric, + std::unique_ptr(new CounterCollector(&leveldb::posix_write_counter, true)), {SubscriberType::QPS}); +tera::AutoCollectorRegister posix_sync_metric(kPosixSyncCountMetric, + std::unique_ptr(new CounterCollector(&leveldb::posix_sync_counter, true)), {SubscriberType::QPS}); +tera::AutoCollectorRegister posix_list_metric(kPosixListCountMetric, + std::unique_ptr(new CounterCollector(&leveldb::posix_list_counter, true)), {SubscriberType::QPS}); +tera::AutoCollectorRegister posix_exists_metric(kPosixExistsCountMetric, + std::unique_ptr(new CounterCollector(&leveldb::posix_exists_counter, true)), {SubscriberType::QPS}); +tera::AutoCollectorRegister posix_open_metric(kPosixOpenCountMetric, + std::unique_ptr(new CounterCollector(&leveldb::posix_open_counter, true)), {SubscriberType::QPS}); +tera::AutoCollectorRegister posix_close_metric(kPosixCloseCountMetric, + std::unique_ptr(new CounterCollector(&leveldb::posix_close_counter, true)), {SubscriberType::QPS}); +tera::AutoCollectorRegister posix_delete_metric(kPosixDeleteCountMetric, + std::unique_ptr(new CounterCollector(&leveldb::posix_delete_counter, true)), {SubscriberType::QPS}); +tera::AutoCollectorRegister posix_tell_metric(kPosixTellCountMetric, + std::unique_ptr(new CounterCollector(&leveldb::posix_tell_counter, true)), {SubscriberType::QPS}); +tera::AutoCollectorRegister posix_seek_metric(kPosixSeekCountMetric, + std::unique_ptr(new CounterCollector(&leveldb::posix_seek_counter, true)), {SubscriberType::QPS}); +tera::AutoCollectorRegister posix_info_metric(kPosixInfoCountMetric, + std::unique_ptr(new CounterCollector(&leveldb::posix_info_counter, true)), {SubscriberType::QPS}); +tera::AutoCollectorRegister posix_other_metric(kPosixOtherCountMetric, + std::unique_ptr(new CounterCollector(&leveldb::posix_other_counter, true)), {SubscriberType::QPS}); + +tera::AutoCollectorRegister rawkey_compare_metric(kRawkeyCompareCountMetric, + std::unique_ptr(new CounterCollector(&leveldb::rawkey_compare_counter, true)), {SubscriberType::QPS}); + class TabletNodeSysInfoDumper { public: TabletNodeSysInfoDumper(const std::string& filename) : @@ -135,29 +244,16 @@ class TabletNodeSysInfoDumper { FILE* fp_; }; -TabletNodeSysInfo::TabletNodeSysInfo() - : mem_check_ts_(0), - net_check_ts_(0), - io_check_ts_(0), - net_tx_total_(0), - net_rx_total_(0), - cpu_check_ts_(0), - tablet_check_ts_(0) { +TabletNodeSysInfo::TabletNodeSysInfo() { + last_check_ts_ = get_micros(); } TabletNodeSysInfo::TabletNodeSysInfo(const TabletNodeInfo& info) - : info_(info), - mem_check_ts_(0), - net_check_ts_(0), - io_check_ts_(0), - net_tx_total_(0), - net_rx_total_(0), - cpu_check_ts_(0), - tablet_check_ts_(0) { + : info_(info) { + last_check_ts_ = get_micros(); } -TabletNodeSysInfo::~TabletNodeSysInfo() { -} +TabletNodeSysInfo::~TabletNodeSysInfo() {} void TabletNodeSysInfo::AddExtraInfo(const std::string& name, int64_t value) { MutexLock lock(&mutex_); @@ -176,32 +272,79 @@ void TabletNodeSysInfo::SetTimeStamp(int64_t ts) { info_.set_timestamp(ts); } +struct DBSize { + uint64_t size; + std::vector lg_size; +}; + void TabletNodeSysInfo::CollectTabletNodeInfo(TabletManager* tablet_manager, const string& server_addr) { + std::vector tablet_ios; + std::vector db_status_vec; + std::vector db_size_vec; + + int64_t ts = get_micros(); + bool need_check = false; + if (ts - last_check_ts_ > FLAGS_tera_tabletnode_sysinfo_check_interval) { + last_check_ts_ = ts; + need_check = true; + } + tablet_manager->GetAllTablets(&tablet_ios); + std::vector::iterator it = tablet_ios.begin(); + while (it != tablet_ios.end()) { + io::TabletIO* tablet_io = *it; + if (tablet_io->ShouldForceUnloadOnError()) { + LOG(WARNING) << *tablet_io << ", has internal error triggered unload"; + StatusCode status; + if (!tablet_io->Unload(&status)) { + LOG(ERROR) << *tablet_io << ", Unload tablet failed, status: " + << StatusCodeToString(status); + } + if (!tablet_manager->RemoveTablet(tablet_io->GetTableName(), + tablet_io->GetStartKey(), tablet_io->GetEndKey(), &status)) { + LOG(ERROR) << *tablet_io << ", remove from TabletManager failed, status: " + << StatusCodeToString(status); + } + tablet_io->DecRef(); + it = tablet_ios.erase(it); + continue; + } + + // check db status whether is corruption + TabletStatus tablet_status = static_cast(kTabletReady); + tablet_io->GetDBStatus(&tablet_status, need_check); + db_status_vec.push_back(tablet_status); + + DBSize db_size; + tablet_io->GetDataSize(&db_size.size, &db_size.lg_size); + db_size_vec.push_back(db_size); + + ++it; + } + MutexLock lock(&mutex_); - int64_t cur_ts = get_micros(); - int64_t interval = cur_ts - tablet_check_ts_; - tablet_check_ts_ = cur_ts; + std::shared_ptr latest_report = CollectorReportPublisher::GetInstance().GetCollectorReport(); + int64_t interval = latest_report->interval_ms; + if (interval <= 0) { + // maybe happen at first report, the metric values must be 0 + // set to any non-zero value to avoid div 0 + VLOG(16) << "Metric Report interval is 0"; + interval = 1000; + } tablet_list_.Clear(); int64_t total_size = 0; - int64_t low_read_cell = 0; - int64_t scan_rows = 0; int64_t scan_kvs = 0; - int64_t scan_size = 0; - int64_t read_rows = 0; int64_t read_kvs = 0; - int64_t read_size = 0; - int64_t write_rows = 0; int64_t write_kvs = 0; - int64_t write_size = 0; int64_t busy_cnt = 0; + int64_t db_corruption_cnt = 0; + + for (uint32_t i = 0; i < tablet_ios.size(); i++) { + io::TabletIO* tablet_io = tablet_ios[i]; + TabletStatus tablet_status = db_status_vec[i]; + DBSize db_size = db_size_vec[i]; - std::vector tablet_ios; - tablet_manager->GetAllTablets(&tablet_ios); - std::vector::iterator it = tablet_ios.begin(); - for (; it != tablet_ios.end(); ++it) { - io::TabletIO* tablet_io = *it; TabletMeta* tablet_meta = tablet_list_.add_meta(); tablet_meta->set_status(TabletStatus(tablet_io->GetStatus())); tablet_meta->set_server_addr(server_addr); @@ -210,274 +353,185 @@ void TabletNodeSysInfo::CollectTabletNodeInfo(TabletManager* tablet_manager, tablet_meta->mutable_key_range()->set_key_start(tablet_io->GetStartKey()); tablet_meta->mutable_key_range()->set_key_end(tablet_io->GetEndKey()); - std::vector lgsize; - uint64_t size; - tablet_io->GetDataSize(&size, &lgsize); - tablet_meta->set_size(size); - for (size_t i = 0; i < lgsize.size(); ++i) { - tablet_meta->add_lg_size(lgsize[i]); + tablet_meta->set_size(db_size.size); + for (size_t i = 0; i < db_size.lg_size.size(); ++i) { + tablet_meta->add_lg_size(db_size.lg_size[i]); } tablet_meta->set_compact_status(tablet_io->GetCompactStatus()); total_size += tablet_meta->size(); TabletCounter* counter = tablet_list_.add_counter(); - tablet_io->GetAndClearCounter(counter); - low_read_cell += counter->low_read_cell(); - scan_rows += counter->scan_rows(); + const std::string& label_str = tablet_io->GetMetricLabel(); + counter->set_low_read_cell(latest_report->FindMetricValue(kLowReadCellMetricName, label_str)); + counter->set_scan_rows(latest_report->FindMetricValue(kScanRowsMetricName, label_str)); + counter->set_scan_kvs(latest_report->FindMetricValue(kScanKvsMetricName, label_str)); + counter->set_scan_size(latest_report->FindMetricValue(kScanThroughPutMetricName, label_str)); + counter->set_read_rows(latest_report->FindMetricValue(kReadRowsMetricName, label_str)); + counter->set_read_kvs(latest_report->FindMetricValue(kReadKvsMetricName, label_str)); + counter->set_read_size(latest_report->FindMetricValue(kReadThroughPutMetricName, label_str)); + counter->set_write_rows(latest_report->FindMetricValue(kWriteRowsMetricName, label_str)); + counter->set_write_kvs(latest_report->FindMetricValue(kWriteKvsMetricName, label_str)); + counter->set_write_size(latest_report->FindMetricValue(kWriteThroughPutMetricName, label_str)); + counter->set_is_on_busy(tablet_io->IsBusy()); + double write_workload = 0; + tablet_io->Workload(&write_workload); + counter->set_write_workload(write_workload); + counter->set_db_status(tablet_status); // set runtime counter + scan_kvs += counter->scan_kvs(); - scan_size += counter->scan_size(); - read_rows += counter->read_rows(); read_kvs += counter->read_kvs(); - read_size += counter->read_size(); - write_rows += counter->write_rows(); write_kvs += counter->write_kvs(); - write_size += counter->write_size(); if (counter->is_on_busy()) { busy_cnt++; } + if (counter->db_status() == kTabletCorruption) { + db_corruption_cnt++; + } tablet_io->DecRef(); } - info_.set_low_read_cell(low_read_cell * 1000000 / interval); - info_.set_scan_rows(scan_rows * 1000000 / interval); - info_.set_scan_kvs(scan_kvs * 1000000 / interval); - info_.set_scan_size(scan_size * 1000000 / interval); - info_.set_read_rows(read_rows * 1000000 / interval); - info_.set_read_kvs(read_kvs * 1000000 / interval); - info_.set_read_size(read_size * 1000000 / interval); - info_.set_write_rows(write_rows * 1000000 / interval); - info_.set_write_kvs(write_kvs * 1000000 / interval); - info_.set_write_size(write_size * 1000000 / interval); + + int64_t low_read_cell = + latest_report->FindMetricValue(kLowLevelReadMetric); + int64_t read_rows = + latest_report->FindMetricValue(kRowCountMetric, kApiLabelRead); + int64_t read_size = + latest_report->FindMetricValue(kRowThroughPutMetric, kApiLabelRead); + int64_t write_rows = + latest_report->FindMetricValue(kRowCountMetric, kApiLabelWrite); + int64_t write_size = + latest_report->FindMetricValue(kRowThroughPutMetric, kApiLabelWrite); + int64_t scan_rows = + latest_report->FindMetricValue(kRowCountMetric, kApiLabelScan); + int64_t scan_size = + latest_report->FindMetricValue(kRowThroughPutMetric, kApiLabelScan); + + info_.set_low_read_cell(low_read_cell * 1000 / interval); + info_.set_scan_rows(scan_rows * 1000 / interval); + info_.set_scan_kvs(scan_kvs * 1000 / interval); + info_.set_scan_size(scan_size * 1000 / interval); + info_.set_read_rows(read_rows * 1000 / interval); + info_.set_read_kvs(read_kvs * 1000 / interval); + info_.set_read_size(read_size * 1000 / interval); + info_.set_write_rows(write_rows * 1000 / interval); + info_.set_write_kvs(write_kvs * 1000 / interval); + info_.set_write_size(write_size * 1000 / interval); info_.set_tablet_onbusy(busy_cnt); + info_.set_tablet_corruption(db_corruption_cnt); // refresh tabletnodeinfo info_.set_load(total_size); info_.set_tablet_total(tablet_ios.size()); int64_t tmp; - tmp = leveldb::dfs_read_size_counter.Clear() * 1000000 / interval; + tmp = latest_report->FindMetricValue(kDfsReadBytesThroughPut) * 1000 / interval; info_.set_dfs_io_r(tmp); - tmp = leveldb::dfs_write_size_counter.Clear() * 1000000 / interval; + tmp = latest_report->FindMetricValue(kDfsWriteBytesThroughPut) * 1000 / interval; info_.set_dfs_io_w(tmp); - tmp = leveldb::posix_read_size_counter.Clear() * 1000000 / interval; + tmp = latest_report->FindMetricValue(kPosixReadThroughPutMetric) * 1000 / interval; info_.set_local_io_r(tmp); - tmp = leveldb::posix_write_size_counter.Clear() * 1000000 / interval; + tmp = latest_report->FindMetricValue(kPosixWriteThroughPutMetric) * 1000 / interval; info_.set_local_io_w(tmp); - info_.set_read_pending(read_pending_counter.Get()); - info_.set_write_pending(write_pending_counter.Get()); - info_.set_scan_pending(scan_pending_counter.Get()); + int64_t read_pending = latest_report->FindMetricValue(kPendingCountMetric, kApiLabelRead); + int64_t write_pending = latest_report->FindMetricValue(kPendingCountMetric, kApiLabelWrite); + int64_t scan_pending = latest_report->FindMetricValue(kPendingCountMetric, kApiLabelScan); + int64_t compact_pending = latest_report->FindMetricValue(kPendingCountMetric, kApiLabelCompact); + + info_.set_read_pending(read_pending); + info_.set_write_pending(write_pending); + info_.set_scan_pending(scan_pending); // collect extra infos info_.clear_extra_info(); ExtraTsInfo* einfo = info_.add_extra_info(); - if (read_rows == 0) { - tmp = 0; - } else { - tmp = rand_read_delay.Clear() / read_rows; - } - einfo->set_name("rand_read_delay"); - einfo->set_value(tmp / 1000); - einfo = info_.add_extra_info(); - if (read_rows == 0) { - tmp = 0; - } else { - tmp = row_read_delay.Clear() / read_rows; - } - einfo->set_name("row_read_delay"); - einfo->set_value(tmp / 1000); + int64_t range_error_sum = + latest_report->FindMetricValue(kRangeErrorMetric, kApiLabelRead) + + latest_report->FindMetricValue(kRangeErrorMetric, kApiLabelWrite) + + latest_report->FindMetricValue(kRangeErrorMetric, kApiLabelScan); - einfo = info_.add_extra_info(); - tmp = range_error_counter.Clear() * 1000000 / interval; + tmp = range_error_sum * 1000 / interval; einfo->set_name("range_error"); einfo->set_value(tmp); einfo = info_.add_extra_info(); - tmp = read_pending_counter.Get(); einfo->set_name("read_pending"); - einfo->set_value(tmp); + einfo->set_value(read_pending); einfo = info_.add_extra_info(); - tmp = write_pending_counter.Get(); einfo->set_name("write_pending"); - einfo->set_value(tmp); + einfo->set_value(write_pending); einfo = info_.add_extra_info(); - tmp = scan_pending_counter.Get(); einfo->set_name("scan_pending"); - einfo->set_value(tmp); + einfo->set_value(scan_pending); einfo = info_.add_extra_info(); - tmp = compact_pending_counter.Get(); einfo->set_name("compact_pending"); + einfo->set_value(compact_pending); + + einfo = info_.add_extra_info(); + tmp = latest_report->FindMetricValue(kRejectCountMetric, kApiLabelRead) * 1000 / interval; + einfo->set_name("read_reject"); einfo->set_value(tmp); -} -// return the number of ticks(jiffies) that this process -// has been scheduled in user and kernel mode. -static long long ProcessCpuTick() { - const int PATH_MAX_LEN = 64; - char path[PATH_MAX_LEN]; - sprintf(path, "/proc/%d/stat", getpid()); - FILE *fp = fopen(path, "r"); - if (fp == NULL) { - return 0; - } - long long utime = 0, stime = 0; - if (fscanf(fp, "%*s %*s %*s %*s %*s %*s %*s %*s %*s %*s %*s %*s %*s %lld %lld", - &utime, &stime) < 2) { - LOG(ERROR) << "get cpu tick from /proc/" << getpid() << "/stat failed."; - } - fclose(fp); - return utime + stime; -} + einfo = info_.add_extra_info(); + tmp = latest_report->FindMetricValue(kRejectCountMetric, kApiLabelWrite) * 1000 / interval; + einfo->set_name("write_reject"); + einfo->set_value(tmp); -// return number of cpu(cores) -static int GetCpuCount() { -#if defined(_SC_NPROCESSORS_ONLN) - return sysconf(_SC_NPROCESSORS_ONLN); -#else - FILE *fp = fopen("/proc/stat", "r"); - if (fp == NULL) { - return 1; - } - const int LINE_MAX_LEN = 256; // enough in here - char *aline = (char*)malloc(LINE_MAX_LEN); - if (aline == NULL) { - LOG(ERROR) << "[HardWare System Info] malloc failed."; - return 1; - } - const int HEADER_MAX_LEN = 10; - char header[HEADER_MAX_LEN]; - int i=0; - size_t len=0; - getline(&aline, &len, fp); // drop the first line - while (getline(&aline, &len, fp)) { - i++; - sscanf(aline, "%s", header); - if (!strncmp(header, "intr", HEADER_MAX_LEN)) { - break; - } - } - fclose(fp); - free(aline); - return i-1 > 0 ? i-1 : 1; -#endif -} + einfo = info_.add_extra_info(); + tmp = latest_report->FindMetricValue(kRejectCountMetric, kApiLabelScan) * 1000 / interval; + einfo->set_name("scan_reject"); + einfo->set_value(tmp); -// irix_on == 1 --> irix mode on -// irix_on == 0 --> irix mode off -// -// return this process's the percentage of CPU usage ( %CPU ). -// -// NOTE: the first time call this function would get 0 as result. -static float GetCpuUsage(int is_irix_on) { - static int cpu_count = 1; // assume cpu count is not variable when process is running - static unsigned long hertz = 0; - if (hertz == 0) { - hertz = sysconf(_SC_CLK_TCK); - cpu_count = GetCpuCount(); - } + einfo = info_.add_extra_info(); + tmp = latest_report->FindMetricValue(kRequestCountMetric, kApiLabelRead) * 1000 / interval; + einfo->set_name("read_request"); + einfo->set_value(tmp); - static struct timeval oldtimev; - struct timeval timev; - gettimeofday(&timev, NULL); - float et = (timev.tv_sec - oldtimev.tv_sec) - + (float)(timev.tv_usec - oldtimev.tv_usec) / 1000000.0; - oldtimev.tv_sec = timev.tv_sec; - oldtimev.tv_usec = timev.tv_usec; - - float frame_etscale; - if (is_irix_on) { - frame_etscale = 100.0f / ((float)hertz * et); - } else { - frame_etscale = 100.0f / ((float)hertz * et * cpu_count); - } + einfo = info_.add_extra_info(); + tmp = latest_report->FindMetricValue(kRequestCountMetric, kApiLabelWrite) * 1000 / interval; + einfo->set_name("write_request"); + einfo->set_value(tmp); - static unsigned long oldtick; - unsigned long newtick; - newtick = ProcessCpuTick(); - float u = (newtick - (float)oldtick) * frame_etscale; - oldtick = newtick; + einfo = info_.add_extra_info(); + tmp = latest_report->FindMetricValue(kRequestCountMetric, kApiLabelScan) * 1000 / interval; + einfo->set_name("scan_request"); + einfo->set_value(tmp); - const float MAX_CPU_USAGE = 99.9f; - if (u > MAX_CPU_USAGE ) { - u = MAX_CPU_USAGE; - } + einfo = info_.add_extra_info(); + tmp = latest_report->FindMetricValue(kErrorCountMetric, kApiLabelRead) * 1000 / interval; + einfo->set_name("read_error"); + einfo->set_value(tmp); - // rounding cpu usage to 1 decimal places - const int USAGE_STR_MAX_LEN = 5; - char usage_str[USAGE_STR_MAX_LEN]; - sprintf(usage_str, "%.1f\n", u); - sscanf(usage_str, "%f", &u); - return u; + einfo = info_.add_extra_info(); + tmp = latest_report->FindMetricValue(kErrorCountMetric, kApiLabelWrite) * 1000 / interval; + einfo->set_name("write_error"); + einfo->set_value(tmp); + + einfo = info_.add_extra_info(); + tmp = latest_report->FindMetricValue(kErrorCountMetric, kApiLabelScan) * 1000 / interval; + einfo->set_name("scan_error"); + einfo->set_value(tmp); } void TabletNodeSysInfo::CollectHardwareInfo() { MutexLock lock(&mutex_); - int pid = getpid(); - FILE* f; - std::ostringstream ss; - ss << "/proc/" << pid << "/"; - int64_t cur_ts = get_micros(); - - int64_t interval = cur_ts - mem_check_ts_; - if (interval / 1000000 > FLAGS_tera_tabletnode_sysinfo_mem_collect_interval) { - mem_check_ts_ = cur_ts; - int64_t mem; - f = fopen((ss.str() + "statm").data(), "r"); - if (f == NULL) { - return; - } - fscanf(f, "%*d %ld", &mem); - mem = mem * 4 * 1024; - fclose(f); - info_.set_mem_used(mem); + std::shared_ptr latest_report = CollectorReportPublisher::GetInstance().GetCollectorReport(); - VLOG(15) << "[HardWare System Info] Memory: " << mem * 4; - return; - } + int64_t cpu_usage = latest_report->FindMetricValue(kInstCpuMetricName); + info_.set_cpu_usage(static_cast(cpu_usage)); - interval = cur_ts - net_check_ts_; - if (interval / 1000000 > FLAGS_tera_tabletnode_sysinfo_net_collect_interval) { - net_check_ts_ = cur_ts; - int64_t net_rx = 0, net_tx = 0; - f = fopen((ss.str() + "net/dev").data(), "r"); - if (f == NULL) { - return; - } - int ret = fseek(f, 327, SEEK_SET); - CHECK_EQ(ret, 0); - for (int i = 0; i < 10; i++) { - while (':' != fgetc(f)); - ret = fscanf(f, "%ld%*d%*d%*d%*d%*d%*d%*d%ld", &net_rx, &net_tx); - if (ret >= 2 && net_rx > 0 && net_tx > 0) { - break; - } - } - fclose(f); - - int64_t tmp; - tmp = (net_rx - net_rx_total_) * 1000000 / interval; - info_.set_net_rx(tmp); - tmp = (net_tx - net_tx_total_) * 1000000 / interval; - info_.set_net_tx(tmp); - net_rx_total_ = net_rx; - net_tx_total_ = net_tx; - - VLOG(15) << "[HardWare System Info] Network RX/TX: " << net_rx << " / " << net_tx; - return; - } + int64_t mem_usage = latest_report->FindMetricValue(kInstMemMetricName); + info_.set_mem_used(mem_usage); - interval = cur_ts - cpu_check_ts_; - if (interval / 1000000 > FLAGS_tera_tabletnode_sysinfo_cpu_collect_interval) { - cpu_check_ts_ = cur_ts; - float cpu_usage = GetCpuUsage(0); - info_.set_cpu_usage(cpu_usage); - VLOG(15) << "[HardWare System Info] %CPU: "<< cpu_usage; - return; - } + int64_t net_rx_usage = latest_report->FindMetricValue(kInstNetRXMetricName); + info_.set_net_rx(net_rx_usage); + + int64_t net_tx_usage = latest_report->FindMetricValue(kInstNetTXMetricName); + info_.set_net_tx(net_tx_usage); } void TabletNodeSysInfo::GetTabletNodeInfo(TabletNodeInfo* info) { @@ -502,11 +556,17 @@ void TabletNodeSysInfo::SetStatus(StatusCode status) { void TabletNodeSysInfo::DumpLog() { MutexLock lock(&mutex_); - + std::shared_ptr latest_report = CollectorReportPublisher::GetInstance().GetCollectorReport(); + int64_t interval = latest_report->interval_ms; + TabletNodeSysInfoDumper dumper(FLAGS_tera_tabletnode_running_info_dump_file); - double snappy_ratio = (double)leveldb::snappy_before_size_counter.Clear() - / leveldb::snappy_after_size_counter.Clear(); + double snappy_ratio = latest_report->FindMetricValue(kSnappyCompressionRatioMetric); + if (snappy_ratio > 0) { + snappy_ratio /= 100.0; + } + + int64_t rawkey_compare_count = latest_report->FindMetricValue(kRawkeyCompareCountMetric); if (FLAGS_tera_tabletnode_dump_running_info) { dumper.DumpData("low_level", info_.low_read_cell()); @@ -517,7 +577,7 @@ void TabletNodeSysInfo::DumpLog() { dumper.DumpData("scan", info_.scan_rows()); dumper.DumpData("sspeed", info_.scan_size()); dumper.DumpData("snappy", snappy_ratio); - dumper.DumpData("rowcomp", leveldb::rawkey_compare_counter.Get()); + dumper.DumpData("rowcomp", rawkey_compare_count); } LOG(INFO) << "[SysInfo]" @@ -529,7 +589,7 @@ void TabletNodeSysInfo::DumpLog() { << " scan " << info_.scan_rows() << " sspeed " << utils::ConvertByteToString(info_.scan_size()) << " snappy " << snappy_ratio - << " rawcomp " << leveldb::rawkey_compare_counter.Clear(); + << " rawcomp " << rawkey_compare_count; // hardware info if (FLAGS_tera_tabletnode_dump_running_info) { @@ -549,15 +609,19 @@ void TabletNodeSysInfo::DumpLog() { << " cpu_usage " << info_.cpu_usage() << "%"; // net and io info + int64_t ssd_read_count = latest_report->FindMetricValue(kSsdReadCountMetric); + int64_t ssd_read_size = latest_report->FindMetricValue(kSsdReadThroughPutMetric); + int64_t ssd_write_count = latest_report->FindMetricValue(kSsdWriteCountMetric); + int64_t ssd_write_size = latest_report->FindMetricValue(kSsdWriteThroughPutMetric); if (FLAGS_tera_tabletnode_dump_running_info) { dumper.DumpData("dfs_r", info_.dfs_io_r()); dumper.DumpData("dfs_w", info_.dfs_io_w()); dumper.DumpData("local_r", info_.local_io_r()); dumper.DumpData("local_w", info_.local_io_w()); - dumper.DumpData("ssd_r_counter", leveldb::ssd_read_counter.Get()); - dumper.DumpData("ssd_r_size", leveldb::ssd_read_size_counter.Get()); - dumper.DumpData("ssd_w_counter", leveldb::ssd_write_counter.Get()); - dumper.DumpData("ssd_w_size", leveldb::ssd_write_size_counter.Get()); + dumper.DumpData("ssd_r_counter", ssd_read_count); + dumper.DumpData("ssd_r_size", ssd_read_size); + dumper.DumpData("ssd_w_counter", ssd_write_count); + dumper.DumpData("ssd_w_size", ssd_write_size); } LOG(INFO) << "[IO]" @@ -569,10 +633,79 @@ void TabletNodeSysInfo::DumpLog() { << utils::ConvertByteToString(info_.local_io_r()) << " local_w " << info_.local_io_w() << " " << utils::ConvertByteToString(info_.local_io_w()) - << " ssd_r " << leveldb::ssd_read_counter.Clear() << " " - << utils::ConvertByteToString(leveldb::ssd_read_size_counter.Clear()) - << " ssd_w " << leveldb::ssd_write_counter.Clear() << " " - << utils::ConvertByteToString(leveldb::ssd_write_size_counter.Clear()); + << " ssd_r " << ssd_read_count << " " + << utils::ConvertByteToString(ssd_read_size) + << " ssd_w " << ssd_write_count << " " + << utils::ConvertByteToString(ssd_write_size); + + // cache info + double block_cache_hitrate = static_cast(latest_report->FindMetricValue(kBlockCacheHitRateMetric)) / 100.0; + if (block_cache_hitrate < 0.0) { + block_cache_hitrate = NAN; + } + int64_t block_cache_entries = latest_report->FindMetricValue(kBlockCacheEntriesMetric); + int64_t block_cache_charge = latest_report->FindMetricValue(kBlockCacheChargeMetric); + double table_cache_hitrate = static_cast(latest_report->FindMetricValue(kTableCacheHitRateMetric)) / 100.0; + if (table_cache_hitrate < 0.0) { + table_cache_hitrate = NAN; + } + int64_t table_cache_entries = latest_report->FindMetricValue(kTableCacheEntriesMetric); + int64_t table_cache_charge = latest_report->FindMetricValue(kTableCacheChargeMetric); + if (FLAGS_tera_tabletnode_dump_running_info) { + dumper.DumpData("block_cache_hitrate", block_cache_hitrate); + dumper.DumpData("block_cache_entry", block_cache_entries); + dumper.DumpData("block_cache_bytes", block_cache_charge); + dumper.DumpData("table_cache_hitrate", table_cache_hitrate); + dumper.DumpData("table_cache_entry", table_cache_entries); + dumper.DumpData("table_cache_bytes", table_cache_charge); + } + LOG(INFO) << "[Cache HitRate/Cnt/Size] table_cache " + << table_cache_hitrate << " " + << table_cache_entries << " " + << table_cache_charge + << ", block_cache " + << block_cache_hitrate << " " + << block_cache_entries << " " + << block_cache_charge; + + int64_t finished_read_request = + latest_report->FindMetricValue(kFinishedRequestCountMetric, kApiLabelRead); + int64_t finished_write_request = + latest_report->FindMetricValue(kFinishedRequestCountMetric, kApiLabelWrite); + int64_t finished_scan_request = + latest_report->FindMetricValue(kFinishedRequestCountMetric, kApiLabelScan); + LOG(INFO) << "[Finished Requests] " + << "read: " << finished_read_request * 1000 / interval + << ", write: " << finished_write_request * 1000 / interval + << ", scan: " << finished_scan_request * 1000 / interval; + + int64_t read_request_delay = + (finished_read_request == 0 ? 0 : latest_report->FindMetricValue(kRequestDelayMetric, kApiLabelRead) / finished_read_request); + int64_t write_request_delay = + (finished_write_request == 0 ? 0 : latest_report->FindMetricValue(kRequestDelayMetric, kApiLabelWrite) / finished_write_request); + int64_t scan_request_delay = + (finished_scan_request == 0 ? 0 : latest_report->FindMetricValue(kRequestDelayMetric, kApiLabelScan) / finished_scan_request); + LOG(INFO) << "[Requests Delay In Ms] " + << "read: " << read_request_delay / 1000.0 + << ", write: " << write_request_delay / 1000.0 + << ", scan: " << scan_request_delay / 1000.0; + + int64_t read_rows = + latest_report->FindMetricValue(kRowCountMetric, kApiLabelRead); + int64_t write_rows = + latest_report->FindMetricValue(kRowCountMetric, kApiLabelWrite); + int64_t scan_rows = + latest_report->FindMetricValue(kRowCountMetric, kApiLabelScan); + int64_t row_read_delay = + (read_rows == 0 ? 0 : latest_report->FindMetricValue(kRowDelayMetric, kApiLabelRead) / read_rows); + int64_t row_write_delay = + (write_rows == 0 ? 0 : latest_report->FindMetricValue(kRowDelayMetric, kApiLabelWrite) / write_rows); + int64_t row_scan_delay = + (scan_rows == 0 ? 0 : latest_report->FindMetricValue(kRowDelayMetric, kApiLabelScan) / scan_rows); + LOG(INFO) << "[Row Delay In Ms] " + << "row_read_delay: " << row_read_delay / 1000.0 + << ", row_write_delay: " << row_write_delay / 1000.0 + << ", row_scan_delay: " << row_scan_delay / 1000.0; // extra info std::ostringstream ss; @@ -587,102 +720,138 @@ void TabletNodeSysInfo::DumpLog() { LOG(INFO) << ss.str(); // DFS info - double rdelay = leveldb::dfs_read_counter.Get() ? - leveldb::dfs_read_delay_counter.Clear()/1000/leveldb::dfs_read_counter.Get() - : 0; - double wdelay = leveldb::dfs_write_counter.Get() ? - leveldb::dfs_write_delay_counter.Clear()/1000/leveldb::dfs_write_counter.Get() - : 0; - double sdelay = leveldb::dfs_sync_counter.Get() ? - leveldb::dfs_sync_delay_counter.Clear()/1000/leveldb::dfs_sync_counter.Get() - : 0; + int64_t dfs_read_delay = latest_report->FindMetricValue(kDfsReadDelayMetric); + int64_t dfs_write_delay = latest_report->FindMetricValue(kDfsWriteDelayMetric); + int64_t dfs_sync_delay = latest_report->FindMetricValue(kDfsSyncDelayMetric); + int64_t dfs_read_count = latest_report->FindMetricValue(kDfsReadCountMetric); + int64_t dfs_write_count = latest_report->FindMetricValue(kDfsWriteCountMetric); + int64_t dfs_sync_count = latest_report->FindMetricValue(kDfsSyncCountMetric); + int64_t dfs_flush_count = latest_report->FindMetricValue(kDfsFlushCountMetric); + int64_t dfs_list_count = latest_report->FindMetricValue(kDfsListCountMetric); + int64_t dfs_other_count = latest_report->FindMetricValue(kDfsOtherCountMetric); + int64_t dfs_exists_count = latest_report->FindMetricValue(kDfsExistsCountMetric); + int64_t dfs_open_count = latest_report->FindMetricValue(kDfsOpenCountMetric); + int64_t dfs_close_count = latest_report->FindMetricValue(kDfsCloseCountMetric); + int64_t dfs_delete_count = latest_report->FindMetricValue(kDfsDeleteCountMetric); + int64_t dfs_tell_count = latest_report->FindMetricValue(kDfsTellCountMetric); + int64_t dfs_info_count = latest_report->FindMetricValue(kDfsInfoCountMetric); + int64_t dfs_read_hang = latest_report->FindMetricValue(kDfsReadHangMetric); + int64_t dfs_write_hang = latest_report->FindMetricValue(kDfsWriteHangMetric); + int64_t dfs_sync_hang = latest_report->FindMetricValue(kDfsSyncHangMetric); + int64_t dfs_flush_hang = latest_report->FindMetricValue(kDfsFlushHangMetric); + int64_t dfs_list_hang = latest_report->FindMetricValue(kDfsListHangMetric); + int64_t dfs_other_hang = latest_report->FindMetricValue(kDfsOtherHangMetric); + int64_t dfs_exists_hang = latest_report->FindMetricValue(kDfsExistsHangMetric); + int64_t dfs_open_hang = latest_report->FindMetricValue(kDfsOpenHangMetric); + int64_t dfs_close_hang = latest_report->FindMetricValue(kDfsCloseHangMetric); + int64_t dfs_delete_hang = latest_report->FindMetricValue(kDfsDeleteHangMetric); + int64_t dfs_tell_hang = latest_report->FindMetricValue(kDfsTellHangMetric); + int64_t dfs_info_hang = latest_report->FindMetricValue(kDfsInfoHangMetric); + double rdelay = dfs_read_count ? static_cast(dfs_read_delay) / 1000.0 / dfs_read_count : 0; + double wdelay = dfs_write_count ? static_cast(dfs_write_delay) / 1000.0 / dfs_write_count : 0; + double sdelay = dfs_sync_count ? static_cast(dfs_sync_delay) / 1000.0 / dfs_sync_count : 0; if (FLAGS_tera_tabletnode_dump_running_info) { - dumper.DumpData("dfs_read", leveldb::dfs_read_counter.Get()); - dumper.DumpData("dfs_read_hang", leveldb::dfs_read_hang_counter.Get()); + dumper.DumpData("dfs_read", dfs_read_count); + dumper.DumpData("dfs_read_hang", dfs_read_hang); dumper.DumpData("dfs_rdealy", rdelay); - dumper.DumpData("dfs_write", leveldb::dfs_write_counter.Get()); - dumper.DumpData("dfs_write_hang", leveldb::dfs_write_hang_counter.Get()); + dumper.DumpData("dfs_write", dfs_write_count); + dumper.DumpData("dfs_write_hang", dfs_write_hang); dumper.DumpData("dfs_wdelay", wdelay); - dumper.DumpData("dfs_sync", leveldb::dfs_sync_counter.Get()); - dumper.DumpData("dfs_sync_hang", leveldb::dfs_sync_hang_counter.Get()); + dumper.DumpData("dfs_sync", dfs_sync_count); + dumper.DumpData("dfs_sync_hang", dfs_sync_hang); dumper.DumpData("dfs_sdelay", sdelay); - dumper.DumpData("dfs_flush", leveldb::dfs_flush_counter.Get()); - dumper.DumpData("dfs_flush_hang", leveldb::dfs_flush_hang_counter.Get()); - dumper.DumpData("dfs_list", leveldb::dfs_list_counter.Get()); - dumper.DumpData("dfs_list_hang", leveldb::dfs_list_hang_counter.Get()); - dumper.DumpData("dfs_info", leveldb::dfs_info_counter.Get()); - dumper.DumpData("dfs_info_hang", leveldb::dfs_info_hang_counter.Get()); - dumper.DumpData("dfs_exists", leveldb::dfs_exists_counter.Get()); - dumper.DumpData("dfs_exists_hang", leveldb::dfs_exists_hang_counter.Get()); - dumper.DumpData("dfs_open", leveldb::dfs_open_counter.Get()); - dumper.DumpData("dfs_open_hang", leveldb::dfs_open_hang_counter.Get()); - dumper.DumpData("dfs_close", leveldb::dfs_close_counter.Get()); - dumper.DumpData("dfs_close_hang", leveldb::dfs_close_hang_counter.Get()); - dumper.DumpData("dfs_delete", leveldb::dfs_delete_counter.Get()); - dumper.DumpData("dfs_delete_hang", leveldb::dfs_delete_hang_counter.Get()); - dumper.DumpData("dfs_tell", leveldb::dfs_tell_counter.Get()); - dumper.DumpData("dfs_tell_hang", leveldb::dfs_tell_hang_counter.Get()); - dumper.DumpData("dfs_other", leveldb::dfs_other_counter.Get()); - dumper.DumpData("dfs_other_hang", leveldb::dfs_other_hang_counter.Get()); + dumper.DumpData("dfs_flush", dfs_flush_count); + dumper.DumpData("dfs_flush_hang", dfs_flush_hang); + dumper.DumpData("dfs_list", dfs_list_count); + dumper.DumpData("dfs_list_hang", dfs_list_hang); + dumper.DumpData("dfs_info", dfs_info_count); + dumper.DumpData("dfs_info_hang", dfs_info_hang); + dumper.DumpData("dfs_exists", dfs_exists_count); + dumper.DumpData("dfs_exists_hang", dfs_exists_hang); + dumper.DumpData("dfs_open", dfs_open_count); + dumper.DumpData("dfs_open_hang", dfs_open_hang); + dumper.DumpData("dfs_close", dfs_close_count); + dumper.DumpData("dfs_close_hang", dfs_close_hang); + dumper.DumpData("dfs_delete", dfs_delete_count); + dumper.DumpData("dfs_delete_hang", dfs_delete_hang); + dumper.DumpData("dfs_tell", dfs_tell_count); + dumper.DumpData("dfs_tell_hang", dfs_tell_hang); + dumper.DumpData("dfs_other", dfs_other_count); + dumper.DumpData("dfs_other_hang", dfs_other_hang); } - LOG(INFO) << "[Dfs] read " << leveldb::dfs_read_counter.Clear() << " " - << leveldb::dfs_read_hang_counter.Get() << " " + LOG(INFO) << "[Dfs] read " << dfs_read_count << " " + << dfs_read_hang << " " << "rdelay " << rdelay << " " - << "write " << leveldb::dfs_write_counter.Clear() << " " - << leveldb::dfs_write_hang_counter.Get() << " " + << "rdelay_total " << dfs_read_delay << " " + << "write " << dfs_write_count << " " + << dfs_write_hang << " " << "wdelay " << wdelay << " " - << "sync " << leveldb::dfs_sync_counter.Clear() << " " - << leveldb::dfs_sync_hang_counter.Get() << " " + << "wdelay_total " << dfs_write_delay << " " + << "sync " << dfs_sync_count << " " + << dfs_sync_hang << " " << "sdelay " << sdelay << " " - << "flush " << leveldb::dfs_flush_counter.Clear() << " " - << leveldb::dfs_flush_hang_counter.Get() << " " - << "list " << leveldb::dfs_list_counter.Clear() << " " - << leveldb::dfs_list_hang_counter.Get() << " " - << "info " << leveldb::dfs_info_counter.Clear() << " " - << leveldb::dfs_info_hang_counter.Get() << " " - << "exists " << leveldb::dfs_exists_counter.Clear() << " " - << leveldb::dfs_exists_hang_counter.Get() << " " - << "open " << leveldb::dfs_open_counter.Clear() << " " - << leveldb::dfs_open_hang_counter.Get() << " " - << "close " << leveldb::dfs_close_counter.Clear() << " " - << leveldb::dfs_close_hang_counter.Get() << " " - << "delete " << leveldb::dfs_delete_counter.Clear() << " " - << leveldb::dfs_delete_hang_counter.Get() << " " - << "tell " << leveldb::dfs_tell_counter.Clear() << " " - << leveldb::dfs_tell_hang_counter.Get() << " " - << "other " << leveldb::dfs_other_counter.Clear() << " " - << leveldb::dfs_other_hang_counter.Get(); + << "sdelay_total " << dfs_sync_delay << " " + << "flush " << dfs_flush_count << " " + << dfs_flush_hang << " " + << "list " << dfs_list_count << " " + << dfs_list_hang << " " + << "info " << dfs_info_count << " " + << dfs_info_hang << " " + << "exists " << dfs_exists_count << " " + << dfs_exists_hang << " " + << "open " << dfs_open_count << " " + << dfs_open_hang << " " + << "close " << dfs_close_count << " " + << dfs_close_hang << " " + << "delete " << dfs_delete_count << " " + << dfs_delete_hang << " " + << "tell " << dfs_tell_count << " " + << dfs_tell_hang << " " + << "other " << dfs_other_count << " " + << dfs_other_hang; // local info + int64_t posix_read_count = latest_report->FindMetricValue(kPosixReadCountMetric); + int64_t posix_write_count = latest_report->FindMetricValue(kPosixWriteCountMetric); + int64_t posix_sync_count = latest_report->FindMetricValue(kPosixSyncCountMetric); + int64_t posix_list_count = latest_report->FindMetricValue(kPosixListCountMetric); + int64_t posix_info_count = latest_report->FindMetricValue(kPosixInfoCountMetric); + int64_t posix_exists_count = latest_report->FindMetricValue(kPosixExistsCountMetric); + int64_t posix_open_count = latest_report->FindMetricValue(kPosixOpenCountMetric); + int64_t posix_close_count = latest_report->FindMetricValue(kPosixCloseCountMetric); + int64_t posix_delete_count = latest_report->FindMetricValue(kPosixDeleteCountMetric); + int64_t posix_tell_count = latest_report->FindMetricValue(kPosixTellCountMetric); + int64_t posix_seek_count = latest_report->FindMetricValue(kPosixSeekCountMetric); + int64_t posix_other_count = latest_report->FindMetricValue(kPosixOtherCountMetric); if (FLAGS_tera_tabletnode_dump_running_info) { - dumper.DumpData("local_read", leveldb::posix_read_counter.Get()); - dumper.DumpData("local_write", leveldb::posix_write_counter.Get()); - dumper.DumpData("local_sync", leveldb::posix_sync_counter.Get()); - dumper.DumpData("local_list", leveldb::posix_list_counter.Get()); - dumper.DumpData("local_info", leveldb::posix_info_counter.Get()); - dumper.DumpData("local_exists", leveldb::posix_exists_counter.Get()); - dumper.DumpData("local_open", leveldb::posix_open_counter.Get()); - dumper.DumpData("local_close", leveldb::posix_close_counter.Get()); - dumper.DumpData("local_delete", leveldb::posix_delete_counter.Get()); - dumper.DumpData("local_tell", leveldb::posix_tell_counter.Get()); - dumper.DumpData("local_seek", leveldb::posix_seek_counter.Get()); - dumper.DumpData("local_other", leveldb::posix_other_counter.Get()); + dumper.DumpData("local_read", posix_read_count); + dumper.DumpData("local_write", posix_write_count); + dumper.DumpData("local_sync", posix_sync_count); + dumper.DumpData("local_list", posix_list_count); + dumper.DumpData("local_info", posix_info_count); + dumper.DumpData("local_exists", posix_exists_count); + dumper.DumpData("local_open", posix_open_count); + dumper.DumpData("local_close", posix_close_count); + dumper.DumpData("local_delete", posix_delete_count); + dumper.DumpData("local_tell", posix_tell_count); + dumper.DumpData("local_seek", posix_seek_count); + dumper.DumpData("local_other", posix_other_count); } - LOG(INFO) << "[Local] read " << leveldb::posix_read_counter.Clear() << " " - << "write " << leveldb::posix_write_counter.Clear() << " " - << "sync " << leveldb::posix_sync_counter.Clear() << " " - << "list " << leveldb::posix_list_counter.Clear() << " " - << "info " << leveldb::posix_info_counter.Clear() << " " - << "exists " << leveldb::posix_exists_counter.Clear() << " " - << "open " << leveldb::posix_open_counter.Clear() << " " - << "close " << leveldb::posix_close_counter.Clear() << " " - << "delete " << leveldb::posix_delete_counter.Clear() << " " - << "tell " << leveldb::posix_tell_counter.Clear() << " " - << "seek " << leveldb::posix_seek_counter.Clear() << " " - << "other " << leveldb::posix_other_counter.Clear(); + LOG(INFO) << "[Local] read " << posix_read_count << " " + << "write " << posix_write_count << " " + << "sync " << posix_sync_count << " " + << "list " << posix_list_count << " " + << "info " << posix_info_count << " " + << "exists " << posix_exists_count << " " + << "open " << posix_open_count << " " + << "close " << posix_close_count << " " + << "delete " << posix_delete_count << " " + << "tell " << posix_tell_count << " " + << "seek " << posix_seek_count << " " + << "other " << posix_other_count; } } // namespace tabletnode diff --git a/src/tabletnode/tabletnode_sysinfo.h b/src/tabletnode/tabletnode_sysinfo.h index 453f2df95..c20a2b519 100644 --- a/src/tabletnode/tabletnode_sysinfo.h +++ b/src/tabletnode/tabletnode_sysinfo.h @@ -50,15 +50,9 @@ class TabletNodeSysInfo { private: TabletNodeInfo info_; TabletMetaList tablet_list_; - int64_t mem_check_ts_; - int64_t net_check_ts_; - int64_t io_check_ts_; - int64_t net_tx_total_; - int64_t net_rx_total_; - int64_t cpu_check_ts_; - - int64_t tablet_check_ts_; + mutable Mutex mutex_; + int64_t last_check_ts_; }; } // namespace tabletnode } // namespace tera diff --git a/src/tabletnode/tabletnode_zk_adapter.cc b/src/tabletnode/tabletnode_zk_adapter.cc old mode 100644 new mode 100755 index 6c9ab06e0..d3e3d7322 --- a/src/tabletnode/tabletnode_zk_adapter.cc +++ b/src/tabletnode/tabletnode_zk_adapter.cc @@ -422,6 +422,9 @@ void InsTabletNodeZkAdapter::OnKickMarkCreated() { } void InsTabletNodeZkAdapter::OnLockChange(std::string session_id, bool deleted) { + LOG(INFO) << "[OnLockChange] session_id = " << session_id + << " deleted = " << deleted + << " now_session_id = " << ins_sdk_->GetSessionID(); if (deleted || session_id != ins_sdk_->GetSessionID()) { LOG(ERROR) << "I lost my lock , so quit"; _Exit(EXIT_FAILURE); diff --git a/src/tabletnode/test/tabletnode_impl_test.cc b/src/tabletnode/test/tabletnode_impl_test.cc index 808250b02..efc1d61b7 100644 --- a/src/tabletnode/test/tabletnode_impl_test.cc +++ b/src/tabletnode/test/tabletnode_impl_test.cc @@ -16,7 +16,7 @@ #include "proto/proto_helper.h" #include "io/mock_tablet_io.h" -DECLARE_bool(tera_zk_enabled); +DECLARE_string(tera_coord_type); DECLARE_int32(tera_tabletnode_retry_period); DECLARE_string(tera_leveldb_env_type); @@ -40,7 +40,7 @@ class TabletNodeImplTest : public ::testing::Test { m_ret_io_split(false), m_start_key("start_key"), m_end_key("end_key"), m_schema(DefaultTableSchema()) { - FLAGS_tera_zk_enabled = false; + FLAGS_tera_coord_type = "fake_zk"; m_tablet_meta.set_table_name("name"); m_tablet_meta.set_path("path"); diff --git a/src/tabletnode/test/tabletnode_sysinfo_test.cc b/src/tabletnode/test/tabletnode_sysinfo_test.cc index 4f4c06724..e15c83a7c 100644 --- a/src/tabletnode/test/tabletnode_sysinfo_test.cc +++ b/src/tabletnode/test/tabletnode_sysinfo_test.cc @@ -5,7 +5,7 @@ #define private public #include "tabletnode_sysinfo.h" -#include "utils/timer.h" +#include "common/timer.h" #include "gtest/gtest.h" namespace tera { diff --git a/src/tera_c.cc b/src/tera_c.cc index fd3fb2994..cd10eb1ba 100644 --- a/src/tera_c.cc +++ b/src/tera_c.cc @@ -39,7 +39,7 @@ static bool SaveError(char** errptr, const ErrorCode& s) { } if (errptr == NULL) { fprintf(stderr, "%s tera error: %s.\n", - common::timer::get_curtime_str().c_str(), s.GetReason().c_str()); + tera::get_curtime_str().c_str(), s.GetReason().c_str()); return true; } @@ -164,7 +164,7 @@ bool tera_table_put_kv(tera_table_t* table, const char* key, uint64_t keylen, delete mutation; if (SaveError(errptr, err)) { fprintf(stderr, "%s tera error: %s.\n", - common::timer::get_curtime_str().c_str(), err.GetReason().c_str()); + tera::get_curtime_str().c_str(), err.GetReason().c_str()); return false; } return true; @@ -197,7 +197,7 @@ bool tera_table_delete(tera_table_t* table, const char* row_key, uint64_t keylen delete mutation; if (SaveError(NULL, err)) { fprintf(stderr, "%s tera delete error: %s.\n", - common::timer::get_curtime_str().c_str(), err.GetReason().c_str()); + tera::get_curtime_str().c_str(), err.GetReason().c_str()); return false; } return true; diff --git a/src/tera_flags.cc b/src/tera_flags.cc old mode 100644 new mode 100755 index 70dba8404..b1364506d --- a/src/tera_flags.cc +++ b/src/tera_flags.cc @@ -19,8 +19,10 @@ DEFINE_int32(tera_heartbeat_retry_times, 5, "the max retry times when fail to se DEFINE_string(tera_working_dir, "./", "the base dir for system data"); -DEFINE_bool(tera_zk_enabled, true, "enable zk adapter to collaborate with other master instances"); -DEFINE_bool(tera_mock_zk_enabled, false, "enable mock zk adapter to collaborate with other master instances"); +DEFINE_string(tera_coord_type, "", "the coordinator service type for tera cluster [zk,ins,mock_zk,mock_ins,fake_zk]"); + +DEFINE_bool(tera_zk_enabled, true, "[obsoleted replace by --tera_coord_type=zk] enable zk adapter to coord"); +DEFINE_bool(tera_mock_zk_enabled, false, "[obsoleted replace by --tera_coord_type=mock_zk] enable mock zk adapter to coord"); DEFINE_string(tera_zk_addr_list, "localhost:2180", "zookeeper server list"); DEFINE_string(tera_zk_root_path, "/tera", "zookeeper root path"); DEFINE_string(tera_fake_zk_path_prefix, "../fakezk", "fake zk path prefix in onebox tera"); @@ -31,6 +33,12 @@ DEFINE_string(tera_zk_lib_log_path, "../log/zk.log", "zookeeper library log outp DEFINE_string(tera_log_prefix, "", "prefix of log file (INFO, WARNING)"); DEFINE_string(tera_local_addr, "", "local host's ip address"); DEFINE_bool(tera_online_schema_update_enabled, false, "enable online-schema-update"); +DEFINE_bool(tera_info_log_clean_enable, true, "enable log cleaner task, enable as default"); +DEFINE_int64(tera_info_log_clean_period_second, 2592000, "time period (in second) for log cleaner task, 30 days as default"); +DEFINE_int64(tera_info_log_expire_second, 2592000, "expire time (in second) of log file, 30 days as default"); +DEFINE_bool(tera_metric_http_server_enable, true, "enable metric http server, enable as default"); +DEFINE_int32(tera_metric_http_server_listen_port, 20221, "listen port for metric http server"); +DEFINE_int64(tera_hardware_collect_period_second, 5, "hardware metrics checking period (in second)"); ///////// io ///////// @@ -100,10 +108,14 @@ DEFINE_int32(tera_master_impl_retry_times, 5, "the max retry times when master i DEFINE_string(tera_master_meta_table_name, "meta_table", "the meta table name"); DEFINE_string(tera_master_meta_table_path, "meta", "the path of meta table"); -DEFINE_double(tera_master_workload_split_threshold, 3.5, "if workload(wwl) > 3.5, halve the splitsize"); +DEFINE_double(tera_master_workload_merge_threshold, 1.0, "if workload(wwl) < 1.0, enable merge on this tablet"); +DEFINE_double(tera_master_workload_split_threshold, 9.9, "if workload(wwl) > 9.9, trigger split by workload"); DEFINE_int64(tera_master_split_tablet_size, 512, "the size (in MB) of tablet to trigger split"); +DEFINE_int64(tera_master_min_split_size, 64, "the size (in MB) of tablet to trigger split"); +DEFINE_double(tera_master_min_split_ratio, 0.25, "min ratio of split size of tablet schema to trigger split"); +DEFINE_int64(tera_master_split_history_time_interval, 600000, "minimal split time interval(ms)"); DEFINE_int64(tera_master_merge_tablet_size, 0, "the size (in MB) of tablet to trigger merge"); -DEFINE_string(tera_master_gc_strategy, "incremental", "gc strategy, [default, incremental, trackable]"); +DEFINE_string(tera_master_gc_strategy, "trackable", "gc strategy, [default, trackable]"); DEFINE_int32(tera_master_max_split_concurrency, 1, "the max concurrency of tabletnode for split tablet"); DEFINE_int32(tera_master_max_load_concurrency, 5, "the max concurrency of tabletnode for load tablet"); @@ -118,10 +130,11 @@ DEFINE_bool(tera_master_move_tablet_enabled, true, "enable master to auto move t DEFINE_bool(tera_master_meta_isolate_enabled, false, "enable master to reserve a tabletnode for meta"); DEFINE_bool(tera_master_load_balance_table_grained, true, "whether the load balance policy only consider the specified table"); DEFINE_double(tera_master_load_balance_size_ratio_trigger, 1.2, "ratio of heaviest node size to lightest to trigger load balance"); -DEFINE_int32(tera_master_load_balance_ts_load_threshold, 5000, "threshold of one tabletnode in QPS load-balance decision"); +DEFINE_int32(tera_master_load_balance_ts_load_threshold, 1000000000, "threshold of one tabletnode in QPS load-balance decision"); +DEFINE_int64(tera_master_load_balance_ts_size_threshold, 0, "threshold of one tabletnode in Size load-balance decision"); DEFINE_int32(tera_master_load_balance_scan_weight, 300, "scan weight in load-balance decision"); -DEFINE_double(tera_safemode_tablet_locality_ratio, 0.3, "the tablet locality ratio threshold of safemode"); +DEFINE_double(tera_safemode_tablet_locality_ratio, 0.9, "the tablet locality ratio threshold of safemode"); DEFINE_bool(tera_master_kick_tabletnode_enabled, true, "enable master to kick tabletnode"); DEFINE_int32(tera_master_kick_tabletnode_query_fail_times, 10, "the number of query fail to kick tabletnode"); DEFINE_int32(tera_master_control_tabletnode_retry_period, 60000, "the retry period (in ms) for master control tabletnode"); @@ -147,27 +160,31 @@ DEFINE_int64(tera_master_stat_table_interval, 60, "interval of system status dum DEFINE_int64(tera_master_stat_table_splitsize, 100, "default split size of stat table"); DEFINE_int32(tera_master_gc_period, 60000, "the period (in ms) for master gc"); +DEFINE_bool(tera_master_gc_trash_enabled, true, "enable master gc trash"); +DEFINE_int64(tera_master_gc_trash_expire_time_s, 86400, "time (in second) for gc file keeped in trash"); +DEFINE_int64(tera_master_gc_trash_clean_period_s, 3600, "period (in second) for clean gc trash"); DEFINE_int64(tera_master_ins_session_timeout, 10000000, "ins session timeout(us), default 10sec"); DEFINE_bool(tera_master_availability_check_enabled, true, "whether execute availability check"); // reload config safety DEFINE_bool(tera_master_availability_show_details_enabled, false, "whether show details of not-ready tablets"); // reload config safety DEFINE_int64(tera_master_not_available_threshold, 0, "the threshold (in s) of not available"); // reload config safety DEFINE_int64(tera_master_availability_check_period, 60, "the period (in s) of availability check"); // reload config safety -DEFINE_int64(tera_master_availability_warning_threshold, 30, "30s, the threshold (in s) of warning availability"); // reload config safety -DEFINE_int64(tera_master_availability_error_threshold, 300, "5 minutes, the threshold (in s) of error availability"); // reload config safety -DEFINE_int64(tera_master_availability_fatal_threshold, 1800, "30 minutes, the threshold (in s) of fatal availability"); // reload config safety +DEFINE_int64(tera_master_availability_warning_threshold, 60, "1 minute, the threshold (in s) of warning availability"); // reload config safety +DEFINE_int64(tera_master_availability_error_threshold, 600, "10 minutes, the threshold (in s) of error availability"); // reload config safety +DEFINE_int64(tera_master_availability_fatal_threshold, 3600, "1 hour, the threshold (in s) of fatal availability"); // reload config safety +DEFINE_bool(tera_master_update_split_meta, true, "[split] update child tablets meta from master"); ///////// tablet node ///////// DEFINE_string(tera_tabletnode_port, "20000", "the tablet node port of tera system"); -DEFINE_int32(tera_tabletnode_ctrl_thread_num, 10, "control thread number of tablet node (query/load/unload/split)"); +DEFINE_int32(tera_tabletnode_ctrl_thread_num, 20, "control thread number of tablet node (query/load/unload/split)"); DEFINE_int32(tera_tabletnode_write_thread_num, 10, "write thread number of tablet node"); DEFINE_int32(tera_tabletnode_read_thread_num, 40, "read thread number of tablet node"); -DEFINE_int32(tera_tabletnode_scan_thread_num, 5, "scan thread number of tablet node"); +DEFINE_int32(tera_tabletnode_scan_thread_num, 30, "scan thread number of tablet node"); DEFINE_int32(tera_tabletnode_manual_compact_thread_num, 2, "the manual compact thread number of tablet node server"); DEFINE_int32(tera_tabletnode_impl_thread_min_num, 1, "the min thread number for tablet node impl operations"); DEFINE_int32(tera_tabletnode_impl_thread_max_num, 10, "the max thread number for tablet node impl operations"); -DEFINE_int32(tera_tabletnode_compact_thread_num, 10, "the max thread number for leveldb compaction"); +DEFINE_int32(tera_tabletnode_compact_thread_num, 30, "the max thread number for leveldb compaction"); DEFINE_int32(tera_tabletnode_scanner_cache_size, 5, "default tablet scanner manager cache no more than 100 stream"); DEFINE_int32(tera_tabletnode_connect_retry_times, 5, "the max retry times when connect to tablet node"); @@ -180,16 +197,20 @@ DEFINE_int32(tera_tabletnode_scan_pack_max_size, 10240, "the max size(KB) of the DEFINE_int32(tera_asyncwriter_pending_limit, 10000, "the max pending data size (KB) in async writer"); DEFINE_bool(tera_enable_level0_limit, true, "enable level0 limit"); -DEFINE_int32(tera_tablet_level0_file_limit, 20000, "the max level0 file num before write busy"); +DEFINE_int32(tera_tablet_level0_file_limit, 500, "the max level0 file num before write busy"); DEFINE_int32(tera_tablet_ttl_percentage, 99, "percentage of ttl tag in sst file begin to trigger compaction"); DEFINE_int32(tera_tablet_del_percentage, 20, "percentage of del tag in sst file begin to trigger compaction"); -DEFINE_int32(tera_asyncwriter_sync_interval, 100, "the interval (in ms) to sync write buffer to disk"); +DEFINE_int32(tera_asyncwriter_sync_interval, 10, "the interval (in ms) to sync write buffer to disk"); DEFINE_int32(tera_asyncwriter_sync_size_threshold, 1024, "force sync per X KB"); DEFINE_int32(tera_asyncwriter_batch_size, 1024, "write batch to leveldb per X KB"); DEFINE_int32(tera_request_pending_limit, 100000, "the max read/write request pending"); DEFINE_int32(tera_scan_request_pending_limit, 1000, "the max scan request pending"); DEFINE_int32(tera_garbage_collect_period, 1800, "garbage collect period in s"); DEFINE_int32(tera_garbage_collect_debug_log, 0, "garbage collect debug log"); +DEFINE_bool(tera_leveldb_ignore_corruption_in_open, false, "ignore fs error when open db"); +DEFINE_int32(tera_leveldb_slow_down_level0_score_limit, 100, "control level 0 score compute, score / 2 or sqrt(score / 2)"); +DEFINE_int32(tera_leveldb_max_background_compactions, 8, "multi-thread compaction number"); +DEFINE_int32(tera_tablet_max_sub_parallel_compaction, 10, "max sub compaction in parallel"); DEFINE_int32(tera_tabletnode_write_meta_rpc_timeout, 60000, "the timeout period (in ms) for tabletnode write meta"); DEFINE_int32(tera_tabletnode_retry_period, 100, "the retry interval period (in ms) when operate tablet"); @@ -219,6 +240,7 @@ DEFINE_int32(tera_tabletnode_tcm_cache_release_period, 180, "the period (in sec) DEFINE_int64(tera_tabletnode_tcm_cache_size, 838860800, "TCMALLOC_MAX_TOTAL_THREAD_CACHE_BYTES"); DEFINE_bool(tera_tabletnode_dump_running_info, true, "dump tabletnode running info"); DEFINE_string(tera_tabletnode_running_info_dump_file, "../monitor/ts.info.data", "file path for dump running info"); +DEFINE_int64(tera_tabletnode_sysinfo_check_interval, 9223372036854775806, "sysinfo check db health interval in us, default int64_max - 1"); ///////// SDK ///////// DEFINE_string(tera_sdk_impl_type, "tera", "the activated type of SDK impl"); @@ -248,9 +270,9 @@ DEFINE_int32(tera_sdk_timeout_precision, 100, "precision of sdk read/write timeo DEFINE_int32(tera_sdk_delay_send_internal, 2, "the sdk resend the request internal time(s)"); DEFINE_int32(tera_sdk_scan_buffer_limit, 2048000, "the pack size limit for scan operation"); DEFINE_bool(tera_sdk_write_sync, false, "sync flag for write"); -DEFINE_int32(tera_sdk_batch_size, 100, "batch_size"); -DEFINE_int32(tera_sdk_write_send_interval, 100, "write batch send interval time"); -DEFINE_int32(tera_sdk_read_send_interval, 10, "read batch send interval time"); +DEFINE_int32(tera_sdk_batch_size, 250, "batch_size"); +DEFINE_int32(tera_sdk_write_send_interval, 10, "write batch send interval time"); +DEFINE_int32(tera_sdk_read_send_interval, 5, "read batch send interval time"); DEFINE_int64(tera_sdk_max_mutation_pending_num, INT64_MAX, "default number of pending mutations in async put op"); DEFINE_int64(tera_sdk_max_reader_pending_num, INT64_MAX, "default number of pending readers in async get op"); DEFINE_bool(tera_sdk_async_blocking_enabled, true, "enable blocking when async writing and reading"); @@ -264,24 +286,120 @@ DEFINE_int32(tera_sdk_cookie_update_interval, 600, "the interval of cookie updat DEFINE_bool(tera_sdk_perf_counter_enabled, true, "enable performance counter log"); DEFINE_int64(tera_sdk_perf_counter_log_interval, 60, "the interval period (in sec) of performance counter log dumping"); +DEFINE_bool(tera_sdk_perf_collect_enabled, false, "enable collect perf counter for metrics"); +DEFINE_int32(tera_sdk_perf_collect_interval, 10000, "the interval of collect perf counter(ms)"); DEFINE_bool(tera_sdk_batch_scan_enabled, true, "enable batch scan"); DEFINE_int64(tera_sdk_scan_buffer_size, 65536, "default buffer limit for scan"); DEFINE_int64(tera_sdk_scan_number_limit, 1000000000, "default number limit for scan"); DEFINE_int32(tera_sdk_max_batch_scan_req, 30, "the max number of concurrent scan req"); -DEFINE_int32(tera_sdk_batch_scan_max_retry, 60, "the max retry times for session scan"); DEFINE_int64(tera_sdk_scan_timeout, 30000, "scan timeout"); +DEFINE_int32(tera_sdk_batch_scan_max_retry, 60, "the max retry times for session scan"); DEFINE_int64(batch_scan_delay_retry_in_us, 1000000, "timewait in us before retry batch scan"); +DEFINE_int32(tera_sdk_sync_scan_max_retry, 10, "the max retry times for sync scan"); +DEFINE_int64(sync_scan_delay_retry_in_ms, 1000, "timewait in ms before retry sync scan"); DEFINE_string(tera_ins_addr_list, "", "the ins cluster addr. e.g. abc.com:1234,abb.com:1234"); DEFINE_string(tera_ins_root_path, "", "root path on ins. e.g /ps/sandbox"); -DEFINE_bool(tera_ins_enabled, false, "option to open ins naming"); -DEFINE_bool(tera_mock_ins_enabled, false, "option to open mock ins naming"); +DEFINE_bool(tera_ins_enabled, false, "[obsoleted replace by --tera_coord_type=ins] option to open ins naming"); +DEFINE_bool(tera_mock_ins_enabled, false, "[obsoleted replace by --tera_coord_type=mock_ins] option to open mock ins naming"); DEFINE_int64(tera_ins_session_timeout, 600000000, "ins session timeout(us), default 10min"); +DEFINE_int64(tera_sdk_ins_session_timeout, 10000000, "ins session timeout(us), default 10s"); DEFINE_int64(tera_sdk_status_timeout, 600, "(s) check tablet/tabletnode status timeout"); +DEFINE_uint64(tera_sdk_read_max_qualifiers, 18446744073709551615U, "read qu limit of each cf, default value is the max of uint64"); ///////// http ///////// DEFINE_string(tera_http_port, "8657", "the http proxy port of tera"); DEFINE_int32(tera_http_request_thread_num, 30, "the http proxy thread num for handle client request"); DEFINE_int32(tera_http_ctrl_thread_num, 10, "the http proxy thread num for it self"); + +///////// timeoracle ///////// +DEFINE_string(tera_timeoracle_port, "30000", "the timeoracle port of tera"); +DEFINE_int32(tera_timeoracle_max_lease_second, 30, "timeoracle work this seconds for a lease"); +DEFINE_int32(tera_timeoracle_refresh_lease_second, 10, "timeoracle refresh lease before this seconds"); + +// only used by timeoracle +DEFINE_bool(tera_timeoracle_mock_enabled, false, "used local filesystem replace zk and ins."); +DEFINE_string(tera_timeoracle_mock_root_path, "/tmp/", "the root path of local filesystem."); +DEFINE_int32(tera_timeoracle_work_thread_num, 16, "timeoracle sofarpc server work_thread_number"); +DEFINE_int32(tera_timeoracle_io_service_pool_size, 4, "timeoracle sofarpc server io_service_pool_size"); + +///////// global transaction //////// +DEFINE_bool(tera_sdk_client_for_gtxn, false, "build thread_pool for global transaction"); +DEFINE_bool(tera_sdk_tso_client_enabled, false, "get timestamp from timeoracle, default from local timestamp"); +DEFINE_int32(tera_gtxn_thread_max_num, 20, "the max thread number for global transaction operations"); +DEFINE_int32(tera_gtxn_timeout_ms, 600000, "global transaction timeout limit (ms) default 10 minutes"); +DEFINE_int32(tera_gtxn_get_waited_times_limit, 10, "global txn wait other locked times limit"); +DEFINE_int32(tera_gtxn_all_puts_size_limit, 10000, "global txn all puts data size limit"); + +//////// observer /////// +DEFINE_int32(observer_proc_thread_num, 3, ""); +DEFINE_int64(observer_max_pending_task, 10000, ""); +DEFINE_int32(observer_scanner_thread_num, 20, ""); +DEFINE_int32(observer_read_thread_num, 20, "observer read thread num"); +DEFINE_int32(observer_ack_conflict_timeout, 3600, "timeout for ack column conflict check"); +DEFINE_int32(observer_rowlock_client_thread_num, 20, ""); + +//////// rowlock server //////// +DEFINE_bool(rowlock_rpc_limit_enabled, false, "enable the rpc traffic limit in sdk"); +DEFINE_int32(rowlock_rpc_limit_max_inflow, 10, "the max bandwidth (in MB/s) for sdk rpc traffic limitation on input flow"); +DEFINE_int32(rowlock_rpc_limit_max_outflow, 10, "the max bandwidth (in MB/s) for sdk rpc traffic limitation on output flow"); +DEFINE_int32(rowlock_rpc_max_pending_buffer_size, 200, "max pending buffer size (in MB) for sdk rpc"); +DEFINE_int32(rowlock_rpc_work_thread_num, 2, "thread num of sdk rpc client"); + +DEFINE_string(rowlock_server_ip, "0.0.0.0", "rowlock server ip"); +DEFINE_string(rowlock_server_port, "22222", "rowlock server port"); +DEFINE_string(rowlock_zk_root_path, "/rowlock", ""); +DEFINE_int32(rowlock_zk_timeout, 10000, "zk timeout"); +DEFINE_string(rowlock_ins_root_path, "/rowlock", "ins rowlock root path"); +DEFINE_int32(rowlock_server_node_num, 1, "number of rowlock servers in cluster"); + +DEFINE_int32(rowlock_db_ttl, 600000, "timeout for an unlocked lock, 10min"); +DEFINE_int32(rowlock_timing_wheel_patch_num, 600, "the number of timing wheel, every patch_num step the oldest data will be cleared"); +DEFINE_int32(rowlock_db_sharding_number, 1024, "sharding number, enhance concurrency"); +DEFINE_string(rowlock_fake_root_path, "../fakezk/rowlock", "one box fake zk root path"); +DEFINE_int32(rowlock_thread_max_num, 20, "the max thread number of rowlock server"); +DEFINE_int32(rowlock_client_max_fail_times, 5, "client max failure time"); + +DEFINE_bool(rowlock_proxy_async_enable, false, "sync | async"); +DEFINE_string(rowlock_proxy_port, "22223", "rowlock proxy port"); +///////// load balancer //////// +DEFINE_string(tera_lb_server_addr, "0.0.0.0", "default load balancer rpc server addr"); +DEFINE_string(tera_lb_server_port, "31000", "default load balancer rpc server port"); +DEFINE_int32(tera_lb_server_thread_num, 2, "default load balancer rpc server thread pool num"); +DEFINE_int32(tera_lb_impl_thread_num, 1, "default load balancer impl thread pool num"); +DEFINE_int32(tera_lb_load_balance_period_s, 300, "default load balance period(s)"); +DEFINE_int32(tera_lb_max_compute_steps, 1000000, "default max compute steps for one balance procedure"); +DEFINE_int32(tera_lb_max_compute_steps_per_tablet, 1000, "default max compute steps per tablet for one balance procedure"); +DEFINE_int32(tera_lb_max_compute_time_ms, 30000, "default max compute time(ms) for one balance procedure"); +DEFINE_double(tera_lb_min_cost_need_balance, 0.1, "min cost needed for balance"); +DEFINE_double(tera_lb_move_count_cost_weight, 10, "move cost weight"); +DEFINE_int32(tera_lb_tablet_max_move_num, 10, "default tablet max move num for one balance procedure"); +DEFINE_double(tera_lb_tablet_max_move_percent, 0.001, "default tablet max move percent for one balance procedure"); +DEFINE_double(tera_lb_move_frequency_cost_weight, 10, "move frequency cost weight"); +DEFINE_int32(tera_lb_tablet_move_too_frequently_threshold_s, 600, "if move a tablet in this threshold time(s) again, it's been moved too frequently"); +DEFINE_double(tera_lb_abnormal_node_cost_weight, 10, "abnormal node cost weight"); +DEFINE_double(tera_lb_abnormal_node_ratio, 0.5, "abnormal node ratio"); +DEFINE_double(tera_lb_read_pending_node_cost_weight, 10, "read pending node cost weight"); +DEFINE_double(tera_lb_write_pending_node_cost_weight, 10, "write pending node cost weight"); +DEFINE_double(tera_lb_scan_pending_node_cost_weight, 10, "scan pending node cost weight"); +DEFINE_double(tera_lb_tablet_count_cost_weight, 0, "tablet count cost weight"); +DEFINE_double(tera_lb_size_cost_weight, 100, "size cost weight"); +DEFINE_double(tera_lb_read_load_cost_weight, 0, "read load cost weight"); +DEFINE_double(tera_lb_write_load_cost_weight, 0, "write load cost weight"); +DEFINE_double(tera_lb_scan_load_cost_weight, 0, "scan load cost weight"); +DEFINE_bool(tera_lb_debug_mode_enabled, false, "debug mode"); + +DEFINE_int32(rowlock_io_service_pool_size, 4, "rowlock server sofarpc server io_service_pool_size"); + +DEFINE_bool(mock_rowlock_enable, false, "test case switch"); +DEFINE_int64(tera_metric_hold_max_time, 300000, "interval of prometheus collectors push a value to hold_queue in ms"); + +////////// PROFILER /////////// +DEFINE_bool(cpu_profiler_enabled, false, "enable cpu profiler"); +DEFINE_bool(heap_profiler_enabled, false, "enable heap profiler"); +DEFINE_int32(cpu_profiler_dump_interval, 120, "cpu profiler dump interval"); +DEFINE_int32(heap_profiler_dump_interval, 120, "heap profiler dump interval"); +DEFINE_int64(heap_profile_allocation_interval, 1073741824, "Env variable for heap profiler's allocation interval"); +DEFINE_int64(heap_profile_inuse_interval, 1073741824, "Env variable for heap profiler's inuse interval"); diff --git a/src/tera_main.cc b/src/tera_main.cc index 2331436b9..aa86c952f 100644 --- a/src/tera_main.cc +++ b/src/tera_main.cc @@ -8,12 +8,20 @@ #include #include "common/base/scoped_ptr.h" +#include "common/log/log_cleaner.h" +#include "common/heap_profiler.h" +#include "common/cpu_profiler.h" #include "tera_entry.h" #include "utils/utils_cmd.h" #include "version.h" +DECLARE_bool(cpu_profiler_enabled); +DECLARE_bool(heap_profiler_enabled); +DECLARE_int32(cpu_profiler_dump_interval); +DECLARE_int32(heap_profiler_dump_interval); DECLARE_string(tera_log_prefix); DECLARE_string(tera_local_addr); +DECLARE_bool(tera_info_log_clean_enable); extern std::string GetTeraEntryName(); extern tera::TeraEntry* GetTeraEntry(); @@ -27,11 +35,25 @@ static void SignalIntHandler(int sig) { int main(int argc, char** argv) { ::google::ParseCommandLineFlags(&argc, &argv, true); ::google::InitGoogleLogging(argv[0]); - if (!FLAGS_tera_log_prefix.empty()) { - tera::utils::SetupLog(FLAGS_tera_log_prefix); - } else { - tera::utils::SetupLog(GetTeraEntryName()); + + + if (FLAGS_tera_log_prefix.empty()) { + FLAGS_tera_log_prefix = GetTeraEntryName(); + if (FLAGS_tera_log_prefix.empty()) { + FLAGS_tera_log_prefix = "tera"; + } } + tera::utils::SetupLog(FLAGS_tera_log_prefix); + + tera::CpuProfiler cpu_profiler; + cpu_profiler.SetEnable(FLAGS_cpu_profiler_enabled) + .SetInterval(FLAGS_cpu_profiler_dump_interval) + .SetProfilerFile("Cpu"); + + tera::HeapProfiler heap_profiler; + heap_profiler.SetEnable(FLAGS_heap_profiler_enabled) + .SetInterval(FLAGS_heap_profiler_dump_interval) + .SetProfilerFile("Heap"); if (argc > 1) { std::string ext_cmd = argv[1]; @@ -52,6 +74,14 @@ int main(int argc, char** argv) { if (!entry->Start()) { return -1; } + + // start log cleaner + if (FLAGS_tera_info_log_clean_enable) { + common::LogCleaner::StartCleaner(); + LOG(INFO) << "start log cleaner"; + } else { + LOG(INFO) << "log cleaner is disable"; + } while (!g_quit) { if (!entry->Run()) { @@ -63,6 +93,8 @@ int main(int argc, char** argv) { LOG(INFO) << "received interrupt signal from user, will stop"; } + common::LogCleaner::StopCleaner(); + if (!entry->Shutdown()) { return -1; } diff --git a/src/tera_test_main.cc b/src/tera_test_main.cc index f7fb788c7..915c172d9 100644 --- a/src/tera_test_main.cc +++ b/src/tera_test_main.cc @@ -34,7 +34,6 @@ DEFINE_int64(pending_num, 100000, ""); DECLARE_string(flagfile); using namespace tera; -using namespace common::timer; void Usage(const std::string& prg_name) { std::cout << "DESCRIPTION \n\ @@ -43,13 +42,13 @@ void Usage(const std::string& prg_name) { version \n"; } -static common::Counter w_pending; -static common::Counter w_succ; -static common::Counter w_total; -static common::Counter r_pending; -static common::Counter r_succ; -static common::Counter r_total; -static common::Counter launch_time; +static Counter w_pending; +static Counter w_succ; +static Counter w_total; +static Counter r_pending; +static Counter r_succ; +static Counter r_total; +static Counter launch_time; void PrintStat() { LOG(INFO) << "Write total " << w_total.Get() @@ -298,7 +297,7 @@ int32_t SharedTableImplTest(int32_t argc, char** argv, ErrorCode* err) { thread_pool.AddTask(task); } while (thread_pool.PendingNum() > 0) { - std::cerr << common::timer::get_time_str(time(NULL)) << " " + std::cerr << get_time_str(time(NULL)) << " " << "waiting for test finish, pending " << thread_pool.PendingNum() << " tasks ..." << std::endl; sleep(1); diff --git a/src/teracli_main.cc b/src/teracli_main.cc index 49c29dd6d..31c9dd55a 100644 --- a/src/teracli_main.cc +++ b/src/teracli_main.cc @@ -2,7 +2,6 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. // - #include #include #include @@ -10,13 +9,14 @@ #include #include +#include #include #include #include #include #include #include - +#include #include #include @@ -26,6 +26,10 @@ #include "common/console/progress_bar.h" #include "common/file/file_path.h" #include "io/coding.h" +#include "io/utils_leveldb.h" +#include "leveldb/dfs.h" +#include "util/nfs.h" +#include "util/hdfs.h" #include "proto/kv_helper.h" #include "proto/proto_helper.h" #include "proto/tabletnode.pb.h" @@ -36,6 +40,7 @@ #include "sdk/sdk_zk.h" #include "sdk/table_impl.h" #include "tera.h" +#include "types.h" #include "utils/crypt.h" #include "utils/string_util.h" #include "utils/tprinter.h" @@ -50,6 +55,15 @@ DECLARE_string(tera_zk_root_path); DECLARE_bool(tera_sdk_batch_scan_enabled); DECLARE_int64(tera_sdk_status_timeout); +DECLARE_string(tera_leveldb_env_type); +DECLARE_string(tera_leveldb_env_dfs_type); +DECLARE_string(tera_leveldb_env_nfs_mountpoint); +DECLARE_string(tera_leveldb_env_nfs_conf_path); +DECLARE_string(tera_leveldb_env_hdfs2_nameservice_list); +DECLARE_string(tera_dfs_so_path); +DECLARE_string(tera_dfs_conf); +DECLARE_uint64(tera_sdk_read_max_qualifiers); + DEFINE_int32(tera_client_batch_put_num, 1000, "num of each batch in batch put mode"); DEFINE_int32(tera_client_scan_package_size, 1024, "the package size (in KB) of each scan request"); @@ -59,6 +73,7 @@ DEFINE_string(rollback_name, "", "rollback operation's name"); DEFINE_int32(lg, -1, "locality group number."); DEFINE_int32(concurrency, 1, "concurrency for compact table."); +DEFINE_int32(compact_timeout, 120000, "tablet compact timeout(ms), default 20min"); DEFINE_int64(timestamp, -1, "timestamp."); DEFINE_string(tablets_file, "", "tablet set file"); @@ -71,6 +86,15 @@ DEFINE_bool(rowkey_count, false, "is print rowkey count when scan"); DEFINE_bool(stdout_is_tty, true, "is stdout connected to a tty"); DEFINE_bool(reorder_tablets, false, "reorder tablets by ts list"); +// dfs related FLAGS +DEFINE_bool(asowner, false, "become owner and execute the command"); +DEFINE_bool(e, false, "test dfs file exist or not"); +DEFINE_bool(z, false, "test dfs file is zero or not"); +DEFINE_bool(d, false, "test dfs file is directory or not"); +DEFINE_bool(override, false, "dfs put file override the existing one"); +DEFINE_bool(attribute, false, "dfs list file detail attribute"); +DEFINE_bool(recursive, false, "dfs remove file recursively"); + volatile int32_t g_start_time = 0; volatile int32_t g_end_time = 0; volatile int32_t g_used_time = 0; @@ -88,16 +112,25 @@ using namespace tera; typedef std::shared_ptr
TablePtr; typedef std::shared_ptr TableImplPtr; typedef std::map CommandTable; - +// FileSystem command table +typedef std::map FSCommandTable; +//typedef std::map > FSCommandTable; /// global variables of single-row-txn used in interactive mode tera::Transaction* g_row_txn = NULL; Table* g_row_txn_table = NULL; +leveldb::Dfs* g_dfs = NULL; + static CommandTable& GetCommandTable(){ static CommandTable command_table; return command_table; } +static FSCommandTable& GetFSCommandTable() { + static FSCommandTable fs_command_table; + return fs_command_table; +} + const char* builtin_cmd_list[] = { "create", "create [] \n\ @@ -224,6 +257,13 @@ const char* builtin_cmd_list[] = { commit \n\ (only support single row transaction)", + "cas", + "cas \n\ + Compare and set a value atomically. (The txn value of table schema must be 'on') \n\ + This command will compare the value at rowkey:columnfamily:qualifier with : \n\ + -> equal : put to this location. \n\ + -> not equal: do nothing.", + "user", "user \n\ create \n\ @@ -236,8 +276,14 @@ const char* builtin_cmd_list[] = { "tablet", "tablet \n\ move \n\ + movex \n\ + * only for force move tablet ignore error \n\ reload \n\ force to unload and load on the same ts \n\ + reloadx \n\ + force to unload and load on the same ts \n\ + * only for force reload tablet ignore error \n\ + lg_list : lg1:lg2:lg3 \n\ compact \n\ split \n\ merge \n\ @@ -290,9 +336,27 @@ const char* builtin_cmd_list[] = { "help [cmd] \n\ show manual for a or all cmd(s)", + "dfs", + "dfs [cmd] args \n\ + mkdir $NFS_PATH \n\ + touchz $NFS_PATH \n\ + test [-e|-z|-d] $NFS_PATH \n\ + get $NFS_PATH $LOCAL_PATH \n\ + put [--override] $LOCAL_PATH $NFS_PATH \n\ + ls [--attribute] $NFS_PATH \n\ + lsr [--attribute] $NFS_PATH \n\ + dus $NFS_PATH \n\ + rm [--recursive] $NFS_PATH \n\ + stat $NFS_PATH \n\ + rename $NFS_PATH_SRC $NFS_PATH_DEST \n\ + unlockdir $NFS_PATH \n\ + checksum $NFS_PATH $OFFSET $LENGTH \n\ + forcerelease $NFS_PATH", + "version", "version \n\ show version info", + }; static void PrintCmdHelpInfo(const char* msg) { @@ -662,21 +726,21 @@ int32_t PutOp(Client* client, int32_t argc, std::string* argv, ErrorCode* err) { value = argv[5]; } - RowMutation* mutation = table->NewRowMutation(rowkey); + std::unique_ptr mutation(table->NewRowMutation(rowkey)); if (FLAGS_timestamp == -1) { mutation->Put(columnfamily, qualifier, value); } else { mutation->Put(columnfamily, qualifier, FLAGS_timestamp, value); } if (g_row_txn != NULL) { - g_row_txn->ApplyMutation(mutation); + g_row_txn->ApplyMutation(mutation.get()); } else { - table->ApplyMutation(mutation); + table->ApplyMutation(mutation.get()); } if (mutation->GetError().GetType() != tera::ErrorCode::kOK) { std::cout << mutation->GetError().ToString() << std::endl; + return -1; } - delete mutation; return 0; } @@ -912,7 +976,7 @@ int32_t GetOp(Client* client, int32_t argc, std::string* argv, ErrorCode* err) { std::string columnfamily = ""; std::string qualifier = ""; std::string value; - RowReader* reader = table->NewRowReader(rowkey); + std::unique_ptr reader(table->NewRowReader(rowkey)); if (argc == 4) { // use table as kv or get row } else if (argc == 5) { @@ -924,10 +988,11 @@ int32_t GetOp(Client* client, int32_t argc, std::string* argv, ErrorCode* err) { reader->AddColumnFamily(columnfamily); } } + reader->SetMaxQualifiers(FLAGS_tera_sdk_read_max_qualifiers); if (g_row_txn != NULL) { - g_row_txn->Get(reader); + g_row_txn->Get(reader.get()); } else { - table->Get(reader); + table->Get(reader.get()); } while (!reader->Done()) { std::cout << PrintableFormatter(reader->RowName()) << ":" @@ -939,8 +1004,8 @@ int32_t GetOp(Client* client, int32_t argc, std::string* argv, ErrorCode* err) { if (reader->GetError().GetType() != tera::ErrorCode::kOK && reader->GetError().GetType() != tera::ErrorCode::kNotFound) { std::cout << reader->GetError().ToString() << std::endl; + return -1; } - delete reader; return 0; } @@ -1052,6 +1117,7 @@ int32_t ScanRange(TablePtr& table, ScanDescriptor& desc, ErrorCode* err) { desc.SetBufferSize(FLAGS_tera_client_scan_package_size << 10); desc.SetAsync(FLAGS_tera_sdk_batch_scan_enabled); desc.SetSnapshot(FLAGS_snapshot); + desc.SetMaxQualifiers(FLAGS_tera_sdk_read_max_qualifiers); ResultStream* result_stream; if ((result_stream = table->Scan(desc, err)) == NULL) { @@ -1161,7 +1227,7 @@ std::string BytesNumberToString(const uint64_t size) { std::string DateNumberToString(int64_t ts) { if (FLAGS_stdout_is_tty) { - return common::timer::get_time_str(ts); + return get_time_str(ts); } return NumberToString(ts); } @@ -1172,6 +1238,10 @@ static std::string GetTabletStatusString(const TabletMetaList& tablet_list, int6 // new tera master int64_t delta = now - tablet_list.timestamp(i); TabletStatus status = tablet_list.meta(i).status(); + TabletStatus db_status = tablet_list.counter(i).db_status(); + if (db_status == kTabletCorruption) { + return StatusCodeToString(db_status); + } if ((status == kTableReady) && (delta > FLAGS_tera_sdk_status_timeout * 1000000)) { return "kUnknown"; } else { @@ -1187,7 +1257,7 @@ int32_t ShowTabletList(const TabletMetaList& tablet_list, bool is_server_addr, b TPrinter printer; int cols; std::vector row; - int64_t now = common::timer::get_micros(); + int64_t now = get_micros(); if (is_x) { if (is_server_addr) { cols = 14; @@ -1492,7 +1562,7 @@ int32_t ShowSingleTable(Client* client, const string& table_name, if (FLAGS_stdout_is_tty) { std::cout << std::endl; std::cout << "create time: " - << common::timer::get_time_str(table_meta.create_time()) << std::endl; + << get_time_str(table_meta.create_time()) << std::endl; std::cout << std::endl; } ShowTabletList(tablet_list, true, is_x); @@ -1514,7 +1584,7 @@ int32_t ShowSingleTabletNodeInfo(Client* client, const string& addr, std::cout << " address: " << info.addr() << std::endl; std::cout << " status: " << info.status_m() << std::endl; std::cout << " update time: " - << common::timer::get_time_str(info.timestamp() / 1000000) << "\n\n"; + << get_time_str(info.timestamp() / 1000000) << "\n\n"; int cols = 4; TPrinter printer(cols, "workload", "tablets", "load", "split"); @@ -1582,7 +1652,7 @@ int32_t ShowTabletNodesInfo(Client* client, bool is_x, ErrorCode* err) { return -1; } - int64_t now = common::timer::get_micros(); + int64_t now = get_micros(); int cols; TPrinter printer; if (is_x) { @@ -2256,7 +2326,7 @@ int32_t CompactTablet(TabletInfo& tablet, int lg) { request.set_tablet_name(tablet.table_name); request.mutable_key_range()->set_key_start(tablet.start_key); request.mutable_key_range()->set_key_end(tablet.end_key); - tabletnode::TabletNodeClient tabletnode_client(tablet.server_addr, 60000); + tabletnode::TabletNodeClient tabletnode_client(tablet.server_addr, FLAGS_compact_timeout); std::string path; if (lg >= 0) { @@ -2292,6 +2362,77 @@ int32_t CompactTablet(TabletInfo& tablet, int lg) { return 0; } +static bool ComputeCompactInsertKeys(RawKey rawkey, std::string* start_key, std::string* end_key) { + static std::string x0("\x0", 1); + static std::string x1("\x1", 1); + *start_key = (rawkey == Readable ? *start_key + x1 : *start_key + x0); + + // pop all '\x0' charcters at the tailing of end_key. Note that Readable should not contain any + // '\x0' characters but here we do not + while (end_key->size() > 0) { + unsigned char last = end_key->at(end_key->size() - 1); + if (last == '\x0') { + end_key->pop_back(); + } + // for Readable key, if the last nonzero character of end_key is '\x1', the wanted key that + // is barely smaller than end_key is computed as: end_key.substr(0, end_key.rfind('\x1')); + // eg: end_key: abcde'\x1' -> wanted key: abcde + else if (rawkey == Readable && last == '\x1'){ + end_key->pop_back(); + return true; + } + else { + break; + } + } + // for other case, the wanted key that is barely smaller than end_key is computed as: minus the + // last char of end_key with 1 and append '\x255' to end key until it reaches the max keysize + // allowed. Notice that the last char of end_key will not be '\x0' for Binary key and not be + // '\x0' nor '\x1' for Readable key here + if (end_key->size() > 0) { + (*end_key)[end_key->size() - 1] = char((*end_key)[end_key->size() - 1] - 1); + } + end_key->resize(kRowkeySize - 1, char(255)); + return true; +} + +void CompactPreprocess(TableImplPtr table, const std::vector& tablet_infos) { + std::vector readers; + for (std::size_t i = 0; i < tablet_infos.size(); ++i) { + const TabletInfo& tablet_info = tablet_infos[i]; + std::string start_key(tablet_info.start_key); + std::string end_key(tablet_info.end_key); + ComputeCompactInsertKeys(table->GetTableSchema().raw_key(), &start_key, &end_key); + std::vector readers; + RowReader* start_reader = table->NewRowReader(start_key); + RowReader* end_reader = table->NewRowReader(end_key); + readers.push_back(start_reader); + readers.push_back(end_reader); + } + if (readers.size() > 0) { + table->Get(readers); + } + std::vector mutations; + for (std::size_t i = 0; i < readers.size(); ++i) { + if (readers[i]->GetError().GetType() == tera::ErrorCode::kNotFound) { + RowMutation* mutation = table->NewRowMutation(readers[i]->RowKey()); + mutation->DeleteRow(); + mutations.push_back(mutation); + } + delete readers[i]; + } + if (mutations.size() > 0) { + table->ApplyMutation(mutations); + for (std::size_t i = 0; i < mutations.size(); ++i) { + if (mutations[i]->GetError().GetType() != tera::ErrorCode::kOK) { + LOG(WARNING) <<"write key " << DebugString(mutations[i]->RowKey()) + << " failed, error: " << mutations[i]->GetError().ToString(); + } + delete mutations[i]; + } + } +} + int32_t CompactTabletOp(Client* client, int32_t argc, std::string* argv, ErrorCode* err) { if (argc != 4) { PrintCmdHelpInfo(argv[1]); @@ -2337,6 +2478,18 @@ int32_t CompactTabletOp(Client* client, int32_t argc, std::string* argv, ErrorCo << ", total tablets: " << tablet_list.size(); return -4; } + std::string command = argv[1]; + if (command == "compactx") + { + tera::ClientImpl* client_impl = static_cast(client); + TableImplPtr table_impl(client_impl->OpenTableInternal(table, err)); + if (table_impl == NULL) { + LOG(ERROR) << "fail to open table: " << table; + return -5; + } + std::vector tablet_infos(1, *tablet_it); + CompactPreprocess(table_impl, tablet_infos); + } return CompactTablet(*tablet_it, lg); } @@ -2409,32 +2562,34 @@ int32_t ScanTabletOp(Client* client, int32_t argc, std::string* argv, ErrorCode* } int32_t TabletOp(Client* client, int32_t argc, std::string* argv, ErrorCode* err) { - if ((argc != 4) && (argc != 5)) { + if ((argc != 4) && (argc != 5) && (argc != 6)) { PrintCmdHelpInfo(argv[1]); return -1; } std::string op = argv[2]; + std::string tablet_id = argv[3]; + std::string server_addr; - if (op == "compact") { + std::vector arg_list; + arg_list.push_back(op); + arg_list.push_back(tablet_id); + if (op == "compact" || op == "compactx") { return CompactTabletOp(client, argc, argv, err); } else if (op == "scan" || op == "scanallv") { return ScanTabletOp(client, argc, argv, err); - } else if (op != "move" && op != "split" && op != "merge" && op != "reload") { + } else if (argc == 4 && (op == "reload" || op == "merge" || op == "split")) { + // nothing to do + } else if (argc == 5 && (op == "reloadx" || op == "move" || op == "split")) { + // reloadx->lg_list move->server_addr split->split_key + arg_list.push_back(argv[4]); + } else if (argc == 6 && op == "movex") { + arg_list.push_back(argv[4]); // server_addr + arg_list.push_back(argv[5]); // lg_list + } else { PrintCmdHelpInfo(argv[1]); return -1; } - - std::string tablet_id = argv[3]; - std::string server_addr; - if (argc == 5) { - server_addr = argv[4]; - } - - std::vector arg_list; - arg_list.push_back(op); - arg_list.push_back(tablet_id); - arg_list.push_back(server_addr); if (!client->CmdCtrl("tablet", arg_list, NULL, NULL, err)) { LOG(ERROR) << "fail to " << op << " tablet " << tablet_id; return -1; @@ -2543,6 +2698,19 @@ int32_t CompactOp(Client* client, int32_t argc, std::string* argv, ErrorCode* er } ReorderTabletList(&tablet_list); + std::string command = argv[1]; + if (command == "compactx") + { + tera::ClientImpl* client_impl = static_cast(client); + TableImplPtr table_impl(client_impl->OpenTableInternal(tablename, err)); + if (table_impl == NULL) { + LOG(ERROR) << "fail to open table: " << tablename; + return -5; + } + std::cout << "begin compact preprocess tablet: " << tablename << std::endl; + CompactPreprocess(table_impl, tablet_list); + } + int conc = FLAGS_concurrency; if (conc <= 0 || conc > 1000) { LOG(ERROR) << "compact concurrency illegal: " << conc; @@ -2556,7 +2724,7 @@ int32_t CompactOp(Client* client, int32_t argc, std::string* argv, ErrorCode* er thread_pool.AddTask(task); } while (thread_pool.PendingNum() > 0) { - std::cerr << common::timer::get_time_str(time(NULL)) << " " + std::cerr << get_time_str(time(NULL)) << " " << thread_pool.PendingNum() << " tablets waiting for compact ..." << std::endl; sleep(5); @@ -3189,6 +3357,65 @@ int TxnOp(Client* client, int32_t argc, std::string* argv, ErrorCode* err) { } } +int32_t CasOp(Client* client, int32_t argc, std::string* argv, ErrorCode* err) { + if (argc != 7) { + LOG(ERROR) << "args number error: " << argc << ", need 7"; + PrintCmdHelpInfo(argv[1]); + return -1; + } + + const std::string& tablename = argv[2]; + TablePtr table(client->OpenTable(tablename, err)); + if (!table) { + LOG(ERROR) << "fail to open table"; + return -1; + } + + const std::string& rowkey = argv[3]; + const std::string& old_val = argv[5]; + const std::string& new_val = argv[6]; + std::string columnfamily = ""; + std::string qualifier = ""; + ParseCfQualifier(argv[4], &columnfamily, &qualifier); + + std::unique_ptr txn(table->StartRowTransaction(rowkey)); + if (!txn) { + LOG(ERROR) << "fail to start row txn"; + return -1; + } + + std::unique_ptr reader(table->NewRowReader(rowkey)); + reader->AddColumn(columnfamily, qualifier); + txn->Get(reader.get()); + if (reader->Done()) { + std::cout << "cas failed: NotFound" << std::endl; + return -1; + } + std::string cur_val = reader->Value(); + if (cur_val != old_val) { + std::cout << "cas failed: NotEqual" << std::endl; + return -1; + } + + std::unique_ptr mutation(table->NewRowMutation(rowkey)); + mutation->Put(columnfamily, qualifier, new_val); + txn->ApplyMutation(mutation.get()); + if (mutation->GetError().GetType() != tera::ErrorCode::kOK) { + std::cout << "cas failed: " << tera::strerr(mutation->GetError()) << std::endl; + return -1; + } + + auto error_code = txn->Commit(); + if (error_code.GetType() != tera::ErrorCode::kOK) { + std::cout << "cas failed: " << tera::strerr(error_code) << std::endl; + return -1; + } else { + std::cout << "cas success" << std::endl; + } + + return 0; +} + int32_t HelpOp(Client*, int32_t argc, std::string* argv, ErrorCode*) { if (argc == 2) { PrintAllCmd(); @@ -3217,6 +3444,469 @@ bool ParseCommand(int argc, char** arg_list, std::vector* parsed_ar return true; } + +int32_t InitDfsClient() { + if (g_dfs != NULL) { + return 0; + } + if (FLAGS_tera_leveldb_env_dfs_type == "nfs") { + if (access(FLAGS_tera_leveldb_env_nfs_conf_path.c_str(), R_OK) == 0) { + LOG(INFO) << "init nfs system: use configure file" << FLAGS_tera_leveldb_env_nfs_conf_path; + leveldb::Nfs::Init(FLAGS_tera_leveldb_env_nfs_mountpoint, FLAGS_tera_leveldb_env_nfs_conf_path); + g_dfs = leveldb::Nfs::GetInstance(); + } + else { + LOG(FATAL) << "init nfs system: no configure file found"; + return -1; + } + } else if (FLAGS_tera_leveldb_env_dfs_type == "hdfs2") { + LOG(INFO) << "hdfs2 system support currently, please use hadoop-client"; + g_dfs = new leveldb::Hdfs2(FLAGS_tera_leveldb_env_hdfs2_nameservice_list); + } else if (FLAGS_tera_leveldb_env_dfs_type == "hdfs") { + g_dfs = new leveldb::Hdfs(); + } + else { + LOG(INFO) << "init dfs system: " << FLAGS_tera_dfs_so_path << "(" << FLAGS_tera_dfs_conf << ")"; + g_dfs = leveldb::Dfs::NewDfs(FLAGS_tera_dfs_so_path, FLAGS_tera_dfs_conf); + } + return 0; +} + +int32_t FileSystemOp(Client* client, int32_t argc, std::string* argv, ErrorCode* err) { + if (argc < 4) { + PrintCmdHelpInfo(argv[1]); + return -1; + } + if (0 != InitDfsClient()) { + LOG(FATAL) << "InitDfsClient failed"; + return -1; + } + std::string operation = argv[2]; + if (GetFSCommandTable().find(operation) == GetFSCommandTable().end()) { + std::cerr << "unsupported dfs command: " << operation << std::endl; + return -1; + } + int ret = (GetFSCommandTable().find(operation)->second)(argc, argv, err); + return ret; +} + +int DfsPrintAttr(const char* pathname, struct stat* st, void* arg = NULL) { + char mode_str[10]; + memset(mode_str, '-', sizeof(mode_str)); + char time_str[64]; + strftime(time_str, sizeof(time_str), "%b %d %H:%M %Y", localtime(&st->st_mtime)); + printf("%c%c%c%c%c%c%c%c%c%c %16lx %16ld %s %s", + (S_IFDIR & st->st_mode) ? 'd' : '-', + (S_IRUSR & st->st_mode) ? 'r' : '-', + (S_IWUSR & st->st_mode) ? 'w' : '-', + (S_IXUSR & st->st_mode) ? 'x' : '-', + (S_IRGRP & st->st_mode) ? 'r' : '-', + (S_IWGRP & st->st_mode) ? 'w' : '-', + (S_IXGRP & st->st_mode) ? 'x' : '-', + (S_IROTH & st->st_mode) ? 'r' : '-', + (S_IWOTH & st->st_mode) ? 'w' : '-', + (S_IXOTH & st->st_mode) ? 'x' : '-', + st->st_ino, + st->st_size, time_str, pathname); + if (S_IFDIR & st->st_mode) { + printf("/"); + } + printf("\n"); + return 0; +} + +static std::string FormatPath(const std::string pathname) { + std::string result; + bool need_strip = false; + for (std::string::size_type i = 0; i < pathname.length(); ++i) { + if (pathname.at(i) == '/') { + if (need_strip) { + continue; + } + else { + result.push_back(pathname.at(i)); + need_strip = true; + } + } else { + need_strip = false; + result.push_back(pathname.at(i)); + } + } + if (result.at(result.length() - 1) == '/') { + result.pop_back(); + } + return result; +} + +int32_t DfsPrintPath(const char* pathname, struct stat* st, void* arg = NULL) { + printf("%s", FormatPath(pathname).c_str()); + if (S_IFDIR & st->st_mode) { + printf("/"); + } + printf("\n"); + return 0; +} + +int32_t DfsSizeSum(const char* pathname, struct stat* st, void* arg) { + uint64_t* sum = reinterpret_cast(arg); + if (!(S_IFDIR & st->st_mode)) { + *sum += st->st_size; + } + return 0; +} + +int32_t DfsTryLockParentPath(const std::string path) { + std::string parent_path = path; + if (parent_path.at(parent_path.length() - 1) == '/') { + parent_path.pop_back(); + } + std::string::size_type pos = parent_path.rfind("/"); + if (pos == std::string::npos) { + fprintf(stderr, "invalid path: %s\n", path.c_str()); + return -1; + } + if (pos == 0) { + parent_path = "/"; + } + parent_path = parent_path.substr(0, pos); + return g_dfs->LockDirectory(parent_path); +} + +int32_t DfsRmPath(const char* pathname, struct stat* st, void*) { + int ret = 0; + if (S_IFDIR & st->st_mode) { + ret = g_dfs->DeleteDirectory(pathname); + if (0 != ret) { + perror("RmDir fail"); + return ret; + } + } else { + ret = g_dfs->Delete(pathname); + if (0 != ret) { + perror("unlink fail"); + } + } + return ret; +} + +typedef int(*WalkFunc)(const char*, struct stat*, void* arg); +int32_t DfsDirWalk(const char* dir_name, WalkFunc func, bool is_recursive, void* arg = NULL) { + struct stat st; + memset(&st, 0, sizeof(struct stat)); + char fullpath[4096] = {0}; + // not a directory, end of recursive call + if (0 == g_dfs->Stat(dir_name, &st) && !(S_IFDIR & st.st_mode)) { + return 0; + } + std::vector sub_paths; + if (0 != g_dfs->ListDirectory(dir_name, &sub_paths)) { + return -1; + } + if (func == DfsRmPath && FLAGS_asowner) { + if (0 != g_dfs->LockDirectory(dir_name)) { + fprintf(stderr, "Lock Directory %s failed", dir_name); + return -1; + } + } + for (std::size_t i = 0; i < sub_paths.size(); ++i) { + snprintf(fullpath, sizeof(fullpath), "%s/%s", dir_name, sub_paths[i].c_str()); + memset(&st, 0, sizeof(struct stat)); + if (g_dfs->Stat(fullpath, &st) < 0) { + perror("Stat failed"); + continue; + } + if (is_recursive && (S_IFDIR & st.st_mode)) { + DfsDirWalk(fullpath, func, true, arg); + } + func(fullpath, &st, arg); + } + return 0; +} + + +int32_t DfsGetOp(int32_t argc, std::string* argv, ErrorCode* err) { + if (argc != 5) { + fprintf(stderr, "Invalid arguments"); + return -1; + } + int ret = 0; + const std::string& src_path = argv[3]; + const std::string& local_path = argv[4]; + std::string local_file_path = local_path; + int local_fd = 0; + if (local_path != "-") { + struct stat st; + if (stat(local_path.c_str(), &st) == 0 && (S_IFDIR & st.st_mode)) { + char* tmp_src_path = strdup(src_path.c_str()); + char* filename = basename(tmp_src_path); + local_file_path.append("/").append(filename); + free(tmp_src_path); + } + local_fd = open(local_file_path.c_str(), O_CREAT | O_WRONLY | O_TRUNC, 0644); + if (local_fd < 0) { + fprintf(stderr, "local file open fail, path=%s, errno=%d", local_file_path.c_str(), errno); + return errno; + } + } + leveldb::DfsFile* file = g_dfs->OpenFile(src_path, leveldb::RDONLY); + if (NULL == file) { + fprintf(stderr, "open dfs file fail, path=%s, errno=%d", src_path.c_str(), errno); + return errno; + } + char buf[128 * 1024]; + ssize_t ret_size = 0; + while ((ret_size = file->Read(buf, sizeof(buf))) > 0) { + ssize_t writelen = write(local_fd, buf, ret_size); + if (writelen < 0) { + fprintf(stderr, "write local file fail, path=%s, errno=%d", local_file_path.c_str(), errno); + break; + ret = errno; + } + } + if (local_fd > 0) { + close(local_fd); + } + file->CloseFile(); + + return ret; +} + +int32_t DfsPutOp(int32_t argc, std::string* argv, ErrorCode* err) { + fprintf(stderr, "not implemented"); + return -1; +} + + +int32_t DfsLsOp(int32_t argc, std::string* argv, ErrorCode* err) { + const std::string& filename = argv[3]; + struct stat fstat; + int ret = 0; + if (0 == g_dfs->Stat(filename.c_str(), &fstat)) { + if (S_IFDIR & fstat.st_mode) { + if (FLAGS_attribute) { + DfsPrintAttr(filename.c_str(), &fstat); + ret = DfsDirWalk(filename.c_str(), DfsPrintAttr, FLAGS_recursive); + } else { + DfsPrintPath(filename.c_str(), &fstat); + ret = DfsDirWalk(filename.c_str(), DfsPrintPath, FLAGS_recursive); + } + } + else { + if (FLAGS_attribute) { + DfsPrintAttr(filename.c_str(), &fstat); + } + else { + DfsPrintPath(filename.c_str(), &fstat); + } + } + } + return ret; +} +int32_t DfsLsrOp(int32_t argc, std::string* argv, ErrorCode* err) { + + bool old_recursive_flag = FLAGS_recursive; + FLAGS_recursive = true; + DfsLsOp(argc, argv, err); + FLAGS_recursive = old_recursive_flag; + return errno; +} + +int32_t DfsDusOp(int32_t argc, std::string* argv, ErrorCode* err) { + struct stat st; + const std::string& path = argv[3]; + uint64_t size = 0; + if (g_dfs->Stat(path, &st) != 0) { + perror("Stat failed"); + return errno; + } + if (S_IFDIR & st.st_mode) { + DfsDirWalk(path.c_str(), DfsSizeSum, true, &size); + } else { + DfsSizeSum(path.c_str(), &st, &size); + } + fprintf(stdout, "%s:\t%lu\n", path.c_str(), size); + return 0; +} + +int32_t DfsTouchzOp(int32_t argc, std::string* argv, ErrorCode* err) { + const std::string& path = argv[3]; + struct stat st; + std::string::size_type pos = path.rfind("/"); + if (pos == std::string::npos || pos == path.length() - 1) { + fprintf(stderr, "invalid filepath: %s", path.c_str()); + return -1; + } + + int ret = g_dfs->Stat(path, &st); + if (0 != ret) { + if (errno != ENOENT) { + perror("Stat failed"); + return errno; + } + std::string parent_path = path.substr(0, pos); + ret = g_dfs->CreateDirectory(parent_path); + if (0 != ret) { + perror("create parent path failed"); + return errno; + } + if (FLAGS_asowner) { + DfsTryLockParentPath(path); + } + leveldb::DfsFile* file = g_dfs->OpenFile(path, leveldb::WRONLY); + if (NULL == file) { + perror("create or open file fail"); + return errno; + } + } else { + if (S_IFDIR & st.st_mode) { + fprintf(stderr, "Touchz fail: %s not Regular file", path.c_str()); + ret = EISDIR; + } else { + fprintf(stdout, "%s already exists", path.c_str()); + ret = EEXIST; + } + } + return ret; +} + +int32_t DfsMkdirOp(int32_t argc, std::string* argv, ErrorCode* err) { + const std::string& path = argv[3]; + if (FLAGS_asowner) { + if (0 != DfsTryLockParentPath(path)) { + fprintf(stderr, "Try lock parent path failed"); + return -1; + } + } + int ret = g_dfs->CreateDirectory(path); + if (0 != ret) { + fprintf(stderr, "Create Path: %s failed, errno=%d\n", path.c_str(), errno); + ret = errno; + } + return ret; +} + +int32_t DfsRmOp(int32_t argc, std::string* argv, ErrorCode* err) { + const std::string& path = argv[3]; + struct stat st; + if (0 != g_dfs->Stat(path.c_str(), &st)) { + perror("Stat fail: "); + return -1; + } + int ret = 0; + if (FLAGS_asowner) { + DfsTryLockParentPath(path); + } + if (st.st_mode & S_IFDIR) { + if (FLAGS_recursive) { + DfsDirWalk(path.c_str(), DfsRmPath, true, NULL); + ret = g_dfs->DeleteDirectory(path); + } else { + ret = g_dfs->DeleteDirectory(path); + } + } else { + ret = g_dfs->Delete(path); + } + if (0 != ret) { + perror("delete failed: "); + } + + return errno; +} + +int32_t DfsTestOp(int32_t argc, std::string* argv, ErrorCode* err) { + fprintf(stderr, "not implemented\n"); + return -1; +} + +int32_t DfsStatOp(int32_t argc, std::string* argv, ErrorCode* err) { + struct stat st; + const std::string& filename = argv[3]; + if (0 != g_dfs->Stat(filename, &st)) { + return errno; + } + const char* file_type; + if (S_IFREG & st.st_mode) { + file_type = "Regular"; + } else if (S_IFDIR & st.st_mode) { + file_type = "Directory"; + } else { + file_type = "Symlink"; + } + fprintf(stdout, "File:\t%s\n", filename.c_str()); + fprintf(stdout, "Inode:\t0x%lx\n", st.st_ino); + fprintf(stdout, "Type:\t%s\n", file_type); + fprintf(stdout, "Size:\t%lu\n", st.st_size); + fprintf(stdout, "Mode:\t%o\n", st.st_mode & 0777); + fprintf(stdout, "Link:\t%lu\n", st.st_nlink); + fprintf(stdout, "Atime:\t%lu\t%s", st.st_atime, ctime(&st.st_atime)); + fprintf(stdout, "Mtime:\t%lu\t%s", st.st_mtime, ctime(&st.st_mtime)); + fprintf(stdout, "Ctime:\t%lu\t%s", st.st_ctime, ctime(&st.st_ctime)); + + return 0; +} + +int32_t DfsRenameOp(int32_t argc, std::string* argv, ErrorCode* err) { + if (argc != 5) { + fprintf(stderr, "invalid arguments\n"); + return -1; + } + std::string& src_path = argv[3]; + std::string& dest_path = argv[4]; + if (FLAGS_asowner) { + if (0 != DfsTryLockParentPath(dest_path)) { + fprintf(stderr, "Lock ParentPath failed"); + return -1; + } + } + + int ret = g_dfs->Rename(src_path, dest_path); + if (0 != ret) { + perror("Rename fail"); + ret = errno; + } + return ret; +} + +int32_t DfsUnlockDirOp(int32_t argc, std::string* argv, ErrorCode* err) { + const std::string& path = argv[3]; + return g_dfs->ClearDirOwner(path); +} + +int32_t DfsChecksumOp(int32_t argc, std::string* argv, ErrorCode* err) { + fprintf(stderr, "Not Implemented"); + return -1; +} + +int32_t DfsLChecksumOp(int32_t argc, std::string* argv, ErrorCode* err) { + fprintf(stderr, "Not Implemented"); + return -1; +} + +int32_t DfsForceReleaseOp(int32_t argc, std::string* argv, ErrorCode* err) { + fprintf(stderr, "Not Implemented"); + return -1; +} + +static void InitializeFileSystemCommandTable() { + FSCommandTable& fs_command_table = GetFSCommandTable(); + fs_command_table["get"] = DfsGetOp; + fs_command_table["put"] = DfsPutOp; + fs_command_table["lsr"] = DfsLsrOp; + fs_command_table["ls"] = DfsLsOp; + fs_command_table["dus"] = DfsDusOp; + fs_command_table["touchz"] = DfsTouchzOp; + fs_command_table["mkdir"] = DfsMkdirOp; + fs_command_table["rm"] = DfsRmOp; + fs_command_table["test"] = DfsTestOp; + fs_command_table["stat"] = DfsStatOp; + fs_command_table["rename"] = DfsRenameOp; + fs_command_table["unlockdir"] = DfsUnlockDirOp; + fs_command_table["checksum"] = DfsChecksumOp; + fs_command_table["lchecksum"] = DfsLChecksumOp; + fs_command_table["forcerelease"] = DfsForceReleaseOp; + return; +} + static void InitializeCommandTable(){ CommandTable& command_table = GetCommandTable(); command_table["create"] = CreateOp; @@ -3257,6 +3947,7 @@ static void InitializeCommandTable(){ command_table["rename"] = RenameOp; command_table["meta"] = MetaOp; command_table["compact"] = CompactOp; + command_table["compactx"] = CompactOp; command_table["findmaster"] = FindMasterOp; command_table["findts"] = FindTsOp; command_table["findtablet"] = FindTabletOp; @@ -3270,6 +3961,9 @@ static void InitializeCommandTable(){ command_table["rangex"] = RangeOp; command_table["txn"] = TxnOp; command_table["help"] = HelpOp; + command_table["cas"] = CasOp; + command_table["dfs"] = FileSystemOp; + InitializeFileSystemCommandTable(); } int ExecuteCommand(Client* client, int argc, char** arg_list) { diff --git a/src/terautil.cc b/src/terautil.cc new file mode 100644 index 000000000..e4f5727d0 --- /dev/null +++ b/src/terautil.cc @@ -0,0 +1,732 @@ +// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. +// + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "ins_sdk.h" + +#include "common/base/string_ext.h" +#include "common/base/string_number.h" +#include "common/console/progress_bar.h" +#include "common/file/file_path.h" +#include "io/coding.h" +#include "proto/kv_helper.h" +#include "proto/proto_helper.h" +#include "proto/tabletnode.pb.h" +#include "proto/tabletnode_client.h" +#include "sdk/client_impl.h" +#include "sdk/cookie.h" +#include "sdk/sdk_utils.h" +#include "sdk/sdk_zk.h" +#include "sdk/table_impl.h" +#include "tera.h" +#include "types.h" +#include "utils/config_utils.h" +#include "utils/crypt.h" +#include "utils/schema_utils.h" +#include "utils/string_util.h" +#include "utils/tprinter.h" +#include "utils/utils_cmd.h" +#include "version.h" + +DECLARE_string(flagfile); +DECLARE_string(log_dir); +DECLARE_string(tera_master_meta_table_name); + +DEFINE_string(dump_tera_src_conf, "../conf/src_tera.flag", "src cluster for tera"); +DEFINE_string(dump_tera_dest_conf, "../conf/dest_tera.flag", "dest cluster for tera"); +DEFINE_string(dump_tera_src_root_path, "/xxx_", "src tera root path"); +DEFINE_string(dump_tera_dest_root_path, "/xxx_", "dest tera root path"); +DEFINE_string(ins_cluster_addr, "terautil_ins", "terautil dump ins cluster conf"); +DEFINE_string(ins_cluster_root_path, "/terautil/dump/xxxx", "dump meta ins"); +DEFINE_string(dump_tera_src_meta_addr, "", "src addr for meta_table"); +DEFINE_string(dump_tera_dest_meta_addr, "", "dest addr for meta_table"); +DEFINE_int64(dump_manual_split_interval, 1000, "manual split interval in ms"); +DEFINE_bool(dump_enable_manual_split, false, "manual split may take a long time, so disable it"); + +using namespace tera; + +const char* terautil_builtin_cmds[] = { + "dump", + "dump \n\ + prepare_safe \n\ + prepare \n\ + run \n\ + show \n\ + check", + + "help", + "help [cmd] \n\ + show manual for a or all cmd(s)", + + "version", + "version \n\ + show version info", +}; + +static void ShowCmdHelpInfo(const char* msg) { + if (msg == NULL) { + return; + } + int count = sizeof(terautil_builtin_cmds)/sizeof(char*); + for (int i = 0; i < count; i+=2) { + if(strncmp(msg, terautil_builtin_cmds[i], 32) == 0) { + std::cout << terautil_builtin_cmds[i + 1] << std::endl; + return; + } + } +} + +static void ShowAllCmd() { + std::cout << "there is cmd list:" << std::endl; + int count = sizeof(terautil_builtin_cmds)/sizeof(char*); + bool newline = false; + for (int i = 0; i < count; i+=2) { + std::cout << std::setiosflags(std::ios::left) << std::setw(20) << terautil_builtin_cmds[i]; + if (newline) { + std::cout << std::endl; + newline = false; + } else { + newline = true; + } + } + std::cout << std::endl << "help [cmd] for details." << std::endl; +} + +int32_t HelpOp(int32_t argc, char** argv) { + if (argc == 2) { + ShowAllCmd(); + } else if (argc == 3) { + ShowCmdHelpInfo(argv[2]); + } else { + ShowCmdHelpInfo("help"); + } + return 0; +} + +int DumpRange(const std::string& ins_cluster_addr, + const std::string& ins_cluster_root_path, + const tera::TableMetaList& table_list, + const tera::TabletMetaList& tablet_list) { + int res = 0; + galaxy::ins::sdk::SDKError ins_err; + galaxy::ins::sdk::InsSDK ins_sdk(ins_cluster_addr); + std::string table_path = ins_cluster_root_path + "/table"; + std::string tablet_path = ins_cluster_root_path + "/tablet"; + //std::string lock_path = ins_cluster_root_path + "/lock"; + + for (int32_t i = 0; i < table_list.meta_size(); i++) { + const tera::TableMeta& meta = table_list.meta(i); + if (meta.table_name() == FLAGS_tera_master_meta_table_name) { + continue; + } + std::string key = table_path + "/" + meta.table_name(); + if(!ins_sdk.Put(key, meta.table_name(), &ins_err)) { + LOG(WARNING) << "ins put: " << key << ", error " << ins_err; + return -1; + } + } + + for (int32_t i = 0; i < tablet_list.meta_size(); i++) { + const tera::TabletMeta& meta = tablet_list.meta(i); + if (meta.table_name() == FLAGS_tera_master_meta_table_name) { + continue; + } + std::string table_name = meta.table_name(); + std::string key = tablet_path + "/" + meta.table_name() + "/" + meta.key_range().key_start(); + std::string val = "0"; + val.append(meta.key_range().key_end()); + if(!ins_sdk.Put(key, val, &ins_err)) { + LOG(WARNING) << "ins put: " << key << ", error " << ins_err; + return -1; + } + //std::string lock_key = lock_path + "/" + meta.table_name() + "/" + meta.key_range().key_start(); + } + return res; +} + +int ScanAndDumpMeta(const std::string& src_meta_tablet_addr, + const std::string& dest_meta_tablet_addr, + tera::TableMetaList* table_list, + tera::TabletMetaList* tablet_list) { + uint64_t seq_id = 0; + tera::ScanTabletRequest request; + tera::ScanTabletResponse response; + tera::WriteTabletRequest write_request; + tera::WriteTabletResponse write_response; + uint64_t request_size = 0; + write_request.set_sequence_id(seq_id++); + write_request.set_tablet_name(FLAGS_tera_master_meta_table_name); + write_request.set_is_sync(true); + write_request.set_is_instant(true); + + request.set_sequence_id(seq_id++); + request.set_table_name(FLAGS_tera_master_meta_table_name); + request.set_start(""); + request.set_end(""); + tera::tabletnode::TabletNodeClient src_meta_node_client(src_meta_tablet_addr); + bool success = true; + while ((success = src_meta_node_client.ScanTablet(&request, &response))) { + if (response.status() != tera::kTabletNodeOk) { + LOG(WARNING) << "dump: fail to load meta table: " + << StatusCodeToString(response.status()); + return -1; + } + int32_t record_size = response.results().key_values_size(); + LOG(INFO) << "scan meta table: " << record_size << " records"; + + bool need_dump = false; + std::string last_record_key; + for (int32_t i = 0; i < record_size; i++) { + const tera::KeyValuePair& record = response.results().key_values(i); + last_record_key = record.key(); + char first_key_char = record.key()[0]; + + TableMeta table_meta; + TabletMeta tablet_meta; + if (first_key_char == '~') { + LOG(INFO) << "(user: " << record.key().substr(1) << ")"; + } else if (first_key_char == '@') { + //ParseMetaTableKeyValue(record.key(), record.value(), table_list->add_meta()); + table_meta.Clear(); + ParseMetaTableKeyValue(record.key(), record.value(), &table_meta); + + std::string key, val; + //table_meta.set_status(kTableDisable); + table_meta.mutable_schema()->set_merge_size(0); // never merge during dump + table_meta.mutable_schema()->set_split_size(10000000); // never split during dump + MakeMetaTableKeyValue(table_meta, &key, &val); + + RowMutationSequence* mu_seq = write_request.add_row_list(); + mu_seq->set_row_key(record.key()); + Mutation* mutation = mu_seq->add_mutation_sequence(); + mutation->set_type(tera::kPut); + mutation->set_value(val); + request_size += mu_seq->ByteSize(); + if (request_size >= kMaxRpcSize) { // write req too large, dump into new tera cluster + need_dump = true; + } + + TableMeta* table_meta2 = table_list->add_meta(); + table_meta2->CopyFrom(table_meta); + } else if (first_key_char > '@') { + //ParseMetaTableKeyValue(record.key(), record.value(), tablet_list->add_meta()); + tablet_meta.Clear(); + ParseMetaTableKeyValue(record.key(), record.value(), &tablet_meta); + + std::string key, val; + tablet_meta.clear_parent_tablets(); + //tablet_meta.set_status(kTabletDisable); + MakeMetaTableKeyValue(tablet_meta, &key, &val); + + RowMutationSequence* mu_seq = write_request.add_row_list(); + mu_seq->set_row_key(record.key()); + Mutation* mutation = mu_seq->add_mutation_sequence(); + mutation->set_type(tera::kPut); + mutation->set_value(val); + request_size += mu_seq->ByteSize(); + if (request_size >= kMaxRpcSize) { // write req too large, dump into new tera cluster + need_dump = true; + } + + TabletMeta* tablet_meta2 = tablet_list->add_meta(); + tablet_meta2->CopyFrom(tablet_meta); + } else { + LOG(WARNING) << "dump: invalid meta record: " << record.key(); + } + } + + if ((need_dump || record_size <= 0) && + write_request.row_list_size() > 0) { + tabletnode::TabletNodeClient dest_meta_node_client(dest_meta_tablet_addr); + if (!dest_meta_node_client.WriteTablet(&write_request, &write_response)) { + LOG(WARNING) << "dump: fail to dump meta tablet: " + << StatusCodeToString(kRPCError); + return -1; + } + tera::StatusCode status = write_response.status(); + if (status == tera::kTabletNodeOk && write_response.row_status_list_size() > 0) { + status = write_response.row_status_list(0); + } + if (status != kTabletNodeOk) { + LOG(WARNING) << "dump: fail to dump meta tablet: " + << StatusCodeToString(status); + return -1; + } + write_request.clear_row_list(); + write_response.Clear(); + request_size = 0; + } + if (record_size <= 0) { + response.Clear(); + LOG(INFO) << "dump: scan meta table success"; + break; + } + + std::string next_record_key = tera::NextKey(last_record_key); + request.set_start(next_record_key); + request.set_end(""); + request.set_sequence_id(seq_id++); + response.Clear(); + } + return success? 0: -1; +} + +int DumpPrepareOp() { + int res = 0; + std::string tera_src_conf = FLAGS_dump_tera_src_conf; + std::string tera_src_root = FLAGS_dump_tera_src_root_path; + std::string tera_dest_conf = FLAGS_dump_tera_dest_conf; + std::string tera_dest_root = FLAGS_dump_tera_dest_root_path; + + // read src meta ts addr and dest meta ts addr + std::string src_meta_addr, dest_meta_addr; + src_meta_addr = FLAGS_dump_tera_src_meta_addr; + dest_meta_addr = FLAGS_dump_tera_dest_meta_addr; + + // scan and dump meta + tera::TableMetaList table_list; + tera::TabletMetaList tablet_list; + if ((res = ScanAndDumpMeta(src_meta_addr, dest_meta_addr, &table_list, &tablet_list)) >= 0) { + // create key range in nexus + std::string ins_cluster_addr = FLAGS_ins_cluster_addr; + std::string ins_cluster_root_path = FLAGS_ins_cluster_root_path; + res = DumpRange(ins_cluster_addr, ins_cluster_root_path, table_list, tablet_list); + } + return res; +} + +int GetAndLockDumpRange(const std::string& ins_cluster_root_path, + std::string* table_name, + std::string* start_key, + std::string* end_key, + galaxy::ins::sdk::InsSDK* ins_sdk) { + int res = -1; + galaxy::ins::sdk::SDKError ins_err; + //std::string table_path = ins_cluster_root_path + "/table"; + std::string tablet_path = ins_cluster_root_path + "/tablet"; + std::string lock_path = ins_cluster_root_path + "/lock"; + + std::string start = tablet_path + "/"; + std::string end = tablet_path + "/"; + if (table_name->size()) { + start.append(*table_name); + start.append("/"); + start.append(*start_key); + if (*start_key == "") { + start.append(1, '\0'); + } + } + end.append(1, '\255'); + galaxy::ins::sdk::ScanResult* result = ins_sdk->Scan(start, end); + while (!result->Done()) { + if (result->Error() != galaxy::ins::sdk::kOK) { + LOG(INFO) << "scan fail: start " << start << ", end " << end << ", err " << result->Error(); + res = -1; + break; + } + std::string key = result->Key(); + std::string val = result->Value(); + std::string has_done = val.substr(0, 1); + if (has_done == "1") { // someone has copy it + result->Next(); + continue; + } + + //std::string key = tablet_path + "/" + meta.table_name() + "/" + meta.key_range().key_start(); + std::string str = key.substr(tablet_path.length() + 1); + std::size_t pos = str.find('/'); + *table_name = str.substr(0, pos); + *start_key = str.substr(pos + 1); + *end_key = val.substr(1); + + std::string lock_key = lock_path + "/" + *table_name + "/" + *start_key + "/"; + if (!ins_sdk->TryLock(lock_key, &ins_err)) { + LOG(INFO) << "ins: TryLock fail: " << lock_key << ", err " << ins_err; + result->Next(); + continue; + } + + std::string val1; + if (ins_sdk->Get(key, &val1, &ins_err)) { + has_done = val1.substr(0, 1); + } else { + LOG(INFO) << "ins: get fail: " << key << ", err " << ins_err; + } + if (has_done == "1") { // someone has copy it + if (!ins_sdk->UnLock(lock_key, &ins_err)) { + LOG(INFO) << "ins: unlock fail: " << lock_key << ", err " << ins_err; + } + result->Next(); + continue; + } + + res = 0; + break; // begin to scan + } + delete result; + return res; +} + +int ReleaseAndUnlockDumpRange(const std::string& ins_cluster_root_path, + const std::string& table_name, + const std::string& start_key, + const std::string& end_key, + galaxy::ins::sdk::InsSDK* ins_sdk) { + int res = 0; + galaxy::ins::sdk::SDKError ins_err; + //std::string table_path = ins_cluster_root_path + "/table"; + std::string tablet_path = ins_cluster_root_path + "/tablet"; + std::string lock_path = ins_cluster_root_path + "/lock"; + + std::string key = tablet_path + "/" + table_name + "/" + start_key; + std::string val = "1"; + val.append(end_key); + + if(!ins_sdk->Put(key, val, &ins_err)) { + LOG(WARNING) << "ins put: " << key << ", error " << ins_err; + } + + std::string lock_key = lock_path + "/" + table_name + "/" + start_key + "/"; + if (!ins_sdk->UnLock(lock_key, &ins_err)) { + LOG(WARNING) << "ins unlock fail: " << lock_key << ", error " << ins_err; + } + return res; +} + +struct ScanDumpContext { + Counter counter; + volatile bool fail; + std::string reason; +}; + +void ScanAndDumpCallBack(RowMutation* mu) { + ScanDumpContext* ctx = (ScanDumpContext*)mu->GetContext(); + if (mu->GetError().GetType() != tera::ErrorCode::kOK) { + if (ctx->fail == false) { + ctx->fail = true; + ctx->reason = mu->GetError().ToString(); + } + } + delete mu; + + ctx->counter.Dec(); + return; +} + +int ScanAndDumpData(Table* src, Table* dest, + const std::string& table_name, + const std::string& start_key, + const std::string& end_key) { + int res = 0; + ErrorCode err; + + ScanDescriptor desc(start_key); + desc.SetEnd(end_key); + desc.SetMaxVersions(std::numeric_limits::max()); + ResultStream* result_stream; + if ((result_stream = src->Scan(desc, &err)) == NULL) { + LOG(INFO) << "scan dump fail(new scan): " << table_name << ", start " << start_key + << ", end " << end_key; + return -1; + } + ScanDumpContext* ctx = new ScanDumpContext; + ctx->counter.Set(1); + ctx->fail = false; + while (!result_stream->Done(&err)) { + RowMutation* mu = dest->NewRowMutation(result_stream->RowName()); + mu->Put(result_stream->Family(), result_stream->Qualifier(), + result_stream->Value(), result_stream->Timestamp()); + ctx->counter.Inc(); + mu->SetContext(ctx); + mu->SetCallBack(ScanAndDumpCallBack); + dest->ApplyMutation(mu); + + result_stream->Next(); + } + delete result_stream; + ctx->counter.Dec(); + + while (ctx->counter.Get() > 0) { + sleep(3); + } + if (ctx->fail == true) { + LOG(INFO) << "scan dump fail: " << table_name << ", start " << start_key + << ", end " << end_key << ", reason " << ctx->reason; + res = -1; + } + delete ctx; + + if (err.GetType() != tera::ErrorCode::kOK) { + LOG(INFO) << "scan dump fail: " << table_name << ", start " << start_key + << ", end " << end_key << ", reason " << err.GetReason(); + res = -1; + } + return res; +} + +int DumpRunOp() { + int res = 0; + std::string ins_cluster_addr = FLAGS_ins_cluster_addr; + std::string ins_cluster_root_path = FLAGS_ins_cluster_root_path; + std::string tera_src_conf = FLAGS_dump_tera_src_conf; + std::string tera_dest_conf = FLAGS_dump_tera_dest_conf; + + // get and lock range + ErrorCode err; + Client* src_client = Client::NewClient(tera_src_conf, &err); + if (src_client == NULL) { + LOG(INFO) << "open src client fail: " << tera_src_conf << ", err " << err.ToString(); + return -1; + } + Client* dest_client = Client::NewClient(tera_dest_conf, &err); + if (dest_client == NULL) { + delete src_client; + src_client = NULL; + LOG(INFO) << "open dest client fail: " << tera_dest_conf << ", err " << err.ToString(); + return -1; + } + Table* src_table = NULL; + Table* dest_table = NULL; + + galaxy::ins::sdk::InsSDK ins_sdk(ins_cluster_addr); + std::string table_name, start_key, end_key, last_table_name; + while (GetAndLockDumpRange(ins_cluster_root_path, &table_name, &start_key, &end_key, &ins_sdk) == 0) { + if (last_table_name != table_name) { // table change + delete src_table; + delete dest_table; + src_table = NULL; + dest_table = NULL; + src_table = src_client->OpenTable(table_name, &err); + if (src_table == NULL) { + LOG(INFO) << "open src table fail: " << table_name << ", err " << err.ToString(); + continue; + } + dest_table = dest_client->OpenTable(table_name, &err); + if (dest_table == NULL) { + delete src_table; + src_table = NULL; + LOG(INFO) << "open dest table fail: " << table_name << ", err " << err.ToString(); + continue; + } + } + last_table_name = table_name; + if ((res = ScanAndDumpData(src_table, dest_table, table_name, start_key, end_key)) < 0) { + LOG(INFO) << "scan dump data fail: " << table_name << ", start " << start_key + << ", end " << end_key; + } else { + ReleaseAndUnlockDumpRange(ins_cluster_root_path, table_name, start_key, end_key, &ins_sdk); + } + start_key = end_key; + } + delete src_client; + delete dest_client; + return res; +} + +void GetTableKeyRange(const std::string& table_name, + const TabletMetaList& tablet_list, + std::vector* delimiters) { + for (int32_t i = 0; i < tablet_list.meta_size(); i++) { + const tera::TabletMeta& meta = tablet_list.meta(i); + if (table_name == meta.table_name() && + meta.key_range().key_start().size() > 0) { + delimiters->push_back(meta.key_range().key_start()); + } + } +} + +int ManualCreateTable(tera::ClientImpl* client, + const std::string& table_name, + const TableSchema& schema, + const std::vector& delimiters) { + ErrorCode err; + TableDescriptor table_desc; + table_desc.SetTableName(table_name); + TableSchemaToDesc(schema, &table_desc); + table_desc.SetSplitSize(10000000); + table_desc.SetMergeSize(0); + if (!client->CreateTable(table_desc, delimiters, &err)) { + LOG(INFO) << "manual create error: " << table_name << ", err: " << err.ToString(); + return -1; + } + return 0; +} + +int ManualSplitTable(tera::ClientImpl* client, + const std::string& table_name, + const std::vector& delimiters) { + ErrorCode err; + std::vector arg_list; + arg_list.push_back("split"); + arg_list.push_back(table_name); + for (uint32_t i = 0; i < delimiters.size(); i++) { + arg_list.push_back(delimiters[i]); + if (!client->CmdCtrl("table", arg_list, NULL, NULL, &err)) { + LOG(INFO) << "manual split table fail(ignore old master): " << table_name + << ", delimiters_size: " << delimiters.size() + << ", err: " << err.ToString(); + } + usleep(FLAGS_dump_manual_split_interval); + arg_list.pop_back(); + } + return 0; +} + +bool SchemaCompare(const TableSchema& src, const TableSchema& dest) { + return ((src.raw_key() == dest.raw_key()) && + (src.kv_only() == dest.kv_only()) && + (src.name() == dest.name()) && + (!IsSchemaCfDiff(src, dest)) && + (!IsSchemaLgDiff(src, dest))); +} + +int GetOrSetTabletLocationSafe(Client* src_client, + Client* dest_client, + TableMetaList* table_list, + TabletMetaList* tablet_list) { + // get src and dest tablet location + ErrorCode err; + TableMetaList src_table_list; + TabletMetaList src_tablet_list; + tera::ClientImpl* src_client_impl = static_cast(src_client); + if (!src_client_impl->ShowTablesInfo(&src_table_list, &src_tablet_list, false, &err)) { + LOG(INFO) << "tera_master show src cluster fail: " << err.ToString(); + return -1; + } + + TableMetaList dest_table_list; + TabletMetaList dest_tablet_list; + tera::ClientImpl* dest_client_impl = static_cast(dest_client); + if (!dest_client_impl->ShowTablesInfo(&dest_table_list, &dest_tablet_list, false, &err)) { + LOG(INFO) << "tera_master show dest cluster fail: " << err.ToString(); + return -1; + } + + // get table meta set + std::map src_table_set; + for (int32_t i = 0; i < src_table_list.meta_size(); i++) { + const tera::TableMeta& meta = src_table_list.meta(i); + TableSchema& schema = src_table_set[meta.table_name()]; + schema.CopyFrom(meta.schema()); + } + std::map dest_table_set; + for (int32_t i = 0; i < dest_table_list.meta_size(); i++) { + const tera::TableMeta& meta = dest_table_list.meta(i); + TableSchema& schema = dest_table_set[meta.table_name()]; + schema.CopyFrom(meta.schema()); + } + + // create or split table, and filter schema not match meta + for (int32_t i = 0; i < src_table_list.meta_size(); i++) { + const tera::TableMeta& meta = src_table_list.meta(i); + if (meta.table_name() == FLAGS_tera_master_meta_table_name) { + continue; + } + std::vector delimiters; + GetTableKeyRange(meta.table_name(), src_tablet_list, &delimiters); + if (dest_table_set.find(meta.table_name()) == dest_table_set.end()) { + if (ManualCreateTable(dest_client_impl, meta.table_name(), meta.schema(), delimiters) < 0) { + return -1; + } + } else if (SchemaCompare(dest_table_set[meta.table_name()], meta.schema())) { + if (FLAGS_dump_enable_manual_split && + ManualSplitTable(dest_client_impl, meta.table_name(), delimiters) < 0) { + return -1; + } + } else { + LOG(INFO) << "table schema not match: " << meta.table_name() << ", src schema: " << meta.schema().ShortDebugString() + << ", dest schema: " << dest_table_set[meta.table_name()].ShortDebugString(); + src_table_set.erase(meta.table_name()); + continue; + } + tera::TableMeta* meta2 = table_list->add_meta(); + meta2->CopyFrom(meta); + } + + // filter key range + for (int32_t i = 0; i < src_tablet_list.meta_size(); i++) { + const tera::TabletMeta& meta = src_tablet_list.meta(i); + if (src_table_set.find(meta.table_name()) == src_table_set.end()) { + continue; + } + tera::TabletMeta* meta2 = tablet_list->add_meta(); + meta2->CopyFrom(meta); + } + return 0; +} + +int DumpPrepareSafeOp() { + int res = 0; + std::string ins_cluster_addr = FLAGS_ins_cluster_addr; + std::string ins_cluster_root_path = FLAGS_ins_cluster_root_path; + std::string tera_src_conf = FLAGS_dump_tera_src_conf; + std::string tera_dest_conf = FLAGS_dump_tera_dest_conf; + + ErrorCode err; + std::unique_ptr src_client(Client::NewClient(tera_src_conf, &err)); + if (src_client == nullptr) { + LOG(INFO) << "open src client fail: " << tera_src_conf << ", err " << err.ToString(); + return -1; + } + std::unique_ptr dest_client(Client::NewClient(tera_dest_conf, &err)); + if (dest_client == nullptr) { + src_client = nullptr; + LOG(INFO) << "open dest client fail: " << tera_dest_conf << ", err " << err.ToString(); + return -1; + } + + // dump src cluster range into ins + TableMetaList table_list; + TabletMetaList tablet_list; + if (GetOrSetTabletLocationSafe(src_client.get(), dest_client.get(), &table_list, &tablet_list) < 0) { + return -1; + } + res = DumpRange(ins_cluster_addr, ins_cluster_root_path, table_list, tablet_list); + return res; +} + +int main(int argc, char* argv[]) { + ::google::ParseCommandLineFlags(&argc, &argv, true); + if (FLAGS_flagfile == "") { + FLAGS_flagfile = "../conf/tera.flag"; + if (access(FLAGS_flagfile.c_str(), R_OK) != 0) { + FLAGS_flagfile = "./tera.flag"; + } + utils::LoadFlagFile(FLAGS_flagfile); + } + + if (argc > 1 && std::string(argv[1]) == "version") { + PrintSystemVersion(); + } else if (argc > 2 && std::string(argv[1]) == "dump" && std::string(argv[2]) == "prepare") { + return DumpPrepareOp(); + } else if (argc > 2 && std::string(argv[1]) == "dump" && std::string(argv[2]) == "prepare_safe") { + return DumpPrepareSafeOp(); + } else if (argc > 2 && std::string(argv[1]) == "dump" && std::string(argv[2]) == "run") { + return DumpRunOp(); + //} else if (argc > 2 && std::string(argv[1]) == "dump" && std::string(argv[2]) == "show") { + // return DumpShowOp(); + //} else if (argc > 2 && std::string(argv[1]) == "dump" && std::string(argv[2]) == "check") { + // return DumpCheckOp(): + } else { + HelpOp(argc, argv); + return -1; + } + return 0; +} + diff --git a/src/timeoracle/bench/timeoracle_bench.cc b/src/timeoracle/bench/timeoracle_bench.cc new file mode 100644 index 000000000..4140005bc --- /dev/null +++ b/src/timeoracle/bench/timeoracle_bench.cc @@ -0,0 +1,48 @@ +#include +#include +#include +#include "common/mutex.h" +#include "common/timer.h" +#include "common/thread_pool.h" +#include "common/this_thread.h" +#include "sdk/sdk_zk.h" + +#include "sdk/timeoracle_client_impl.h" +#include + +DEFINE_int64(client_thread_num, 10, ""); + +using namespace tera; +using namespace tera::timeoracle; + +std::shared_ptr g_thread_pool; + + +void worker() { + tera::sdk::ClusterFinder* cluster_finder = sdk::NewTimeoracleClusterFinder(); + tera::timeoracle::TimeoracleClientImpl client(g_thread_pool.get(), cluster_finder); + + while (true) { + int64_t st = client.GetTimestamp(1); + if (st <= 0) { + std::cout << "rpc failed" << std::endl; + ThisThread::Sleep(200); + } + } +} + +int main(int argc, char** argv) { + ::google::ParseCommandLineFlags(&argc, &argv, true); + g_thread_pool.reset(new common::ThreadPool(FLAGS_client_thread_num + 1)); + + std::vector thread_list; + for (int64_t i = 0; i < FLAGS_client_thread_num; ++i) { + thread_list.push_back(std::thread(&worker)); + } + + for (auto& th : thread_list) { + th.join(); + } + + return 0; +} diff --git a/src/timeoracle/remote_timeoracle.h b/src/timeoracle/remote_timeoracle.h new file mode 100644 index 000000000..588bd0547 --- /dev/null +++ b/src/timeoracle/remote_timeoracle.h @@ -0,0 +1,73 @@ +// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef TERA_TIMEORACLE_REMOTE_TIMEORACLE_H +#define TERA_TIMEORACLE_REMOTE_TIMEORACLE_H + +#include +#include "common/thread_pool.h" +#include "proto/timeoracle_rpc.pb.h" +#include "timeoracle/timeoracle.h" + +namespace tera { +namespace timeoracle { + +class ClosureGuard { +public: + ClosureGuard(::google::protobuf::Closure* done) : done_(done) { + } + + ~ClosureGuard() { + if (done_) { + done_->Run(); + } + } + + ::google::protobuf::Closure* release() { + auto done = done_; + done_ = nullptr; + return done; + } + +private: + ClosureGuard(const ClosureGuard&) = delete; +private: + ::google::protobuf::Closure* done_; +}; + +class RemoteTimeoracle : public TimeoracleServer { +public: + RemoteTimeoracle(int64_t start_timestamp) : timeoracle_(start_timestamp) { + } + + virtual void GetTimestamp(::google::protobuf::RpcController* controller, + const ::tera::GetTimestampRequest* request, + ::tera::GetTimestampResponse* response, + ::google::protobuf::Closure* done) { + ClosureGuard closure_guard(done); + + int64_t count = request->count(); + int64_t start_timestamp = timeoracle_.GetTimestamp(count); + + if (start_timestamp) { + response->set_start_timestamp(start_timestamp); + response->set_count(count); + response->set_status(kTimeoracleOk); + } else { + response->set_status(kTimeoracleBusy); + } + } + + Timeoracle* GetTimeoracle() { + return &timeoracle_; + } + +private: + Timeoracle timeoracle_; +}; + +} // namespace timeoracle +} // namespace tera + +#endif // TERA_TIMEORACLE_REMOTE_TIMEORACLE_H diff --git a/src/timeoracle/test/timeoracle_test.cc b/src/timeoracle/test/timeoracle_test.cc new file mode 100644 index 000000000..e7b6f4472 --- /dev/null +++ b/src/timeoracle/test/timeoracle_test.cc @@ -0,0 +1,78 @@ +// Copyright (c) 2017, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include + +#include +#include +#include +#include "timeoracle/timeoracle.h" +#include "utils/utils_cmd.h" + +DECLARE_string(log_dir); +DECLARE_string(tera_coord_type); +DECLARE_string(tera_leveldb_env_type); +DECLARE_string(tera_fake_zk_path_prefix); + +namespace tera { +namespace timeoracle { + +class TimeoracleTest: public ::testing::Test { +public: +}; + +TEST_F(TimeoracleTest, UniqueTimestampMsTest) { + int64_t ts0 = Timeoracle::UniqueTimestampMs(); + for (int i = 0; i < 10000; ++i) { + int64_t ts = Timeoracle::UniqueTimestampMs(); + EXPECT_LT(ts0, ts); + ts0 = ts; + } +} + +TEST_F(TimeoracleTest, TimeoracleFunc) { + Timeoracle to(1024LL); + + auto tmp = to.GetTimestamp(10LL); + EXPECT_EQ(tmp, 0); + + tmp = to.UpdateLimitTimestamp(10LL); + EXPECT_EQ(tmp, 10); + + tmp = to.GetTimestamp(10LL); + EXPECT_EQ(tmp, 0); + + tmp = to.UpdateLimitTimestamp(2000LL); + EXPECT_EQ(tmp, 2000); + + tmp = to.GetTimestamp(10LL); + EXPECT_EQ(tmp, 1044); + + tmp = to.GetTimestamp(10LL); + EXPECT_EQ(tmp, 1054); + + EXPECT_EQ(to.GetStartTimestamp(), 1064); + + tmp = to.UpdateStartTimestamp(); + + EXPECT_GT(tmp, 1064); + + auto new_ts = to.GetTimestamp(10LL); + EXPECT_EQ(new_ts, 0); +} + +} // namespace timeoracle +} // namespace tera + +int main(int argc, char** argv) { + ::google::ParseCommandLineFlags(&argc, &argv, true); + ::google::InitGoogleLogging(argv[0]); + FLAGS_tera_coord_type = "fake_zk"; + FLAGS_tera_leveldb_env_type = "local"; + + tera::utils::SetupLog("timeorcale_test"); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} + diff --git a/src/timeoracle/timeoracle.cc b/src/timeoracle/timeoracle.cc new file mode 100644 index 000000000..9d755445b --- /dev/null +++ b/src/timeoracle/timeoracle.cc @@ -0,0 +1,13 @@ +// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "timeoracle/timeoracle.h" + +namespace tera { +namespace timeoracle { + +std::atomic Timeoracle::s_last_timestamp_ms; + +} // namespace timeoracle +} // namespace tera diff --git a/src/timeoracle/timeoracle.h b/src/timeoracle/timeoracle.h new file mode 100644 index 000000000..eb690de56 --- /dev/null +++ b/src/timeoracle/timeoracle.h @@ -0,0 +1,124 @@ +// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef TERA_TIMEORACLE_TIMEORACLE_H_ +#define TERA_TIMEORACLE_TIMEORACLE_H_ + +#include +#include +#include +#include +#include + +namespace tera { +namespace timeoracle { + +constexpr int64_t kTimestampPerMilliSecond = 10000ULL; +constexpr int64_t kTimestampPerSecond = kTimestampPerMilliSecond * 1000ULL; +constexpr int64_t kBaseTimestampMilliSecond = 1483200000000ULL; // 20170101 00:00 + +inline int64_t clock_realtime_ms() { + struct timespec tp; + ::clock_gettime(CLOCK_REALTIME, &tp); + return tp.tv_sec * 1000ULL + tp.tv_nsec / 1000000ULL - kBaseTimestampMilliSecond; +} + +class Timeoracle { +public: + Timeoracle(int64_t start_timestamp) : start_timestamp_(start_timestamp), + limit_timestamp_(0) { + } + + // if num == 0, see next timstamp + // if return 0, allocate timestamp failed + int64_t GetTimestamp(int64_t num) { + int64_t start_timestamp = start_timestamp_.fetch_add(num); + + if ((start_timestamp + num) >= limit_timestamp_) { + return 0; + } + + return start_timestamp; + } + + int64_t UpdateLimitTimestamp(int64_t limit_timestamp) { + if (limit_timestamp > limit_timestamp_) { + limit_timestamp_ = limit_timestamp; + } else { + LOG(ERROR) << "update limit timestamp failed, limit_timestamp_=" << limit_timestamp_ + << ",update to " << limit_timestamp; + return 0; + } + return limit_timestamp; + } + + int64_t UpdateStartTimestamp() { + const int64_t cur_timestamp = CurrentTimestamp(); + + int64_t start_timestamp = 0; + while (1) { + start_timestamp = start_timestamp_; + if (start_timestamp < cur_timestamp) { + if (start_timestamp_.compare_exchange_strong(start_timestamp, cur_timestamp)) { + return cur_timestamp; + } + continue; + } + + int64_t limit_timestamp = limit_timestamp_; + if (start_timestamp > limit_timestamp) { + if (start_timestamp_.compare_exchange_strong(start_timestamp, limit_timestamp)) { + LOG(WARNING) << "adjust start timestamp to limit timestamp " << limit_timestamp; + return limit_timestamp; + } + continue; + } + + break; + } + + LOG(INFO) << "ignore to adjust start timestamp, current timestamp is " << cur_timestamp; + return start_timestamp; + } + + int64_t GetStartTimestamp() const { + return start_timestamp_; + } + + int64_t GetLimitTimestamp() const { + return limit_timestamp_; + } + +private: + std::atomic start_timestamp_; + std::atomic limit_timestamp_; + +public: + static int64_t UniqueTimestampMs() { + while (true) { + int64_t ts = clock_realtime_ms(); + int64_t last_timestamp_ms = s_last_timestamp_ms; + + if (ts <= last_timestamp_ms) { + return s_last_timestamp_ms.fetch_add(1) + 1; + } + + if (s_last_timestamp_ms.compare_exchange_strong(last_timestamp_ms, ts)) { + return ts; + } + } + } + + static int64_t CurrentTimestamp() { + return UniqueTimestampMs() * kTimestampPerMilliSecond; + } + +private: + static std::atomic s_last_timestamp_ms; +}; + +} // namespace timeoracle +} // namespace tera + +#endif // TERA_TIMEORACLE_TIMEORACLE_H_ diff --git a/src/timeoracle/timeoracle_entry.cc b/src/timeoracle/timeoracle_entry.cc new file mode 100644 index 000000000..8bff587ad --- /dev/null +++ b/src/timeoracle/timeoracle_entry.cc @@ -0,0 +1,174 @@ +// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "timeoracle/timeoracle_entry.h" + +#include +#include +#include +#include "common/net/ip_address.h" +#include "common/this_thread.h" +#include "utils/utils_cmd.h" + +#include "timeoracle/remote_timeoracle.h" +#include "timeoracle/timeoracle_zk_adapter.h" + +DECLARE_string(tera_local_addr); +DECLARE_string(tera_timeoracle_port); +DECLARE_int32(tera_timeoracle_refresh_lease_second); +DECLARE_int32(tera_timeoracle_max_lease_second); +DECLARE_bool(tera_timeoracle_mock_enabled); +DECLARE_int32(tera_timeoracle_work_thread_num); +DECLARE_int32(tera_timeoracle_io_service_pool_size); +DECLARE_string(tera_coord_type); + +namespace tera { +namespace timeoracle { + +TimeoracleEntry::TimeoracleEntry() : + remote_timeoracle_(nullptr), + startup_timestamp_(0), + need_quit_(false) { + sofa::pbrpc::RpcServerOptions rpc_options; + rpc_options.work_thread_num = FLAGS_tera_timeoracle_work_thread_num; + rpc_options.io_service_pool_size = FLAGS_tera_timeoracle_io_service_pool_size; + rpc_options.no_delay = false; //use Nagle's Algorithm + rpc_options.write_buffer_base_block_factor = 0; //64Bytes per malloc + rpc_options.read_buffer_base_block_factor = 7; //8kBytes per malloc + sofa_pbrpc_server_.reset(new sofa::pbrpc::RpcServer(rpc_options)); + + if (FLAGS_tera_local_addr.empty()) { + local_addr_ = utils::GetLocalHostName()+ ":" + FLAGS_tera_timeoracle_port; + } else { + local_addr_ = FLAGS_tera_local_addr + ":" + FLAGS_tera_timeoracle_port; + } +} + +bool TimeoracleEntry::Start() { + if (!InitZKAdaptor()) { + return false; + } + + int64_t current_timestamp = Timeoracle::CurrentTimestamp(); + if (startup_timestamp_ < current_timestamp) { + startup_timestamp_ = current_timestamp; + } else { + LOG(WARNING) << "startup timestamp big than current timestamp," + << "startup timestamp is " << startup_timestamp_ + << "current timestamp is " << current_timestamp; + } + + LOG(INFO) << "set startup timestamp to " << startup_timestamp_; + + if (!StartServer()) { + return false; + } + + return true; +} + +TimeoracleEntry::~TimeoracleEntry() { + need_quit_ = true; + if (lease_thread_.joinable()) { + lease_thread_.join(); + } +} + +bool TimeoracleEntry::InitZKAdaptor() { + if (FLAGS_tera_timeoracle_mock_enabled) { + LOG(INFO) << "mock mode" ; + zk_adapter_.reset(new TimeoracleMockAdapter(local_addr_)); + } else if (FLAGS_tera_coord_type == "zk") { + LOG(INFO) << "zk mode" ; + zk_adapter_.reset(new TimeoracleZkAdapter(local_addr_)); + } else if (FLAGS_tera_coord_type == "ins") { + LOG(INFO) << "ins mode" ; + zk_adapter_.reset(new TimeoracleInsAdapter(local_addr_)); + } else { + LOG(FATAL) << "invalid configure for coord service, please check " + << "--tera_timeoracle_mock_enabled=true or " + << "--tera_coord_type=zk|ins"; + assert(0); + } + + return zk_adapter_->Init(&startup_timestamp_); +} + +bool TimeoracleEntry::StartServer() { + IpAddress timeoracle_addr("0.0.0.0", FLAGS_tera_timeoracle_port); + LOG(INFO) << "Start timeoracle RPC server at: " << timeoracle_addr.ToString(); + + remote_timeoracle_ = new RemoteTimeoracle(startup_timestamp_); + std::thread lease_thread(&TimeoracleEntry::LeaseThread, this); + lease_thread_ = std::move(lease_thread); + + auto timeoracle = remote_timeoracle_->GetTimeoracle(); + + while (startup_timestamp_ < timeoracle->GetLimitTimestamp()) { + if (need_quit_) { + return false; + } + ThisThread::Sleep(100); + } + + sofa_pbrpc_server_->RegisterService(remote_timeoracle_); + if (!sofa_pbrpc_server_->Start(timeoracle_addr.ToString())) { + LOG(ERROR) << "start timeoracle RPC server error"; + return false; + } + + LOG(INFO) << "finish start timeoracle RPC server"; + return true; +} + +bool TimeoracleEntry::Run() { + if (need_quit_) { + return false; + } + + int64_t start_timestamp = remote_timeoracle_->GetTimeoracle()->UpdateStartTimestamp(); + + VLOG(100) << "adjust start timestamp finished, start timestmap is " << start_timestamp; + + ThisThread::Sleep(1000); + return true; +} + +void TimeoracleEntry::ShutdownServer() { + need_quit_ = true; + sofa_pbrpc_server_->Stop(); +} + +void TimeoracleEntry::LeaseThread() { + auto timeoracle = remote_timeoracle_->GetTimeoracle(); + + while (!need_quit_) { + int64_t start_timestamp = timeoracle->GetStartTimestamp(); + int64_t limit_timestamp = timeoracle->GetLimitTimestamp(); + int64_t refresh_lease_timestamp = + FLAGS_tera_timeoracle_refresh_lease_second * kTimestampPerSecond; + + if (start_timestamp + refresh_lease_timestamp >= limit_timestamp) { + // need to require lease + if (limit_timestamp < start_timestamp) { + limit_timestamp = start_timestamp; + } + + int64_t next_limit_timestamp = + limit_timestamp + FLAGS_tera_timeoracle_max_lease_second * kTimestampPerSecond; + + if (!zk_adapter_->UpdateTimestamp(next_limit_timestamp)) { + need_quit_ = true; + return; + } + + timeoracle->UpdateLimitTimestamp(next_limit_timestamp); + } + + ThisThread::Sleep(1000); + } +} + +} // namespace timeoracle +} // namespace tera diff --git a/src/timeoracle/timeoracle_entry.h b/src/timeoracle/timeoracle_entry.h new file mode 100644 index 000000000..356ae452a --- /dev/null +++ b/src/timeoracle/timeoracle_entry.h @@ -0,0 +1,49 @@ +// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef TERA_TIMEORACLE_TIMEORACLE_ENTRY_H_ +#define TERA_TIMEORACLE_TIMEORACLE_ENTRY_H_ + +#include + +#include "tera_entry.h" +#include +#include +#include + +namespace tera { +namespace timeoracle { + +class RemoteTimeoracle; +class TimeoracleZkAdapterBase; + +class TimeoracleEntry : public TeraEntry { +public: + TimeoracleEntry(); + ~TimeoracleEntry(); + + + virtual bool Start() override; + virtual bool Run() override; + virtual void ShutdownServer() override; + +private: + bool InitZKAdaptor(); + bool StartServer(); + void LeaseThread(); + +private: + std::string local_addr_; + RemoteTimeoracle* remote_timeoracle_; + std::unique_ptr sofa_pbrpc_server_; + int64_t startup_timestamp_; + std::unique_ptr zk_adapter_; + std::thread lease_thread_; + std::atomic need_quit_; +}; + +} // namespace timeoracle +} // namespace tera + +#endif // TERA_TIMEORACLE_TIMEORACLE_ENTRY_H_ diff --git a/src/timeoracle/timeoracle_zk_adapter.cc b/src/timeoracle/timeoracle_zk_adapter.cc new file mode 100644 index 000000000..58dd4a554 --- /dev/null +++ b/src/timeoracle/timeoracle_zk_adapter.cc @@ -0,0 +1,477 @@ +// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include +#include +#include "timeoracle/timeoracle_zk_adapter.h" +#include "common/file/file_path.h" +#include "common/this_thread.h" +#include "types.h" +#include "zk/zk_util.h" +#include "ins_sdk.h" + +DECLARE_string(tera_zk_addr_list); +DECLARE_string(tera_zk_root_path); +DECLARE_string(tera_fake_zk_path_prefix); +DECLARE_int32(tera_zk_timeout); +DECLARE_int64(tera_zk_retry_period); +DECLARE_int32(tera_zk_retry_max_times); + +DECLARE_string(tera_ins_addr_list); +DECLARE_string(tera_ins_root_path); +DECLARE_int64(tera_master_ins_session_timeout); +DECLARE_string(tera_timeoracle_mock_root_path); + +namespace tera { +namespace timeoracle { + +void TimeoracleZkAdapterBase::OnNodeValueChanged(const std::string& path, + const std::string& value) { + LOG(INFO) << "zk OnNodeValueChanged, path=" << path; +} + +void TimeoracleZkAdapterBase::OnChildrenChanged(const std::string& path, + const std::vector& name_list, + const std::vector& data_list) { + LOG(INFO) << "zk OnChildrenChanged, path=" << path; +} + +void TimeoracleZkAdapterBase::OnNodeCreated(const std::string& path) { + LOG(INFO) << "zk OnNodeCreated, path=" << path; +} + +void TimeoracleZkAdapterBase::OnNodeDeleted(const std::string& path) { + LOG(INFO) << "zk OnNodeDeleted, path=" << path; + Finalize(); + _Exit(EXIT_FAILURE); +} + +void TimeoracleZkAdapterBase::OnWatchFailed(const std::string& path, int watch_type, + int err) { + LOG(INFO) << "zk OnWatchFailed, path=" << path; + Finalize(); + _Exit(EXIT_FAILURE); +} + +void TimeoracleZkAdapterBase::OnSessionTimeout() { + LOG(ERROR) << "zk session timeout!"; + _Exit(EXIT_FAILURE); +} + +TimeoracleZkAdapter::~TimeoracleZkAdapter() { +} + +bool TimeoracleZkAdapter::Init(int64_t* last_timestamp) { + if (!InitZk()) { + return false; + } + + if (!LockTimeoracleLock()) { + return false; + } + + if (ReadTimestamp(last_timestamp)) { + LOG(INFO) << "read timestamp sucess,get start_timestamp=" << *last_timestamp; + return CreateTimeoracleNode(); + } + + return false; +} + +bool TimeoracleZkAdapter::CreateTimeoracleNode() { + LOG(INFO) << "try create timeoracle nod,path=" << kTimeoracleNodePath; + int32_t retry_count = 0; + int zk_errno = zk::ZE_OK; + while (!CreateEphemeralNode(kTimeoracleNodePath, server_addr_, &zk_errno)) { + if (retry_count++ >= FLAGS_tera_zk_retry_max_times) { + LOG(ERROR) << "fail to create timeoracle node"; + return false; + } + LOG(ERROR) << "retry create timeoracle node in " + << FLAGS_tera_zk_retry_period << " ms, retry=" << retry_count; + ThisThread::Sleep(FLAGS_tera_zk_retry_period); + zk_errno = zk::ZE_OK; + } + LOG(INFO) << "create timeoracle node success"; + return true; +} + +bool TimeoracleZkAdapter::InitZk() { + LOG(INFO) << "try to init zk,zk_addr_list=" << FLAGS_tera_zk_addr_list + << ",zk_root_path=" << FLAGS_tera_zk_root_path; + int zk_errno = zk::ZE_OK; + int32_t retry_count = 0; + while (!ZooKeeperAdapter::Init(FLAGS_tera_zk_addr_list, + FLAGS_tera_zk_root_path, + FLAGS_tera_zk_timeout, + server_addr_, &zk_errno)) { + if (retry_count++ >= FLAGS_tera_zk_retry_max_times) { + LOG(ERROR) << "fail to init zk: " << zk::ZkErrnoToString(zk_errno); + return false; + } + LOG(ERROR) << "init zk fail: " << zk::ZkErrnoToString(zk_errno) + << ". retry in " << FLAGS_tera_zk_retry_period << " ms, retry: " + << retry_count; + ThisThread::Sleep(FLAGS_tera_zk_retry_period); + zk_errno = zk::ZE_OK; + } + LOG(INFO) << "init zk success"; + return true; +} + +bool TimeoracleZkAdapter::LockTimeoracleLock() { + LOG(INFO) << "try to lock timeoracle lock,path=" << kTimeoracleLockPath; + int32_t retry_count = 0; + int zk_errno = zk::ZE_OK; + while (!SyncLock(kTimeoracleLockPath, &zk_errno, -1)) { + if (retry_count++ >= FLAGS_tera_zk_retry_max_times) { + LOG(ERROR) << "fail to acquire timeoracle lock"; + return false; + } + LOG(ERROR) << "retry lock timeoracle lock in " + << FLAGS_tera_zk_retry_period << " ms, retry=" << retry_count; + ThisThread::Sleep(FLAGS_tera_zk_retry_period); + zk_errno = zk::ZE_OK; + } + LOG(INFO) << "acquire timeoracle lock success"; + return true; +} + +bool TimeoracleZkAdapter::ReadTimestamp(int64_t* timestamp) { + LOG(INFO) << "try to read timestamp, path=" << kTimeoracleTimestampPath; + + std::string timestamp_str; + int32_t retry_count = 0; + int zk_errno = zk::ZE_OK; + while (!ReadNode(kTimeoracleTimestampPath, ×tamp_str, &zk_errno) + && zk_errno != zk::ZE_NOT_EXIST) { + if (retry_count++ >= FLAGS_tera_zk_retry_max_times) { + LOG(ERROR) << "fail to read timestamp node"; + return false; + } + LOG(ERROR) << "retry read timestamp node in " + << FLAGS_tera_zk_retry_period << " ms, retry=" << retry_count; + ThisThread::Sleep(FLAGS_tera_zk_retry_period); + zk_errno = zk::ZE_OK; + } + if (zk_errno == zk::ZE_NOT_EXIST) { + *timestamp = 0; + return true; + } + + char * pEnd = nullptr; + *timestamp = ::strtoull(timestamp_str.c_str(), &pEnd, 10); + if (*pEnd != '\0') { + // TODO (chenzongjia) + LOG(WARNING) << "read invalid timestamp value=" << timestamp_str; + return false; + } + + LOG(INFO) << "read timestamp value=" << timestamp_str; + + return true; +} + +bool TimeoracleZkAdapter::UpdateTimestamp(int64_t timestamp) { + char timestamp_str[64]; + snprintf(timestamp_str, sizeof(timestamp_str), "%lu", timestamp); + LOG(INFO) << "try to update timestamp to " << timestamp; + int zk_errno = zk::ZE_OK; + while (!WriteNode(kTimeoracleTimestampPath, timestamp_str, &zk_errno) + && zk_errno != zk::ZE_NOT_EXIST) { + return false; + /* + if (retry_count++ >= FLAGS_tera_zk_retry_max_times) { + LOG(INFO) << "fail to update timestamp"; + return false; + } + LOG(ERROR) << "retry update timestamp in " + << FLAGS_tera_zk_retry_period << " ms, retry=" << retry_count; + ThisThread::Sleep(FLAGS_tera_zk_retry_period); + zk_errno = zk::ZE_OK; + */ + } + if (zk_errno == zk::ZE_OK) { + LOG(INFO) << "update zk path=" << kTimeoracleTimestampPath << " to " + << timestamp_str << " success."; + return true; + } + + LOG(INFO) << "timestamp node not exist, try create timestamp node"; + zk_errno = zk::ZE_OK; + while (!CreatePersistentNode(kTimeoracleTimestampPath, timestamp_str, &zk_errno)) { + return false; + } + LOG(INFO) << "create timestamp node success"; + return true; + +} + +TimeoracleInsAdapter::~TimeoracleInsAdapter() { + if (ins_sdk_) { + std::string lock_path = FLAGS_tera_ins_root_path + kTimeoracleLockPath; + galaxy::ins::sdk::SDKError err; + ins_sdk_->UnLock(lock_path, &err); + } +} + +bool TimeoracleInsAdapter::Init(int64_t* last_timestamp) { + if (!InitInsAndLock()) { + return false; + } + + if (ReadTimestamp(last_timestamp)) { + LOG(INFO) << "read timestamp sucess,get start_timestamp=" << *last_timestamp; + return CreateTimeoracleNode(); + } + + return false; +} + +bool TimeoracleInsAdapter::CreateTimeoracleNode() { + std::string put_path = FLAGS_tera_ins_root_path + kTimeoracleNodePath; + + LOG(INFO) << "try write timeoracle nod,path=" << put_path; + + galaxy::ins::sdk::SDKError err; + + if (!ins_sdk_->Put(put_path, server_addr_, &err)) { + LOG(ERROR) << "update timestamp node, path=" << put_path << ",failed " + << ins_sdk_->ErrorToString(err); + return false; + } + + LOG(INFO) << "update timeoracle node success"; + return true; +} + +static void InsOnSessionTimeout(void * context) { + TimeoracleInsAdapter* ins_adp = static_cast(context); + ins_adp->OnSessionTimeout(); +} + +static void InsOnLockChange(const galaxy::ins::sdk::WatchParam& param, + galaxy::ins::sdk::SDKError error) { + TimeoracleInsAdapter* ins_adp = static_cast(param.context); + ins_adp->OnLockChange(param.value, param.deleted); +} + +bool TimeoracleInsAdapter::InitInsAndLock() { + MutexLock lock(&mutex_); + LOG(INFO) << "try to init ins,ins_addr_list=" << FLAGS_tera_ins_addr_list + << ",ins_root_path=" << FLAGS_tera_ins_root_path; + ins_sdk_ = new galaxy::ins::sdk::InsSDK(FLAGS_tera_ins_addr_list); + ins_sdk_->SetTimeoutTime(FLAGS_tera_master_ins_session_timeout); + + std::string lock_path = FLAGS_tera_ins_root_path + kTimeoracleLockPath; + + galaxy::ins::sdk::SDKError err; + + ins_sdk_->RegisterSessionTimeout(InsOnSessionTimeout, this); + + if (!ins_sdk_->Lock(lock_path, &err)) { + LOG(ERROR) << "try to lock timeoracle lock,path=" << kTimeoracleLockPath << " failed," + << ins_sdk_->ErrorToString(err); + return false; + } + + LOG(INFO) << "try to lock timeoracle lock,path=" << kTimeoracleLockPath << " success"; + + if (!ins_sdk_->Watch(lock_path, InsOnLockChange, this, &err)) { + LOG(ERROR) << "try to watch timeoracle lock,path=" << kTimeoracleLockPath << " failed," + << ins_sdk_->ErrorToString(err); + return false; + } + + LOG(INFO) << "try to watch timeoracle lock,path=" << kTimeoracleLockPath << " success"; + + return true; +} + +bool TimeoracleInsAdapter::ReadTimestamp(int64_t* timestamp) { + std::string read_path = FLAGS_tera_ins_root_path + kTimeoracleTimestampPath; + + LOG(INFO) << "try to read timestamp, path=" << read_path; + + std::string timestamp_str; + galaxy::ins::sdk::SDKError err; + + if (!ins_sdk_->Get(read_path, ×tamp_str, &err)) { + if (err == galaxy::ins::sdk::SDKError::kNoSuchKey) { + *timestamp = 0; + return true; + } + + LOG(ERROR) << "try to read timestamp, path=" << read_path << ",failed " + << ins_sdk_->ErrorToString(err); + return false; + } + + char * pEnd = nullptr; + *timestamp = ::strtoull(timestamp_str.c_str(), &pEnd, 10); + if (*pEnd != '\0') { + // TODO (chenzongjia) + LOG(WARNING) << "read invalid timestamp value=" << timestamp_str; + return false; + } + + LOG(INFO) << "read timestamp value=" << timestamp_str; + return true; +} + +bool TimeoracleInsAdapter::UpdateTimestamp(int64_t timestamp) { + char buf[64]; + snprintf(buf, sizeof(buf), "%lu", timestamp); + LOG(INFO) << "try to update timestamp to " << timestamp; + + std::string timestamp_str(buf); + galaxy::ins::sdk::SDKError err; + std::string put_path = FLAGS_tera_ins_root_path + kTimeoracleTimestampPath; + + if (!ins_sdk_->Put(put_path, timestamp_str, &err)) { + LOG(ERROR) << "update timestamp, path=" << put_path << ",failed " + << ins_sdk_->ErrorToString(err); + return false; + } + + return true; +} + +void TimeoracleInsAdapter::OnLockChange(std::string session_id, bool deleted) { + if (deleted || session_id != ins_sdk_->GetSessionID()) { + LOG(ERROR) << "timeoracle lock losted"; + exit(1); + } +} + +class FdGuard { +public: + explicit FdGuard(int fd) : fd_(fd) {} + + FdGuard() : fd_(-1) {} + + ~FdGuard() { + if (fd_ >= 0) { + ::close(fd_); + } + } + + operator int() const { + return fd_; + } + + void reset(int fd) { + if (fd_ >= 0) { + ::close(fd_); + } + fd_ = fd; + } + + int relese() { + const int ret = fd_; + fd_ = -1; + return ret; + } + +private: + FdGuard(const FdGuard&) = delete; + void operator=(const FdGuard&) = delete; + int fd_; +}; + +// not thread safe +bool TimeoracleMockAdapter::Init(int64_t* last_timestamp) { + std::string lock_path = FLAGS_tera_timeoracle_mock_root_path + kTimeoracleLockPath; + static FdGuard lock_fd(::open(lock_path.c_str(), O_CREAT | O_RDWR, 0666)); + + if (lock_fd < 0) { + return false; + } + + LOG(INFO) << "TimeoracleMockAdapter try to get lock for file=" << lock_path; + + if (::flock(lock_fd, LOCK_EX) < 0) { + LOG(WARNING) << "lock file failed for path=" << lock_path; + return false; + } + + LOG(INFO) << "TimeoracleMockAdapter got the lock for file=" << lock_path; + + std::string get_path = FLAGS_tera_timeoracle_mock_root_path + kTimeoracleTimestampPath; + + FdGuard tmp_fd(::open(get_path.c_str(), O_CREAT | O_RDWR, 0666)); + + if (tmp_fd < 0) { + LOG(WARNING) << "open file failed for file=" << get_path; + return false; + } + + char buf[64]; + + ssize_t len = pread(tmp_fd, buf, sizeof(buf), 0); + if (len < 0) { + LOG(WARNING) << "read file failed for file=" << get_path; + return false; + } + + if (len == 0) { + *last_timestamp = 0; + return true; + } + + buf[len] = '\0'; + char * pEnd = nullptr; + *last_timestamp = ::strtoull(buf, &pEnd, 10); + if (*pEnd != '\0') { + // TODO (chenzongjia) + LOG(WARNING) << "read invalid timestamp value=" << buf; + return false; + } + + LOG(INFO) << "read timestamp value=" << *last_timestamp; + + std::string put_path = FLAGS_tera_timeoracle_mock_root_path + kTimeoracleNodePath; + + tmp_fd.reset(::open(put_path.c_str(), O_CREAT | O_RDWR, 0666)); + + if (tmp_fd < 0) { + LOG(WARNING) << "open file failed for file=" << put_path; + return false; + } + + if (::pwrite(tmp_fd, server_addr_.data(), server_addr_.size(), 0) + != (ssize_t)server_addr_.size()) { + LOG(WARNING) << "write file failed for file=" << put_path; + return false; + } + + return true; +} + +// not thread safe +bool TimeoracleMockAdapter::UpdateTimestamp(int64_t new_timestamp) { + std::string put_path = FLAGS_tera_timeoracle_mock_root_path + kTimeoracleTimestampPath; + FdGuard tmp_fd(::open(put_path.c_str(), O_CREAT | O_RDWR, 0666)); + + if (tmp_fd < 0) { + LOG(WARNING) << "open file failed for file=" << put_path; + return false; + } + + char buf[64]; + snprintf(buf, sizeof(buf), "%lu", new_timestamp); + std::string timestamp_str(buf); + LOG(INFO) << "try to update timestamp to " << put_path; + + if (::pwrite(tmp_fd, timestamp_str.data(), timestamp_str.size(), 0) + != (ssize_t)timestamp_str.size()) { + LOG(WARNING) << "write file failed for file=" << put_path; + return false; + } + + return true; +} + +} // namespace timeoracle +} // namespace tera diff --git a/src/timeoracle/timeoracle_zk_adapter.h b/src/timeoracle/timeoracle_zk_adapter.h new file mode 100644 index 000000000..b0f6a970c --- /dev/null +++ b/src/timeoracle/timeoracle_zk_adapter.h @@ -0,0 +1,124 @@ +// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef TERA_TIMEORACLE_TIMEORACLE_ZK_ADAPTER_H +#define TERA_TIMEORACLE_TIMEORACLE_ZK_ADAPTER_H + +#include +#include +#include "zk/zk_adapter.h" + +// forward declare +namespace galaxy{ +namespace ins{ +namespace sdk { + class InsSDK; +} +} +} + +namespace tera { +namespace timeoracle { + +class TimeoracleZkAdapterBase : public zk::ZooKeeperAdapter { +public: + virtual ~TimeoracleZkAdapterBase() {}; + + // not thread safe + virtual bool Init(int64_t* last_timestamp) = 0; + + // not thread safe + virtual bool UpdateTimestamp(int64_t new_timestamp) = 0; + + virtual void OnChildrenChanged(const std::string& path, + const std::vector& name_list, + const std::vector& data_list) override; + + virtual void OnNodeValueChanged(const std::string& path, + const std::string& value) override; + + virtual void OnNodeCreated(const std::string& path) override; + + virtual void OnNodeDeleted(const std::string& path) override; + + virtual void OnWatchFailed(const std::string& path, int watch_type, + int err) override; + + virtual void OnSessionTimeout() final; +}; + +class TimeoracleZkAdapter : public TimeoracleZkAdapterBase { +public: + TimeoracleZkAdapter(const std::string& server_addr) : server_addr_(server_addr) {} + + virtual ~TimeoracleZkAdapter(); + + virtual bool Init(int64_t* last_timestamp) override; + + virtual bool UpdateTimestamp(int64_t new_timestamp) override; + +private: + bool InitZk(); + + bool LockTimeoracleLock(); + + bool ReadTimestamp(int64_t* timestamp); + + bool CreateTimeoracleNode(); + +private: + std::string server_addr_; +}; + +class TimeoracleInsAdapter : public TimeoracleZkAdapterBase { +public: + TimeoracleInsAdapter(const std::string & server_addr) : server_addr_(server_addr) {} + + virtual ~TimeoracleInsAdapter(); + + virtual bool Init(int64_t* last_timestamp) override; + + virtual bool UpdateTimestamp(int64_t new_timestamp) override; + + void OnLockChange(std::string session_id, bool deleted); + +private: + bool InitInsAndLock(); + + bool ReadTimestamp(int64_t* timestamp); + + bool CreateTimeoracleNode(); + +private: + mutable Mutex mutex_; + std::string server_addr_; + galaxy::ins::sdk::InsSDK* ins_sdk_{NULL}; +}; + + +/* + * This is not zookeeper! + * Just used on onebox for tasting tera briefly. + * This is implemented through local file system. + * Not support watching. + */ +class TimeoracleMockAdapter: public TimeoracleZkAdapterBase { +public: + TimeoracleMockAdapter(const std::string& server_addr) : server_addr_(server_addr) { + } + + // not thread safe + virtual bool Init(int64_t* last_timestamp) override; + + // not thread safe + virtual bool UpdateTimestamp(int64_t new_timestamp) override; + +private: + std::string server_addr_; +}; + +} // namespace timeoracle +} // namespace tera + +#endif // TERA_TIMEORACLE_TIMEORACLE_ZK_ADAPTER_H diff --git a/src/timeoracle_main.cc b/src/timeoracle_main.cc new file mode 100644 index 000000000..3c7f713be --- /dev/null +++ b/src/timeoracle_main.cc @@ -0,0 +1,69 @@ +// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include +#include + +#include +#include + +#include "common/base/scoped_ptr.h" +#include "tera_entry.h" +#include "utils/utils_cmd.h" +#include "version.h" +#include "timeoracle/timeoracle_entry.h" + +DECLARE_string(tera_log_prefix); + +volatile sig_atomic_t g_quit = 0; + +static void SignalIntHandler(int sig) { + g_quit = 1; +} + +int main(int argc, char* argv[]) { + ::google::SetUsageMessage("./timeoracle --flagfile=xxx.flag"); + ::google::ParseCommandLineFlags(&argc, &argv, true); + ::google::InitGoogleLogging(argv[0]); + if (!FLAGS_tera_log_prefix.empty()) { + tera::utils::SetupLog(FLAGS_tera_log_prefix); + } else { + tera::utils::SetupLog("timeoracle"); + } + + if (argc > 1) { + std::string ext_cmd = argv[1]; + if (ext_cmd == "version") { + PrintSystemVersion(); + return 0; + } + } + + signal(SIGINT, SignalIntHandler); + signal(SIGTERM, SignalIntHandler); + + scoped_ptr entry(new tera::timeoracle::TimeoracleEntry()); + + if (!entry->Start()) { + return -1; + } + + while (!g_quit) { + if (!entry->Run()) { + LOG(ERROR) << "Server run error ,and then exit now "; + break; + } + } + if (g_quit) { + LOG(INFO) << "received interrupt signal from user, will stop"; + } + + if (!entry->Shutdown()) { + return -1; + } + + return 0; +} + +/* vim: set ts=4 sw=4 sts=4 tw=100 */ diff --git a/src/types.h b/src/types.h index bfad100da..1f50f0f8f 100644 --- a/src/types.h +++ b/src/types.h @@ -27,6 +27,10 @@ const std::string kTsListPath = "/ts"; const std::string kKickPath = "/kick"; const std::string kRootTabletNodePath = "/root_table"; const std::string kSafeModeNodePath = "/safemode"; +const std::string kTimeoracleNodePath = "/timeoracle"; +const std::string kTimeoracleLockPath = "/timeoracle-lock"; +const std::string kTimeoracleTimestampPath = "/timeoracle-timestamp"; +const std::string kClientsNodePath = "/clients"; const std::string kSms = "[SMS] "; const std::string kMail = "[MAIL] "; const int64_t kLatestTs = INT64_MAX; @@ -36,6 +40,16 @@ const uint64_t kRowkeySize = (64 << 10); // 64KB const uint64_t kQualifierSize = (64 << 10); // 64KB const uint64_t kValueSize = (32 << 20); // 32MB +// observer +const std::string kRowlockNodeIdListPath = "/id_lock"; +const std::string kRowlockNodeHostListPath = "/host_lock"; +const std::string kRowlockNodeNumPath = "/node_num"; +const std::string kRowlockProxyPath = "/proxy"; +const uint64_t kObserverWaitTime = 1000000; + +// global transaction +const char* const kNotifyColumnFamily = "_N_"; + } // namespace tera #endif // TERA_TYPES_H_ diff --git a/src/utils/atomic.h b/src/utils/atomic.h deleted file mode 100644 index 69434be09..000000000 --- a/src/utils/atomic.h +++ /dev/null @@ -1,110 +0,0 @@ -// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. - -#ifndef TERA_UTILS_ATOMIC_H_ -#define TERA_UTILS_ATOMIC_H_ - -namespace tera { - -static inline int atomic_add(volatile int *mem, int add) -{ - asm volatile( - "lock xadd %0, (%1);" - : "=a"(add) - : "r"(mem), "a"(add) - : "memory" - ); - return add; -} - -static inline int64_t atomic_add64(volatile int64_t* mem, int64_t add) -{ - asm volatile( - "lock xaddq %0, (%1)" - : "=a" (add) - : "r" (mem), "a" (add) - : "memory" - ); - return add; -} - -static inline void atomic_inc(volatile int *mem) -{ - asm volatile( - "lock incl %0;" - : "=m"(*mem) - : "m"(*mem) - ); -} -static inline void atomic_inc64(volatile int64_t *mem) -{ - asm volatile( - "lock incq %0;" - : "=m"(*mem) - : "m"(*mem) - ); -} - -static inline void atomic_dec(volatile int *mem) -{ - asm volatile( - "lock decl %0;" - : "=m"(*mem) - : "m"(*mem) - ); -} - -static inline void atomic_dec64(volatile int64_t *mem) -{ - asm volatile( - "lock decq %0;" - : "=m"(*mem) - : "m"(*mem) - ); -} - -static inline int atomic_swap(volatile void *lockword, int value) -{ - asm volatile( - "lock xchg %0, (%1);" - : "=a"(value) - : "r"(lockword), "a"(value) - : "memory" - ); - return value; -} - -static inline int64_t atomic_swap64(volatile void *lockword, int64_t value) -{ - asm volatile( - "lock xchg %0, (%1);" - : "=a"(value) - : "r"(lockword), "a"(value) - : "memory" - ); - return value; -} - -static inline int atomic_comp_swap(volatile void *mem, int xchg, int cmp) -{ - asm volatile( - "lock cmpxchg %1, (%2)" - :"=a"(cmp) - :"d"(xchg), "r"(mem), "a"(cmp) - ); - return cmp; -} - -static inline int64_t atomic_comp_swap64(volatile void *mem, int64_t xchg, int64_t cmp) -{ - asm volatile( - "lock cmpxchg %1, (%2)" - :"=a"(cmp) - :"d"(xchg), "r"(mem), "a"(cmp) - ); - return cmp; -} - -} -#endif // TERA_UTILS_ATOMIC_H_ diff --git a/src/utils/counter.h b/src/utils/counter.h deleted file mode 100644 index 3f4da00a9..000000000 --- a/src/utils/counter.h +++ /dev/null @@ -1,76 +0,0 @@ -// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. - -#ifndef TERA_UTILS_COUNTER_H_ -#define TERA_UTILS_COUNTER_H_ - -#include - -#include "atomic.h" -#include "timer.h" - -namespace tera { - -class Counter { -public: - Counter() : val_(0) {} - int64_t Add(int64_t v) { - return atomic_add64(&val_, v) + v; - } - int64_t Sub(int64_t v) { - return atomic_add64(&val_, -v) - v; - } - int64_t Inc() { - return atomic_add64(&val_, 1) + 1; - } - int64_t Dec() { - return atomic_add64(&val_, -1) - 1; - } - int64_t Get() { - return val_; - } - int64_t Set(int64_t v) { - return atomic_swap64(&val_, v); - } - int64_t Clear() { - return atomic_swap64(&val_, 0); - } - -private: - volatile int64_t val_; -}; - -class AutoCounter { -public: - AutoCounter(Counter* counter, const char* msg1, const char* msg2 = NULL) - : counter_(counter), - msg1_(msg1), - msg2_(msg2) { - start_ = get_micros(); - counter_->Inc(); - } - ~AutoCounter() { - int64_t end = get_micros(); - if (end - start_ > 5000000) { - int64_t t = (end - start_) / 1000000; - if (!msg2_) { - fprintf(stderr, "%s [AutoCounter] %s hang for %ld s\n", - get_curtime_str().data(), msg1_, t); - } else { - fprintf(stderr, "%s [AutoCounter] %s %s hang for %ld s\n", - get_curtime_str().data(), msg1_, msg2_, t); - } - } - counter_->Dec(); - } - -private: - Counter* counter_; - int64_t start_; - const char* msg1_; - const char* msg2_; -}; -} - -#endif // TERA_UTILS_COUNTER_H_ diff --git a/src/utils/timer.h b/src/utils/timer.h deleted file mode 100644 index 62428c754..000000000 --- a/src/utils/timer.h +++ /dev/null @@ -1,57 +0,0 @@ -// Copyright (c) 2015, Baidu.com, Inc. All Rights Reserved -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. - -#ifndef TERA_UTILS_TIMER_H_ -#define TERA_UTILS_TIMER_H_ - -#include -#include - -namespace tera { - -static inline std::string get_curtime_str() { - struct tm tt; - char buf[20]; - time_t t = time(NULL); - strftime(buf, 20, "%Y%m%d-%H:%M:%S", localtime_r(&t, &tt)); - return std::string(buf, 17); -} - -static inline std::string get_curtime_str_plain() { - struct tm tt; - char buf[20]; - time_t t = time(NULL); - strftime(buf, 20, "%Y%m%d%H%M%S", localtime_r(&t, &tt)); - return std::string(buf); -} - -static inline int64_t get_micros() { - struct timespec ts; - clock_gettime(CLOCK_REALTIME, &ts); - return static_cast(ts.tv_sec) * 1000000 + static_cast(ts.tv_nsec) / 1000; -} - -static inline int64_t get_millis() { - return get_micros() / 1000; -} - -static inline int64_t get_unique_micros(int64_t ref) { - int64_t now; - do { - now = get_micros(); - } while (now == ref); - return now; -} - -static inline int64_t GetTimeStampInUs() { - return get_micros(); -} - -static inline int64_t GetTimeStampInMs() { - return get_millis(); -} - -} // namespace tera - -#endif // TERA_UTILS_TIMER_H_ diff --git a/src/zk/zk_adapter.cc b/src/zk/zk_adapter.cc old mode 100644 new mode 100755 index 1b83d6f87..9fd1aa1ac --- a/src/zk/zk_adapter.cc +++ b/src/zk/zk_adapter.cc @@ -51,7 +51,8 @@ bool ZooKeeperAdapter::Init(const std::string& server_list, const std::string& root_path, uint32_t session_timeout, const std::string& id, - int* zk_errno) { + int* zk_errno, + int wait_timeout) { MutexLock mutex(&state_mutex_); if (NULL != handle_) { @@ -79,7 +80,12 @@ bool ZooKeeperAdapter::Init(const std::string& server_list, } while (state_ == ZS_DISCONN || state_ == ZS_CONNECTING) { - state_cond_.Wait(); + if (wait_timeout > 0) { + state_cond_.TimeWait(wait_timeout); + break; + } else { + state_cond_.Wait(); + } } int code = ZE_OK; @@ -427,7 +433,7 @@ bool ZooKeeperAdapter::ListAndWatchChildren(const std::string& path, } } -bool ZooKeeperAdapter::CheckExist(const std::string&path, bool* is_exist, +bool ZooKeeperAdapter::CheckExist(const std::string& path, bool* is_exist, int* zk_errno) { MutexLock mutex(&state_mutex_); if (!ZooKeeperUtil::IsValidPath(path)) { diff --git a/src/zk/zk_adapter.h b/src/zk/zk_adapter.h index 56cf8e2b3..010efed75 100644 --- a/src/zk/zk_adapter.h +++ b/src/zk/zk_adapter.h @@ -9,7 +9,7 @@ #include #include -#include +#include #include "common/mutex.h" #include "common/thread_pool.h" @@ -17,6 +17,7 @@ #include "zk/zk_lock.h" #include "zk/zk_util.h" + namespace tera { namespace zk { @@ -39,7 +40,8 @@ class ZooKeeperAdapter { virtual ~ZooKeeperAdapter(); bool Init(const std::string& server_list, const std::string& root_path, - uint32_t session_timeout, const std::string& id, int* zk_errno); + uint32_t session_timeout, const std::string& id, int* zk_errno, + int wait_timeout = -1); // default wait until zk server ready void Finalize(); bool GetSessionId(int64_t* session_id, int* zk_errno); diff --git a/src/zk/zk_util.cc b/src/zk/zk_util.cc index 446ef6108..579a59f0d 100644 --- a/src/zk/zk_util.cc +++ b/src/zk/zk_util.cc @@ -9,7 +9,7 @@ #include #include -#include +#include #include "common/file/file_path.h" #include "common/file/file_stream.h"