From 03b60dc3b8e4e8fa4c81a92f4a5703542a4e8000 Mon Sep 17 00:00:00 2001 From: alejandro Date: Thu, 14 Sep 2017 18:11:37 +0200 Subject: [PATCH] Benchmark configuration refactor. A new function in common_benchmarks permits configuring any benchmark in a centralized way without duplicating code. It uses specific variables "use_$engine{hadoop,hive,spark}" to set the requirements to download, prepare and start necessary services. The objective of such modification is to reduce benchmark definition complexity, code duplication, code than runs unnecessarily more than once and readability. Some sourcing is still maintained to provide support for legacy benchmarks until they are all ported to the new system. Modified benchmark configuration conditionals to avoid incorrect uses. BugFix: Hive and Derby only benchmarks such as hive-test and derby-test that are not yet ported to the nev configuration system need to source Hadoop and Java for them to work. Eventually once all benchmarks are ported, this sources need to go away. Added a init method in each engine that handles the initilization and configuration. Also, initilization of monitoring tools is moved to benchmark_suite_start_config as part of the benchmark configuration, this avoids an issue where some monitor tools where not correctly started. Modified the way we check if an engine is needed, using use_$engine=1. This adds security to not initilize the engine if some garbage string is set to the variables. Solve package download and configuration sync --- aloja-bench/run_benchs.sh | 6 ++--- shell/common/benchmark_BigBench.sh | 31 +++++++++---------------- shell/common/common_BigBench.sh | 28 ----------------------- shell/common/common_benchmarks.sh | 35 ++++++++++++++++++++++++----- shell/common/common_derby.sh | 8 ++++++- shell/common/common_hadoop.sh | 8 +++++++ shell/common/common_hive.sh | 7 +++++- shell/common/common_java.sh | 4 ++++ shell/common/common_spark.sh | 10 +++++---- shell/common/common_tez.sh | 6 +++++ shell/conf/benchmarks_defaults.conf | 2 +- 11 files changed, 81 insertions(+), 64 deletions(-) diff --git a/aloja-bench/run_benchs.sh b/aloja-bench/run_benchs.sh index da30d7e7..e60ae53c 100755 --- a/aloja-bench/run_benchs.sh +++ b/aloja-bench/run_benchs.sh @@ -32,16 +32,14 @@ prepare_folder "$DISK" # Save globals at the beginning (for debugging purposes) save_env "$JOB_PATH/config.sh" -# Check if needed to download files and configs -install_files +# Prepare benchmark configuration +benchmark_suite_config # 3.) Run the benchmarks # At this point, if the user presses ctrl+c or the script is killed to clean up afterwards and copy the files if remote is defined update_traps "benchmark_suite_cleanup; rsync_extenal '$JOB_NAME';" "update_logger" -benchmark_suite_config - start_time=$(date '+%s') ######################################################## diff --git a/shell/common/benchmark_BigBench.sh b/shell/common/benchmark_BigBench.sh index 4c88532b..48525d8c 100644 --- a/shell/common/benchmark_BigBench.sh +++ b/shell/common/benchmark_BigBench.sh @@ -18,31 +18,22 @@ if [ ! $user_suplied_bench_list ]; then fi benchmark_suite_config() { - initialize_hadoop_vars - prepare_hadoop_config "$NET" "$DISK" "$BENCH_SUITE" - start_hadoop + add_engine "hadoop" + add_engine "hive" - if [ "$BB_SERVER_DERBY" == "true" ]; then - logger "WARNING: Using Derby DB in client/server mode" - USE_EXTERNAL_DATABASE="true" - initialize_derby_vars "BigBench_DB" - start_derby - else - logger "WARNING: Using Derby DB in embedded mode" - fi + #BigBench uses Spark under specific circumstances: spark-SQL as SQL engine or spark-MLlib as Machine learning framework + if [ "$ENGINE" == "spark_sql" ] || [ "$HIVE_ML_FRAMEWORK" == "spark" ] || [ "$HIVE_ML_FRAMEWORK" == "spark-csv" ] \ + || [ "$HIVE_ML_FRAMEWORK" == "spark-2" ]; then - initialize_hive_vars - prepare_hive_config "$HIVE_SETTINGS_FILE" "$HIVE_SETTINGS_FILE_PATH" + add_engine "spark" - if [ ! -z "$use_spark" ]; then - initialize_spark_vars - prepare_spark_config + if [ "$HIVE_ML_FRAMEWORK" == "spark-2" ]; then + logger "WARNING: Using spark 2 as SQL engine and Machine Learning framework" + SPARK_HIVE="spark_hive-2.1.1" + fi fi - if [ "$HIVE_ENGINE" == "tez" ]; then - initialize_tez_vars - prepare_tez_config -fi + benchmark_suite_start_config initialize_BigBench_vars prepare_BigBench } diff --git a/shell/common/common_BigBench.sh b/shell/common/common_BigBench.sh index 2bcdfc3b..a3e1b5b4 100644 --- a/shell/common/common_BigBench.sh +++ b/shell/common/common_BigBench.sh @@ -1,31 +1,3 @@ -# Start Spark if needed -if [ "$ENGINE" == "spark_sql" ] || [ "$HIVE_ML_FRAMEWORK" == "spark" ] || [ "$HIVE_ML_FRAMEWORK" == "spark-csv" ] || [ "$HIVE_ML_FRAMEWORK" == "spark-2" ]; then - - if [ "$HIVE_ML_FRAMEWORK" == "spark-2" ]; then - logger "WARNING: Using spark 2 as SQL engine and Machine Learning framework" - SPARK_HIVE="spark_hive-2.1.1" - fi - use_spark=true - source_file "$ALOJA_REPO_PATH/shell/common/common_spark.sh" - set_spark_requires -# HIVE_ENGINE="mr" -fi - -# Start Hive -source_file "$ALOJA_REPO_PATH/shell/common/common_hive.sh" -set_hive_requires - -# Start Tez if needed -if [ "$HIVE_ENGINE" == "tez" ]; then - source_file "$ALOJA_REPO_PATH/shell/common/common_tez.sh" - set_tez_requires -fi - -if [ "$BB_SERVER_DERBY" == "true" ]; then - source_file "$ALOJA_REPO_PATH/shell/common/common_derby.sh" - set_derby_requires -fi - BIG_BENCH_FOLDER="Big-Data-Benchmark-for-Big-Bench" if [ "$BENCH_SCALE_FACTOR" == 0 ] ; then #Should only happen when BENCH_SCALE_FACTOR is not set and BENCH_DATA_SIZE < 1GB diff --git a/shell/common/common_benchmarks.sh b/shell/common/common_benchmarks.sh index 33134053..d3fd5714 100644 --- a/shell/common/common_benchmarks.sh +++ b/shell/common/common_benchmarks.sh @@ -162,11 +162,39 @@ get_options() { } +add_engine(){ + bench_engines+=("$1") +} + +source_engines(){ + for engine in "${bench_engines[@]}"; do + source_file "$ALOJA_REPO_PATH/shell/common/common_$engine.sh" + function_call "set_${engine}_requires" + done +} + +init_engines(){ + for engine in "${bench_engines[@]}"; do + function_call "init_$engine" + done +} # Temple functions, re implement in benchmark if needed -benchmark_suite_config() { - logger "DEBUG: No specific ${FUNCNAME[0]} defined for $BENCH_SUITE" +benchmark_suite_start_config() { + logger "INFO: preparing confiuration for $BENCH_SUITE" + + # Source needed engines and mark files to download + source_engines + + # Check if needed to download files and configs + install_files + + # Configure the engine and start services if needed + init_engines + + # Specify which binaries to use for monitoring + set_monit_binaries } # Iterate the specified benchmarks in the suite @@ -1281,9 +1309,6 @@ $($DSH "ls -lah '$HDD/../'; ls -lah '$HDD_TMP/../' " ) else logger "DEBUG: Base dirs created successfully" fi - - # specify which binaries to use for monitoring - set_monit_binaries } # Cleanup after a benchmark suite run, and before starting one diff --git a/shell/common/common_derby.sh b/shell/common/common_derby.sh index 5b450d46..c40edcd1 100644 --- a/shell/common/common_derby.sh +++ b/shell/common/common_derby.sh @@ -1,6 +1,5 @@ source_file "$ALOJA_REPO_PATH/shell/common/common_java.sh" set_java_requires - # Sets the required files to download/copy set_derby_requires() { [ ! "$DERBY_VERSION" ] && die "No DERBY_VERSION specified" @@ -117,6 +116,13 @@ initialize_derby_vars() { fi } +init_derby() { + initialize_derby_vars "Aloja_DB" + logger "WARNING: Using Derby DB in client/server mode" + USE_EXTERNAL_DATABASE="true" + start_derby +} + clean_derby() { stop_derby } \ No newline at end of file diff --git a/shell/common/common_hadoop.sh b/shell/common/common_hadoop.sh index df65536e..ce50d2e0 100644 --- a/shell/common/common_hadoop.sh +++ b/shell/common/common_hadoop.sh @@ -411,6 +411,14 @@ cp $HADOOP_CONF_DIR/* $JOB_PATH/conf_$node/" & fi } + +init_hadoop() { + initialize_hadoop_vars + prepare_hadoop_config "$NET" "$DISK" "$BENCH_SUITE" + use_hadoop=1 # Control variable, useful only in certain benchmarks + start_hadoop +} + # Returns if Hadoop v1 or v2 # $1 the hadoop string (optional, if not uses $HADOOP_VERSION) get_hadoop_major_version() { diff --git a/shell/common/common_hive.sh b/shell/common/common_hive.sh index 016d216f..5063e03b 100644 --- a/shell/common/common_hive.sh +++ b/shell/common/common_hive.sh @@ -1,4 +1,3 @@ -#HIVE SPECIFIC FUNCTIONS source_file "$ALOJA_REPO_PATH/shell/common/common_hadoop.sh" set_hadoop_requires @@ -117,6 +116,12 @@ initialize_hive_vars() { fi } +init_hive() { + initialize_hive_vars + prepare_hive_config + use_hive=1 # Control variable, useful only in certain benchmarks +} + get_hive_major_version() { local hive_string="$HIVE_VERSION" local major_version="" diff --git a/shell/common/common_java.sh b/shell/common/common_java.sh index 4a3c4c25..015be0f6 100644 --- a/shell/common/common_java.sh +++ b/shell/common/common_java.sh @@ -21,6 +21,10 @@ get_java_exports() { fi } +init_java(){ + set_java_requires +} + # Sets the JAVA_HOME for the benchmark # TODO this assumes you are in the head node and only sets it there, should finish export_var_path funct # also that it is run from the main run_bench.sh file diff --git a/shell/common/common_spark.sh b/shell/common/common_spark.sh index 67da0030..4b547c1c 100644 --- a/shell/common/common_spark.sh +++ b/shell/common/common_spark.sh @@ -1,7 +1,3 @@ -#SPARK SPECIFIC FUNCTIONS -source_file "$ALOJA_REPO_PATH/shell/common/common_hadoop.sh" -set_hadoop_requires - # Sets the required files to download/copy set_spark_requires() { [ ! "$SPARK_VERSION" ] && die "No SPARK_VERSION specified" @@ -103,6 +99,12 @@ execute_spark-sql(){ execute_spark "$bench" "$cmd" "$time_exec" "spark-sql" } +init_spark() { + initialize_spark_vars + prepare_spark_config + use_spark=1 # Control variable, useful only in certain benchmarks +} + initialize_spark_vars() { if [ "$clusterType" == "PaaS" ]; then SPARK_HOME="/usr" ## TODO ONLY WORKING IN HDI diff --git a/shell/common/common_tez.sh b/shell/common/common_tez.sh index 12646383..8b4c363d 100644 --- a/shell/common/common_tez.sh +++ b/shell/common/common_tez.sh @@ -38,6 +38,12 @@ initialize_tez_vars() { fi } +init_tez() { + set_tez_requires + initialize_tez_vars + prepare_tez_config +} + # Sets the substitution values for the tez config get_tez_substitutions() { diff --git a/shell/conf/benchmarks_defaults.conf b/shell/conf/benchmarks_defaults.conf index a9d4e0f9..aa1b0c91 100644 --- a/shell/conf/benchmarks_defaults.conf +++ b/shell/conf/benchmarks_defaults.conf @@ -85,7 +85,7 @@ BENCH_SCALE_FACTOR="$(( BENCH_DATA_SIZE / 1000000000 ))" #in GB [ ! "$HIVE_FILEFORMAT" ] && HIVE_FILEFORMAT="ORC" # Available options are: EXTFILE, RCFILE, ORC(default), SEQUENCEFILE, PARQUET, AVRO, [ ! "$HIVE_ENGINE" ] && HIVE_ENGINE="tez" # Available options are: tez (default, hadoop 2 only), mr (MapReduce) [ ! "$HIVE_ML_FRAMEWORK" ] && HIVE_ML_FRAMEWORK="mahout" # Available options are: spark, spark-2, mahout(default) -[ ! "$BB_SERVER_DERBY" ] && BB_SERVER_DERBY="true" # Available options are: true(Server/Client deployment), false(embeded only) +[ ! "$BB_SERVER_DERBY" ] && BB_SERVER_DERBY=1 # Available options are: true(Server/Client deployment), false(embeded only) if [ "$HIVE_ENGINE" == "mr" ]; then #For MapReduce DO NOT do MapJoins, MR uses lots of memory and tends to fail anyways because of high Garbage Collection times. HIVE_JOINS="false"