From 03b60dc3b8e4e8fa4c81a92f4a5703542a4e8000 Mon Sep 17 00:00:00 2001
From: alejandro <alejandro.montero.rivero@gmail.com>
Date: Thu, 14 Sep 2017 18:11:37 +0200
Subject: [PATCH] Benchmark configuration refactor.

A new function in common_benchmarks permits configuring any benchmark
in a centralized way without duplicating code. It uses specific variables
"use_$engine{hadoop,hive,spark}" to set the requirements to download,
prepare and start necessary services.

The objective of such modification is to reduce benchmark definition
complexity, code duplication, code than runs unnecessarily more than once
and readability.

Some sourcing is still maintained to provide support for legacy benchmarks
until they are all ported to the new system.

Modified benchmark configuration conditionals to avoid incorrect uses.

BugFix: Hive and Derby only benchmarks such as hive-test and derby-test
that are not yet ported to the nev configuration system need to source
Hadoop and Java for them to work. Eventually once all benchmarks are
ported, this sources need to go away.

Added a init method in each engine that handles the initilization and
configuration.

Also, initilization of monitoring tools is moved to
benchmark_suite_start_config as part of the benchmark configuration, this
avoids an issue where some monitor tools where not correctly started.

Modified the way we check if an engine is needed, using use_$engine=1.
This adds security to not initilize the engine if some garbage string is
set to the variables.

Solve package download and configuration sync
---
 aloja-bench/run_benchs.sh           |  6 ++---
 shell/common/benchmark_BigBench.sh  | 31 +++++++++----------------
 shell/common/common_BigBench.sh     | 28 -----------------------
 shell/common/common_benchmarks.sh   | 35 ++++++++++++++++++++++++-----
 shell/common/common_derby.sh        |  8 ++++++-
 shell/common/common_hadoop.sh       |  8 +++++++
 shell/common/common_hive.sh         |  7 +++++-
 shell/common/common_java.sh         |  4 ++++
 shell/common/common_spark.sh        | 10 +++++----
 shell/common/common_tez.sh          |  6 +++++
 shell/conf/benchmarks_defaults.conf |  2 +-
 11 files changed, 81 insertions(+), 64 deletions(-)

diff --git a/aloja-bench/run_benchs.sh b/aloja-bench/run_benchs.sh
index da30d7e7..e60ae53c 100755
--- a/aloja-bench/run_benchs.sh
+++ b/aloja-bench/run_benchs.sh
@@ -32,16 +32,14 @@ prepare_folder "$DISK"
 # Save globals at the beginning (for debugging purposes)
 save_env "$JOB_PATH/config.sh"
 
-# Check if needed to download files and configs
-install_files
+# Prepare benchmark configuration
+benchmark_suite_config
 
 # 3.) Run the benchmarks
 
 # At this point, if the user presses ctrl+c or the script is killed to clean up afterwards and copy the files if remote is defined
 update_traps "benchmark_suite_cleanup; rsync_extenal '$JOB_NAME';" "update_logger"
 
-benchmark_suite_config
-
 start_time=$(date '+%s')
 
 ########################################################
diff --git a/shell/common/benchmark_BigBench.sh b/shell/common/benchmark_BigBench.sh
index 4c88532b..48525d8c 100644
--- a/shell/common/benchmark_BigBench.sh
+++ b/shell/common/benchmark_BigBench.sh
@@ -18,31 +18,22 @@ if [ ! $user_suplied_bench_list ]; then
 fi
 
 benchmark_suite_config() {
-  initialize_hadoop_vars
-  prepare_hadoop_config "$NET" "$DISK" "$BENCH_SUITE"
-  start_hadoop
+  add_engine "hadoop"
+  add_engine "hive"
 
-  if [ "$BB_SERVER_DERBY" == "true" ]; then
-    logger "WARNING: Using Derby DB in client/server mode"
-    USE_EXTERNAL_DATABASE="true"
-    initialize_derby_vars "BigBench_DB"
-    start_derby
-  else
-    logger "WARNING: Using Derby DB in embedded mode"
-  fi
+  #BigBench uses Spark under specific circumstances: spark-SQL as SQL engine or spark-MLlib as Machine learning framework
+  if [ "$ENGINE" == "spark_sql" ] || [ "$HIVE_ML_FRAMEWORK" == "spark" ] || [ "$HIVE_ML_FRAMEWORK" == "spark-csv" ] \
+  || [ "$HIVE_ML_FRAMEWORK" == "spark-2" ]; then
 
-  initialize_hive_vars
-  prepare_hive_config "$HIVE_SETTINGS_FILE" "$HIVE_SETTINGS_FILE_PATH"
+    add_engine "spark"
 
-  if [ ! -z "$use_spark" ]; then
-    initialize_spark_vars
-    prepare_spark_config
+    if [ "$HIVE_ML_FRAMEWORK" == "spark-2" ]; then
+      logger "WARNING: Using spark 2 as SQL engine and Machine Learning framework"
+      SPARK_HIVE="spark_hive-2.1.1"
+    fi
   fi
 
-  if [ "$HIVE_ENGINE" == "tez" ]; then
-    initialize_tez_vars
-    prepare_tez_config
-fi
+  benchmark_suite_start_config
   initialize_BigBench_vars
   prepare_BigBench
 }
diff --git a/shell/common/common_BigBench.sh b/shell/common/common_BigBench.sh
index 2bcdfc3b..a3e1b5b4 100644
--- a/shell/common/common_BigBench.sh
+++ b/shell/common/common_BigBench.sh
@@ -1,31 +1,3 @@
-# Start Spark if needed
-if [ "$ENGINE" == "spark_sql" ] || [ "$HIVE_ML_FRAMEWORK" == "spark" ] || [ "$HIVE_ML_FRAMEWORK" == "spark-csv" ] || [ "$HIVE_ML_FRAMEWORK" == "spark-2" ]; then
-
-  if [ "$HIVE_ML_FRAMEWORK" == "spark-2" ]; then
-    logger "WARNING: Using spark 2 as SQL engine and Machine Learning framework"
-    SPARK_HIVE="spark_hive-2.1.1"
-  fi
-  use_spark=true
-  source_file "$ALOJA_REPO_PATH/shell/common/common_spark.sh"
-  set_spark_requires
-#  HIVE_ENGINE="mr"
-fi
-
-# Start Hive
-source_file "$ALOJA_REPO_PATH/shell/common/common_hive.sh"
-set_hive_requires
-
-# Start Tez if needed
-if [ "$HIVE_ENGINE" == "tez" ]; then
-  source_file "$ALOJA_REPO_PATH/shell/common/common_tez.sh"
-  set_tez_requires
-fi
-
-if [ "$BB_SERVER_DERBY" == "true" ]; then
-  source_file "$ALOJA_REPO_PATH/shell/common/common_derby.sh"
-  set_derby_requires
-fi
-
 BIG_BENCH_FOLDER="Big-Data-Benchmark-for-Big-Bench"
 
 if [ "$BENCH_SCALE_FACTOR" == 0 ] ; then #Should only happen when BENCH_SCALE_FACTOR is not set and BENCH_DATA_SIZE < 1GB
diff --git a/shell/common/common_benchmarks.sh b/shell/common/common_benchmarks.sh
index 33134053..d3fd5714 100644
--- a/shell/common/common_benchmarks.sh
+++ b/shell/common/common_benchmarks.sh
@@ -162,11 +162,39 @@ get_options() {
 
 }
 
+add_engine(){
+  bench_engines+=("$1")
+}
+
+source_engines(){
+  for engine in "${bench_engines[@]}"; do
+    source_file "$ALOJA_REPO_PATH/shell/common/common_$engine.sh"
+    function_call "set_${engine}_requires"
+  done
+}
+
+init_engines(){
+  for engine in "${bench_engines[@]}"; do
+    function_call "init_$engine"
+  done
+}
 
 # Temple functions, re implement in benchmark if needed
 
-benchmark_suite_config() {
-  logger "DEBUG: No specific ${FUNCNAME[0]} defined for $BENCH_SUITE"
+benchmark_suite_start_config() {
+  logger "INFO: preparing confiuration for $BENCH_SUITE"
+
+  # Source needed engines and mark files to download
+  source_engines
+
+  # Check if needed to download files and configs
+  install_files
+
+  # Configure the engine and start services if needed
+  init_engines
+
+  # Specify which binaries to use for monitoring
+  set_monit_binaries
 }
 
 # Iterate the specified benchmarks in the suite
@@ -1281,9 +1309,6 @@ $($DSH "ls -lah '$HDD/../'; ls -lah '$HDD_TMP/../' " )
   else
     logger "DEBUG: Base dirs created successfully"
   fi
-
-  # specify which binaries to use for monitoring
-  set_monit_binaries
 }
 
 # Cleanup after a benchmark suite run, and before starting one
diff --git a/shell/common/common_derby.sh b/shell/common/common_derby.sh
index 5b450d46..c40edcd1 100644
--- a/shell/common/common_derby.sh
+++ b/shell/common/common_derby.sh
@@ -1,6 +1,5 @@
 source_file "$ALOJA_REPO_PATH/shell/common/common_java.sh"
 set_java_requires
-
 # Sets the required files to download/copy
 set_derby_requires() {
   [ ! "$DERBY_VERSION" ] && die "No DERBY_VERSION specified"
@@ -117,6 +116,13 @@ initialize_derby_vars() {
   fi
 }
 
+init_derby() {
+  initialize_derby_vars "Aloja_DB"
+  logger "WARNING: Using Derby DB in client/server mode"
+  USE_EXTERNAL_DATABASE="true"
+  start_derby
+}
+
 clean_derby() {
   stop_derby
 }
\ No newline at end of file
diff --git a/shell/common/common_hadoop.sh b/shell/common/common_hadoop.sh
index df65536e..ce50d2e0 100644
--- a/shell/common/common_hadoop.sh
+++ b/shell/common/common_hadoop.sh
@@ -411,6 +411,14 @@ cp $HADOOP_CONF_DIR/* $JOB_PATH/conf_$node/" &
  fi
 }
 
+
+init_hadoop() {
+  initialize_hadoop_vars
+  prepare_hadoop_config "$NET" "$DISK" "$BENCH_SUITE"
+  use_hadoop=1 # Control variable, useful only in certain benchmarks
+  start_hadoop
+}
+
 # Returns if Hadoop v1 or v2
 # $1 the hadoop string (optional, if not uses $HADOOP_VERSION)
 get_hadoop_major_version() {
diff --git a/shell/common/common_hive.sh b/shell/common/common_hive.sh
index 016d216f..5063e03b 100644
--- a/shell/common/common_hive.sh
+++ b/shell/common/common_hive.sh
@@ -1,4 +1,3 @@
-#HIVE SPECIFIC FUNCTIONS
 source_file "$ALOJA_REPO_PATH/shell/common/common_hadoop.sh"
 set_hadoop_requires
 
@@ -117,6 +116,12 @@ initialize_hive_vars() {
   fi
 }
 
+init_hive() {
+  initialize_hive_vars
+  prepare_hive_config
+  use_hive=1 # Control variable, useful only in certain benchmarks
+}
+
 get_hive_major_version() {
   local hive_string="$HIVE_VERSION"
   local major_version=""
diff --git a/shell/common/common_java.sh b/shell/common/common_java.sh
index 4a3c4c25..015be0f6 100644
--- a/shell/common/common_java.sh
+++ b/shell/common/common_java.sh
@@ -21,6 +21,10 @@ get_java_exports() {
   fi
 }
 
+init_java(){
+  set_java_requires
+}
+
 # Sets the JAVA_HOME for the benchmark
 # TODO this assumes you are in the head node and only sets it there, should finish export_var_path funct
 # also that it is run from the main run_bench.sh file
diff --git a/shell/common/common_spark.sh b/shell/common/common_spark.sh
index 67da0030..4b547c1c 100644
--- a/shell/common/common_spark.sh
+++ b/shell/common/common_spark.sh
@@ -1,7 +1,3 @@
-#SPARK SPECIFIC FUNCTIONS
-source_file "$ALOJA_REPO_PATH/shell/common/common_hadoop.sh"
-set_hadoop_requires
-
 # Sets the required files to download/copy
 set_spark_requires() {
   [ ! "$SPARK_VERSION" ] && die "No SPARK_VERSION specified"
@@ -103,6 +99,12 @@ execute_spark-sql(){
   execute_spark "$bench" "$cmd" "$time_exec" "spark-sql"
 }
 
+init_spark() {
+  initialize_spark_vars
+  prepare_spark_config
+  use_spark=1 # Control variable, useful only in certain benchmarks
+}
+
 initialize_spark_vars() {
   if [ "$clusterType" == "PaaS" ]; then
     SPARK_HOME="/usr" ## TODO ONLY WORKING IN HDI
diff --git a/shell/common/common_tez.sh b/shell/common/common_tez.sh
index 12646383..8b4c363d 100644
--- a/shell/common/common_tez.sh
+++ b/shell/common/common_tez.sh
@@ -38,6 +38,12 @@ initialize_tez_vars() {
   fi
 }
 
+init_tez() {
+  set_tez_requires
+  initialize_tez_vars
+  prepare_tez_config
+}
+
 # Sets the substitution values for the tez config
 get_tez_substitutions() {
 
diff --git a/shell/conf/benchmarks_defaults.conf b/shell/conf/benchmarks_defaults.conf
index a9d4e0f9..aa1b0c91 100644
--- a/shell/conf/benchmarks_defaults.conf
+++ b/shell/conf/benchmarks_defaults.conf
@@ -85,7 +85,7 @@ BENCH_SCALE_FACTOR="$(( BENCH_DATA_SIZE / 1000000000 ))" #in GB
 [ ! "$HIVE_FILEFORMAT" ] && HIVE_FILEFORMAT="ORC" # Available options are: EXTFILE, RCFILE, ORC(default), SEQUENCEFILE, PARQUET, AVRO,
 [ ! "$HIVE_ENGINE" ] && HIVE_ENGINE="tez" # Available options are: tez (default, hadoop 2 only), mr (MapReduce)
 [ ! "$HIVE_ML_FRAMEWORK" ] && HIVE_ML_FRAMEWORK="mahout" # Available options are: spark, spark-2, mahout(default)
-[ ! "$BB_SERVER_DERBY" ] && BB_SERVER_DERBY="true" # Available options are: true(Server/Client deployment), false(embeded only)
+[ ! "$BB_SERVER_DERBY" ] && BB_SERVER_DERBY=1 # Available options are: true(Server/Client deployment), false(embeded only)
 
 if [ "$HIVE_ENGINE" == "mr" ]; then #For MapReduce DO NOT do MapJoins, MR uses lots of memory and tends to fail anyways because of high Garbage Collection times.
   HIVE_JOINS="false"