Skip to content
This repository was archived by the owner on Jan 4, 2026. It is now read-only.
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 2 additions & 4 deletions aloja-bench/run_benchs.sh
Original file line number Diff line number Diff line change
Expand Up @@ -32,16 +32,14 @@ prepare_folder "$DISK"
# Save globals at the beginning (for debugging purposes)
save_env "$JOB_PATH/config.sh"

# Check if needed to download files and configs
install_files
# Prepare benchmark configuration
benchmark_suite_config

# 3.) Run the benchmarks

# At this point, if the user presses ctrl+c or the script is killed to clean up afterwards and copy the files if remote is defined
update_traps "benchmark_suite_cleanup; rsync_extenal '$JOB_NAME';" "update_logger"

benchmark_suite_config

start_time=$(date '+%s')

########################################################
Expand Down
31 changes: 11 additions & 20 deletions shell/common/benchmark_BigBench.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,31 +18,22 @@ if [ ! $user_suplied_bench_list ]; then
fi

benchmark_suite_config() {
initialize_hadoop_vars
prepare_hadoop_config "$NET" "$DISK" "$BENCH_SUITE"
start_hadoop
add_engine "hadoop"
add_engine "hive"

if [ "$BB_SERVER_DERBY" == "true" ]; then
logger "WARNING: Using Derby DB in client/server mode"
USE_EXTERNAL_DATABASE="true"
initialize_derby_vars "BigBench_DB"
start_derby
else
logger "WARNING: Using Derby DB in embedded mode"
fi
#BigBench uses Spark under specific circumstances: spark-SQL as SQL engine or spark-MLlib as Machine learning framework
if [ "$ENGINE" == "spark_sql" ] || [ "$HIVE_ML_FRAMEWORK" == "spark" ] || [ "$HIVE_ML_FRAMEWORK" == "spark-csv" ] \
|| [ "$HIVE_ML_FRAMEWORK" == "spark-2" ]; then

initialize_hive_vars
prepare_hive_config "$HIVE_SETTINGS_FILE" "$HIVE_SETTINGS_FILE_PATH"
add_engine "spark"

if [ ! -z "$use_spark" ]; then
initialize_spark_vars
prepare_spark_config
if [ "$HIVE_ML_FRAMEWORK" == "spark-2" ]; then
logger "WARNING: Using spark 2 as SQL engine and Machine Learning framework"
SPARK_HIVE="spark_hive-2.1.1"
fi
fi

if [ "$HIVE_ENGINE" == "tez" ]; then
initialize_tez_vars
prepare_tez_config
fi
benchmark_suite_start_config
initialize_BigBench_vars
prepare_BigBench
}
Expand Down
28 changes: 0 additions & 28 deletions shell/common/common_BigBench.sh
Original file line number Diff line number Diff line change
@@ -1,31 +1,3 @@
# Start Spark if needed
if [ "$ENGINE" == "spark_sql" ] || [ "$HIVE_ML_FRAMEWORK" == "spark" ] || [ "$HIVE_ML_FRAMEWORK" == "spark-csv" ] || [ "$HIVE_ML_FRAMEWORK" == "spark-2" ]; then

if [ "$HIVE_ML_FRAMEWORK" == "spark-2" ]; then
logger "WARNING: Using spark 2 as SQL engine and Machine Learning framework"
SPARK_HIVE="spark_hive-2.1.1"
fi
use_spark=true
source_file "$ALOJA_REPO_PATH/shell/common/common_spark.sh"
set_spark_requires
# HIVE_ENGINE="mr"
fi

# Start Hive
source_file "$ALOJA_REPO_PATH/shell/common/common_hive.sh"
set_hive_requires

# Start Tez if needed
if [ "$HIVE_ENGINE" == "tez" ]; then
source_file "$ALOJA_REPO_PATH/shell/common/common_tez.sh"
set_tez_requires
fi

if [ "$BB_SERVER_DERBY" == "true" ]; then
source_file "$ALOJA_REPO_PATH/shell/common/common_derby.sh"
set_derby_requires
fi

BIG_BENCH_FOLDER="Big-Data-Benchmark-for-Big-Bench"

if [ "$BENCH_SCALE_FACTOR" == 0 ] ; then #Should only happen when BENCH_SCALE_FACTOR is not set and BENCH_DATA_SIZE < 1GB
Expand Down
35 changes: 30 additions & 5 deletions shell/common/common_benchmarks.sh
Original file line number Diff line number Diff line change
Expand Up @@ -162,11 +162,39 @@ get_options() {

}

add_engine(){
bench_engines+=("$1")
}

source_engines(){
for engine in "${bench_engines[@]}"; do
source_file "$ALOJA_REPO_PATH/shell/common/common_$engine.sh"
function_call "set_${engine}_requires"
done
}

init_engines(){
for engine in "${bench_engines[@]}"; do
function_call "init_$engine"
done
}

# Temple functions, re implement in benchmark if needed

benchmark_suite_config() {
logger "DEBUG: No specific ${FUNCNAME[0]} defined for $BENCH_SUITE"
benchmark_suite_start_config() {
logger "INFO: preparing confiuration for $BENCH_SUITE"

# Source needed engines and mark files to download
source_engines

# Check if needed to download files and configs
install_files

# Configure the engine and start services if needed
init_engines

# Specify which binaries to use for monitoring
set_monit_binaries
}

# Iterate the specified benchmarks in the suite
Expand Down Expand Up @@ -1281,9 +1309,6 @@ $($DSH "ls -lah '$HDD/../'; ls -lah '$HDD_TMP/../' " )
else
logger "DEBUG: Base dirs created successfully"
fi

# specify which binaries to use for monitoring
set_monit_binaries
}

# Cleanup after a benchmark suite run, and before starting one
Expand Down
8 changes: 7 additions & 1 deletion shell/common/common_derby.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
source_file "$ALOJA_REPO_PATH/shell/common/common_java.sh"
set_java_requires

# Sets the required files to download/copy
set_derby_requires() {
[ ! "$DERBY_VERSION" ] && die "No DERBY_VERSION specified"
Expand Down Expand Up @@ -117,6 +116,13 @@ initialize_derby_vars() {
fi
}

init_derby() {
initialize_derby_vars "Aloja_DB"
logger "WARNING: Using Derby DB in client/server mode"
USE_EXTERNAL_DATABASE="true"
start_derby
}

clean_derby() {
stop_derby
}
8 changes: 8 additions & 0 deletions shell/common/common_hadoop.sh
Original file line number Diff line number Diff line change
Expand Up @@ -411,6 +411,14 @@ cp $HADOOP_CONF_DIR/* $JOB_PATH/conf_$node/" &
fi
}


init_hadoop() {
initialize_hadoop_vars
prepare_hadoop_config "$NET" "$DISK" "$BENCH_SUITE"
use_hadoop=1 # Control variable, useful only in certain benchmarks
start_hadoop
}

# Returns if Hadoop v1 or v2
# $1 the hadoop string (optional, if not uses $HADOOP_VERSION)
get_hadoop_major_version() {
Expand Down
7 changes: 6 additions & 1 deletion shell/common/common_hive.sh
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
#HIVE SPECIFIC FUNCTIONS
source_file "$ALOJA_REPO_PATH/shell/common/common_hadoop.sh"
set_hadoop_requires

Expand Down Expand Up @@ -117,6 +116,12 @@ initialize_hive_vars() {
fi
}

init_hive() {
initialize_hive_vars
prepare_hive_config
use_hive=1 # Control variable, useful only in certain benchmarks
}

get_hive_major_version() {
local hive_string="$HIVE_VERSION"
local major_version=""
Expand Down
4 changes: 4 additions & 0 deletions shell/common/common_java.sh
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,10 @@ get_java_exports() {
fi
}

init_java(){
set_java_requires
}

# Sets the JAVA_HOME for the benchmark
# TODO this assumes you are in the head node and only sets it there, should finish export_var_path funct
# also that it is run from the main run_bench.sh file
Expand Down
10 changes: 6 additions & 4 deletions shell/common/common_spark.sh
Original file line number Diff line number Diff line change
@@ -1,7 +1,3 @@
#SPARK SPECIFIC FUNCTIONS
source_file "$ALOJA_REPO_PATH/shell/common/common_hadoop.sh"
set_hadoop_requires

# Sets the required files to download/copy
set_spark_requires() {
[ ! "$SPARK_VERSION" ] && die "No SPARK_VERSION specified"
Expand Down Expand Up @@ -103,6 +99,12 @@ execute_spark-sql(){
execute_spark "$bench" "$cmd" "$time_exec" "spark-sql"
}

init_spark() {
initialize_spark_vars
prepare_spark_config
use_spark=1 # Control variable, useful only in certain benchmarks
}

initialize_spark_vars() {
if [ "$clusterType" == "PaaS" ]; then
SPARK_HOME="/usr" ## TODO ONLY WORKING IN HDI
Expand Down
6 changes: 6 additions & 0 deletions shell/common/common_tez.sh
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,12 @@ initialize_tez_vars() {
fi
}

init_tez() {
set_tez_requires
initialize_tez_vars
prepare_tez_config
}

# Sets the substitution values for the tez config
get_tez_substitutions() {

Expand Down
2 changes: 1 addition & 1 deletion shell/conf/benchmarks_defaults.conf
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ BENCH_SCALE_FACTOR="$(( BENCH_DATA_SIZE / 1000000000 ))" #in GB
[ ! "$HIVE_FILEFORMAT" ] && HIVE_FILEFORMAT="ORC" # Available options are: EXTFILE, RCFILE, ORC(default), SEQUENCEFILE, PARQUET, AVRO,
[ ! "$HIVE_ENGINE" ] && HIVE_ENGINE="tez" # Available options are: tez (default, hadoop 2 only), mr (MapReduce)
[ ! "$HIVE_ML_FRAMEWORK" ] && HIVE_ML_FRAMEWORK="mahout" # Available options are: spark, spark-2, mahout(default)
[ ! "$BB_SERVER_DERBY" ] && BB_SERVER_DERBY="true" # Available options are: true(Server/Client deployment), false(embeded only)
[ ! "$BB_SERVER_DERBY" ] && BB_SERVER_DERBY=1 # Available options are: true(Server/Client deployment), false(embeded only)
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Comment should be updated to match the new behaviour. Do/did we ever check for false before? To be honest, this one might make more sense with descriptive strings, e.g. embedded instead of false (or change the variable name to something a bit more descriptive, like $BB_DERBY_USE_EMBEDDED).

....or just don't touch it at all in this commit!

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Many Aloja variables need a refactor, for instance ENGINE={hive,spark-sql} needs to be renamed. I propose to do a refactor of benchmark_defaults and change all related variables in a single branch. We also need to discuss standardise the name and the way to populate the variables in different conditions; for instance when using a control variable, should use_spark be set to 1 or to "true", etc.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not proposing that we do sweeping refactors through the whole code base to change these things, I'm just saying that if you change the code here, you should update the comment to match.

For now I'm happy with using 1 and 0 for the new variables (and making sure the tests check the value, rather than just whether the variable is set or not).

The thing about ENGINE seems completely unrelated (and I don't understand what the problem with the current names is).


if [ "$HIVE_ENGINE" == "mr" ]; then #For MapReduce DO NOT do MapJoins, MR uses lots of memory and tends to fail anyways because of high Garbage Collection times.
HIVE_JOINS="false"
Expand Down