Skip to content

Commit

Permalink
[SYSTEMDS-31] Shell and Python scripts to run SystemDS locally
Browse files Browse the repository at this point in the history
[SYSTEMDS-32] Shell script to run SystemDS with spark-submit
  • Loading branch information
corepointer committed Aug 24, 2019
1 parent 2b4ab23 commit 7239c9b
Show file tree
Hide file tree
Showing 5 changed files with 492 additions and 2 deletions.
45 changes: 45 additions & 0 deletions bin/sparkDML2.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
#!/bin/bash
#-------------------------------------------------------------
#
# Copyright 2019 Graz University of Technology
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
#-------------------------------------------------------------

#set -x

# This script is a simplified version of sparkDML.sh in order to
# allow a simple drop-in replacement for 'hadoop jar' without
# the need to change any command line arguments.

#export HADOOP_CONF_DIR=/etc/hadoop/conf
#SPARK_HOME=../spark-2.3.1-bin-hadoop2.7
#export HADOOP_HOME=${HADOOP_HOME:-/usr/hdp/2.5.0.0-1245/hadoop}
#HADOOP_CONF_DIR=${HADOOP_CONF_DIR:-/usr/hdp/2.5.0.0-1245/hadoop/conf}

export SPARK_MAJOR_VERSION=2

#$SPARK_HOME/bin/spark-submit \
spark-submit \
--master yarn \
--driver-memory 80g \
--num-executors 1 \
--executor-memory 60g \
--executor-cores 19 \
--conf "spark.yarn.am.extraJavaOptions -Dhdp.version=2.5.0.0-1245" \
"$@"

# # run spark submit locally
# spark-submit \
# "$@"
133 changes: 133 additions & 0 deletions bin/systemds-standalone.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
#!/usr/bin/env python
#-------------------------------------------------------------
#
# Copyright 2019 Graz University of Technology
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
#-------------------------------------------------------------

import os
import sys
from os.path import join
import argparse
import platform
from utils import get_env_systemds_root, find_dml_file, log4j_path, config_path


def default_classpath(systemds_root):
"""
Classpath information required for excution
return: String
Classpath location of build, library and hadoop directories
"""
build_lib = join(systemds_root, 'target', '*')
lib_lib = join(systemds_root, 'target', 'lib', '*')
hadoop_lib = join(systemds_root, 'target', 'lib', 'hadoop', '*')
sysds_jar = join(systemds_root, 'target', 'SystemDS.jar')
return build_lib, lib_lib, hadoop_lib, sysds_jar


def standalone_execution_entry(nvargs, args, config, explain, debug, stats, gpu, heapmem, f):
"""
This function is responsible for the execution of arguments via
subprocess call in singlenode mode
"""

systemds_root = get_env_systemds_root()
script_file = find_dml_file(systemds_root, f)

if platform.system() == 'Windows':
default_cp = ';'.join(default_classpath(systemds_root))
else:
default_cp = ':'.join(default_classpath(systemds_root))

java_memory = '-Xmx' + heapmem + ' -Xms4g -Xmn1g'

# Log4j
log4j = log4j_path(systemds_root)
log4j_properties_path = '-Dlog4j.configuration=file:{}'.format(log4j)

# Config
if config is None:
default_config = config_path(systemds_root)
else:
default_config = config

ds_options = []
if nvargs is not None:
ds_options.append('-nvargs')
ds_options.append(' '.join(nvargs))
if args is not None:
ds_options.append('-args')
ds_options.append(' '.join(args))
if explain is not None:
ds_options.append('-explain')
ds_options.append(explain)
if debug is not False:
ds_options.append('-debug')
if stats is not None:
ds_options.append('-stats')
ds_options.append(stats)
if gpu is not None:
ds_options.append('-gpu')
ds_options.append(gpu)

os.environ['HADOOP_HOME'] = '/tmp/systemds'

cmd = ['java', java_memory, log4j_properties_path,
'-cp', default_cp, 'org.tugraz.sysds.api.DMLScript',
'-f', script_file, '-exec', 'singlenode', '-config', default_config,
' '.join(ds_options)]

cmd = ' '.join(cmd)
print(cmd)

return_code = os.system(cmd)
return return_code


if __name__ == '__main__':

fn = sys.argv[0]
if os.path.exists(fn):
#print(os.path.basename(fn))
print(fn[:fn.rfind('/')])

cparser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
description='System-DS Standalone Script')

# SYSTEM-DS Options
cparser.add_argument('-nvargs', help='List of attributeName-attributeValue pairs', nargs='+', metavar='')
cparser.add_argument('-args', help='List of positional argument values', metavar='', nargs='+')
cparser.add_argument('-config', help='System-DS configuration file (e.g SystemDS-config.xml)', metavar='')
cparser.add_argument('-explain', help='explains plan levels can be hops, runtime, '
'recompile_hops, recompile_runtime', nargs='?', const='runtime', metavar='')
cparser.add_argument('-debug', help='runs in debug mode', action='store_true')
cparser.add_argument('-stats', help='Monitor and report caching/recompilation statistics, '
'heavy hitter <count> is 10 unless overridden', nargs='?', const='10',
metavar='')
cparser.add_argument('-gpu', help='uses CUDA instructions when reasonable, '
'set <force> option to skip conservative memory estimates '
'and use GPU wherever possible', nargs='?')
cparser.add_argument('-heapmem', help='maximum JVM heap memory', metavar='', default='8g')
cparser.add_argument('-f', required=True, help='specifies dml file to execute; '
'path can be local/hdfs/gpfs', metavar='')

args = cparser.parse_args()
arg_dict = vars(args)
return_code = standalone_execution_entry(**arg_dict)

if return_code != 0:
print('Failed to run SystemDS. Exit code :' + str(return_code))
196 changes: 196 additions & 0 deletions bin/systemds-standalone.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,196 @@
#!/usr/bin/env bash
#-------------------------------------------------------------
#
# Copyright 2019 Graz University of Technology
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
#-------------------------------------------------------------


# error help print
printSimpleUsage()
{
cat << EOF
Usage: $0 <dml-filename> [arguments] [-help]
-help - Print detailed help message
EOF
exit 1
}

# Script internally invokes 'java -Xmx4g -Xms4g -Xmn400m [Custom-Java-Options] -jar StandaloneSystemDS.jar -f <dml-filename> -exec singlenode -config=SystemDS-config.xml [Optional-Arguments]'

if [ -z "$1" ] ; then
echo "Wrong Usage.";
printSimpleUsage
fi

if [ ! -z $SYSTEMDS_ROOT ]; then
PROJECT_ROOT_DIR="$SYSTEMDS_ROOT"
echo "SYTEMDS_ROOT is set to:" $SYSTEMDS_ROOT
else
# find the systemDS root path which contains the bin folder, the script folder and the target folder
# tolerate path with spaces
SCRIPT_DIR=$( dirname "$0" )
PROJECT_ROOT_DIR=$( cd "${SCRIPT_DIR}/.." ; pwd -P )
fi

USER_DIR=$PWD

BUILD_DIR=${PROJECT_ROOT_DIR}/target
HADOOP_LIB_DIR=${BUILD_DIR}/lib
DML_SCRIPT_CLASS=${BUILD_DIR}/classes/org/tugraz/sysds/api/DMLScript.class

BUILD_ERR_MSG="You must build the project before running this script."
BUILD_DIR_ERR_MSG="Could not find target directory \"${BUILD_DIR}\". ${BUILD_ERR_MSG}"
HADOOP_LIB_ERR_MSG="Could not find required libraries \"${HADOOP_LIB_DIR}/*\". ${BUILD_ERR_MSG}"
DML_SCRIPT_ERR_MSG="Could not find \"${DML_SCRIPT_CLASS}\". ${BUILD_ERR_MSG}"

# check if the project had been built and the jar files exist
if [ ! -d "${BUILD_DIR}" ]; then echo "${BUILD_DIR_ERR_MSG}"; exit 1; fi
if [ ! -d "${HADOOP_LIB_DIR}" ]; then echo "${HADOOP_LIB_ERR_MSG}"; exit 1; fi
if [ ! -f "${DML_SCRIPT_CLASS}" ]; then echo "${DML_SCRIPT_ERR_MSG}"; exit 1; fi


echo "================================================================================"

# if the present working directory is the project root or bin folder, then use the temp folder as user.dir
if [ "$USER_DIR" = "$PROJECT_ROOT_DIR" ] || [ "$USER_DIR" = "$PROJECT_ROOT_DIR/bin" ]
then
USER_DIR=${PROJECT_ROOT_DIR}/temp
echo "Output dir: $USER_DIR"
fi


# if the SystemDS-config.xml does not exist, create it from the template
if [ ! -f "${PROJECT_ROOT_DIR}/conf/SystemDS-config.xml" ]
then
cp "${PROJECT_ROOT_DIR}/conf/SystemDS-config.xml.template" \
"${PROJECT_ROOT_DIR}/conf/SystemDS-config.xml"
echo "... created ${PROJECT_ROOT_DIR}/conf/SystemDS-config.xml"
fi

# if the log4j.properties do not exis, create them from the template
if [ ! -f "${PROJECT_ROOT_DIR}/conf/log4j.properties" ]
then
cp "${PROJECT_ROOT_DIR}/conf/log4j.properties.template" \
"${PROJECT_ROOT_DIR}/conf/log4j.properties"
echo "... created ${PROJECT_ROOT_DIR}/conf/log4j.properties"
fi




# add hadoop libraries which were generated by the build to the classpath
CLASSPATH=\"${BUILD_DIR}/lib/*\"

#SYSTEM_DS_JAR=$( find $PROJECT_ROOT_DIR/target/system-ds-*-SNAPSHOT.jar )
SYSTEM_DS_JAR=\"${BUILD_DIR}/classes\"

CLASSPATH=${CLASSPATH}:${SYSTEM_DS_JAR}

echo "================================================================================"

# Set default Java options
SYSTEMDS_DEFAULT_JAVA_OPTS="\
-Xmx8g -Xms4g -Xmn1g \
-cp $CLASSPATH \
-Dlog4j.configuration=file:'$PROJECT_ROOT_DIR/conf/log4j.properties' \
-Duser.dir='$USER_DIR'"

# Add any custom Java options set by the user at command line, overriding defaults as necessary.
if [ ! -z "${SYSTEMDS_JAVA_OPTS}" ]; then
SYSTEMDS_DEFAULT_JAVA_OPTS+=" ${SYSTEMDS_JAVA_OPTS}"
unset SYSTEMDS_JAVA_OPTS
fi

# Add any custom Java options set by the user in the environment variables file, overriding defaults as necessary.
if [ -f "${PROJECT_ROOT_DIR}/conf/systemds-env.sh" ]; then
. "${PROJECT_ROOT_DIR}/conf/systemds-env.sh"
if [ ! -z "${SYSTEMDS_JAVA_OPTS}" ]; then
SYSTEMDS_DEFAULT_JAVA_OPTS+=" ${SYSTEMDS_JAVA_OPTS}"
fi
fi


printUsageExit()
{
CMD="\
java ${SYSTEMDS_DEFAULT_JAVA_OPTS} \
org.tugraz.sysds.api.DMLScript \
-help"
# echo ${CMD}
eval ${CMD}
exit 0
}

while getopts "h:f:" options; do
case $options in
h ) echo Warning: Help requested. Will exit after usage message
printUsageExit
;;
\? ) echo Warning: Help requested. Will exit after usage message
printUsageExit
;;
f ) #echo "Shifting args due to -f"
shift
;;
* ) echo Error: Unexpected error while processing options
esac
done

# Peel off first argument so that $@ contains arguments to DML script
SCRIPT_FILE=$1
shift

# if the script file path was omitted, try to complete the script path
if [ ! -f "$SCRIPT_FILE" ]
then
SCRIPT_FILE_NAME=$(basename $SCRIPT_FILE)
SCRIPT_FILE_FOUND=$(find "$PROJECT_ROOT_DIR/scripts" -name "$SCRIPT_FILE_NAME")
if [ ! "$SCRIPT_FILE_FOUND" ]
then
echo "Could not find DML script: $SCRIPT_FILE"
printSimpleUsage
else
SCRIPT_FILE=$SCRIPT_FILE_FOUND
echo "DML script: $SCRIPT_FILE"
fi
fi


# Invoke the jar with options and arguments
CMD="\
java ${SYSTEMDS_DEFAULT_JAVA_OPTS} \
org.tugraz.sysds.api.DMLScript \
-f '$SCRIPT_FILE' \
-exec singlenode \
-config '$PROJECT_ROOT_DIR/conf/SystemDS-config.xml' \
$@"

export HADOOP_HOME=/tmp/systemds
eval ${CMD}

RETURN_CODE=$?

# if there was an error, display the full java command (in case some of the variable substitutions broke it)
if [ $RETURN_CODE -ne 0 ]
then
echo "Failed to run SystemDS. Exit code: $RETURN_CODE"
LF=$'\n'


# keep empty lines above for the line breaks
echo " ${CMD// /$LF }"
fi

Loading

0 comments on commit 7239c9b

Please sign in to comment.