This repository was archived by the owner on Jan 3, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathexclude_nodes.sh
More file actions
executable file
·54 lines (46 loc) · 1.78 KB
/
exclude_nodes.sh
File metadata and controls
executable file
·54 lines (46 loc) · 1.78 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
#!/bin/bash
if [ $# -ne 0 ]; then
NUM_REQUIRED_NODES=$1
else
NUM_REQUIRED_NODES=0
fi
# Try to read a MSR. Print the hostname if it fails.
CHECK_RDMSR=check_rdmsr.sh
echo '#!/bin/bash' > $CHECK_RDMSR
echo "geopmread POWER_PACKAGE_TDP package 0 >& /dev/null || hostname | sed -e 's/nid[0]*//'" >> $CHECK_RDMSR
echo 'true' >> $CHECK_RDMSR
chmod u+x $CHECK_RDMSR
BAD_NODES=$(aprun -n $COBALT_JOBSIZE -N1 -q ./$CHECK_RDMSR | tr '\n' ' ' | sed 's/\>/,/g;s/ //g;s/,$//')
rm -f $CHECK_RDMSR
if [ -z "$BAD_NODES" ]; then
NUM_BAD_NODES=0
else
NUM_BAD_NODES=$(echo $BAD_NODES | sed 's|[^,]||g' | wc -c)
fi
if [ $NUM_REQUIRED_NODES -gt $(($COBALT_JOBSIZE - $NUM_BAD_NODES)) ]; then
>&2 echo "Error: number of msr-safe enabled nodes is less than number of nodes required!"
>&2 echo "Warning: msr-safe failure detected on the following nodes: $BAD_NODES"
exit 1
fi
NUM_EXTRA_NODES=$(($COBALT_JOBSIZE - $NUM_BAD_NODES - $NUM_REQUIRED_NODES))
if [ $NUM_EXTRA_NODES -gt 0 ]; then
HOSTNAME=get_hostname.sh
echo '#!/bin/bash' > $HOSTNAME
echo "hostname | sed -e 's/nid[0]*//'" >> $HOSTNAME
echo 'true' >> $HOSTNAME
chmod u+x $HOSTNAME
if [ $NUM_BAD_NODES -gt 0 ]; then
EXCLUDE_OPTION="-E $BAD_NODES"
fi
EXTRA_NODES=$(aprun $EXCLUDE_OPTION -n $(($COBALT_JOBSIZE - $NUM_BAD_NODES)) -N1 -q ./$HOSTNAME | \
sort | tail -n $NUM_EXTRA_NODES | tr '\n' ' ' | sed 's/\>/,/g;s/ //g;s/,$//')
rm -f $HOSTNAME
fi
EXCLUDE_LIST=$(echo "$BAD_NODES,$EXTRA_NODES" | sed 's/,$//;s/^,//')
if [ ! -z "$EXCLUDE_LIST" ]; then
echo -n "-E $EXCLUDE_LIST"
fi
echo "NUM_BAD_NODES=$NUM_BAD_NODES" > exclude_nodes.log
echo "BAD_NODES=$BAD_NODES" >> exclude_nodes.log
echo "NUM_EXTRA=$NUM_EXTRA_NODES" >> exclude_nodes.log
echo "EXTRA_NODES=$EXTRA_NODES" >> exclude_nodes.log