From 79a92e592c48b1c16191ac2868d289cf41c727cf Mon Sep 17 00:00:00 2001 From: whd Date: Wed, 10 Jul 2013 13:34:22 -0700 Subject: [PATCH 1/2] First pass at a degraded instance monitor --- .../nagios/libexec/check_degraded_instances | 48 +++++++++++++++++++ 1 file changed, 48 insertions(+) create mode 100755 chef/cookbooks/persona-monitor/files/default/usr/local/nagios/libexec/check_degraded_instances diff --git a/chef/cookbooks/persona-monitor/files/default/usr/local/nagios/libexec/check_degraded_instances b/chef/cookbooks/persona-monitor/files/default/usr/local/nagios/libexec/check_degraded_instances new file mode 100755 index 0000000..51b0772 --- /dev/null +++ b/chef/cookbooks/persona-monitor/files/default/usr/local/nagios/libexec/check_degraded_instances @@ -0,0 +1,48 @@ +#!/usr/bin/env python +import argparse +import sys + +import boto.ec2 + +ERRORS = {'OK': 0, 'WARNING': 1, 'CRITICAL': 2, 'UNKNOWN': 3} + + +def sendSignal(status, message): + print status + ': ' + message + sys.exit(ERRORS[status]) + + +### main logic +if __name__ == "__main__": + try: + parser = argparse.ArgumentParser() + parser.add_argument('region', help='region help') + parser.add_argument('app_name', help='App tag value') + parser.add_argument('env_name', help='Env tag value') + conf = parser.parse_args() + check_string = 'The instance is running on degraded hardware' + connection = boto.ec2.connect_to_region(conf.region) + degraded = connection.get_all_instance_status( + filters={'event.description': check_string}) + instance_ids = [i.id for i in degraded] + reservations = connection.get_all_instances(instance_ids=instance_ids) + + constraints = {'App': conf.app_name, 'Env': conf.env_name} + + instances = [item for sublist in + [i.instances for i in reservations if + isinstance(i.instances, boto.resultset.ResultSet)] for + item in sublist] + actionable = [i for i in instances if + all(item in i.tags.items() for + item in constraints.items())] + + if len(actionable) == 0: + sendSignal('OK', 'No instances running on degraded hardware') + elif len(actionable) > 0: + sendSignal('CRITICAL', + "{} instances running on degraded hardware: {}".format( + len(actionable), + ",".join([i.id for i in actionable]))) + except Exception as exception: + sendSignal('UNKNOWN', 'exception: ' + str(exception)) From 7fbf8fca62d86d3e2bbc1df4e9666d7656424c7d Mon Sep 17 00:00:00 2001 From: whd Date: Mon, 19 Aug 2013 11:49:51 -0700 Subject: [PATCH 2/2] Degraded instance check per issue #98 --- .../default/usr/local/nagios/libexec/check_degraded_instances | 1 + 1 file changed, 1 insertion(+) diff --git a/chef/cookbooks/persona-monitor/files/default/usr/local/nagios/libexec/check_degraded_instances b/chef/cookbooks/persona-monitor/files/default/usr/local/nagios/libexec/check_degraded_instances index 51b0772..298b053 100755 --- a/chef/cookbooks/persona-monitor/files/default/usr/local/nagios/libexec/check_degraded_instances +++ b/chef/cookbooks/persona-monitor/files/default/usr/local/nagios/libexec/check_degraded_instances @@ -23,6 +23,7 @@ if __name__ == "__main__": check_string = 'The instance is running on degraded hardware' connection = boto.ec2.connect_to_region(conf.region) degraded = connection.get_all_instance_status( + instance_ids=None, filters={'event.description': check_string}) instance_ids = [i.id for i in degraded] reservations = connection.get_all_instances(instance_ids=instance_ids)