diff --git a/ipmi/conf.d/ipmi.pyconf b/ipmi/conf.d/ipmi.pyconf index 600c5f07..90d16620 100644 --- a/ipmi/conf.d/ipmi.pyconf +++ b/ipmi/conf.d/ipmi.pyconf @@ -8,21 +8,34 @@ modules { value = "ipmi" } - # IP of rhe IPMI - param ipmi_ip { - value = "10.1.2.3" + # use sudo + param use_sudo { + value = False } + + # IP of the IPMI (optional) + #param ipmi_ip { + # value = "10.1.2.3" + #} - param username { - value = "admin" - } + # IPMI username (optional) + #param username { + # value = "admin" + #} - param password { - value = "secret" - } + # IPMI password (optional) + #param password { + # value = "secret" + #} - param level { - value = "USER" + # IPMI auth level (optional) + #param level { + # value = "USER" + #} + + # timeout on ipmitool command + param timeout { + value = 15 } # Location of timeout binary @@ -31,7 +44,7 @@ modules { } # Location of ipmitool binary - param timeout_bin { + param ipmitool_bin { value = "/usr/bin/ipmitool" } diff --git a/ipmi/python_modules/ipmi.py b/ipmi/python_modules/ipmi.py index 24769c11..7b6d54f4 100644 --- a/ipmi/python_modules/ipmi.py +++ b/ipmi/python_modules/ipmi.py @@ -1,3 +1,4 @@ +import os import sys import re import time @@ -7,38 +8,143 @@ METRICS = { 'time' : 0, - 'data' : {} + 'data' : {}, + 'units': {}, + 'descr': {} } METRICS_CACHE_MAX = 5 stats_pos = {} -def get_metrics(params): +# Try to make different vendors' sensor names at least somewhat consistent... +# This list is admittedly a bit Dell centric, as I have HP and Dell +# hardware and Dell's sensor names (mostly) make more sense to me than +# HP's... --troy +unified_metric_names = { + # HP sensor names + "01-Inlet Ambient": "Inlet Temp", + "43-Sys Exhaust": "Exhaust Temp", + "02-CPU 1": "CPU 1 Temp", + "03-CPU 2": "CPU 2 Temp", + "04-P1 DIMM 1-4": "CPU 1 MemBank 1 Temp", + "05-P1 DIMM 5-8": "CPU 1 MemBank 2 Temp", + "06-P2 DIMM 1-4": "CPU 2 MemBank 1 Temp", + "07-P2 DIMM 5-8": "CPU 2 MemBank 2 Temp", + "34-Coprocessor 1": "Coprocessor 1 Temp", + "35-Coprocessor 2": "Coprocessor 2 Temp", + "36-Coprocessor 3": "Coprocessor 3 Temp", + "42-P/S Board": "Pwr Supply 1 Temp", + "Power Meter": "Pwr Consumption", + "Temp 1": "Inlet Temp", + "Temp 2 (CPU 1)": "CPU 1 Temp", + "Temp 3 (CPU 2)": "CPU 2 Temp", + "Temp 4 (MemD1)": "CPU 1 MemBank 1 Temp", + "Temp 5 (MemD2)": "CPU 2 MemBank 1 Temp", + "Temp 16 (GPU2)": "Coprocessor 2 Temp", + "Temp 17 (GPU3)": "Coprocessor 3 Temp", + "Temp 18 (GPU1)": "Coprocessor 1 Temp", + # Dell sensor names + "Fan1": "Fan 1", + "Fan2": "Fan 2", + "Fan3": "Fan 3", + "Fan4": "Fan 4", + "Fan5": "Fan 5", + "Fan6": "Fan 6", + "Fan7": "Fan 7", + "Fan8": "Fan 8", + "Fan1A": "Fan 1A", + "Fan1B": "Fan 1B", + "Fan2A": "Fan 2A", + "Fan2B": "Fan 2B", + "Fan3A": "Fan 3A", + "Fan3B": "Fan 3B", + "Fan4A": "Fan 4A", + "Fan4B": "Fan 4B", + "Fan5A": "Fan 5A", + "Fan5B": "Fan 5B", + "Fan6A": "Fan 6A", + "Fan6B": "Fan 6B", + "Fan7A": "Fan 7A", + "Fan7B": "Fan 7B", + "Fan8A": "Fan 8A", + "Fan8B": "Fan 8B", + # Intel(?) sensor names + "Front Panel Temp": "Inlet Temp", + "Exit Air Temp": "Exhaust Temp", + "System Fan 1": "Fan 1", + "System Fan 2": "Fan 2", + "Processor 1 Fan": "Fan 3", + "Processor 2 Fan": "Fan 4", + "PS1 Temperature": "Pwr Supply 1 Temp", + "PS2 Temperature": "Pwr Supply 2 Temp" +} +def mangle_metric_name(metric_name,prefix): + name = metric_name + if ( metric_name.strip() in unified_metric_names.keys() ): + name = unified_metric_names[metric_name.strip()] + return prefix+"_"+name.strip().lower().replace("+","").replace(" ","_").replace("-","_") +def metric_description(metric_name): + if ( metric_name.strip() in unified_metric_names.keys() ): + return unified_metric_names[metric_name.strip()] + else: + return metric_name.strip() + +def get_metrics(): """Return all metrics""" global METRICS - if (time.time() - METRICS['time']) > METRICS_CACHE_MAX: - - new_metrics = {} - units = {} - - command = [ params['timeout_bin'], - "3", params['ipmitool_bin'], - "-H", params['ipmi_ip'], - "-U", params['username'], - '-P', params['password'], - '-L', params['level'], - 'sensor'] - + params = global_params + + # bail out if no ipmi ip address is set and there are no + # ipmi device files available (i.e. ipmitool is guaranteed + # to fail + if ( 'ipmi_ip' not in params.keys() and + not os.path.exists('/dev/ipmi0') and + not os.path.exists('/dev/ipmi/0') and + not os.path.exists('/dev/ipmidev/0') ): + pass + # otherwise, run ipmitool if we're outside the cache timeout + elif (time.time() - METRICS['time']) > METRICS_CACHE_MAX: + new_metrics = {} + units = {} + descr = {} + + command = [ params['timeout_bin'], str(params['timeout']) ] + if ( 'use_sudo' in params.keys() and params['use_sudo'] ): + command.append('sudo') + command.append(params['ipmitool_bin']) + if ( 'ipmi_ip' in params.keys() ): + command.append("-H") + command.append(params['ipmi_ip']) + if ( 'username' in params.keys() ): + command.append("-U") + command.append(params['username']) + if ( 'password' in params.keys() ): + command.append('-P') + command.append(params['password']) + if ('level' in params.keys() ): + command.append('-L') + command.append(params['level']) + command.append('sensor') + p = subprocess.Popen(command, stdout=subprocess.PIPE).communicate()[0][:-1] + dell_temp_count = 1 for i, v in enumerate(p.split("\n")): data = v.split("|") try: - metric_name = data[0].strip().lower().replace("+", "").replace(" ", "_") + if ( data[0].strip()=="Temp" ): + # Dell names all CPU temperature sensors "Temp"; + # thus, the following stupidity: + description = "CPU "+str(dell_temp_count)+" Temp" + metric_name = mangle_metric_name(description,params['metric_prefix']) + dell_temp_count = dell_temp_count+1 + else: + description = metric_description(data[0]) + metric_name = mangle_metric_name(data[0],params['metric_prefix']) value = data[1].strip() # Skip missing sensors @@ -53,16 +159,18 @@ def get_metrics(params): new_metrics[metric_name] = metric_value units[metric_name] = data[2].strip().replace("degrees C", "C") - + descr[metric_name] = description + except ValueError: continue except IndexError: continue - - METRICS = { + + METRICS = { 'time': time.time(), 'data': new_metrics, - 'units': units + 'units': units, + 'descr': descr } return [METRICS] @@ -72,14 +180,15 @@ def get_value(name): """Return a value for the requested metric""" try: + + metrics = get_metrics()[0] - metrics = get_metrics()[0] - - name = name.lstrip('ipmi_') + if ( name in metrics['data'].keys() ): + result = metrics['data'][name] + else: + result = 0 - result = metrics['data'][name] - - except Exception: + except Exception as e: result = 0 return result @@ -91,7 +200,7 @@ def create_desc(skel, prop): return d def metric_init(params): - global descriptors, metric_map, Desc_Skel + global descriptors, metric_map, Desc_Skel, global_params descriptors = [] @@ -107,34 +216,41 @@ def metric_init(params): 'groups' : 'XXX', } - metrics = get_metrics(params)[0] + global_params = params + + metrics = get_metrics()[0] for item in metrics['data']: - descriptors.append(create_desc(Desc_Skel, { - "name" : params['metric_prefix'] + "_" + item, - 'groups' : params['metric_prefix'], - 'units' : metrics['units'][item] - })) + descriptors.append(create_desc(Desc_Skel, { + 'name' : item, + 'description' : metrics['descr'][item], + 'groups' : params['metric_prefix'], + 'units' : metrics['units'][item] + })) return descriptors + def metric_cleanup(): '''Clean up the metric module.''' pass + #This code is for debugging and unit testing if __name__ == '__main__': params = { - "metric_prefix" : "ipmi", - "ipmi_ip" : "10.1.2.3", - "username" : "ADMIN", - "password" : "secret", - "level" : "USER", - "ipmitool_bin" : "/usr/bin/ipmitool", - "timeout_bin" : "/usr/bin/timeout" - } + "use_sudo" : False, + "metric_prefix" : "ipmi", + #"ipmi_ip" : "10.1.2.3", + #"username" : "ADMIN", + #"password" : "secret", + #"level" : "USER", + "timeout" : 15, + "ipmitool_bin" : "/usr/bin/ipmitool", + "timeout_bin" : "/usr/bin/timeout" + } descriptors = metric_init(params) while True: diff --git a/moab/conf.d/moab.pyconf b/moab/conf.d/moab.pyconf new file mode 100644 index 00000000..0dbf05fa --- /dev/null +++ b/moab/conf.d/moab.pyconf @@ -0,0 +1,52 @@ +modules { + module { + name = "moab" + language = "python" + + # If you change this entry make sure you put it under name_match section + param metric_prefix { + value = "moab" + } + + # Debug flag + param debug { + value = False + } + + # location of Moab config files + param moab_home_dir { + value = "/var/spool/moab" + } + + # Moab server host + #param moab_server { + # value = "moabsrv.mydomain.org" + #} + + # Moab server port + #param moab_port { + # value = 42559 + #} + + # Location of showq binary + param showq_bin { + value = "/opt/moab/bin/showq" + } + + # timeout on Moab client commands + param timeout { + value = 15 + } + } +} + +collection_group { + collect_every = 60 + time_threshold = 90 + + metric { + name_match = "moab_(.+)" + value_threshold = 1.0 + } + +} diff --git a/moab/python_modules/moab.py b/moab/python_modules/moab.py new file mode 100644 index 00000000..4547859b --- /dev/null +++ b/moab/python_modules/moab.py @@ -0,0 +1,210 @@ +import os +import subprocess +import sys +import time +from xml.dom import minidom + +METRICS = { + 'time' : 0, + 'data' : {}, + 'units': {}, + 'descr': {} +} + +METRICS_CACHE_MAX = 60 + + +def get_metrics(): + """Return all metrics""" + global METRICS + + params = global_params + + if ( 'showq_bin' not in params ): + pass + elif ( (time.time()-METRICS['time']) > METRICS_CACHE_MAX ): + new_metrics = {} + units = {} + descr = {} + + if ( 'moab_home_dir' in params ): + os.environ['MOABHOMEDIR'] = params['moab_home_dir'] + command = [ params['showq_bin'], "-s", "--xml" ] + if ( 'moab_server' in params ): + command.append("--host=%s" % params['moab_server']) + if ( 'moab_port' in params ): + command.append("--port=%s" % str(params['moab_port'])) + if ( 'timeout' in params ): + command.append("--timeout=%s" % str(params['timeout'])) + if ( 'debug' in params ): + print str(command) + + p = subprocess.Popen(command, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + close_fds=True) + try: + xmldoc = minidom.parseString("\n".join(p.stdout.readlines())) + p.stdout.close() + xmlclusters = xmldoc.getElementsByTagName("cluster") + for xmlcluster in xmlclusters: + if ( xmlcluster.hasAttributes() ): + metric_name = None + metric_value = None + metric_descr = None + metric_units = None + for attr in xmlcluster.attributes.keys(): + if ( attr=="LocalActiveNodes" ): + metric_name = "allocated_nodes" + metric_value = int(xmlcluster.attributes["LocalActiveNodes"].value) + metric_units = "nodes" + metric_descr = "Allocated Nodes" + elif ( attr=="LocalIdleNodes" ): + metric_name = "idle_nodes" + metric_value = int(xmlcluster.attributes["LocalIdleNodes"].value) + metric_units = "nodes" + metric_descr = "Idle Nodes" + elif ( attr=="LocalUpNodes" ): + metric_name = "up_nodes" + metric_value = int(xmlcluster.attributes["LocalUpNodes"].value) + metric_descr = "Up Nodes" + metric_units = "nodes" + elif ( attr=="LocalAllocProcs" ): + metric_name = "allocated_cores" + metric_value = int(xmlcluster.attributes["LocalAllocProcs"].value) + metric_units = "cores" + metric_descr = "Allocated Processor Cores" + elif ( attr=="LocalIdleProcs" ): + metric_name = "idle_cores" + metric_value = int(xmlcluster.attributes["LocalIdleProcs"].value) + metric_units = "cores" + metric_descr = "Idle Processor Cores" + elif ( attr=="LocalUpProcs" ): + metric_name = "up_cores" + metric_value = int(xmlcluster.attributes["LocalUpProcs"].value) + metric_units = "cores" + metric_descr = "Up Processor Cores" + if ( metric_name is not None and + metric_value is not None and + metric_descr is not None and + metric_units is not None ): + new_metrics[metric_name] = metric_value + units[metric_name] = metric_units + descr[metric_name] = metric_descr + + xmlqueues = xmldoc.getElementsByTagName("queue") + for xmlqueue in xmlqueues: + if ( xmlqueue.hasAttributes() ): + if ( "option" in xmlqueue.attributes.keys() and + "count" in xmlqueue.attributes.keys() ): + if ( xmlqueue.attributes["option"].value=="active" ): + metric_name = "running_jobs" + new_metrics[metric_name] = int(xmlqueue.attributes["count"].value) + units[metric_name] = "jobs" + descr[metric_name] = "Running Jobs" + elif ( xmlqueue.attributes["option"].value=="eligible" ): + metric_name = "eligible_jobs" + new_metrics[metric_name] = int(xmlqueue.attributes["count"].value) + units[metric_name] = "jobs" + descr[metric_name] = "Eligible Jobs" + elif ( xmlqueue.attributes["option"].value=="blocked" ): + metric_name = "blocked_jobs" + new_metrics[metric_name] = int(xmlqueue.attributes["count"].value) + units[metric_name] = "jobs" + descr[metric_name] = "Blocked Jobs" + + METRICS = { + 'time': time.time(), + 'data': new_metrics, + 'units': units, + 'descr': descr + } + except Exception as e: + sys.stderr.write("WARNING: %s\n" % str(e)) + pass + + return [METRICS] + + +def get_value(name): + """Return a value for the requested metric""" + try: + + metrics = get_metrics()[0] + + if ( name in metrics['data'].keys() ): + result = metrics['data'][name] + else: + result = 0 + + except Exception as e: + result = 0 + + return result + +def create_desc(skel, prop): + d = skel.copy() + for k,v in prop.iteritems(): + d[k] = v + return d + + +def metric_init(params): + global descriptors, metric_map, Desc_Skel, global_params + + descriptors = [] + + Desc_Skel = { + 'name' : 'XXX', + 'call_back' : get_value, + 'time_max' : METRICS_CACHE_MAX, + 'value_type' : 'uint', + 'format' : '%d', + 'units' : 'count/s', + 'slope' : 'both', # zero|positive|negative|both + 'description' : 'XXX', + 'groups' : 'XXX', + } + + global_params = params + + metrics = get_metrics()[0] + + for item in metrics['data']: + descriptors.append(create_desc(Desc_Skel, { + 'name' : item, + 'description' : metrics['descr'][item], + 'groups' : params['metric_prefix'], + 'units' : metrics['units'][item] + })) + + return descriptors + + +def metric_cleanup(): + """Clean up the metric module""" + pass + + +#This code is for debugging and unit testing +if __name__ == '__main__': + + params = { + "metric_prefix" : "moab", + #"debug" : True, + "moab_home_dir" : "/var/spool/moab", + #"moab_server" : "moabsrv.mydomain.org", + #"moab_port" : 42559, + "showq_bin" : "/opt/moab/bin/showq", + "timeout" : 30, + } + + descriptors = metric_init(params) + + while True: + for d in descriptors: + v = d['call_back'](d['name']) + print '%s = %s' % (d['name'], v) + print 'Sleeping %d seconds\n' % METRICS_CACHE_MAX + time.sleep(METRICS_CACHE_MAX) +