check_gputemp

#!/bin/bash

################################################################################
#                                                                              #
#  Copyright (C) 2011 Jack-Benny Persson <jack-benny@cyberinfo.se>             #
#                                                                              #
#   This program is free software; you can redistribute it and/or modify       #
#   it under the terms of the GNU General Public License as published by       #
#   the Free Software Foundation; either version 2 of the License, or          #
#   (at your option) any later version.                                        #
#                                                                              #
#   This program is distributed in the hope that it will be useful,            #
#   but WITHOUT ANY WARRANTY; without even the implied warranty of             #
#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              #
#   GNU General Public License for more details.                               #
#                                                                              #
#   You should have received a copy of the GNU General Public License          #
#   along with this program; if not, write to the Free Software                #
#   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA  #
#                                                                              #
################################################################################

###############################################################################
#                                                                             #
# Nagios plugin to monitor GPU temperature with aticonfig.                    #
# This only works on ATI cards with the proprietary driver (fglrx).           #
# Written in Bash (and uses sed & awk).                                       #
#                                                                             #
# Version 1.1: Fixed a typo (EXIT_UNKOWN to STATE_UNKNOWN on line 163         #
###############################################################################

VERSION="Version 1.1"
AUTHOR="(c) 2011 Jack-Benny Persson (jack-benny@cyberinfo.se)"

# Sensor program
SENSORPROG=/usr/bin/aticonfig

# Exit codes
STATE_OK=0
STATE_WARNING=1
STATE_CRITICAL=2
STATE_UNKNOWN=3

shopt -s extglob

#### Functions ####

# Print version information
print_version()
{
        printf "\n\n$0 - $VERSION\n"
}

#Print help information
print_help()
{
        print_version
        printf "$AUTHOR\n"
        printf "Monitor GPU temperatur with the use of aticonfig (fglrx)\n"
/bin/cat <<EOT

Options:
-h
   Print detailed help screen
-V
   Print version information
-v
   Verbose output

--adapter NUM
   Set which GPU adapter to monitor, for example 0 or 1. Default is 0
 
-w INTEGER
   Exit with WARNING status if above INTEGER degres
-c INTEGER
   Exit with CRITICAL status if above INTEGER degres
EOT

}


###### MAIN ########

# Warning threshold
thresh_warn=
# Critical threshold
thresh_crit=
# Hardware to monitor
adapter=0

# See if we have the aticonfig program installed and can execute it
if [[ ! -x "$SENSORPROG" ]]; then
        printf "\nIt appears you don't have aticonfig installed \
        in $SENSORPROG\n"

        exit $STATE_UNKOWN
fi

# Parse command line options
while [[ -n "$1" ]]; do
   case "$1" in

       -h | --help)
           print_help
           exit $STATE_OK
           ;;

       -V | --version)
           print_version
           exit $STATE_OK
           ;;

       -v | --verbose)
           : $(( verbosity++ ))
           shift
           ;;

       -w | --warning)
           if [[ -z "$2" ]]; then
               # Threshold not provided
               printf "\nOption $1 requires an argument"
               print_help
               exit $STATE_UNKNOWN
            elif [[ "$2" = +([0-9]) ]]; then
               # Threshold is an integer
               thresh=$2
            else
               # Threshold is not an integer
               printf "\nThreshold must be an integer"
               print_help
               exit $STATE_UNKNOWN
           fi
           thresh_warn=$thresh
           shift 2
           ;;

       -c | --critical)
           if [[ -z "$2" ]]; then
               # Threshold not provided
               printf "\nOption '$1' requires an argument"
               print_help
               exit $STATE_UNKNOWN
            elif [[ "$2" = +([0-9]) ]]; then
               # Threshold is an integer
               thresh=$2
            else
               # Threshold is not an integer
               printf "\nThreshold must be an integer"
               print_help
               exit $STATE_UNKNOWN
           fi
           thresh_crit=$thresh
           shift 2
           ;;

       -?)
           print_help
           exit $STATE_OK
           ;;

       --adapter)
           if [[ -z "$2" ]]; then
                printf "\nOption $1 requires an argument"
                print_help
                exit $STATE_UNKNOWN
           fi
                adapter=$2
           shift 2
           ;;

       *)
           printf "\nInvalid option '$1'"
           print_help
           exit $STATE_UNKNOWN
           ;;
   esac
done


# Check if a sensor were specified
if [[ -z "$adapter" ]]; then
        # No sensor to monitor were specified
        printf "\nNo sensor specified"
        print_help
        exit $STATE_UNKNOWN
fi


#Get the temperature
TEMP=`${SENSORPROG} --adapter=${adapter} --od-gettemperature \
| grep "Temperature" | awk '{print $5}' | cut -c1-2`


# Check if the tresholds has been set correctly
if [[ -z "$thresh_warn" || -z "$thresh_crit" ]]; then
        # One or both thresholds were not specified
        printf "\nThreshold not set"
        print_help
        exit $STATE_UNKNOWN
  elif [[ "$thresh_crit" -lt "$thresh_warn" ]]; then
        # The warning threshold must be lower than the critical threshold
        printf "\nWarning temperature should be lower than critical"
        print_help
        exit $STATE_UNKNOWN
fi


# Verbose outpu2t
if [[ "$verbosity" -ge 2 ]]; then
   /bin/cat <<__EOT
Debugging information:
  Warning threshold: $thresh_warn
  Critical threshold: $thresh_crit
  Verbosity level: $verbosity
  Current GPU $adapter temperature: $TEMP
__EOT
printf "\n  Temperature lines directly from aticonfig:\n"
${SENSORPROG} --adapter=${adapter} --od-gettemperature | grep "Temperature"
printf "\n\n"
fi


# And finally check the temperature against our thresholds
if [[ "$TEMP" -gt "$thresh_crit" ]]; then
        # Temperature is above critical threshold
        echo "GPU $adapter CRITICAL - Temperature is $TEMP"
        exit $STATE_CRITICAL

  elif [[ "$TEMP" -gt "$thresh_warn" ]]; then
        # Temperature is above warning threshold
        echo "GPU $adapter WARNING - Temperature is $TEMP"
        exit $STATE_WARNING

  else
        # Temperature is ok
        echo "GPU $adapter OK - Temperature is $TEMP"
        exit $STATE_OK
fi
exit 3