Monitoring

De Wiki de Nelly & Richard
Sauter à la navigation Sauter à la recherche

Introduction

  • Afin de faciliter la surveillance des serveurs, plusieurs outils existent notamment cacti, mrtg, ... ils sont très complets mais souvent compliqués à configurer, j'ai donc préféré munin pour la simplicité les possibilités d'extensions disponibles ici.
  • Pour être prévenu lors de l'arrêt, le changement d'état, ... des services il existe monit.

Munin

Installation

  • L'installation sur une debian est facile :
    • apt-get install munin munin-node
    • Puis une configuration simple dans apache du type :
# /etc/apache/conf.d/munin

Alias /munin /var/www/munin

<Location /var/www/munin>
       Order deny,allow
       Deny from all
       Allow from toto.com
       Allow from 192.168.0.234
</Location>

Configuration

  • Pour activer les plugins manuels présents dans /usr/share/munin/plugins/ il suffit de créer le liens symbolique vers /etc/munin/plugins puis de vérifier les différents paramètres à configurer dans /etc/munin/plugin-conf.d/munin-node, les manpages ainsi que le site de munin vous seront utiles.
  • Pour ajouter ou créer des plugins non disponibles dans /usr/share/munin/plugins/ le site des plugins pour munin ainsi que les manpages vous aideront.

Exemples

  • Cela m'a permis de constater que mamachine était un peu sur-dimensionnée, mais bon un geek sans serveur ;)
Netstat-week.png Sensors temp-day.png

Monit

  • Monit permet de surveiller les services présent sur un serveur de tenter de les redémarrer en cas de non disponibilité et/ou de vous envoyer un mail en cas d'échec.
  • La documentation en ligne est très bien faites de nombreux exemples y sont présents (documentation de monit | site officiel de monit).

Exemple de configuration

  • Fichier de configuration /etc/monit/monitrc
################
# GLOBAL SECTION
set daemon  120
set logfile syslog facility log_daemon

#############
# MAIL ALERTE
set mailserver   localhost
set mail-format { from: monit@mail.com }
set alert toto@mail.com

#######################
# HTTP SERVER FOR MONIT
set httpd port 2812 and
  use address 192.168.0.2
# hotes autorisés à se connecter
  allow 192.168.0.2
# authentification http
  allow toto:121315456464

#####################
# MONITORING SERVICES

# SYSTEM
 check system localhost
   if loadavg (1min) > 4 then alert
   if loadavg (5min) > 2 then alert
   if memory usage > 75% then alert
   if cpu usage (user) > 70% then alert
   if cpu usage (system) > 30% then alert
   if cpu usage (wait) > 20% then alert

# POSTFIX
 check process postfix with pidfile /var/spool/postfix/pid/master.pid
   group mail
   start program = "/etc/init.d/postfix start"
   stop  program = "/etc/init.d/postfix stop"
   if failed port 25 protocol smtp then restart
   if 5 restarts within 5 cycles then timeout
   depends on postfix_rc

 check file postfix_rc with path /etc/init.d/postfix
   group mail
   if failed checksum then unmonitor
   if failed permission 755 then unmonitor
   if failed uid root then unmonitor
   if failed gid root then unmonitor

# SPAMASSASSIN
 check process spamd with pidfile /var/run/spamd.pid
   group mail
   start program = "/etc/init.d/spamassassin start"
   stop  program = "/etc/init.d/spamassassin stop"
   if failed host localhost port 783 type TCP then restart
   if 5 restarts within 5 cycles then timeout
   depends on spamd_bin
   depends on spamd_rc

 check file spamd_bin with path /usr/sbin/spamd
   group mail
   if failed checksum then unmonitor
   if failed permission 755 then unmonitor
   if failed uid root then unmonitor
   if failed gid root then unmonitor

 check file spamd_rc with path /etc/init.d/spamassassin
   group mail
   if failed checksum then unmonitor
   if failed permission 755 then unmonitor
   if failed uid root then unmonitor
   if failed gid root then unmonitor

# APACHE2
 check process apache2 with pidfile /var/run/apache2.pid
   group www-data
   start program = "/etc/init.d/apache2 start"
   stop  program = "/etc/init.d/apache2 stop"
   if failed host bigben.nerux.org port 80 protocol http then restart
   if failed host bigben.nerux.org port 443 type TCPSSL then restart
   if 5 restarts within 5 cycles then timeout
   depends on apache2_rc

 check file apache2_rc with path /etc/init.d/apache2
   group root
   if failed checksum then unmonitor
   if failed permission 755 then unmonitor
   if failed uid root then unmonitor
   if failed gid root then unmonitor

# MYSQL
 check process mysql with pidfile /var/run/mysqld/mysqld.pid
   group mysql
   start program = "/etc/init.d/mysql start"
   stop  program = "/etc/init.d/mysql stop"
   if failed unixsocket /var/run/mysqld/mysqld.sock protocol mysql then restart
   if 5 restarts within 5 cycles then timeout
   depends on mysql_rc

 check file mysql_rc with path /etc/init.d/mysql
   group mysql
   if failed checksum then unmonitor
   if failed permission 755 then unmonitor
   if failed uid root then unmonitor
   if failed gid root then unmonitor

# BIND9
 check process bind9 with pidfile /var/run/bind/run/named.pid
   group bind
   start program = "/etc/init.d/bind9 start"
   stop  program = "/etc/init.d/bind9 stop"
   if failed port 53 protocol dns then restart
   if 5 restarts within 5 cycles then timeout
   depends on bind9_rc

 check file bind9_rc with path /etc/init.d/bind9
   group bind
   if failed checksum then unmonitor
   if failed permission 755 then unmonitor
   if failed uid root then unmonitor
   if failed gid root then unmonitor

# SSHD
 check process ssh with pidfile /var/run/sshd.pid
   start program  "/etc/init.d/ssh start"
   stop program  "/etc/init.d/ssh stop"
   if failed host globule.ner.com port 13000 protocol ssh then restart
   if 5 restarts within 5 cycles then timeout

# CUPSD
 check process cupsd with pidfile /var/run/cups/cupsd.pid
   group lpadmin
   start program = "/etc/init.d/cupsys start"
   stop  program = "/etc/init.d/cupsys stop"
   if failed unixsocket /var/run/cups/cups.sock then restart
   if 5 restarts within 5 cycles then timeout
   depends on cupsd_bin
   depends on cupsd_rc

 check file cupsd_bin with path /usr/sbin/cupsd
   group lpadmin
   if failed checksum then unmonitor
   if failed permission 755 then unmonitor
   if failed uid root then unmonitor
   if failed gid root then unmonitor

 check file cupsd_rc with path /etc/init.d/cupsys
   group lpadmin
   if failed checksum then unmonitor
   if failed permission 755 then unmonitor
   if failed uid root then unmonitor
   if failed gid root then unmonitor

# APCUPSD
 check process apcupsd with pidfile /var/run/apcupsd.pid
   group root
   start program = "/etc/init.d/apcupsd start"
   stop  program = "/etc/init.d/apcupsd stop"
   if 5 restarts within 5 cycles then timeout
   if failed host localhost port 3551 type TCP then restart
   depends on apcupsd_bin
   depends on apcupsd_rc

 check file apcupsd_bin with path /sbin/apcupsd
   if failed checksum then unmonitor
   if failed permission 755 then unmonitor
   if failed uid root then unmonitor
   if failed gid root then unmonitor

 check file apcupsd_rc with path /etc/init.d/apcupsd
   if failed checksum then unmonitor
   if failed permission 755 then unmonitor
   if failed uid root then unmonitor
   if failed gid root then unmonitor

# dovecot
 check process dovecot with pidfile /var/run/dovecot/master.pid
   group root
   start program = "/etc/init.d/dovecot start"
   stop  program = "/etc/init.d/dovecot stop"
   if 5 restarts within 5 cycles then timeout
   if failed host localhost port 10143 type TCP then restart
   depends on dovecot_bin
   depends on dovecot_rc

 check file dovecot_bin with path /usr/sbin/dovecot
   if failed checksum then unmonitor
   if failed permission 755 then unmonitor
   if failed uid root then unmonitor
   if failed gid root then unmonitor

 check file dovecot_rc with path /etc/init.d/dovecot
   if failed checksum then unmonitor
   if failed permission 755 then unmonitor
   if failed uid root then unmonitor
   if failed gid root then unmonitor