ganlia 和 nagios 等工具,是业界的监控告警工具;这种工具主要是面向运维的,也可以用来进行性能稳定性的测试。 面对分布式系统测试,耗时都比较长,往往一台机器安装多套系统,影响监控指标的准确性。 下面是一种进行进程级别监控的方n法,可以通过扩展,集群的监控力度;同时将监控脚本加入告警,防止脚本异常退出(Nagios扩展另文描述) GEngin.py:总体的引擎,根据conf下配置文件的配置项,轮询监控指标,调用gmetric广播出去 conf:目录中保存metrix配置文件,配置参数指标 flag:目录中仅保存一个flag文件,文件名是任务名,监控指标将根据任务名分离,便于汇总统计对比 log: 目录中记录GEngin的log及每个指标收取脚本的log pid: GEngin的pid 为告警脚本使用 script: 指标收集的具体的脚本 cat conf/metrix.cfg: YARN|ResourceManager|cpu|ResourceManager_cpu.py|ResourceManager_cpu.txt|int16|Percent| YARN|ResourceManager|mem|ResourceManager_mem.py|ResourceManager_mem.txt|int16|Percent| YARN|ResourceManager|lsof|ResourceManager_lsof.py|ResourceManager_lsof.txt|int16|Number| ls flag/: yarntestD001.flag ll log/: -rw-r--r-- 1 yarn users 168 Mar 19 20:02 yarntestD001_YARNResourceManagercputdw-10-16-19-91.txt -rw-r--r-- 1 yarn users 168 Mar 19 20:02 yarntestD001_YARNResourceManagerlsoftdw-10-16-19-91.txt -rw-r--r-- 1 yarn users 168 Mar 19 20:02 yarntestD001_YARNResourceManagermemtdw-10-16-19-91.txt ll script/: -rw-r--r-- 1 yarn users 882 Feb 28 17:20 ResourceManager_cpu.py -rw-r--r-- 1 yarn users 1093 Feb 28 17:45 ResourceManager_lsof.py -rw-r--r-- 1 yarn users 882 Feb 28 17:18 ResourceManager_mem.py cat script/SAMPLE.py: #!/usr/bin/env python # coding=gbk import sys import os import datetime import time def CheckInput(): "Check Input parameters , they should be a pysql file." if len(sys.argv) < 2 : print "Usage: " + sys.argv[0] + " FileNamePrefix " sys.exit() if __name__== '__main__': CheckInput() # check parameter and asign PyFileName ## result file log to directory of LOG LogFile = open("log/"+sys.argv[1],'a') res = "29" ## Interface to Gmetrix ,must be value:Value print "value:"+res ntime = str(time.strftime("%Y-%m-%d %X",time.localtime())) LogFile.write(ntime+" "+res+" ") LogFile.close()
cat GEngin.py : #!/usr/bin/env python # coding=gbk import sys import os import random import datetime import time from time import sleep def CheckInput(): "Check Input parameters , they should be a pysql file." print "Usage : python ./" + sys.argv[0] if not os.path.exists("conf/metrix.cfg"): print "Error : config file conf/metrix.cfg does not exsits ! " sys.exit() ## kill previous proc For restart if os.path.exists("pid/pid.txt"): pfile = open("pid/pid.txt",'r') for p in pfile: pid = p.strip() os.system("kill -9 "+pid) pfile.close() os.system("rm pid/pid.txt") pfile = open("pid/pid.txt",'a') pid = os.getpid() pfile.write(str(pid)) pfile.close() if __name__== '__main__': CheckInput() # check parameter and asign PyFileName LogFile = open("log/"+sys.argv[0]+".log",'a') # File Prefix of logs filePre="noTask" for fi in os.listdir("flag"): if fi.endswith(".flag"): filePre=fi.split('.')[0].strip() # host name for gmetrix host="" f = os.popen("hostname") for res in f: if res.startswith("tdw"): host=res.strip() LogFile.write("******** Start task "+filePre+" monitoring ******* ") # Main Loop untile flag is null while True: if len(os.listdir("flag")) < 1 or len(os.listdir("flag")) > 1: sleep(10) LogFile.write("Finish previous take "+filePre+" .... No task ,Main loop ..... ") LogFile.flush() continue if len(os.listdir("flag")) == 1 and not os.path.exists("flag/"+filePre+".flag"): LogFile.write("Finish previous take "+filePre+"..... ") for fi in os.listdir("flag"): if fi.endswith(".flag"): filePre=fi.split('.')[0].strip() LogFile.write("***** Start New Task "+filePre+" monitoring ******* ") # Deal with config metrix one by one insFile = open("conf/metrix.cfg",'r') for line in insFile: mGroup,mName,mItem,mShell,mFile,mUnit,mWeiht,nouse = line.split('|'); outPutFile = filePre+"_"+mGroup+mName+mItem+host+".txt" value = "" if mShell.endswith(".py"): f = os.popen("python script/"+mShell+" "+outPutFile) for res in f: if res.startswith("value:"): value=res.split(':')[1].strip() else: value="0" f.close() if mShell.endswith(".sh"): f = os.popen("script/"+mShell+" "+outPutFile) for res in f: if res.startswith("value:"): value=res.split(':')[1].strip() else: value="0" f.close() cmd = "gmetric -n "+mGroup+"_"+mName+"_"+mItem+" -v "+value+" -t "+mUnit+" -u "+mWeiht+" -S "+host+":"+host print cmd f = os.popen(cmd) ntime = str(time.strftime("%Y-%m-%d %X",time.localtime())) LogFile.write(ntime+" "+cmd+" ") insFile.close() LogFile.flush() if len(os.listdir("flag")) == 1 and os.path.exists("flag/"+filePre+".flag"): sleep(8) LogFile.close() Ganglia 中显示的监控指标: 将运行的GEngin.py脚本加入监控,防止进程异常退出