DolphinScheduler/script/monitor-server.py

105 lines
4.2 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

#!/usr/bin/env python
# -*- coding:utf-8 -*-
'''
1, yum install pip
yum -y install python-pip
2, pip install kazoo
pip install kazoo
or
3, conda install kazoo
conda install -c conda-forge kazoo
run script and parameter description
nohup python -u monitor_server.py /data1_1T/escheduler 192.168.xx.xx:2181,192.168.xx.xx:2181,192.168.xx.xx:2181 /escheduler/masters /escheduler/workers> monitor_server.log 2>&1 &
the parameters are as follows:
/data1_1T/escheduler : the value comes from the installPath in install.sh
192.168.xx.xx:2181,192.168.xx.xx:2181,192.168.xx.xx:2181 : the value comes from zkQuorum in install.sh
the value comes from zkWorkers in install.sh
/escheduler/masters : the value comes from zkMasters in install.sh
/escheduler/workers : the value comes from zkWorkers in install.sh
'''
import sys
import socket
import os
import sched
import time
from datetime import datetime
from kazoo.client import KazooClient
schedule = sched.scheduler(time.time, time.sleep)
class ZkClient:
def __init__(self):
# hosts configuration zk address cluster
self.zk = KazooClient(hosts=zookeepers)
self.zk.start()
# read configuration files and assemble them into a dictionary
def read_file(self,path):
with open(path, 'r') as f:
dict = {}
for line in f.readlines():
arr = line.strip().split('=')
if (len(arr) == 2):
dict[arr[0]] = arr[1]
return dict
# get the ip address according to hostname
def get_ip_by_hostname(self,hostname):
return socket.gethostbyname(hostname)
# restart server
def restart_server(self,inc):
config_dict = self.read_file(install_path + '/conf/config/run_config.conf')
master_list = config_dict.get('masters').split(',')
print master_list
master_list = list(map(lambda item : self.get_ip_by_hostname(item),master_list))
worker_list = config_dict.get('workers').split(',')
print worker_list
worker_list = list(map(lambda item: self.get_ip_by_hostname(item), worker_list))
if (self.zk.exists(masters_zk_path)):
zk_master_list = []
zk_master_nodes = self.zk.get_children(masters_zk_path)
for zk_master_node in zk_master_nodes:
zk_master_list.append(zk_master_node.split('_')[0])
restart_master_list = list(set(master_list) - set(zk_master_list))
if (len(restart_master_list) != 0):
for master in restart_master_list:
print("master " + self.get_ip_by_hostname(master) + " server has down")
os.system('ssh ' + self.get_ip_by_hostname(master) + ' sh ' + install_path + '/bin/escheduler-daemon.sh start master-server')
if (self.zk.exists(workers_zk_path)):
zk_worker_list = []
zk_worker_nodes = self.zk.get_children(workers_zk_path)
for zk_worker_node in zk_worker_nodes:
zk_worker_list.append(zk_worker_node.split('_')[0])
restart_worker_list = list(set(worker_list) - set(zk_worker_list))
if (len(restart_worker_list) != 0):
for worker in restart_worker_list:
print("worker " + self.get_ip_by_hostname(worker) + " server has down")
os.system('ssh ' + self.get_ip_by_hostname(worker) + ' sh ' + install_path + '/bin/escheduler-daemon.sh start worker-server')
print(datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
schedule.enter(inc, 0, self.restart_server, (inc,))
# default parameter 60s
def main(self,inc=60):
# the enter four parameters are: interval event, priority (sequence for simultaneous execution of two events arriving at the same time), function triggered by the call
# the argument to the trigger function (tuple form)
schedule.enter(0, 0, self.restart_server, (inc,))
schedule.run()
if __name__ == '__main__':
if (len(sys.argv) < 4):
print('please input install_path,zookeepers,masters_zk_path and worker_zk_path')
install_path = sys.argv[1]
zookeepers = sys.argv[2]
masters_zk_path = sys.argv[3]
workers_zk_path = sys.argv[4]
zkClient = ZkClient()
zkClient.main(300)