diff -up sos-2.2/sos/plugins/cluster.py.orig sos-2.2/sos/plugins/cluster.py --- sos-2.2/sos/plugins/cluster.py.orig 2011-02-04 14:50:31.510521528 -0500 +++ sos-2.2/sos/plugins/cluster.py 2011-02-04 14:51:19.485399800 -0500 @@ -14,12 +14,12 @@ import sos.plugintools import os, re -import time, libxml2 -import glob +from glob import glob class cluster(sos.plugintools.PluginBase): """cluster suite and GFS related information """ + optionList = [("gfslockdump", 'gather output of gfs lockdumps', 'slow', False), ('lockdump', 'gather dlm lockdumps', 'slow', False)] @@ -40,190 +40,9 @@ class cluster(sos.plugintools.PluginBase self.files = [ "/etc/cluster/cluster.conf" ] return sos.plugintools.PluginBase.checkenabled(self) - def has_gfs(self): - return (len(self.doRegexFindAll(r'^\S+\s+\S+\s+gfs\s+.*$', "/etc/mtab")) > 0) - - def diagnose(self): + def setup(self): rhelver = self.policy().rhelVersion() - # check if the minimum set of packages is installed - # for RHEL4 RHCS(ccs, cman, cman-kernel, magma, magma-plugins, (dlm, dlm-kernel) || gulm, perl-Net-Telnet, rgmanager, fence) - # RHEL4 GFS (GFS, GFS-kernel, ccs, lvm2-cluster, fence) - - pkgs_check = [] - mods_check = [] - serv_check = [] - - if rhelver == 4: - pkgs_check.extend( [ "ccs", "cman", "magma", "magma-plugins", "perl-Net-Telnet", "rgmanager", "fence" ] ) - mods_check.extend( [ "cman", "dlm" ] ) - if self.has_gfs(): - mods_check.append("gfs") - serv_check.extend( [ "cman", "ccsd", "rgmanager", "fenced" ] ) - if self.has_gfs(): - serv_check.extend( ["gfs", "clvmd"] ) - elif rhelver == 5: - pkgs_check.extend ( [ "cman", "perl-Net-Telnet", "rgmanager" ] ) - mods_check.extend( [ "dlm" ] ) - if self.has_gfs(): - mods_check.extend( ["gfs", "gfs2"] ) - serv_check.extend( [ "cman", "rgmanager" ] ) - if self.has_gfs(): - serv_check.extend( ["gfs", "clvmd"] ) - elif rhelver == 6: - serv_check.extend( [ "fenced", "corosync", "dlm_controld"] ) - if self.has_gfs(): - serv_check.extend( ["gfs_controld"] ) - - # check that kernel module packages are installed for - # running kernel version - - for modname in mods_check: - found = 0 - - if self.policy().allPkgsByNameRegex( "^" + modname ): - found = 1 - - ret, out, time = self.callExtProg('/sbin/modinfo -F vermagic ' + modname) - - if ret == 0: - found = 2 - - if len(self.fileGrep("^%s\s+" % modname, "/proc/modules")) > 0: - found = 3 - - if found == 0: - self.addDiagnose("required kernel module is missing: %s" % modname) - elif found == 1: - self.addDiagnose("required module is not available for current kernel: %s" % modname) - elif found == 2: - self.addDiagnose("required module is available but not loaded: %s" % modname) - - for pkg in pkgs_check: - if not self.isInstalled(pkg): - self.addDiagnose("required package is missing: %s" % pkg) - - if rhelver == "4": - # (dlm, dlm-kernel) || gulm - if not ((self.isInstalled("dlm") and self.isInstalled("dlm-kernel")) or self.isInstalled("gulm")): - self.addDiagnose("required packages are missing: (dlm, dlm-kernel) || gulm") - - # check if all the needed daemons are active at sosreport time - # check if they are started at boot time in RHEL4 RHCS (cman, ccsd, rgmanager, fenced) - # and GFS (gfs, ccsd, clvmd, fenced) - - for service in serv_check: - ret, out, time = self.callExtProg("/sbin/service %s status &> /dev/null" % service) - if ret != 0: - self.addDiagnose("service %s is not running" % service) - - if not self.policy().runlevelDefault() in self.policy().runlevelByService(service): - self.addDiagnose("service %s is not started in default runlevel" % service) - - # FIXME: missing important cman services - # FIXME: any cman service whose state != run ? - # Fence Domain: "default" 2 2 run - - - # is cluster quorate - if not self.is_cluster_quorate(): - self.addDiagnose("cluster node is not quorate") - - # if there is no cluster.conf, diagnose() finishes here. - try: - os.stat("/etc/cluster/cluster.conf") - except: - self.addDiagnose("/etc/cluster/cluster.conf is missing") - return - - # setup XML xpath context - xml = libxml2.parseFile("/etc/cluster/cluster.conf") - xpathContext = xml.xpathNewContext() - - # make sure that the node names are valid according to RFC 2181 - for hostname in xpathContext.xpathEval('/cluster/clusternodes/clusternode/@name'): - if not re.match('^[a-zA-Z]([a-zA-Z0-9-]*[a-zA-Z0-9])?(\.[a-zA-Z]([a-zA-Z0-9-]*[a-zA-Z0-9])?)*$', hostname.content): - self.addDiagnose("node name (%s) contains invalid characters" % hostname.content) - - # do not rely on DNS to resolve node names, must have them in /etc/hosts - for hostname in xpathContext.xpathEval('/cluster/clusternodes/clusternode/@name'): - if len(self.fileGrep(r'^.*\W+%s' % hostname.content , "/etc/hosts")) == 0: - self.addDiagnose("node %s is not defined in /etc/hosts" % hostname.content) - - # check fencing (warn on no fencing) - if len(xpathContext.xpathEval("/cluster/clusternodes/clusternode[not(fence/method/device)]")): - if self.has_gfs(): - self.addDiagnose("one or more nodes have no fencing agent configured: fencing is required for GFS to work") - else: - self.addDiagnose("one or more nodes have no fencing agent configured: the cluster infrastructure might not work as intended") - - # check fencing (warn on manual) - if len(xpathContext.xpathEval("/cluster/clusternodes/clusternode[/cluster/fencedevices/fencedevice[@agent='fence_manual']/@name=fence/method/device/@name]")): - self.addDiagnose("one or more nodes have manual fencing agent configured (data integrity is not guaranteed)") - - # if fence_ilo or fence_drac, make sure acpid is not running - ret, hostname, time = self.callExtProg("/bin/uname -n") - hostname = hostname.split(".")[0] - if len(xpathContext.xpathEval('/cluster/clusternodes/clusternode[@name = "%s" and /cluster/fencedevices/fencedevice[@agent="fence_rsa" or @agent="fence_drac"]/@name=fence/method/device/@name]' % hostname )): - ret, out, time = self.callExtProg("/sbin/service acpid status") - if ret == 0 or self.policy().runlevelDefault() in self.policy().runlevelByService("acpid"): - self.addDiagnose("acpid is enabled, this may cause problems with your fencing method.") - - # check for fs exported via nfs without nfsid attribute - if len(xpathContext.xpathEval("/cluster/rm/service//fs[not(@fsid)]/nfsexport")): - for xmlNode in xpathContext.xpathEval("/cluster/rm/service//fs[not(@fsid)]"): - fsRefAttribute = xmlNode.xpathEval("@ref") - if (len(fsRefAttribute) > 0) : - fsRefName = fsRefAttribute[0].content - if len(xpathContext.xpathEval("cluster/rm/resources/fs[@name='%s'][not(@fsid)]" % fsRefName)): - self.addDiagnose("one or more nfs export do not have a fsid attribute set.") - break - else: - self.addDiagnose("one or more nfs export do not have a fsid attribute set.") - - # cluster.conf file version and the in-memory cluster configuration version matches - status, cluster_version, time = self.callExtProg("cman_tool status | grep 'Config version'") - if not status: - cluster_version = cluster_version[16:] - else: - cluster_version = None - conf_version = xpathContext.xpathEval("/cluster/@config_version")[0].content - - if status == 0 and conf_version != cluster_version: - self.addDiagnose("cluster.conf and in-memory configuration version differ (%s != %s)" % (conf_version, cluster_version) ) - - status, output, time = self.callExtProg("/usr/sbin/rg_test test /etc/cluster/cluster.conf") - if output.find("Error: ") > 0: - self.addDiagnose("configuration errors are present according to rg_test") - - # make sure the first part of the lock table matches the cluster name - # and that the locking protocol is sane - cluster_name = xpathContext.xpathEval("/cluster/@name")[0].content - - for fs in self.fileGrep(r'^[^#][/\w]*\W*[/\w]*\W*gfs', "/etc/fstab"): - # for each gfs entry - fs = fs.split() - lockproto = self.get_gfs_sb_field(fs[0], "sb_lockproto") - if lockproto and lockproto != self.get_locking_proto(): - self.addDiagnose("gfs mountpoint (%s) is using the wrong locking protocol (%s)" % (fs[0], lockproto) ) - - locktable = self.get_gfs_sb_field(fs[0], "sb_locktable") - try: locktable = locktable.split(":")[0] - except: continue - if locktable != cluster_name: - self.addDiagnose("gfs mountpoint (%s) is using the wrong locking table" % fs[0]) - - # Check for existence of weak-updates in gfs2 prior to 2.6.18-128 - if rhelver == 5: - ret, vermagic, time = self.callExtProg("modinfo -F vermagic gfs2") - # just kernel release from vermagic line - vermagic = vermagic.split()[0].lstrip('2.6.18-') - vermagic = vermagic[:vermagic.find('.')] - if int(vermagic) < 128: - self.addDiagnose('GFS2 is being used via weak-updates, kmod-gfs2 should be uninstalled and system reboot' \ - 'to allow for kernel provided gfs2 module to be used.') - - def setup(self): - self.collectExtOutput("/sbin/fdisk -l") self.addCopySpec("/etc/cluster.conf") self.addCopySpec("/etc/cluster.xml") self.addCopySpec("/etc/cluster") @@ -231,79 +50,82 @@ class cluster(sos.plugintools.PluginBase self.addCopySpec("/etc/sysconfig/cman") self.addCopySpec("/var/lib/ricci") self.addCopySpec("/var/lib/luci") + self.addCopySpec("/var/log/cluster") + self.addCopySpec("/var/log/luci/luci.log") + + if self.getOption('gfslockdump'): + self.do_gfslockdump() + + if self.getOption('lockdump'): + self.do_lockdump() + self.collectExtOutput("/usr/sbin/rg_test test /etc/cluster/cluster.conf") - self.collectExtOutput("cman_tool status") - self.collectExtOutput("cman_tool -a nodes") - self.collectOutputNow("group_tool ls -g1") - self.collectOutputNow("group_tool dump") - self.collectExtOutput("ccs_tool lsnode") self.collectExtOutput("fence_tool ls -n") - self.collectExtOutput("dlm_tool ls -n") self.collectExtOutput("gfs_control ls -n") - self.collectExtOutput("fence_tool dump", root_symlink="fenced.txt") - self.collectExtOutput("dlm_tool dump", root_symlink="dlm_controld.txt") - self.collectExtOutput("gfs_control dump", root_symlink="gfs_controld.txt") - self.collectExtOutput("dlm_tool log_plock", root_symlink="log_plock.txt") - self.addCopySpec("/var/log/cluster") - self.addCopySpec("/var/log/luci/luci.log") - - self.collectExtOutput("clustat") + self.collectExtOutput("dlm_tool log_plock") + self.collectExtOutput("/sbin/fdisk -l") + self.collectOutputNow("clustat") + self.collectOutputNow("group_tool dump") + self.collectExtOutput("cman_tool services") + self.collectExtOutput("cman_tool nodes") + self.collectExtOutput("cman_tool status") + self.collectExtOutput("ccs_tool lsnode") self.collectExtOutput("/sbin/ipvsadm -L") - if self.getOption('gfslockdump'): self.do_gfslockdump() - if self.getOption('lockdump'): self.do_lockdump() - - return + if rhelver is 4: + self.addCopySpec("/proc/cluster/*") + self.collectExtOutput("cman_tool nodes") + + if rhelver is not 4: # 5+ + self.collectExtOutput("cman_tool -a nodes") + + if rhelver is 5: + self.collectExtOutput("group_tool -v") + self.collectExtOutput("group_tool dump fence") + self.collectExtOutput("group_tool dump gfs") + + if rhelver not in (4,5): # 6+ + self.collectExtOutput("corosync-quorumtool -l") + self.collectExtOutput("corosync-quorumtool -s") + self.collectExtOutput("corosync-cpgtool") + self.collectExtOutput("corosync-objctl") + self.collectExtOutput("group_tool ls -g1") + self.collectExtOutput("gfs_control ls -n") + self.collectExtOutput("gfs_control dump") + self.collectExtOutput("fence_tool dump") + self.collectExtOutput("dlm_tool dump") + self.collectExtOutput("dlm_tool ls -n") def do_lockdump(self): - status, output, time= self.callExtProg("cman_tool services") - if status: - # command somehow failed - return False - rhelver = self.policy().rhelVersion() - if rhelver == "4": - regex = r'^DLM Lock Space:\s*"([^"]*)".*$' - elif rhelver == "5Server" or rhelver == "5Client": - regex = r'^dlm\s+[^\s]+\s+([^\s]+)\s.*$' - - reg=re.compile(regex,re.MULTILINE) - for lockspace in reg.findall(output): - ret, out, time = self.callExtProg("echo %s > /proc/cluster/dlm_locks" % lockspace) - self.collectOutputNow("cat /proc/cluster/dlm_locks", root_symlink = "dlm_locks_%s" % lockspace) - - def get_locking_proto(self): - # FIXME: what's the best way to find out ? - return "lock_dlm" - return "lock_gulm" + if rhelver is 4: + status, output, time = self.callExtProg("cman_tool services") + for lockspace in re.compile(r'^DLM Lock Space:\s*"([^"]*)".*$', re.MULTILINE).findall(output): + self.callExtProg("echo %s > /proc/cluster/dlm_locks" % lockspace) + self.collectOutputNow("cat /proc/cluster/dlm_locks", + suggest_filename = "dlm_locks_%s" % lockspace) + + if rhelver is 5: + status, output, time = self.callExtProg("group_tool") + for lockspace in re.compile(r'^dlm\s+[^\s]+\s+([^\s]+)$', re.MULTILINE).findall(output): + self.collectExtOutput("dlm_tool lockdebug '%s'" % lockspace, + suggest_filename = "dlm_locks_%s" % lockspace) + + else: # RHEL6 or recent Fedora + status, output, time = self.callExtProg("dlm_tool ls") + for lockspace in re.compile(r'^name\s+([^\s]+)$', re.MULTILINE).findall(output): + self.collectExtOutput("dlm_tool lockdebug -svw '%s'" % lockspace, + suggest_filename = "dlm_locks_%s" % lockspace) def do_gfslockdump(self): for mntpoint in self.doRegexFindAll(r'^\S+\s+([^\s]+)\s+gfs\s+.*$', "/proc/mounts"): - self.collectExtOutput("/sbin/gfs_tool lockdump %s" % mntpoint, root_symlink = "gfs_lockdump_" + self.mangleCommand(mntpoint) ) - - def do_rgmanager_bt(self): - # FIXME: threads backtrace via SIGALRM - return + self.collectExtOutput("/sbin/gfs_tool lockdump %s" % mntpoint, + suggest_filename = "gfs_lockdump_" + self.mangleCommand(mntpoint)) def postproc(self): - for cluster_conf in glob.glob("/etc/cluster/cluster.conf*"): + for cluster_conf in glob("/etc/cluster/cluster.conf*"): self.doRegexSub(cluster_conf, r"(\s*\