corosync.py 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619
  1. '''CTS: Cluster Testing System: corosync...
  2. '''
  3. __copyright__='''
  4. Copyright (c) 2010 Red Hat, Inc.
  5. '''
  6. # All rights reserved.
  7. #
  8. # Author: Angus Salkeld <asalkeld@redhat.com>
  9. #
  10. # This software licensed under BSD license, the text of which follows:
  11. #
  12. # Redistribution and use in source and binary forms, with or without
  13. # modification, are permitted provided that the following conditions are met:
  14. #
  15. # - Redistributions of source code must retain the above copyright notice,
  16. # this list of conditions and the following disclaimer.
  17. # - Redistributions in binary form must reproduce the above copyright notice,
  18. # this list of conditions and the following disclaimer in the documentation
  19. # and/or other materials provided with the distribution.
  20. # - Neither the name of the MontaVista Software, Inc. nor the names of its
  21. # contributors may be used to endorse or promote products derived from this
  22. # software without specific prior written permission.
  23. #
  24. # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  25. # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  26. # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  27. # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  28. # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  29. # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  30. # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  31. # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  32. # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  33. # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
  34. # THE POSSIBILITY OF SUCH DAMAGE.
  35. import os
  36. import sys
  37. import time
  38. import socket
  39. import shutil
  40. import string
  41. import augeas
  42. from cts.CTS import ClusterManager
  43. from cts.CTSscenarios import ScenarioComponent
  44. from cts.CTS import RemoteExec
  45. from cts.CTSvars import CTSvars
  46. ###################################################################
  47. class CoroConfig(object):
  48. def __init__(self, corobase=None):
  49. self.base = "/files/etc/corosync/corosync.conf/"
  50. self.new_root = "/tmp/aug-root/"
  51. if corobase == None:
  52. self.corobase = os.getcwd() + "/.."
  53. else:
  54. self.corobase = corobase
  55. example = self.corobase + "/conf/corosync.conf.example"
  56. if os.path.isdir(self.new_root):
  57. shutil.rmtree (self.new_root)
  58. os.makedirs (self.new_root + "/etc/corosync")
  59. shutil.copy (example, self.new_root + "/etc/corosync/corosync.conf")
  60. self.aug = augeas.Augeas (root=self.new_root,
  61. loadpath=self.corobase + "/conf/lenses")
  62. self.original = {}
  63. # store the original values (of totem), so we can restore them in
  64. # apply_default_config()
  65. totem = self.aug.match('/files/etc/corosync/corosync.conf/totem/*')
  66. for c in totem:
  67. # /files/etc/corosync/corosync.conf/
  68. short_name = c[len(self.base):]
  69. self.original[short_name] = self.aug.get(c)
  70. interface = self.aug.match('/files/etc/corosync/corosync.conf/totem/interface/*')
  71. for c in interface:
  72. short_name = c[len(self.base):]
  73. self.original[short_name] = self.aug.get(c)
  74. def get (self, name):
  75. return self.aug.get (self.base + name)
  76. def set (self, name, value):
  77. token = self.aug.set (self.base + name, str(value))
  78. def save (self):
  79. self.aug.save()
  80. def get_filename(self):
  81. return self.new_root + "/etc/corosync/corosync.conf"
  82. ###################################################################
  83. class corosync_flatiron(ClusterManager):
  84. '''
  85. bla
  86. '''
  87. def __init__(self, Environment, randseed=None):
  88. ClusterManager.__init__(self, Environment, randseed)
  89. self.update({
  90. "Name" : "corosync(flatiron)",
  91. "StartCmd" : CTSvars.INITDIR+"/corosync start",
  92. "StopCmd" : CTSvars.INITDIR+"/corosync stop",
  93. "RereadCmd" : CTSvars.INITDIR+"/corosync reload",
  94. "StatusCmd" : CTSvars.INITDIR+"/corosync status %s",
  95. "DeadTime" : 30,
  96. "StartTime" : 15, # Max time to start up
  97. "StableTime" : 10,
  98. "BreakCommCmd" : "/usr/share/corosync/tests/net_breaker.sh BreakCommCmd %s",
  99. "FixCommCmd" : "/usr/share/corosync/tests/net_breaker.sh FixCommCmd %s",
  100. "Pat:We_stopped" : "%s.*Corosync Cluster Engine exiting with status.*",
  101. "Pat:They_stopped" : "%s.*Member left:.*%s.*",
  102. "Pat:They_dead" : "corosync:.*Node %s is now: lost",
  103. "Pat:Local_starting" : "%s.*started and ready to provide service.",
  104. "Pat:Local_started" : "%s.*started and ready to provide service.",
  105. "Pat:Master_started" : "%s.*Completed service synchronization, ready to provide service.",
  106. "Pat:Slave_started" : "%s.*Completed service synchronization, ready to provide service.",
  107. "Pat:ChildKilled" : "%s corosync.*Child process %s terminated with signal 9",
  108. "Pat:ChildRespawn" : "%s corosync.*Respawning failed child process: %s",
  109. "Pat:ChildExit" : "Child process .* exited",
  110. "Pat:DC_IDLE" : ".*A processor joined or left the membership and a new membership was formed.",
  111. # Bad news Regexes. Should never occur.
  112. "BadRegexes" : (
  113. r"ERROR:",
  114. r"CRIT:",
  115. r"Shutting down\.",
  116. r"Forcing shutdown\.",
  117. r"core dump",
  118. r"Could not bind AF_UNIX",
  119. ),
  120. "LogFileName" : Environment["LogFileName"],
  121. })
  122. self.start_cpg = True
  123. self.cpg_agent = {}
  124. self.confdb_agent = {}
  125. self.sam_agent = {}
  126. self.votequorum_agent = {}
  127. self.config = CoroConfig ()
  128. self.node_to_ip = {}
  129. self.new_config = {}
  130. self.new_config['service[1]/name'] = 'corosync_tst_sv2'
  131. self.new_config['service[1]/ver'] = '0'
  132. self.applied_config = {}
  133. for n in self.Env["nodes"]:
  134. ip = socket.gethostbyname(n)
  135. ips = ip.split('.')
  136. ips[3] = '0'
  137. ip_mask = '.'.join(ips)
  138. self.new_config['totem/interface/bindnetaddr'] = str(ip_mask)
  139. return
  140. def apply_default_config(self):
  141. for c in self.applied_config:
  142. if 'bindnetaddr' in c:
  143. continue
  144. elif not self.config.original.has_key(c):
  145. # new config option (non default)
  146. pass
  147. elif self.applied_config[c] is not self.config.original[c]:
  148. # reset to the original
  149. self.new_config[c] = self.config.original[c]
  150. if len(self.new_config) > 0:
  151. self.debug('applying default config')
  152. self.stopall()
  153. def apply_new_config(self):
  154. if len(self.new_config) > 0:
  155. self.debug('applying new config')
  156. self.stopall()
  157. self.startall()
  158. def install_all_config(self):
  159. tmp1 = {}
  160. for c in self.new_config:
  161. self.log('configuring: ' + c + ' = '+ str(self.new_config[c]))
  162. self.config.set (c, self.new_config[c])
  163. self.applied_config[c] = self.new_config[c]
  164. tmp1[c] = self.new_config[c]
  165. for c in tmp1:
  166. del self.new_config[c]
  167. self.config.save()
  168. src_file = self.config.get_filename()
  169. for node in self.Env["nodes"]:
  170. self.rsh.cp(src_file, "%s:%s" % (node, "/etc/corosync/"))
  171. def install_config(self, node):
  172. # install gets new_config and installs it, then moves the
  173. # config to applied_config
  174. if len(self.new_config) > 0:
  175. self.install_all_config()
  176. def key_for_node(self, node):
  177. if not self.node_to_ip.has_key(node):
  178. self.node_to_ip[node] = socket.gethostbyname (node)
  179. return self.node_to_ip[node]
  180. def StartaCM(self, node):
  181. if not self.ShouldBeStatus.has_key(node):
  182. self.ShouldBeStatus[node] = "down"
  183. if self.ShouldBeStatus[node] != "down":
  184. return 1
  185. self.debug('starting corosync on : ' + node)
  186. ret = ClusterManager.StartaCM(self, node)
  187. if self.start_cpg:
  188. if self.cpg_agent.has_key(node):
  189. self.cpg_agent[node].restart()
  190. else:
  191. self.cpg_agent[node] = CpgTestAgent(node, self.Env)
  192. self.cpg_agent[node].start()
  193. if self.confdb_agent.has_key(node):
  194. self.confdb_agent[node].restart()
  195. if self.sam_agent.has_key(node):
  196. self.sam_agent[node].restart()
  197. # votequorum agent started as needed.
  198. if self.applied_config.has_key('quorum/provider'):
  199. if self.votequorum_agent.has_key(node):
  200. self.votequorum_agent[node].restart()
  201. else:
  202. self.votequorum_agent[node] = VoteQuorumTestAgent(node, self.Env)
  203. self.votequorum_agent[node].start()
  204. return ret
  205. def StopaCM(self, node):
  206. if self.ShouldBeStatus[node] != "up":
  207. return 1
  208. self.debug('stoping corosync on : ' + node)
  209. if self.cpg_agent.has_key(node):
  210. self.cpg_agent[node].stop()
  211. if self.sam_agent.has_key(node):
  212. self.sam_agent[node].stop()
  213. if self.votequorum_agent.has_key(node):
  214. self.votequorum_agent[node].stop()
  215. return ClusterManager.StopaCM(self, node)
  216. def test_node_CM(self, node):
  217. # 2 - up and stable
  218. # 1 - unstable
  219. # 0 - down
  220. out = self.rsh(node, self["StatusCmd"], 1)
  221. is_stopped = string.find(out, 'stopped')
  222. is_dead = string.find(out, 'dead')
  223. ret = (is_dead is -1 and is_stopped is -1)
  224. try:
  225. if ret:
  226. ret = 2
  227. if self.ShouldBeStatus[node] == "down":
  228. self.log(
  229. "Node status for %s is %s but we think it should be %s"
  230. % (node, "up", self.ShouldBeStatus[node]))
  231. else:
  232. if self.ShouldBeStatus[node] == "up":
  233. self.log(
  234. "Node status for %s is %s but we think it should be %s"
  235. % (node, "down", self.ShouldBeStatus[node]))
  236. except KeyError: pass
  237. if ret: self.ShouldBeStatus[node] = "up"
  238. else: self.ShouldBeStatus[node] = "down"
  239. return ret
  240. def StataCM(self, node):
  241. '''Report the status of corosync on a given node'''
  242. if self.test_node_CM(node) > 0:
  243. return 1
  244. else:
  245. return None
  246. def RereadCM(self, node):
  247. self.log('reloading corosync on : ' + node)
  248. return ClusterManager.RereadCM(self, node)
  249. def find_partitions(self):
  250. ccm_partitions = []
  251. return ccm_partitions
  252. def prepare(self):
  253. '''Finish the Initialization process. Prepare to test...'''
  254. self.partitions_expected = 1
  255. for node in self.Env["nodes"]:
  256. self.ShouldBeStatus[node] = ""
  257. self.unisolate_node(node)
  258. self.StataCM(node)
  259. def HasQuorum(self, node_list):
  260. # If we are auditing a partition, then one side will
  261. # have quorum and the other not.
  262. # So the caller needs to tell us which we are checking
  263. # If no value for node_list is specified... assume all nodes
  264. if not node_list:
  265. node_list = self.Env["nodes"]
  266. for node in node_list:
  267. if self.ShouldBeStatus[node] == "up":
  268. quorum = self.rsh(node, self["QuorumCmd"], 1)
  269. if string.find(quorum, "1") != -1:
  270. return 1
  271. elif string.find(quorum, "0") != -1:
  272. return 0
  273. else:
  274. self.log("WARN: Unexpected quorum test result from "+ node +":"+ quorum)
  275. return 0
  276. def Components(self):
  277. return None
  278. ###################################################################
  279. class TestAgentComponent(ScenarioComponent):
  280. def __init__(self, Env):
  281. self.Env = Env
  282. def IsApplicable(self):
  283. '''Return TRUE if the current ScenarioComponent is applicable
  284. in the given LabEnvironment given to the constructor.
  285. '''
  286. return True
  287. def SetUp(self, CM):
  288. '''Set up the given ScenarioComponent'''
  289. self.CM = CM
  290. for node in self.Env["nodes"]:
  291. if not CM.StataCM(node):
  292. raise RuntimeError ("corosync not up")
  293. if self.CM.start_cpg:
  294. self.CM.cpg_agent[node] = CpgTestAgent(node, CM.Env)
  295. self.CM.cpg_agent[node].start()
  296. self.CM.confdb_agent[node] = ConfdbTestAgent(node, CM.Env)
  297. self.CM.confdb_agent[node].start()
  298. self.CM.sam_agent[node] = SamTestAgent(node, CM.Env)
  299. self.CM.sam_agent[node].start()
  300. # votequorum agent started as needed.
  301. if CM.applied_config.has_key('quorum/provider'):
  302. self.CM.votequorum_agent[node] = VoteQuorumTestAgent(node, CM.Env)
  303. self.CM.votequorum_agent[node].start()
  304. return 1
  305. def TearDown(self, CM):
  306. '''Tear down (undo) the given ScenarioComponent'''
  307. self.CM = CM
  308. for node in self.Env["nodes"]:
  309. if self.CM.cpg_agent.has_key(node):
  310. self.CM.cpg_agent[node].stop()
  311. self.CM.confdb_agent[node].stop()
  312. self.CM.sam_agent[node].stop()
  313. if self.CM.votequorum_agent.has_key(node):
  314. self.CM.votequorum_agent[node].stop()
  315. ###################################################################
  316. class TestAgent(object):
  317. def __init__(self, binary, node, port, env=None):
  318. self.node = node
  319. self.node_address = None
  320. self.port = port
  321. self.sock = None
  322. self.binary = binary
  323. self.started = False
  324. self.rsh = RemoteExec(Env=env)
  325. self.func_name = None
  326. self.used = False
  327. self.env = env
  328. self.send_recv = False
  329. def restart(self):
  330. self.stop()
  331. self.start()
  332. def clean_start(self):
  333. if self.used or not self.status():
  334. self.env.debug('test agent: cleaning %s on node %s' % (self.binary, self.node))
  335. self.stop()
  336. self.start()
  337. def status(self):
  338. if not self.started:
  339. return False
  340. try:
  341. self.send (["are_you_ok_dude"])
  342. self.read ()
  343. self.started = True
  344. return True
  345. except RuntimeError, msg:
  346. self.started = False
  347. return False
  348. def start(self):
  349. '''Set up the given ScenarioComponent'''
  350. self.env.debug('test agent: starting %s on node %s' % (self.binary, self.node))
  351. self.sock = socket.socket (socket.AF_INET, socket.SOCK_STREAM)
  352. ip = socket.gethostbyname(self.node)
  353. self.rsh(self.node, self.binary, blocking=0)
  354. is_connected = False
  355. retries = 0
  356. while not is_connected:
  357. try:
  358. retries = retries + 1
  359. self.sock.connect ((ip, self.port))
  360. is_connected = True
  361. except socket.error, msg:
  362. if retries > 5:
  363. self.env.log("Retried " + str(retries) + " times. Error: " + str(msg))
  364. time.sleep(1)
  365. self.started = True
  366. self.used = False
  367. def stop(self):
  368. '''Tear down (undo) the given ScenarioComponent'''
  369. self.env.debug('test agent: stopping %s on node %s' % (self.binary, self.node))
  370. self.sock.close ()
  371. self.rsh(self.node, "killall " + self.binary + " 2>/dev/null")
  372. self.started = False
  373. def send (self, args):
  374. if not self.started:
  375. self.start()
  376. real_msg = str (len (args))
  377. for a in args:
  378. a_str = str(a)
  379. real_msg += ":" + str (len (a_str)) + ":" + a_str
  380. real_msg += ";"
  381. sent = 0
  382. try:
  383. sent = self.sock.send (real_msg)
  384. except socket.error, msg:
  385. self.env.debug("send(%s): %s; error: %s" % (self.node, real_msg, msg))
  386. if sent == 0:
  387. raise RuntimeError ("socket connection broken")
  388. self.used = True
  389. def __getattribute__(self,name):
  390. try:
  391. return object.__getattribute__(self, name)
  392. except:
  393. self.func_name = name
  394. if self.send_recv:
  395. return self.send_recv_dynamic
  396. else:
  397. return self.send_dynamic
  398. def send_recv_dynamic (self, *args):
  399. self.send_dynamic (args)
  400. try:
  401. res = self.read ()
  402. except RuntimeError, msg:
  403. self.env.log("send_recv_dynamic: %s; error: %s" % (str(real_msg), msg))
  404. return res
  405. def send_dynamic (self, *args):
  406. if not self.started:
  407. self.start()
  408. # number of args+func
  409. real_msg = str (len (args) + 1) + ":" + str(len(self.func_name)) + ":" + self.func_name
  410. for a in args:
  411. a_str = str(a)
  412. real_msg += ":" + str (len (a_str)) + ":" + a_str
  413. real_msg += ";"
  414. sent = 0
  415. try:
  416. sent = self.sock.send (real_msg)
  417. except socket.error, msg:
  418. self.env.debug("send_dynamic(%s): %s; error: %s" % (self.node, real_msg, msg))
  419. if sent == 0:
  420. raise RuntimeError ("socket connection broken")
  421. self.used = True
  422. def read (self):
  423. msg = self.sock.recv (4096)
  424. if msg == '':
  425. raise RuntimeError("socket connection broken")
  426. return msg
  427. class CpgConfigEvent:
  428. def __init__(self, msg):
  429. info = msg.split(',')
  430. self.group_name = info[0]
  431. self.node_id = info[1]
  432. self.node = None
  433. self.pid = info[2]
  434. if "left" in info[3]:
  435. self.is_member = False
  436. else:
  437. self.is_member = True
  438. def __str__ (self):
  439. str = self.group_name + "," + self.node_id + "," + self.pid + ","
  440. if self.is_member:
  441. return str + "joined"
  442. else:
  443. return str + "left"
  444. ###################################################################
  445. class CpgTestAgent(TestAgent):
  446. def __init__(self, node, Env=None):
  447. TestAgent.__init__(self, "cpg_test_agent", node, 9034, env=Env)
  448. self.initialized = False
  449. self.nodeid = None
  450. def start(self):
  451. if not self.started:
  452. TestAgent.start(self)
  453. self.cpg_initialize()
  454. self.used = False
  455. def stop(self):
  456. try:
  457. if self.started:
  458. self.cpg_finalize()
  459. except RuntimeError, msg:
  460. # if cpg_agent is down, we are not going to stress
  461. self.env.debug("CpgTestAgent::cpg_finalize() - %s" % msg)
  462. TestAgent.stop(self)
  463. def cpg_local_get(self):
  464. if self.nodeid == None:
  465. self.send (["cpg_local_get"])
  466. self.nodeid = self.read ()
  467. return self.nodeid
  468. def record_config_events(self, truncate=True):
  469. if truncate:
  470. self.send (["record_config_events", "truncate"])
  471. else:
  472. self.send (["record_config_events", "append"])
  473. return self.read ()
  474. def read_config_event(self):
  475. self.send (["read_config_event"])
  476. msg = self.read ()
  477. if "None" in msg:
  478. return None
  479. else:
  480. return CpgConfigEvent(msg)
  481. def read_messages(self, atmost):
  482. self.send (["read_messages", atmost])
  483. msg = self.read ()
  484. if "None" in msg:
  485. return None
  486. else:
  487. return msg
  488. def context_test(self):
  489. self.send (["context_test"])
  490. return self.read ()
  491. ###################################################################
  492. class ConfdbTestAgent(TestAgent):
  493. def __init__(self, node, Env=None):
  494. TestAgent.__init__(self, "confdb_test_agent", node, 9035, env=Env)
  495. self.initialized = False
  496. self.nodeid = None
  497. self.send_recv = True
  498. ###################################################################
  499. class SamTestAgent(TestAgent):
  500. def __init__(self, node, Env=None):
  501. TestAgent.__init__(self, "sam_test_agent", node, 9036, env=Env)
  502. self.initialized = False
  503. self.nodeid = None
  504. self.send_recv = True
  505. ###################################################################
  506. class VoteQuorumTestAgent(TestAgent):
  507. def __init__(self, node, Env=None):
  508. TestAgent.__init__(self, "votequorum_test_agent", node, 9037, env=Env)
  509. self.initialized = False
  510. self.nodeid = None
  511. self.send_recv = True
  512. def start(self):
  513. if not self.started:
  514. TestAgent.start(self)
  515. self.init()
  516. self.used = False