corotests.py 54 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615
  1. __copyright__='''
  2. Copyright (c) 2010 Red Hat, Inc.
  3. '''
  4. # All rights reserved.
  5. #
  6. # Author: Angus Salkeld <asalkeld@redhat.com>
  7. #
  8. # This software licensed under BSD license, the text of which follows:
  9. #
  10. # Redistribution and use in source and binary forms, with or without
  11. # modification, are permitted provided that the following conditions are met:
  12. #
  13. # - Redistributions of source code must retain the above copyright notice,
  14. # this list of conditions and the following disclaimer.
  15. # - Redistributions in binary form must reproduce the above copyright notice,
  16. # this list of conditions and the following disclaimer in the documentation
  17. # and/or other materials provided with the distribution.
  18. # - Neither the name of the MontaVista Software, Inc. nor the names of its
  19. # contributors may be used to endorse or promote products derived from this
  20. # software without specific prior written permission.
  21. #
  22. # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  23. # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  24. # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  25. # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  26. # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  27. # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  28. # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  29. # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  30. # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  31. # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
  32. # THE POSSIBILITY OF SUCH DAMAGE.
  33. import random
  34. import socket
  35. from UserDict import UserDict
  36. from cts.CTStests import *
  37. from corosync import CpgTestAgent
  38. ###################################################################
  39. class CoroTest(CTSTest):
  40. '''
  41. basic class to make sure that new configuration is applied
  42. and old configuration is removed.
  43. '''
  44. def __init__(self, cm):
  45. CTSTest.__init__(self,cm)
  46. self.start = StartTest(cm)
  47. self.stop = StopTest(cm)
  48. self.config = {}
  49. self.config['logging/logger_subsys[1]/subsys'] = 'MAIN'
  50. self.config['logging/logger_subsys[1]/debug'] = 'on'
  51. self.need_all_up = True
  52. self.CM.start_cpg = True
  53. self.cpg_name = 'cts_group'
  54. def setup(self, node):
  55. ret = CTSTest.setup(self, node)
  56. # setup the authkey
  57. localauthkey = '/tmp/authkey'
  58. if not os.path.exists(localauthkey):
  59. self.CM.rsh(node, 'corosync-keygen -l')
  60. self.CM.rsh.cp("%s:%s" % (node, "/etc/corosync/authkey"), localauthkey)
  61. for n in self.CM.Env["nodes"]:
  62. if n is not node:
  63. #copy key onto other nodes
  64. self.CM.rsh.cp(localauthkey, "%s:%s" % (n, "/etc/corosync/authkey"))
  65. # copy over any new config
  66. for c in self.config:
  67. self.CM.new_config[c] = self.config[c]
  68. # apply the config
  69. self.CM.apply_new_config(self.need_all_up)
  70. # start/stop all corosyncs'
  71. for n in self.CM.Env["nodes"]:
  72. if self.need_all_up and not self.CM.StataCM(n):
  73. self.incr("started")
  74. self.start(n)
  75. if self.need_all_up and self.CM.start_cpg:
  76. self.CM.cpg_agent[n].clean_start()
  77. self.CM.cpg_agent[n].cpg_join(self.cpg_name)
  78. self.CM.cpg_agent[n].cfg_initialize()
  79. if not self.need_all_up and self.CM.StataCM(n):
  80. self.incr("stopped")
  81. self.stop(n)
  82. return ret
  83. def config_valid(self, config):
  84. return True
  85. def teardown(self, node):
  86. self.CM.apply_default_config()
  87. return CTSTest.teardown(self, node)
  88. ###################################################################
  89. class CpgContextTest(CoroTest):
  90. def __init__(self, cm):
  91. self.name="CpgContextTest"
  92. CoroTest.__init__(self, cm)
  93. self.CM.start_cpg = True
  94. def __call__(self, node):
  95. self.incr("calls")
  96. res = self.CM.cpg_agent[node].context_test()
  97. if 'OK' in res:
  98. return self.success()
  99. else:
  100. return self.failure('context_test failed')
  101. ###################################################################
  102. class CpgConfigChangeBase(CoroTest):
  103. '''
  104. join a cpg group on each node, and test that the following
  105. causes a leave event:
  106. - a call to cpg_leave()
  107. - app exit
  108. - node leave
  109. - node leave (with large token timeout)
  110. '''
  111. def setup(self, node):
  112. ret = CoroTest.setup(self, node)
  113. self.listener = None
  114. self.wobbly = None
  115. for n in self.CM.Env["nodes"]:
  116. if self.wobbly is None:
  117. self.wobbly = n
  118. elif self.listener is None:
  119. self.listener = n
  120. if self.CM.cpg_agent.has_key(self.wobbly):
  121. self.wobbly_id = self.CM.cpg_agent[self.wobbly].cpg_local_get()
  122. if self.CM.cpg_agent.has_key(self.listener):
  123. self.CM.cpg_agent[self.listener].record_config_events(truncate=True)
  124. return ret
  125. def wait_for_config_change(self):
  126. found = False
  127. max_timeout = 60 * 15
  128. waited = 0
  129. printit = 0
  130. self.CM.log("Waiting for config change on " + self.listener)
  131. while not found:
  132. try:
  133. event = self.CM.cpg_agent[self.listener].read_config_event()
  134. except:
  135. return self.failure('connection to test cpg_agent failed.')
  136. if not event == None:
  137. self.CM.debug("RECEIVED: " + str(event))
  138. if event == None:
  139. if waited >= max_timeout:
  140. return self.failure("timedout(" + str(waited) + " sec) == no event!")
  141. else:
  142. time.sleep(1)
  143. waited = waited + 1
  144. printit = printit + 1
  145. if printit is 60:
  146. print 'waited ' + str(waited) + ' seconds'
  147. printit = 0
  148. elif str(event.node_id) in str(self.wobbly_id) and not event.is_member:
  149. self.CM.log("Got the config change in " + str(waited) + " seconds")
  150. found = True
  151. else:
  152. self.CM.debug("No match")
  153. self.CM.debug("wobbly nodeid:" + str(self.wobbly_id))
  154. self.CM.debug("event nodeid:" + str(event.node_id))
  155. self.CM.debug("event.is_member:" + str(event.is_member))
  156. if found:
  157. return self.success()
  158. ###################################################################
  159. class CpgCfgChgOnGroupLeave(CpgConfigChangeBase):
  160. def __init__(self, cm):
  161. CpgConfigChangeBase.__init__(self,cm)
  162. self.name="CpgCfgChgOnGroupLeave"
  163. def failure_action(self):
  164. self.CM.log("calling cpg_leave() on " + self.wobbly)
  165. self.CM.cpg_agent[self.wobbly].cpg_leave(self.cpg_name)
  166. def __call__(self, node):
  167. self.incr("calls")
  168. self.failure_action()
  169. return self.wait_for_config_change()
  170. ###################################################################
  171. class CpgCfgChgOnNodeLeave(CpgConfigChangeBase):
  172. def __init__(self, cm):
  173. CpgConfigChangeBase.__init__(self,cm)
  174. self.name="CpgCfgChgOnNodeLeave"
  175. def failure_action(self):
  176. self.CM.log("stopping corosync on " + self.wobbly)
  177. self.stop(self.wobbly)
  178. def __call__(self, node):
  179. self.incr("calls")
  180. self.failure_action()
  181. return self.wait_for_config_change()
  182. ###################################################################
  183. class CpgCfgChgOnLowestNodeJoin(CTSTest):
  184. '''
  185. 1) stop all nodes
  186. 2) start all but the node with the smallest ip address
  187. 3) start recording events
  188. 4) start the last node
  189. '''
  190. def __init__(self, cm):
  191. CTSTest.__init__(self, cm)
  192. self.name="CpgCfgChgOnLowestNodeJoin"
  193. self.start = StartTest(cm)
  194. self.stop = StopTest(cm)
  195. self.config = {}
  196. self.need_all_up = False
  197. def config_valid(self, config):
  198. return True
  199. def lowest_ip_set(self):
  200. self.lowest = None
  201. for n in self.CM.Env["nodes"]:
  202. if self.lowest is None:
  203. self.lowest = n
  204. self.CM.log("lowest node is " + self.lowest)
  205. def setup(self, node):
  206. # stop all nodes
  207. for n in self.CM.Env["nodes"]:
  208. self.CM.StopaCM(n)
  209. self.lowest_ip_set()
  210. # copy over any new config
  211. for c in self.config:
  212. self.CM.new_config[c] = self.config[c]
  213. # install the config
  214. self.CM.install_all_config()
  215. # start all but lowest
  216. self.listener = None
  217. for n in self.CM.Env["nodes"]:
  218. if n is not self.lowest:
  219. if self.listener is None:
  220. self.listener = n
  221. self.incr("started")
  222. self.CM.log("starting " + n)
  223. self.start(n)
  224. self.CM.cpg_agent[n].clean_start()
  225. self.CM.cpg_agent[n].cpg_join(self.cpg_name)
  226. # start recording events
  227. pats = []
  228. pats.append("%s .*sync: node joined.*" % self.listener)
  229. pats.append("%s .*sync: activate correctly.*" % self.listener)
  230. self.sync_log = self.create_watch(pats, 60)
  231. self.sync_log.setwatch()
  232. self.CM.log("setup done")
  233. return CTSTest.setup(self, node)
  234. def __call__(self, node):
  235. self.incr("calls")
  236. self.start(self.lowest)
  237. self.CM.cpg_agent[self.lowest].clean_start()
  238. self.CM.cpg_agent[self.lowest].cpg_join(self.cpg_name)
  239. self.wobbly_id = self.CM.cpg_agent[self.lowest].cpg_local_get()
  240. self.CM.log("waiting for sync events")
  241. if not self.sync_log.lookforall():
  242. return self.failure("Patterns not found: " + repr(self.sync_log.unmatched))
  243. else:
  244. return self.success()
  245. ###################################################################
  246. class CpgCfgChgOnExecCrash(CpgConfigChangeBase):
  247. def __init__(self, cm):
  248. CpgConfigChangeBase.__init__(self,cm)
  249. self.name="CpgCfgChgOnExecCrash"
  250. def failure_action(self):
  251. self.CM.log("sending KILL to corosync on " + self.wobbly)
  252. self.CM.rsh(self.wobbly, "killall -9 corosync")
  253. self.CM.rsh(self.wobbly, "rm -f /var/run/corosync.pid")
  254. self.CM.ShouldBeStatus[self.wobbly] = "down"
  255. def __call__(self, node):
  256. self.incr("calls")
  257. self.failure_action()
  258. return self.wait_for_config_change()
  259. ###################################################################
  260. class CpgCfgChgOnNodeIsolate(CpgConfigChangeBase):
  261. def __init__(self, cm):
  262. CpgConfigChangeBase.__init__(self,cm)
  263. self.name="CpgCfgChgOnNodeIsolate"
  264. def config_valid(self, config):
  265. if config.has_key('totem/rrp_mode'):
  266. return False
  267. else:
  268. return True
  269. def failure_action(self):
  270. self.CM.log("isolating node " + self.wobbly)
  271. self.CM.isolate_node(self.wobbly)
  272. def __call__(self, node):
  273. self.incr("calls")
  274. self.failure_action()
  275. return self.wait_for_config_change()
  276. def teardown(self, node):
  277. self.CM.unisolate_node (self.wobbly)
  278. return CpgConfigChangeBase.teardown(self, node)
  279. ###################################################################
  280. class CpgCfgChgOnNodeRestart(CpgConfigChangeBase):
  281. def __init__(self, cm):
  282. CpgConfigChangeBase.__init__(self,cm)
  283. self.name="CpgCfgChgOnNodeRestart"
  284. self.CM.start_cpg = False
  285. def config_valid(self, config):
  286. if config.has_key('totem/secauth'):
  287. if config['totem/secauth'] is 'on':
  288. return False
  289. else:
  290. return True
  291. if config.has_key('totem/rrp_mode'):
  292. return False
  293. else:
  294. return True
  295. def failure_action(self):
  296. self.CM.log("2: isolating node " + self.wobbly)
  297. self.CM.isolate_node(self.wobbly)
  298. self.CM.log("3: Killing corosync on " + self.wobbly)
  299. self.CM.rsh(self.wobbly, "killall -9 corosync")
  300. self.CM.rsh(self.wobbly, "rm -f /var/run/corosync.pid")
  301. self.CM.ShouldBeStatus[self.wobbly] = "down"
  302. self.CM.log("4: unisolating node " + self.wobbly)
  303. self.CM.unisolate_node (self.wobbly)
  304. self.CM.log("5: starting corosync on " + self.wobbly)
  305. self.CM.StartaCM(self.wobbly)
  306. time.sleep(5)
  307. self.CM.log("6: starting cpg on all nodes")
  308. self.CM.start_cpg = True
  309. for node in self.CM.Env["nodes"]:
  310. self.CM.cpg_agent[node] = CpgTestAgent(node, self.CM.Env)
  311. self.CM.cpg_agent[node].start()
  312. self.CM.cpg_agent[node].cpg_join(self.cpg_name)
  313. self.wobbly_id = self.CM.cpg_agent[self.wobbly].cpg_local_get()
  314. self.CM.cpg_agent[self.listener].record_config_events(truncate=True)
  315. self.CM.log("7: isolating node " + self.wobbly)
  316. self.CM.isolate_node(self.wobbly)
  317. self.CM.log("8: Killing corosync on " + self.wobbly)
  318. self.CM.rsh(self.wobbly, "killall -9 corosync")
  319. self.CM.rsh(self.wobbly, "rm -f /var/run/corosync.pid")
  320. self.CM.ShouldBeStatus[self.wobbly] = "down"
  321. self.CM.log("9: unisolating node " + self.wobbly)
  322. self.CM.unisolate_node (self.wobbly)
  323. self.CM.log("10: starting corosync on " + self.wobbly)
  324. self.CM.StartaCM(self.wobbly)
  325. def __call__(self, node):
  326. self.incr("calls")
  327. self.failure_action()
  328. return self.wait_for_config_change()
  329. def teardown(self, node):
  330. self.CM.unisolate_node (self.wobbly)
  331. return CpgConfigChangeBase.teardown(self, node)
  332. ###################################################################
  333. class CpgMsgOrderBase(CoroTest):
  334. def __init__(self, cm):
  335. CoroTest.__init__(self,cm)
  336. self.num_msgs_per_node = 0
  337. self.total_num_msgs = 0
  338. def setup(self, node):
  339. ret = CoroTest.setup(self, node)
  340. for n in self.CM.Env["nodes"]:
  341. self.CM.cpg_agent[n].clean_start()
  342. self.CM.cpg_agent[n].cpg_join(self.cpg_name)
  343. self.CM.cpg_agent[n].record_messages()
  344. time.sleep(1)
  345. return ret
  346. def cpg_msg_blaster(self):
  347. for n in self.CM.Env["nodes"]:
  348. self.CM.cpg_agent[n].msg_blaster(self.num_msgs_per_node)
  349. def wait_and_validate_order(self):
  350. msgs = {}
  351. self.total_num_msgs = 0
  352. for n in self.CM.Env["nodes"]:
  353. self.total_num_msgs = self.total_num_msgs + self.num_msgs_per_node
  354. for n in self.CM.Env["nodes"]:
  355. msgs[n] = []
  356. stopped = False
  357. waited = 0
  358. while len(msgs[n]) < self.total_num_msgs and waited < 360:
  359. try:
  360. msg = self.CM.cpg_agent[n].read_messages(50)
  361. except:
  362. return self.failure('connection to test cpg_agent failed.')
  363. if not msg == None:
  364. msgl = msg.split(";")
  365. # remove empty entries
  366. not_done=True
  367. while not_done:
  368. try:
  369. msgl.remove('')
  370. except:
  371. not_done = False
  372. msgs[n].extend(msgl)
  373. elif msg == None:
  374. time.sleep(2)
  375. waited = waited + 2
  376. if len(msgs[n]) < self.total_num_msgs:
  377. return self.failure("expected %d messages from %s got %d" % (self.total_num_msgs, n, len(msgs[n])))
  378. fail = False
  379. error_message = ''
  380. for i in range(0, self.total_num_msgs):
  381. first = None
  382. for n in self.CM.Env["nodes"]:
  383. # first test for errors
  384. params = msgs[n][i].split(":")
  385. if not 'OK' in params[3]:
  386. fail = True
  387. error_message = 'error: ' + params[3] + ' in received message'
  388. self.CM.log(str(params))
  389. # then look for out of order messages
  390. if first == None:
  391. first = n
  392. else:
  393. if not msgs[first][i] == msgs[n][i]:
  394. # message order not the same!
  395. fail = True
  396. error_message = 'message out of order'
  397. self.CM.log(msgs[first][i] + " != " + msgs[n][i])
  398. if fail:
  399. return self.failure(error_message)
  400. else:
  401. return self.success()
  402. ###################################################################
  403. class CpgMsgOrderBasic(CpgMsgOrderBase):
  404. '''
  405. each sends & logs lots of messages
  406. '''
  407. def __init__(self, cm):
  408. CpgMsgOrderBase.__init__(self,cm)
  409. self.name="CpgMsgOrderBasic"
  410. self.num_msgs_per_node = 9000
  411. def __call__(self, node):
  412. self.incr("calls")
  413. for n in self.CM.Env["nodes"]:
  414. self.CM.cpg_agent[n].msg_blaster(self.num_msgs_per_node)
  415. return self.wait_and_validate_order()
  416. ###################################################################
  417. class CpgMsgOrderZcb(CpgMsgOrderBase):
  418. '''
  419. each sends & logs lots of messages
  420. '''
  421. def __init__(self, cm):
  422. CpgMsgOrderBase.__init__(self,cm)
  423. self.name="CpgMsgOrderZcb"
  424. self.num_msgs_per_node = 9000
  425. def __call__(self, node):
  426. self.incr("calls")
  427. for n in self.CM.Env["nodes"]:
  428. self.CM.cpg_agent[n].msg_blaster_zcb(self.num_msgs_per_node)
  429. return self.wait_and_validate_order()
  430. ###################################################################
  431. class MemLeakObject(CoroTest):
  432. '''
  433. run mem_leak_test.sh -1
  434. '''
  435. def __init__(self, cm):
  436. CoroTest.__init__(self,cm)
  437. self.name="MemLeakObject"
  438. def __call__(self, node):
  439. self.incr("calls")
  440. mem_leaked = self.CM.rsh(node, "/usr/share/corosync/tests/mem_leak_test.sh -1")
  441. if mem_leaked is 0:
  442. return self.success()
  443. else:
  444. return self.failure(str(mem_leaked) + 'kB memory leaked.')
  445. ###################################################################
  446. class MemLeakSession(CoroTest):
  447. '''
  448. run mem_leak_test.sh -2
  449. '''
  450. def __init__(self, cm):
  451. CoroTest.__init__(self,cm)
  452. self.name="MemLeakSession"
  453. def __call__(self, node):
  454. self.incr("calls")
  455. mem_leaked = self.CM.rsh(node, "/usr/share/corosync/tests/mem_leak_test.sh -2")
  456. if mem_leaked is 0:
  457. return self.success()
  458. else:
  459. return self.failure(str(mem_leaked) + 'kB memory leaked.')
  460. ###################################################################
  461. class CMapDispatchDeadlock(CoroTest):
  462. '''
  463. run cmap-dispatch-deadlock.sh
  464. '''
  465. def __init__(self, cm):
  466. CoroTest.__init__(self,cm)
  467. self.name="CMapDispatchDeadlock"
  468. def __call__(self, node):
  469. self.incr("calls")
  470. result = self.CM.rsh(node, "/usr/share/corosync/tests/cmap-dispatch-deadlock.sh")
  471. if result is 0:
  472. return self.success()
  473. else:
  474. return self.failure('Deadlock detected')
  475. ###################################################################
  476. class SamTest1(CoroTest):
  477. def __init__(self, cm):
  478. CoroTest.__init__(self, cm)
  479. self.name="SamTest1"
  480. def __call__(self, node):
  481. self.incr("calls")
  482. res = self.CM.sam_agent[node].test1()
  483. if 'OK' in res:
  484. return self.success()
  485. else:
  486. return self.failure(self.name + ' failed')
  487. ###################################################################
  488. class SamTest2(CoroTest):
  489. def __init__(self, cm):
  490. CoroTest.__init__(self, cm)
  491. self.name="SamTest2"
  492. def __call__(self, node):
  493. self.incr("calls")
  494. res = self.CM.sam_agent[node].test2()
  495. if 'OK' in res:
  496. return self.success()
  497. else:
  498. return self.failure(self.name + ' failed')
  499. ###################################################################
  500. class SamTest4(CoroTest):
  501. def __init__(self, cm):
  502. CoroTest.__init__(self, cm)
  503. self.name="SamTest4"
  504. def __call__(self, node):
  505. self.incr("calls")
  506. res = self.CM.sam_agent[node].test4()
  507. if 'OK' in res:
  508. return self.success()
  509. else:
  510. return self.failure(self.name + ' failed')
  511. ###################################################################
  512. class SamTest5(CoroTest):
  513. def __init__(self, cm):
  514. CoroTest.__init__(self, cm)
  515. self.name="SamTest5"
  516. def __call__(self, node):
  517. self.incr("calls")
  518. res = self.CM.sam_agent[node].test5()
  519. if 'OK' in res:
  520. return self.success()
  521. else:
  522. return self.failure(self.name + ' failed')
  523. ###################################################################
  524. class SamTest6(CoroTest):
  525. def __init__(self, cm):
  526. CoroTest.__init__(self, cm)
  527. self.name="SamTest6"
  528. def __call__(self, node):
  529. self.incr("calls")
  530. res = self.CM.sam_agent[node].test6()
  531. if 'OK' in res:
  532. return self.success()
  533. else:
  534. return self.failure(self.name + ' failed')
  535. ###################################################################
  536. class SamTest8(CoroTest):
  537. def __init__(self, cm):
  538. CoroTest.__init__(self, cm)
  539. self.name="SamTest8"
  540. def __call__(self, node):
  541. self.incr("calls")
  542. res = self.CM.sam_agent[node].test8()
  543. if 'OK' in res:
  544. return self.success()
  545. else:
  546. return self.failure(self.name + ' failed')
  547. ###################################################################
  548. class SamTest9(CoroTest):
  549. def __init__(self, cm):
  550. CoroTest.__init__(self, cm)
  551. self.name="SamTest9"
  552. def __call__(self, node):
  553. self.incr("calls")
  554. res = self.CM.sam_agent[node].test9()
  555. if 'OK' in res:
  556. return self.success()
  557. else:
  558. return self.failure(self.name + ' failed')
  559. class QuorumState(object):
  560. def __init__(self, cm, node):
  561. self.node = node
  562. self.CM = cm
  563. self.CM.votequorum_agent[self.node].init()
  564. def refresh(self):
  565. info = self.CM.votequorum_agent[self.node].votequorum_getinfo()
  566. assert(info != 'FAIL')
  567. assert(info != 'NOT_SUPPORTED')
  568. #self.CM.log('refresh: ' + info)
  569. params = info.split(':')
  570. self.node_votes = int(params[0])
  571. self.expected_votes = int(params[1])
  572. self.highest_expected = int(params[2])
  573. self.total_votes = int(params[3])
  574. self.quorum = int(params[4])
  575. self.quorate = self.CM.votequorum_agent[self.node].quorum_getquorate()
  576. assert(self.quorate != 'FAIL')
  577. assert(self.quorate != 'NOT_SUPPORTED')
  578. #self.CM.log('quorate: ' + str(self.quorate))
  579. ###################################################################
  580. class VoteQuorumBase(CoroTest):
  581. def setup(self, node):
  582. ret = CoroTest.setup(self, node)
  583. self.listener = None
  584. for n in self.CM.Env["nodes"]:
  585. if self.listener is None:
  586. self.listener = n
  587. return ret
  588. def config_valid(self, config):
  589. if config.has_key('totem/rrp_mode'):
  590. return False
  591. if config.has_key('quorum/provider'):
  592. return False
  593. return True
  594. ###################################################################
  595. class VoteQuorumGoDown(VoteQuorumBase):
  596. # all up
  597. # calc min expected votes to get Q
  598. # bring nodes down one-by-one
  599. # confirm cluster looses Q when V < EV
  600. #
  601. def __init__(self, cm):
  602. VoteQuorumBase.__init__(self, cm)
  603. self.name="VoteQuorumGoDown"
  604. self.victims = []
  605. self.expected = len(self.CM.Env["nodes"])
  606. self.config['quorum/provider'] = 'corosync_votequorum'
  607. self.config['quorum/expected_votes'] = self.expected
  608. #self.CM.log('set expected to %d' % (self.expected))
  609. def __call__(self, node):
  610. self.incr("calls")
  611. self.victims = []
  612. pats = []
  613. pats.append("%s .*VQ notification quorate: 0" % self.listener)
  614. pats.append("%s .*NQ notification quorate: 0" % self.listener)
  615. quorum = self.create_watch(pats, 30)
  616. quorum.setwatch()
  617. state = QuorumState(self.CM, self.listener)
  618. state.refresh()
  619. for n in self.CM.Env["nodes"]:
  620. if n is self.listener:
  621. continue
  622. self.victims.append(n)
  623. self.CM.StopaCM(n)
  624. #if not self.wait_for_quorum_change():
  625. # return self.failure(self.error_message)
  626. nodes_alive = len(self.CM.Env["nodes"]) - len(self.victims)
  627. state.refresh()
  628. #self.expected = self.expected - 1
  629. if state.node_votes != 1:
  630. self.failure('unexpected number of node_votes')
  631. if state.expected_votes != self.expected:
  632. self.CM.log('nev: %d != exp %d' % (state.expected_votes, self.expected))
  633. self.failure('unexpected number of expected_votes')
  634. if state.total_votes != nodes_alive:
  635. self.failure('unexpected number of total votes:%d, nodes_alive:%d' % (state.total_votes, nodes_alive))
  636. min = ((len(self.CM.Env["nodes"]) + 2) / 2)
  637. if min != state.quorum:
  638. self.failure('we should have %d (not %d) as quorum' % (min, state.quorum))
  639. if nodes_alive < state.quorum:
  640. if state.quorate == 1:
  641. self.failure('we should NOT have quorum(%d) %d > %d' % (state.quorate, state.quorum, nodes_alive))
  642. else:
  643. if state.quorate == 0:
  644. self.failure('we should have quorum(%d) %d <= %d' % (state.quorate, state.quorum, nodes_alive))
  645. if not quorum.lookforall():
  646. self.CM.log("Patterns not found: " + repr(quorum.unmatched))
  647. return self.failure('quorm event not found')
  648. return self.success()
  649. ###################################################################
  650. class VoteQuorumGoUp(VoteQuorumBase):
  651. # all down
  652. # calc min expected votes to get Q
  653. # bring nodes up one-by-one
  654. # confirm cluster gains Q when V >= EV
  655. def __init__(self, cm):
  656. VoteQuorumBase.__init__(self, cm)
  657. self.name="VoteQuorumGoUp"
  658. self.need_all_up = False
  659. self.expected = len(self.CM.Env["nodes"])
  660. self.config['quorum/provider'] = 'corosync_votequorum'
  661. self.config['quorum/expected_votes'] = self.expected
  662. #self.CM.log('set expected to %d' % (self.expected))
  663. def __call__(self, node):
  664. self.incr("calls")
  665. pats = []
  666. pats.append("%s .*VQ notification quorate: 1" % self.listener)
  667. pats.append("%s .*NQ notification quorate: 1" % self.listener)
  668. quorum = self.create_watch(pats, 30)
  669. quorum.setwatch()
  670. self.CM.StartaCM(self.listener)
  671. nodes_alive = 1
  672. state = QuorumState(self.CM, self.listener)
  673. state.refresh()
  674. for n in self.CM.Env["nodes"]:
  675. if n is self.listener:
  676. continue
  677. #if not self.wait_for_quorum_change():
  678. # return self.failure(self.error_message)
  679. if state.node_votes != 1:
  680. self.failure('unexpected number of node_votes')
  681. if state.expected_votes != self.expected:
  682. self.CM.log('nev: %d != exp %d' % (state.expected_votes, self.expected))
  683. self.failure('unexpected number of expected_votes')
  684. if state.total_votes != nodes_alive:
  685. self.failure('unexpected number of total votes')
  686. min = ((len(self.CM.Env["nodes"]) + 2) / 2)
  687. if min != state.quorum:
  688. self.failure('we should have %d (not %d) as quorum' % (min, state.quorum))
  689. if nodes_alive < state.quorum:
  690. if state.quorate == 1:
  691. self.failure('we should NOT have quorum(%d) %d > %d' % (state.quorate, state.quorum, nodes_alive))
  692. else:
  693. if state.quorate == 0:
  694. self.failure('we should have quorum(%d) %d <= %d' % (state.quorate, state.quorum, nodes_alive))
  695. self.CM.StartaCM(n)
  696. nodes_alive = nodes_alive + 1
  697. state.refresh()
  698. if not quorum.lookforall():
  699. self.CM.log("Patterns not found: " + repr(quorum.unmatched))
  700. return self.failure('quorm event not found')
  701. return self.success()
  702. ###################################################################
  703. class VoteQuorumWaitForAll(VoteQuorumBase):
  704. # all down
  705. # bring nodes up one-by-one
  706. # confirm cluster gains Q when V == num nodes
  707. def __init__(self, cm):
  708. VoteQuorumBase.__init__(self, cm)
  709. self.name="VoteQuorumWaitForAll"
  710. self.need_all_up = False
  711. self.expected = len(self.CM.Env["nodes"])
  712. self.config['quorum/provider'] = 'corosync_votequorum'
  713. self.config['quorum/expected_votes'] = self.expected
  714. self.config['quorum/wait_for_all'] = '1'
  715. def __call__(self, node):
  716. self.incr("calls")
  717. pats = []
  718. pats.append("%s .*VQ notification quorate: 1" % self.listener)
  719. pats.append("%s .*NQ notification quorate: 1" % self.listener)
  720. quorum = self.create_watch(pats, 30)
  721. quorum.setwatch()
  722. # make absolutly all are stopped
  723. for n in self.CM.Env["nodes"]:
  724. self.CM.StopaCM(n)
  725. # start the listener
  726. self.CM.StartaCM(self.listener)
  727. nodes_alive = 1
  728. state = QuorumState(self.CM, self.listener)
  729. state.refresh()
  730. for n in self.CM.Env["nodes"]:
  731. if n is self.listener:
  732. continue
  733. self.CM.StartaCM(n)
  734. nodes_alive = nodes_alive + 1
  735. state.refresh()
  736. if state.node_votes != 1:
  737. self.failure('unexpected number of node_votes')
  738. if state.expected_votes != self.expected:
  739. self.CM.log('nev: %d != exp %d' % (state.expected_votes, self.expected))
  740. self.failure('unexpected number of expected_votes')
  741. if state.total_votes != nodes_alive:
  742. self.failure('unexpected number of total votes')
  743. if nodes_alive < len(self.CM.Env["nodes"]):
  744. if state.quorate == 1:
  745. self.failure('we should NOT have quorum(%d) %d > %d' % (state.quorate,
  746. len(self.CM.Env["nodes"]), nodes_alive))
  747. else:
  748. if state.quorate == 0:
  749. self.failure('we should have quorum(%d) %d <= %d' % (state.quorate,
  750. len(self.CM.Env["nodes"]), nodes_alive))
  751. if not quorum.lookforall():
  752. self.CM.log("Patterns not found: " + repr(quorum.unmatched))
  753. return self.failure('quorm event not found')
  754. return self.success()
  755. ###################################################################
  756. class VoteQuorumContextTest(CoroTest):
  757. def __init__(self, cm):
  758. CoroTest.__init__(self, cm)
  759. self.name="VoteQuorumContextTest"
  760. self.expected = len(self.CM.Env["nodes"])
  761. self.config['quorum/provider'] = 'corosync_votequorum'
  762. self.config['quorum/expected_votes'] = self.expected
  763. def __call__(self, node):
  764. self.incr("calls")
  765. res = self.CM.votequorum_agent[node].context_test()
  766. if 'OK' in res:
  767. return self.success()
  768. else:
  769. return self.failure('context_test failed')
  770. ###################################################################
  771. class GenSimulStart(CoroTest):
  772. '''Start all the nodes ~ simultaneously'''
  773. def __init__(self, cm):
  774. CoroTest.__init__(self,cm)
  775. self.name="GenSimulStart"
  776. self.need_all_up = False
  777. self.stopall = SimulStopLite(cm)
  778. self.startall = SimulStartLite(cm)
  779. def __call__(self, dummy):
  780. '''Perform the 'SimulStart' test. '''
  781. self.incr("calls")
  782. # We ignore the "node" parameter...
  783. # Shut down all the nodes...
  784. ret = self.stopall(None)
  785. if not ret:
  786. return self.failure("Setup failed")
  787. self.CM.clear_all_caches()
  788. if not self.startall(None):
  789. return self.failure("Startall failed")
  790. return self.success()
  791. ###################################################################
  792. class GenSimulStop(CoroTest):
  793. '''Stop all the nodes ~ simultaneously'''
  794. def __init__(self, cm):
  795. CoroTest.__init__(self,cm)
  796. self.name="GenSimulStop"
  797. self.startall = SimulStartLite(cm)
  798. self.stopall = SimulStopLite(cm)
  799. self.need_all_up = True
  800. def __call__(self, dummy):
  801. '''Perform the 'GenSimulStop' test. '''
  802. self.incr("calls")
  803. # We ignore the "node" parameter...
  804. # Start up all the nodes...
  805. ret = self.startall(None)
  806. if not ret:
  807. return self.failure("Setup failed")
  808. if not self.stopall(None):
  809. return self.failure("Stopall failed")
  810. return self.success()
  811. class GenFlipTest(CoroTest):
  812. def __init__(self, cm):
  813. CoroTest.__init__(self,cm)
  814. self.name="GenFlipTest"
  815. self.test = FlipTest(cm)
  816. def __call__(self, dummy):
  817. '''Perform the test. '''
  818. self.incr("calls")
  819. return self.test.__call__(dummy)
  820. class GenRestartTest(CoroTest):
  821. def __init__(self, cm):
  822. CoroTest.__init__(self,cm)
  823. self.name="GenRestartTest"
  824. self.test = RestartTest(cm)
  825. def __call__(self, dummy):
  826. '''Perform the test. '''
  827. self.incr("calls")
  828. return self.test.__call__(dummy)
  829. class GenStartOnebyOne(CoroTest):
  830. def __init__(self, cm):
  831. CoroTest.__init__(self,cm)
  832. self.name="GenStartOnebyOne"
  833. self.test = RestartOnebyOne(cm)
  834. def __call__(self, dummy):
  835. '''Perform the test. '''
  836. self.incr("calls")
  837. return self.test.__call__(dummy)
  838. class GenStopOnebyOne(CoroTest):
  839. def __init__(self, cm):
  840. CoroTest.__init__(self,cm)
  841. self.name="GenStopOnebyOne"
  842. self.test = StopOnebyOne(cm)
  843. def __call__(self, dummy):
  844. '''Perform the test. '''
  845. self.incr("calls")
  846. return self.test.__call__(dummy)
  847. class GenRestartOnebyOne(CoroTest):
  848. def __init__(self, cm):
  849. CoroTest.__init__(self,cm)
  850. self.name="GenRestartOnebyOne"
  851. self.test = RestartOnebyOne(cm)
  852. def __call__(self, dummy):
  853. '''Perform the test. '''
  854. self.incr("calls")
  855. return self.test.__call__(dummy)
  856. ###################################################################
  857. class GenStopAllBeekhof(CoroTest):
  858. '''Stop all the nodes ~ simultaneously'''
  859. def __init__(self, cm):
  860. CoroTest.__init__(self,cm)
  861. self.name="GenStopAllBeekhof"
  862. self.need_all_up = True
  863. self.config['logging/logger_subsys[2]/subsys'] = 'CFG'
  864. self.config['logging/logger_subsys[2]/debug'] = 'on'
  865. def __call__(self, node):
  866. '''Perform the 'GenStopAllBeekhof' test. '''
  867. self.incr("calls")
  868. stopping = int(time.time())
  869. for n in self.CM.Env["nodes"]:
  870. self.CM.cpg_agent[n].pcmk_test()
  871. for n in self.CM.Env["nodes"]:
  872. self.CM.cpg_agent[n].msg_blaster(1000)
  873. for n in self.CM.Env["nodes"]:
  874. self.CM.cpg_agent[n].cfg_shutdown()
  875. self.CM.ShouldBeStatus[n] = "down"
  876. waited = 0
  877. max_wait = 60 * 15
  878. still_up = list(self.CM.Env["nodes"])
  879. while len(still_up) > 0:
  880. waited = int(time.time()) - stopping
  881. self.CM.log("%s still up %s; waited %d secs" % (self.name, str(still_up), waited))
  882. if waited > max_wait:
  883. break
  884. time.sleep(3)
  885. for v in self.CM.Env["nodes"]:
  886. if v in still_up:
  887. self.CM.ShouldBeStatus[n] = "down"
  888. if not self.CM.StataCM(v):
  889. still_up.remove(v)
  890. waited = int(time.time()) - stopping
  891. if waited > max_wait:
  892. return self.failure("Waited %d secs for nodes: %s to stop" % (waited, str(still_up)))
  893. self.CM.log("%s ALL good (waited %d secs)" % (self.name, waited))
  894. return self.success()
  895. ###################################################################
  896. class NoWDConfig(CoroTest):
  897. '''Assertion: no config == no watchdog
  898. Setup: no config, kmod inserted
  899. 1] make sure watchdog is not enabled
  900. '''
  901. def __init__(self, cm):
  902. CoroTest.__init__(self,cm)
  903. self.name="NoWDConfig"
  904. self.need_all_up = False
  905. def config_valid(self, config):
  906. return not config.has_key('resources')
  907. def __call__(self, node):
  908. '''Perform the 'NoWDConfig' test. '''
  909. self.incr("calls")
  910. self.CM.StopaCM(node)
  911. pats = []
  912. pats.append("%s .*no resources configured." % node)
  913. w = self.create_watch(pats, 60)
  914. w.setwatch()
  915. self.CM.StartaCM(node)
  916. if not w.lookforall():
  917. return self.failure("Patterns not found: " + repr(w.unmatched))
  918. else:
  919. return self.success()
  920. ###################################################################
  921. class WDConfigNoWd(CoroTest):
  922. '''Assertion: watchdog config but no watchdog kmod will emit a log
  923. Setup: config watchdog, but no kmod
  924. 1] look in the log for warning that there is no kmod
  925. '''
  926. def __init__(self, cm):
  927. CoroTest.__init__(self,cm)
  928. self.name="WDConfigNoWd"
  929. self.need_all_up = False
  930. def __call__(self, node):
  931. '''Perform the 'WDConfigNoWd' test. '''
  932. self.incr("calls")
  933. self.CM.StopaCM(node)
  934. self.CM.rsh(node, 'rmmod softdog')
  935. pats = []
  936. pats.append("%s .*No Watchdog, try modprobe.*" % node)
  937. w = self.create_watch(pats, 60)
  938. w.setwatch()
  939. self.CM.StartaCM(node)
  940. if not w.lookforall():
  941. return self.failure("Patterns not found: " + repr(w.unmatched))
  942. else:
  943. return self.success()
  944. ###################################################################
  945. class NoWDOnCorosyncStop(CoroTest):
  946. '''Configure WD then /etc/init.d/corosync stop
  947. must stay up for > 60 secs
  948. '''
  949. def __init__(self, cm):
  950. CoroTest.__init__(self,cm)
  951. self.name="NoWDOnCorosyncStop"
  952. self.need_all_up = False
  953. def __call__(self, node):
  954. '''Perform the test. '''
  955. self.incr("calls")
  956. self.CM.StopaCM(node)
  957. self.CM.rsh(node, 'modprobe softdog')
  958. self.CM.StartaCM(node)
  959. pats = []
  960. pats.append("%s .*Unexpected close, not stopping watchdog.*" % node)
  961. w = self.create_watch(pats, 60)
  962. w.setwatch()
  963. self.CM.StopaCM(node)
  964. if w.lookforall():
  965. return self.failure("Should have closed the WD better: " + repr(w.matched))
  966. else:
  967. return self.success()
  968. ###################################################################
  969. class WDOnForkBomb(CoroTest):
  970. '''Configure memory resource
  971. run memory leaker / forkbomb
  972. confirm watchdog action
  973. '''
  974. def __init__(self, cm):
  975. CoroTest.__init__(self,cm)
  976. self.name="WDOnForkBomb"
  977. self.need_all_up = False
  978. self.config['logging/logger_subsys[2]/subsys'] = 'WD'
  979. self.config['logging/logger_subsys[2]/debug'] = 'on'
  980. self.config['resources/system/memory_used/recovery'] = 'watchdog'
  981. self.config['resources/system/memory_used/max'] = '80'
  982. self.config['resources/system/memory_used/poll_period'] = '800'
  983. def __call__(self, node):
  984. '''Perform the test. '''
  985. self.incr("calls")
  986. # get the uptime
  987. up_before = self.CM.rsh(node, 'cut -d. -f1 /proc/uptime', 1).rstrip()
  988. self.CM.StopaCM(node)
  989. self.CM.rsh(node, 'modprobe softdog')
  990. self.CM.StartaCM(node)
  991. self.CM.rsh(node, ':(){ :|:& };:', synchronous=0)
  992. self.CM.log("wait for it to watchdog")
  993. time.sleep(60 * 5)
  994. ping_able = False
  995. while not ping_able:
  996. if self.CM.rsh("localhost", "ping -nq -c10 -w10 %s" % node) == 0:
  997. ping_able = True
  998. self.CM.log("can ping 10 in 10secs.")
  999. else:
  1000. self.CM.log("not yet responding to pings.")
  1001. self.CM.ShouldBeStatus[node] = "down"
  1002. # wait for the node to come back up
  1003. self.CM.log("waiting for node to come back up.")
  1004. if self.CM.ns.WaitForNodeToComeUp(node):
  1005. up_after = self.CM.rsh(node, 'cut -d. -f1 /proc/uptime', 1).rstrip()
  1006. if int(up_after) < int(up_before):
  1007. return self.success()
  1008. else:
  1009. return self.failure("node didn't seem to watchdog uptime 1 %s; 2 %s" %(up_before, up_after))
  1010. else:
  1011. return self.failure("node didn't seem to come back up")
  1012. ###################################################################
  1013. class SamWdIntegration1(CoroTest):
  1014. '''start sam hc
  1015. kill agent
  1016. confirm action
  1017. '''
  1018. def __init__(self, cm):
  1019. CoroTest.__init__(self,cm)
  1020. self.name="SamWdIntegration1"
  1021. self.need_all_up = True
  1022. self.config['logging/logger_subsys[2]/subsys'] = 'WD'
  1023. self.config['logging/logger_subsys[2]/debug'] = 'on'
  1024. def __call__(self, node):
  1025. '''Perform the test. '''
  1026. self.incr("calls")
  1027. self.CM.sam_agent[node].setup_hc()
  1028. pids = self.CM.sam_agent[node].getpid().rstrip().split(" ")
  1029. pats = []
  1030. for pid in pids:
  1031. pats.append('%s .*resource "%s" failed!' % (node, pid))
  1032. w = self.create_watch(pats, 60)
  1033. w.setwatch()
  1034. self.CM.sam_agent[node].kill()
  1035. look_result = w.look()
  1036. if not look_result:
  1037. return self.failure("Patterns not found: " + repr(w.regexes))
  1038. else:
  1039. return self.success()
  1040. ###################################################################
  1041. class SamWdIntegration2(CoroTest):
  1042. '''start sam hc
  1043. call sam_stop()
  1044. confirm resource "stopped" and no watchdog action.
  1045. '''
  1046. def __init__(self, cm):
  1047. CoroTest.__init__(self,cm)
  1048. self.name="SamWdIntegration2"
  1049. self.need_all_up = True
  1050. self.config['logging/logger_subsys[2]/subsys'] = 'WD'
  1051. self.config['logging/logger_subsys[2]/debug'] = 'on'
  1052. def __call__(self, node):
  1053. '''Perform the test. '''
  1054. self.incr("calls")
  1055. self.CM.sam_agent[node].setup_hc()
  1056. pids = self.CM.sam_agent[node].getpid().rstrip().split(" ")
  1057. no_pats = []
  1058. yes_pats = []
  1059. for pid in pids:
  1060. no_pats.append('%s .*resource "%s" failed!' % (node, pid))
  1061. yes_pats.append('%s .*Fsm:%s event "config_changed", state "running" --> "stopped"' % (node, pid))
  1062. yes_w = self.create_watch(yes_pats, 10)
  1063. no_w = self.create_watch(no_pats, 10)
  1064. yes_w.setwatch()
  1065. no_w.setwatch()
  1066. time.sleep(2)
  1067. self.CM.sam_agent[node].sam_stop()
  1068. yes_matched = yes_w.look()
  1069. no_matched = no_w.look()
  1070. if no_matched:
  1071. return self.failure("Patterns found: " + repr(no_matched))
  1072. else:
  1073. if not yes_matched:
  1074. return self.failure("Patterns NOT found: " + repr(yes_w.regexes))
  1075. return self.success()
  1076. ###################################################################
  1077. class WdDeleteResource(CoroTest):
  1078. '''config resource & start corosync
  1079. check that it is getting checked
  1080. delete the object resource object
  1081. check that we do NOT get watchdog'ed
  1082. '''
  1083. def __init__(self, cm):
  1084. CoroTest.__init__(self,cm)
  1085. self.name="WdDeleteResource"
  1086. self.need_all_up = True
  1087. self.config['logging/logger_subsys[2]/subsys'] = 'MON'
  1088. self.config['logging/logger_subsys[2]/debug'] = 'on'
  1089. self.config['logging/logger_subsys[3]/subsys'] = 'WD'
  1090. self.config['logging/logger_subsys[3]/debug'] = 'on'
  1091. self.config['resources/system/memory_used/recovery'] = 'watchdog'
  1092. self.config['resources/system/memory_used/max'] = '80'
  1093. self.config['resources/system/memory_used/poll_period'] = '800'
  1094. def __call__(self, node):
  1095. '''Perform the test. '''
  1096. self.incr("calls")
  1097. no_pats = []
  1098. yes_pats = []
  1099. no_pats.append('%s .*resource "memory_used" failed!' % node)
  1100. yes_pats.append('%s .*resource "memory_used" deleted from cmap!' % node)
  1101. yes_w = self.create_watch(yes_pats, 10)
  1102. no_w = self.create_watch(no_pats, 10)
  1103. yes_w.setwatch()
  1104. no_w.setwatch()
  1105. time.sleep(2)
  1106. self.CM.rsh(node, 'corosync-cmapctl -D resources.system.memory_used')
  1107. yes_matched = yes_w.look()
  1108. no_matched = no_w.look()
  1109. if no_matched:
  1110. return self.failure("Patterns found: " + repr(no_matched))
  1111. else:
  1112. if not yes_matched:
  1113. return self.failure("Patterns NOT found: " + repr(yes_w.regexes))
  1114. return self.success()
  1115. ###################################################################
  1116. class ResourcePollAdjust(CoroTest):
  1117. '''config resource & start corosync
  1118. change the poll_period
  1119. check that we do NOT get watchdog'ed
  1120. '''
  1121. def __init__(self, cm):
  1122. CoroTest.__init__(self,cm)
  1123. self.name="ResourcePollAdjust"
  1124. self.need_all_up = True
  1125. self.config['logging/logger_subsys[2]/subsys'] = 'MON'
  1126. self.config['logging/logger_subsys[2]/debug'] = 'on'
  1127. self.config['logging/logger_subsys[3]/subsys'] = 'WD'
  1128. self.config['logging/logger_subsys[3]/debug'] = 'on'
  1129. self.config['resources/system/memory_used/recovery'] = 'none'
  1130. self.config['resources/system/memory_used/max'] = '80'
  1131. self.config['resources/system/memory_used/poll_period'] = '800'
  1132. def __call__(self, node):
  1133. '''Perform the test. '''
  1134. self.incr("calls")
  1135. no_pats = []
  1136. no_pats.append('%s .*resource "memory_used" failed!' % node)
  1137. no_pats.append('%s .*Could NOT use poll_period.*' % node)
  1138. no_w = self.create_watch(no_pats, 10)
  1139. no_w.setwatch()
  1140. changes = 0
  1141. while changes < 50:
  1142. changes = changes + 1
  1143. poll_period = int(random.random() * 5000)
  1144. if poll_period < 500:
  1145. poll_period = 500
  1146. self.CM.log("setting poll_period to: %d" % poll_period)
  1147. self.CM.rsh(node, 'corosync-cmapctl -s resources.system.memory_used.poll_period str %d' % poll_period)
  1148. sleep_time = poll_period * 2 / 1000
  1149. if sleep_time < 1:
  1150. sleep_time = 1
  1151. time.sleep(sleep_time)
  1152. no_matched = no_w.look()
  1153. if no_matched:
  1154. return self.failure("Patterns found: " + repr(no_matched))
  1155. return self.success()
  1156. ###################################################################
  1157. class RebootOnHighMem(CoroTest):
  1158. '''Configure memory resource
  1159. run memory leaker / forkbomb
  1160. confirm reboot action
  1161. '''
  1162. def __init__(self, cm):
  1163. CoroTest.__init__(self,cm)
  1164. self.name="RebootOnHighMem"
  1165. self.need_all_up = True
  1166. self.config['logging/logger_subsys[2]/subsys'] = 'WD'
  1167. self.config['logging/logger_subsys[2]/debug'] = 'on'
  1168. self.config['resources/system/memory_used/recovery'] = 'reboot'
  1169. self.config['resources/system/memory_used/max'] = '80'
  1170. self.config['resources/system/memory_used/poll_period'] = '800'
  1171. def __call__(self, node):
  1172. '''Perform the test. '''
  1173. self.incr("calls")
  1174. # get the uptime
  1175. up_before = self.CM.rsh(node, 'cut -d. -f1 /proc/uptime', 1).rstrip()
  1176. cmd = 'corosync-cmapctl resources.system.memory_used. | grep current | cut -d= -f2'
  1177. mem_current_str = self.CM.rsh(node, cmd, 1).rstrip()
  1178. mem_new_max = int(mem_current_str) + 5
  1179. self.CM.log("current mem usage: %s, new max:%d" % (mem_current_str, mem_new_max))
  1180. cmd = 'corosync-cmapctl -s resources.system.memory_used.max str ' + str(mem_new_max)
  1181. self.CM.rsh(node, cmd)
  1182. self.CM.rsh(node, 'memhog -r10000 200m', synchronous=0)
  1183. self.CM.log("wait for it to reboot")
  1184. time.sleep(60 * 3)
  1185. cmd = 'corosync-cmapctl resources.system.memory_used. | grep current | cut -d= -f2'
  1186. mem_current_str = self.CM.rsh(node, cmd, 1).rstrip()
  1187. self.CM.log("current mem usage: %s" % (mem_current_str))
  1188. ping_able = False
  1189. while not ping_able:
  1190. if self.CM.rsh("localhost", "ping -nq -c10 -w10 %s" % node) == 0:
  1191. ping_able = True
  1192. self.CM.log("can ping 10 in 10secs.")
  1193. else:
  1194. self.CM.log("not yet responding to pings.")
  1195. self.CM.ShouldBeStatus[node] = "down"
  1196. # wait for the node to come back up
  1197. self.CM.log("waiting for node to come back up.")
  1198. if self.CM.ns.WaitForNodeToComeUp(node):
  1199. up_after = self.CM.rsh(node, 'cut -d. -f1 /proc/uptime', 1).rstrip()
  1200. if int(up_after) < int(up_before):
  1201. return self.success()
  1202. else:
  1203. return self.failure("node didn't seem to watchdog uptime 1 %s; 2 %s" %(up_before, up_after))
  1204. else:
  1205. return self.failure("node didn't seem to come back up")
  1206. GenTestClasses = []
  1207. GenTestClasses.append(GenSimulStart)
  1208. GenTestClasses.append(GenSimulStop)
  1209. GenTestClasses.append(GenFlipTest)
  1210. GenTestClasses.append(GenRestartTest)
  1211. GenTestClasses.append(GenStartOnebyOne)
  1212. GenTestClasses.append(GenStopOnebyOne)
  1213. GenTestClasses.append(GenRestartOnebyOne)
  1214. GenTestClasses.append(GenStopAllBeekhof)
  1215. GenTestClasses.append(CpgMsgOrderBasic)
  1216. GenTestClasses.append(CpgMsgOrderZcb)
  1217. GenTestClasses.append(CpgCfgChgOnExecCrash)
  1218. GenTestClasses.append(CpgCfgChgOnGroupLeave)
  1219. GenTestClasses.append(CpgCfgChgOnNodeLeave)
  1220. GenTestClasses.append(CpgCfgChgOnNodeIsolate)
  1221. #GenTestClasses.append(CpgCfgChgOnNodeRestart)
  1222. AllTestClasses = []
  1223. AllTestClasses.append(CpgContextTest)
  1224. AllTestClasses.append(SamTest1)
  1225. AllTestClasses.append(SamTest2)
  1226. AllTestClasses.append(SamTest4)
  1227. AllTestClasses.append(SamTest5)
  1228. AllTestClasses.append(SamTest6)
  1229. AllTestClasses.append(SamTest8)
  1230. AllTestClasses.append(SamTest9)
  1231. AllTestClasses.append(SamWdIntegration1)
  1232. AllTestClasses.append(SamWdIntegration2)
  1233. AllTestClasses.append(NoWDConfig)
  1234. AllTestClasses.append(WDConfigNoWd)
  1235. AllTestClasses.append(NoWDOnCorosyncStop)
  1236. #AllTestClasses.append(WDOnForkBomb)
  1237. AllTestClasses.append(WdDeleteResource)
  1238. #AllTestClasses.append(RebootOnHighMem)
  1239. AllTestClasses.append(ResourcePollAdjust)
  1240. AllTestClasses.append(MemLeakObject)
  1241. AllTestClasses.append(MemLeakSession)
  1242. #AllTestClasses.append(CMapDispatchDeadlock)
  1243. # quorum tests
  1244. AllTestClasses.append(VoteQuorumContextTest)
  1245. GenTestClasses.append(VoteQuorumGoDown)
  1246. GenTestClasses.append(VoteQuorumGoUp)
  1247. GenTestClasses.append(VoteQuorumWaitForAll)
  1248. # FIXME need log messages in sync
  1249. #GenTestClasses.append(CpgCfgChgOnLowestNodeJoin)
  1250. class ConfigContainer(UserDict):
  1251. def __init__ (self, name):
  1252. self.name = name
  1253. UserDict.__init__(self)
  1254. def CoroTestList(cm, audits):
  1255. result = []
  1256. configs = []
  1257. for testclass in AllTestClasses:
  1258. bound_test = testclass(cm)
  1259. if bound_test.is_applicable():
  1260. bound_test.Audits = audits
  1261. result.append(bound_test)
  1262. default = ConfigContainer('default')
  1263. default['logging/fileline'] = 'on'
  1264. default['logging/function_name'] = 'off'
  1265. default['logging/logfile_priority'] = 'info'
  1266. default['logging/syslog_priority'] = 'info'
  1267. default['logging/syslog_facility'] = 'daemon'
  1268. default['uidgid/uid'] = '0'
  1269. default['uidgid/gid'] = '0'
  1270. configs.append(default)
  1271. a = ConfigContainer('none_5min')
  1272. a['totem/token'] = (5 * 60 * 1000)
  1273. a['totem/consensus'] = int(5 * 60 * 1000 * 1.2) + 1
  1274. configs.append(a)
  1275. b = ConfigContainer('pcmk_basic')
  1276. b['totem/token'] = 5000
  1277. b['totem/token_retransmits_before_loss_const'] = 10
  1278. b['totem/join'] = 1000
  1279. b['totem/consensus'] = 7500
  1280. configs.append(b)
  1281. c = ConfigContainer('pcmk_sec_nss')
  1282. c['totem/secauth'] = 'on'
  1283. c['totem/crypto_accept'] = 'new'
  1284. c['totem/crypto_type'] = 'nss'
  1285. c['totem/token'] = 5000
  1286. c['totem/token_retransmits_before_loss_const'] = 10
  1287. c['totem/join'] = 1000
  1288. c['totem/consensus'] = 7500
  1289. configs.append(c)
  1290. #
  1291. # s = ConfigContainer('pcmk_vq')
  1292. # s['quorum/provider'] = 'corosync_votequorum'
  1293. # s['quorum/expected_votes'] = len(cm.Env["nodes"])
  1294. # s['totem/token'] = 5000
  1295. # s['totem/token_retransmits_before_loss_const'] = 10
  1296. # s['totem/join'] = 1000
  1297. # s['totem/vsftype'] = 'none'
  1298. # s['totem/consensus'] = 7500
  1299. # s['totem/max_messages'] = 20
  1300. # configs.append(s)
  1301. #
  1302. d = ConfigContainer('sec_sober')
  1303. d['totem/secauth'] = 'on'
  1304. d['totem/crypto_type'] = 'sober'
  1305. configs.append(d)
  1306. if not cm.Env["RrpBindAddr"] is None:
  1307. g = ConfigContainer('rrp_passive')
  1308. g['totem/rrp_mode'] = 'passive'
  1309. g['totem/interface[2]/ringnumber'] = '1'
  1310. g['totem/interface[2]/bindnetaddr'] = cm.Env["RrpBindAddr"]
  1311. g['totem/interface[2]/mcastaddr'] = '226.94.1.2'
  1312. g['totem/interface[2]/mcastport'] = '5405'
  1313. configs.append(g)
  1314. h = ConfigContainer('rrp_active')
  1315. h['totem/rrp_mode'] = 'active'
  1316. h['totem/interface[2]/ringnumber'] = '1'
  1317. h['totem/interface[2]/bindnetaddr'] = cm.Env["RrpBindAddr"]
  1318. h['totem/interface[2]/mcastaddr'] = '226.94.1.2'
  1319. h['totem/interface[2]/mcastport'] = '5405'
  1320. configs.append(h)
  1321. else:
  1322. print 'Not including rrp tests. Use --rrp-binaddr to enable them.'
  1323. num=1
  1324. for cfg in configs:
  1325. for testclass in GenTestClasses:
  1326. bound_test = testclass(cm)
  1327. if bound_test.is_applicable() and bound_test.config_valid(cfg):
  1328. bound_test.Audits = audits
  1329. for c in cfg.keys():
  1330. bound_test.config[c] = cfg[c]
  1331. bound_test.name = bound_test.name + '_' + cfg.name
  1332. result.append(bound_test)
  1333. num = num + 1
  1334. return result