pacemaker  1.1.15-e174ec8
Scalable High-Availability cluster resource manager
 All Data Structures Files Functions Variables Typedefs Enumerations Enumerator Macros Modules Pages
election.c
Go to the documentation of this file.
1 /*
2  * Copyright (C) 2004 Andrew Beekhof <andrew@beekhof.net>
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License as published by the Free Software Foundation; either
7  * version 2 of the License, or (at your option) any later version.
8  *
9  * This software is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12  * General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public
15  * License along with this library; if not, write to the Free Software
16  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17  */
18 #include <crm_internal.h>
19 
20 #include <sys/time.h>
21 #include <sys/resource.h>
22 
23 #include <crm/msg_xml.h>
24 #include <crm/common/xml.h>
25 
26 #include <crm/common/mainloop.h>
27 #include <crm/cluster/internal.h>
28 #include <crm/cluster/election.h>
29 #include <crm/crm.h>
30 
31 #define STORM_INTERVAL 2 /* in seconds */
32 #define STORM_MULTIPLIER 5 /* multiplied by the number of nodes */
33 
34 struct election_s
35 {
36  enum election_result state;
37  guint count;
38  char *name;
39  char *uname;
40  GSourceFunc cb;
41  GHashTable *voted;
42  mainloop_timer_t *timeout; /* When to stop if not everyone casts a vote */
43 };
44 
45 static void election_complete(election_t *e)
46 {
47  crm_info("Election %s complete", e->name);
48  e->state = election_won;
49 
50  if(e->cb) {
51  e->cb(e);
52  }
53 
54  election_reset(e);
55 }
56 
57 static gboolean election_timer_cb(gpointer user_data)
58 {
59  election_t *e = user_data;
60 
61  crm_info("Election %s %p timed out", e->name, e);
62  election_complete(e);
63  return FALSE;
64 }
65 
66 enum election_result
68 {
69  if(e) {
70  return e->state;
71  }
72  return election_error;
73 }
74 
75 election_t *
76 election_init(const char *name, const char *uname, guint period_ms, GSourceFunc cb)
77 {
78  static guint count = 0;
79  election_t *e = calloc(1, sizeof(election_t));
80 
81  if(e != NULL) {
82  if(name) {
83  e->name = crm_strdup_printf("election-%s", name);
84  } else {
85  e->name = crm_strdup_printf("election-%u", count++);
86  }
87 
88  e->cb = cb;
89  e->uname = strdup(uname);
90  e->timeout = mainloop_timer_add(e->name, period_ms, FALSE, election_timer_cb, e);
91  crm_trace("Created %s %p", e->name, e);
92  }
93  return e;
94 }
95 
96 void
98 {
99  if(e && uname && e->voted) {
100  g_hash_table_remove(e->voted, uname);
101  }
102 }
103 
104 void
106 {
107  crm_trace("Resetting election %s", e->name);
108  if(e) {
109  mainloop_timer_stop(e->timeout);
110  }
111  if (e && e->voted) {
112  crm_trace("Destroying voted cache with %d members", g_hash_table_size(e->voted));
113  g_hash_table_destroy(e->voted);
114  e->voted = NULL;
115  }
116 }
117 
118 void
120 {
121  if(e) {
122  election_reset(e);
123  crm_trace("Destroying %s", e->name);
124  mainloop_timer_del(e->timeout);
125  free(e->uname);
126  free(e->name);
127  free(e);
128  }
129 }
130 
131 static void
132 election_timeout_start(election_t *e)
133 {
134  if(e) {
135  mainloop_timer_start(e->timeout);
136  }
137 }
138 
139 void
141 {
142  if(e) {
143  mainloop_timer_stop(e->timeout);
144  }
145 }
146 
147 void
149 {
150  if(e) {
151  mainloop_timer_set_period(e->timeout, period);
152  } else {
153  crm_err("No election defined");
154  }
155 }
156 
157 static int
158 crm_uptime(struct timeval *output)
159 {
160  static time_t expires = 0;
161  static struct rusage info;
162 
163  time_t tm_now = time(NULL);
164 
165  if (expires < tm_now) {
166  int rc = 0;
167 
168  info.ru_utime.tv_sec = 0;
169  info.ru_utime.tv_usec = 0;
170  rc = getrusage(RUSAGE_SELF, &info);
171 
172  output->tv_sec = 0;
173  output->tv_usec = 0;
174 
175  if (rc < 0) {
176  crm_perror(LOG_ERR, "Could not calculate the current uptime");
177  expires = 0;
178  return -1;
179  }
180 
181  crm_debug("Current CPU usage is: %lds, %ldus", (long)info.ru_utime.tv_sec,
182  (long)info.ru_utime.tv_usec);
183  }
184 
185  expires = tm_now + STORM_INTERVAL; /* N seconds after the last _access_ */
186  output->tv_sec = info.ru_utime.tv_sec;
187  output->tv_usec = info.ru_utime.tv_usec;
188 
189  return 1;
190 }
191 
192 static int
193 crm_compare_age(struct timeval your_age)
194 {
195  struct timeval our_age;
196 
197  crm_uptime(&our_age); /* If an error occurred, our_age will be compared as {0,0} */
198 
199  if (our_age.tv_sec > your_age.tv_sec) {
200  crm_debug("Win: %ld vs %ld (seconds)", (long)our_age.tv_sec, (long)your_age.tv_sec);
201  return 1;
202  } else if (our_age.tv_sec < your_age.tv_sec) {
203  crm_debug("Loose: %ld vs %ld (seconds)", (long)our_age.tv_sec, (long)your_age.tv_sec);
204  return -1;
205  } else if (our_age.tv_usec > your_age.tv_usec) {
206  crm_debug("Win: %ld.%ld vs %ld.%ld (usec)",
207  (long)our_age.tv_sec, (long)our_age.tv_usec, (long)your_age.tv_sec, (long)your_age.tv_usec);
208  return 1;
209  } else if (our_age.tv_usec < your_age.tv_usec) {
210  crm_debug("Loose: %ld.%ld vs %ld.%ld (usec)",
211  (long)our_age.tv_sec, (long)our_age.tv_usec, (long)your_age.tv_sec, (long)your_age.tv_usec);
212  return -1;
213  }
214 
215  return 0;
216 }
217 
218 void
220 {
221  struct timeval age;
222  xmlNode *vote = NULL;
223  crm_node_t *our_node;
224 
225  if(e == NULL) {
226  crm_trace("Not voting in election: not initialized");
227  return;
228  }
229 
230  our_node = crm_get_peer(0, e->uname);
231  if (our_node == NULL || crm_is_peer_active(our_node) == FALSE) {
232  crm_trace("Cannot vote yet: %p", our_node);
233  return;
234  }
235 
236  e->state = election_in_progress;
237  vote = create_request(CRM_OP_VOTE, NULL, NULL, CRM_SYSTEM_CRMD, CRM_SYSTEM_CRMD, NULL);
238 
239  e->count++;
240  crm_xml_add(vote, F_CRM_ELECTION_OWNER, our_node->uuid);
241  crm_xml_add_int(vote, F_CRM_ELECTION_ID, e->count);
242 
243  crm_uptime(&age);
244  crm_xml_add_int(vote, F_CRM_ELECTION_AGE_S, age.tv_sec);
245  crm_xml_add_int(vote, F_CRM_ELECTION_AGE_US, age.tv_usec);
246 
247  send_cluster_message(NULL, crm_msg_crmd, vote, TRUE);
248  free_xml(vote);
249 
250  crm_debug("Started election %d", e->count);
251  if (e->voted) {
252  g_hash_table_destroy(e->voted);
253  e->voted = NULL;
254  }
255 
256  election_timeout_start(e);
257  return;
258 }
259 
260 bool
262 {
263  int voted_size = 0;
264  int num_members = crm_active_peers();
265 
266  if(e == NULL) {
267  crm_trace("not initialized");
268  return FALSE;
269  }
270 
271  if (e->voted) {
272  voted_size = g_hash_table_size(e->voted);
273  }
274  /* in the case of #voted > #members, it is better to
275  * wait for the timeout and give the cluster time to
276  * stabilize
277  */
278  if (voted_size >= num_members) {
279  /* we won and everyone has voted */
281  if (voted_size > num_members) {
282  GHashTableIter gIter;
283  const crm_node_t *node;
284  char *key = NULL;
285 
286  g_hash_table_iter_init(&gIter, crm_peer_cache);
287  while (g_hash_table_iter_next(&gIter, NULL, (gpointer *) & node)) {
288  if (crm_is_peer_active(node)) {
289  crm_err("member: %s proc=%.32x", node->uname, node->processes);
290  }
291  }
292 
293  g_hash_table_iter_init(&gIter, e->voted);
294  while (g_hash_table_iter_next(&gIter, (gpointer *) & key, NULL)) {
295  crm_err("voted: %s", key);
296  }
297 
298  }
299 
300  election_complete(e);
301  return TRUE;
302 
303  } else {
304  crm_debug("Still waiting on %d non-votes (%d total)",
305  num_members - voted_size, num_members);
306  }
307 
308  return FALSE;
309 }
310 
311 #define loss_dampen 2 /* in seconds */
312 
313 /* A_ELECTION_COUNT */
314 enum election_result
315 election_count_vote(election_t *e, xmlNode *vote, bool can_win)
316 {
317  int age = 0;
318  int election_id = -1;
319  int log_level = LOG_INFO;
320  gboolean use_born_on = FALSE;
321  gboolean done = FALSE;
322  gboolean we_loose = FALSE;
323  const char *op = NULL;
324  const char *from = NULL;
325  const char *reason = "unknown";
326  const char *election_owner = NULL;
327  crm_node_t *our_node = NULL, *your_node = NULL;
328 
329  static int election_wins = 0;
330 
331  xmlNode *novote = NULL;
332  time_t tm_now = time(NULL);
333  static time_t expires = 0;
334  static time_t last_election_loss = 0;
335 
336  /* if the membership copy is NULL we REALLY shouldn't be voting
337  * the question is how we managed to get here.
338  */
339 
340  CRM_CHECK(vote != NULL, return election_error);
341 
342  if(e == NULL) {
343  crm_info("Not voting in election: not initialized");
344  return election_lost;
345 
346  } else if(crm_peer_cache == NULL) {
347  crm_info("Not voting in election: no peer cache");
348  return election_lost;
349  }
350 
351  op = crm_element_value(vote, F_CRM_TASK);
352  from = crm_element_value(vote, F_CRM_HOST_FROM);
353  election_owner = crm_element_value(vote, F_CRM_ELECTION_OWNER);
354  crm_element_value_int(vote, F_CRM_ELECTION_ID, &election_id);
355 
356  your_node = crm_get_peer(0, from);
357  our_node = crm_get_peer(0, e->uname);
358 
359  if (e->voted == NULL) {
360  crm_debug("Created voted hash");
361  e->voted = g_hash_table_new_full(crm_str_hash, g_str_equal,
363  }
364 
365  if (is_heartbeat_cluster()) {
366  use_born_on = TRUE;
367  } else if (is_classic_ais_cluster()) {
368  use_born_on = TRUE;
369  }
370 
371  if(can_win == FALSE) {
372  reason = "Not eligible";
373  we_loose = TRUE;
374 
375  } else if (our_node == NULL || crm_is_peer_active(our_node) == FALSE) {
376  reason = "We are not part of the cluster";
377  log_level = LOG_ERR;
378  we_loose = TRUE;
379 
380  } else if (election_id != e->count && crm_str_eq(our_node->uuid, election_owner, TRUE)) {
381  log_level = LOG_TRACE;
382  reason = "Superseded";
383  done = TRUE;
384 
385  } else if (your_node == NULL || crm_is_peer_active(your_node) == FALSE) {
386  /* Possibly we cached the message in the FSA queue at a point that it wasn't */
387  reason = "Peer is not part of our cluster";
388  log_level = LOG_WARNING;
389  done = TRUE;
390 
391  } else if (crm_str_eq(op, CRM_OP_NOVOTE, TRUE)) {
392  char *op_copy = strdup(op);
393  char *uname_copy = strdup(from);
394 
395  CRM_ASSERT(crm_str_eq(our_node->uuid, election_owner, TRUE));
396 
397  /* update the list of nodes that have voted */
398  g_hash_table_replace(e->voted, uname_copy, op_copy);
399  reason = "Recorded";
400  done = TRUE;
401 
402  } else {
403  struct timeval your_age;
404  const char *your_version = crm_element_value(vote, F_CRM_VERSION);
405  int tv_sec = 0;
406  int tv_usec = 0;
407 
410 
411  your_age.tv_sec = tv_sec;
412  your_age.tv_usec = tv_usec;
413 
414  age = crm_compare_age(your_age);
415  if (crm_str_eq(from, e->uname, TRUE)) {
416  char *op_copy = strdup(op);
417  char *uname_copy = strdup(from);
418 
419  CRM_ASSERT(crm_str_eq(our_node->uuid, election_owner, TRUE));
420 
421  /* update ourselves in the list of nodes that have voted */
422  g_hash_table_replace(e->voted, uname_copy, op_copy);
423  reason = "Recorded";
424  done = TRUE;
425 
426  } else if (compare_version(your_version, CRM_FEATURE_SET) < 0) {
427  reason = "Version";
428  we_loose = TRUE;
429 
430  } else if (compare_version(your_version, CRM_FEATURE_SET) > 0) {
431  reason = "Version";
432 
433  } else if (age < 0) {
434  reason = "Uptime";
435  we_loose = TRUE;
436 
437  } else if (age > 0) {
438  reason = "Uptime";
439 
440  /* TODO: Check for y(our) born < 0 */
441  } else if (use_born_on && your_node->born < our_node->born) {
442  reason = "Born";
443  we_loose = TRUE;
444 
445  } else if (use_born_on && your_node->born > our_node->born) {
446  reason = "Born";
447 
448  } else if (e->uname == NULL) {
449  reason = "Unknown host name";
450  we_loose = TRUE;
451 
452  } else if (strcasecmp(e->uname, from) > 0) {
453  reason = "Host name";
454  we_loose = TRUE;
455 
456  } else {
457  reason = "Host name";
458  CRM_ASSERT(strcasecmp(e->uname, from) < 0);
459 /* can't happen...
460  * } else if(strcasecmp(e->uname, from) == 0) {
461  *
462  */
463  }
464  }
465 
466  if (expires < tm_now) {
467  election_wins = 0;
468  expires = tm_now + STORM_INTERVAL;
469 
470  } else if (done == FALSE && we_loose == FALSE) {
471  int peers = 1 + g_hash_table_size(crm_peer_cache);
472 
473  /* If every node has to vote down every other node, thats N*(N-1) total elections
474  * Allow some leway before _really_ complaining
475  */
476  election_wins++;
477  if (election_wins > (peers * peers)) {
478  crm_warn("Election storm detected: %d elections in %d seconds", election_wins,
480  election_wins = 0;
481  expires = tm_now + STORM_INTERVAL;
482  crm_write_blackbox(0, NULL);
483  }
484  }
485 
486  if (done) {
487  do_crm_log(log_level + 1, "Election %d (current: %d, owner: %s): Processed %s from %s (%s)",
488  election_id, e->count, election_owner, op, from, reason);
489  return e->state;
490 
491  } else if(we_loose == FALSE) {
492  do_crm_log(log_level, "Election %d (owner: %s) pass: %s from %s (%s)",
493  election_id, election_owner, op, from, reason);
494 
495  if (last_election_loss == 0
496  || tm_now - last_election_loss > (time_t) loss_dampen) {
497 
498  last_election_loss = 0;
500 
501  /* Start a new election by voting down this, and other, peers */
502  e->state = election_start;
503  return e->state;
504  }
505 
506  crm_info("Election %d ignore: We already lost an election less than %ds ago (%s)",
507  election_id, loss_dampen, ctime(&last_election_loss));
508  }
509 
510  novote = create_request(CRM_OP_NOVOTE, NULL, from,
512 
513  do_crm_log(log_level, "Election %d (owner: %s) lost: %s from %s (%s)",
514  election_id, election_owner, op, from, reason);
515 
517 
518  crm_xml_add(novote, F_CRM_ELECTION_OWNER, election_owner);
519  crm_xml_add_int(novote, F_CRM_ELECTION_ID, election_id);
520 
521  send_cluster_message(your_node, crm_msg_crmd, novote, TRUE);
522  free_xml(novote);
523 
524  last_election_loss = tm_now;
525  e->state = election_lost;
526  return e->state;
527 }
#define F_CRM_TASK
Definition: msg_xml.h:56
#define LOG_TRACE
Definition: logging.h:29
#define CRM_CHECK(expr, failure_action)
Definition: logging.h:164
void crm_write_blackbox(int nsig, struct qb_log_callsite *callsite)
Definition: logging.c:407
A dumping ground.
void mainloop_timer_start(mainloop_timer_t *t)
Definition: mainloop.c:1184
guint mainloop_timer_set_period(mainloop_timer_t *t, guint period_ms)
Definition: mainloop.c:1202
void mainloop_timer_del(mainloop_timer_t *t)
Definition: mainloop.c:1240
gboolean is_heartbeat_cluster(void)
Definition: cluster.c:645
gboolean crm_is_peer_active(const crm_node_t *node)
Definition: membership.c:291
uint64_t born
Definition: cluster.h:74
char * uuid
Definition: cluster.h:83
#define STORM_INTERVAL
Definition: election.c:31
#define CRM_FEATURE_SET
Definition: crm.h:38
#define F_CRM_HOST_FROM
Definition: msg_xml.h:61
struct mainloop_timer_s mainloop_timer_t
Definition: mainloop.h:37
crm_node_t * crm_get_peer(unsigned int id, const char *uname)
Definition: membership.c:672
void g_hash_destroy_str(gpointer data)
Definition: utils.c:615
void election_timeout_stop(election_t *e)
Definition: election.c:140
#define CRM_OP_NOVOTE
Definition: crm.h:105
guint crm_active_peers(void)
Definition: membership.c:389
void mainloop_timer_stop(mainloop_timer_t *t)
Definition: mainloop.c:1193
#define F_CRM_ELECTION_AGE_S
Definition: msg_xml.h:68
Wrappers for and extensions to glib mainloop.
struct election_s election_t
Definition: election.h:27
enum election_result election_count_vote(election_t *e, xmlNode *vote, bool can_win)
Definition: election.c:315
char uname[MAX_NAME]
Definition: internal.h:53
#define crm_warn(fmt, args...)
Definition: logging.h:249
uint32_t processes
Definition: cluster.h:79
#define crm_debug(fmt, args...)
Definition: logging.h:253
election_result
Definition: election.h:29
void election_vote(election_t *e)
Definition: election.c:219
#define crm_trace(fmt, args...)
Definition: logging.h:254
#define do_crm_log(level, fmt, args...)
Log a message.
Definition: logging.h:129
Wrappers for and extensions to libxml2.
int crm_element_value_int(xmlNode *data, const char *name, int *dest)
Definition: xml.c:4009
const char * crm_element_value(xmlNode *data, const char *name)
Definition: xml.c:5842
void free_xml(xmlNode *child)
Definition: xml.c:2851
gboolean crm_str_eq(const char *a, const char *b, gboolean use_case)
Definition: utils.c:1441
void election_timeout_set_period(election_t *e, guint period)
Definition: election.c:148
election_t * election_init(const char *name, const char *uname, guint period_ms, GSourceFunc cb)
Definition: election.c:76
void election_fini(election_t *e)
Definition: election.c:119
#define CRM_SYSTEM_CRMD
Definition: crm.h:84
#define CRM_OP_VOTE
Definition: crm.h:104
const char * crm_xml_add(xmlNode *node, const char *name, const char *value)
Definition: xml.c:2698
const char * crm_xml_add_int(xmlNode *node, const char *name, int value)
Definition: xml.c:2786
#define F_CRM_ELECTION_AGE_US
Definition: msg_xml.h:69
#define loss_dampen
Definition: election.c:311
#define crm_perror(level, fmt, args...)
Log a system error message.
Definition: logging.h:226
void election_reset(election_t *e)
Definition: election.c:105
#define crm_err(fmt, args...)
Definition: logging.h:248
int compare_version(const char *version1, const char *version2)
Definition: utils.c:536
#define CRM_ASSERT(expr)
Definition: error.h:35
mainloop_timer_t * mainloop_timer_add(const char *name, guint period_ms, bool repeat, GSourceFunc cb, void *userdata)
Definition: mainloop.c:1219
char * uname
Definition: cluster.h:82
bool election_check(election_t *e)
Definition: election.c:261
#define F_CRM_ELECTION_ID
Definition: msg_xml.h:67
char * crm_strdup_printf(char const *format,...) __attribute__((__format__(__printf__
#define crm_str_hash
Definition: crm.h:198
gboolean send_cluster_message(crm_node_t *node, enum crm_ais_msg_types service, xmlNode *data, gboolean ordered)
Definition: cluster.c:271
#define create_request(task, xml_data, host_to, sys_to, sys_from, uuid_from)
Definition: ipc.h:34
GHashTable * crm_peer_cache
Definition: membership.c:42
#define crm_info(fmt, args...)
Definition: logging.h:251
#define F_CRM_VERSION
Definition: msg_xml.h:63
void election_remove(election_t *e, const char *uname)
Definition: election.c:97
enum election_result election_state(election_t *e)
Definition: election.c:67
gboolean is_classic_ais_cluster(void)
Definition: cluster.c:624
Functions for conducting elections.
#define F_CRM_ELECTION_OWNER
Definition: msg_xml.h:70