client_supervisor.c 33 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346
  1. /*
  2. * Copyright (c) 2019 Clementine Computing LLC.
  3. *
  4. * This file is part of PopuFare.
  5. *
  6. * PopuFare is free software: you can redistribute it and/or modify
  7. * it under the terms of the GNU Affero General Public License as published by
  8. * the Free Software Foundation, either version 3 of the License, or
  9. * (at your option) any later version.
  10. *
  11. * PopuFare is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  14. * GNU Affero General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU Affero General Public License
  17. * along with PopuFare. If not, see <https://www.gnu.org/licenses/>.
  18. *
  19. */
  20. #include <linux/limits.h>
  21. #include <sys/types.h>
  22. #include <sys/wait.h>
  23. #include <sysexits.h>
  24. #include <stdio.h>
  25. #include <stdlib.h>
  26. #include <unistd.h>
  27. #include <signal.h>
  28. #include <string.h>
  29. #include <strings.h>
  30. #include <errno.h>
  31. #include <time.h>
  32. #include <poll.h>
  33. #include <syslog.h>
  34. #include <stdarg.h>
  35. #include "../common/common_defs.h"
  36. #include "../commhub/commhub.h"
  37. #include "../commhub/client_utils.h"
  38. // Number of modules we are actively tracking
  39. //
  40. int NUM_MODULES = 0;
  41. // File descriptor of our connection to the comm hub
  42. //
  43. int commhub_fd = -1;
  44. // This flag is set by the SIGCHLD handler, and cleared by the reaper function
  45. //
  46. volatile sig_atomic_t zombie_alert = 0;
  47. // This is our flag that the SIGHUP handler sets and is cleared when a HUP has been delivered to the client modules
  48. //
  49. volatile sig_atomic_t hup_alert = 0;
  50. // This is the time of the last new process creation event
  51. //
  52. int last_global_spawn = 0;
  53. // This is the time of our last system status check
  54. //
  55. int last_system_stat = 0;
  56. // This is the number of KB of free memory in the system
  57. //
  58. int system_stat_freemem = 0;
  59. // This is the number of running processes in the system
  60. //
  61. int system_stat_numproc = 0;
  62. // This is the 5 minute load average
  63. //
  64. float system_stat_loadavg = 0;
  65. // This is the error logging module's module number
  66. // (this is the module who's name matches SUPERVISOR_SPECIAL_BILLDB)
  67. //
  68. int error_logging_module = -1;
  69. #define PING_PUNISH_NONE (0)
  70. #define PING_PUNISH_WARN (1)
  71. #define PING_PUNISH_TERM (2)
  72. #define PING_PUNISH_KILL (3)
  73. struct module_record {
  74. // last time this process was started
  75. //
  76. time_t last_start;
  77. // last time this process has reaped
  78. //
  79. time_t last_exit;
  80. // If this flag is nonzero, the specified module is subject to PING monitoring
  81. //
  82. int ping_requested;
  83. // however, if this flag is zero, the specified module is immune to PING monitoring.
  84. // If this is zero, all further fields are invalid
  85. //
  86. int active;
  87. //-------------------------
  88. // If this flag is set, we expect the process to exit (i.e. it's not an error if it does...)
  89. //
  90. int exit_expected;
  91. // Process ID of this child module
  92. //
  93. pid_t pid;
  94. // File descriptor of this child's standard input (write to this to feed child)
  95. //
  96. int stdin_fd;
  97. // File descriptor of this child's standard output (read from this to get output)
  98. //
  99. int stdout_fd;
  100. // File descriptor of this child's standard error (read from this to get error output)
  101. //
  102. int stderr_fd;
  103. // timestamp in microseconds of the last PING message sent to this process
  104. //
  105. long long int last_ping;
  106. // timestamp in microseconds of the last PONG message received from this process
  107. //
  108. long long int last_pong;
  109. // Latest PING->PONG lag for this process
  110. //
  111. long long int ping_lag;
  112. // Used to keep track of PING non-response action sequence
  113. //
  114. int ping_punish;
  115. // This array contains the state for each module
  116. //
  117. } module_status[SUPERVISOR_MAX_MODULES] = {{0}};
  118. // This array contains pointers to the names of each module
  119. //
  120. char *module_name[SUPERVISOR_MAX_MODULES] = {0};
  121. #define PIPE_R (0)
  122. #define PIPE_W (1)
  123. #define STDIN_H (0)
  124. #define STDOUT_H (1)
  125. #define STDERR_H (2)
  126. int route_debug_message(char *fmt, ...) {
  127. va_list ap;
  128. #ifdef SUPERVISOR_LOG_DEBUG_TO_BILLDB
  129. // IF... we have a connection to the commhub, and we have an instance of billdb under our care, and it is alive...
  130. //
  131. if ( (commhub_fd >= 0) && (error_logging_module >= 0) && module_status[error_logging_module].active ) {
  132. int retval;
  133. int len = 0;
  134. struct message_record outgoing_msg;
  135. char payload[MAX_PAYLOAD_LENGTH] = {0};
  136. payload[len++] = LOGLEVEL_DEBUG;
  137. len += make_log_prefix(payload + len, MAX_PAYLOAD_LENGTH - len);
  138. va_start(ap, fmt); /* Initialize the va_list */
  139. len += vsnprintf(payload + len, MAX_PAYLOAD_LENGTH - len, fmt, ap);
  140. va_end(ap);
  141. prepare_message(&outgoing_msg, MAILBOX_BILLING_LOG, payload, len);
  142. retval = send_message(commhub_fd, &outgoing_msg);
  143. if (retval == 0) {
  144. return 0;
  145. }
  146. }
  147. #endif
  148. // IF either that didn't work, or the conditions were not met, send it to syslog...
  149. //
  150. // Initialize the va_list
  151. //
  152. va_start(ap, fmt);
  153. vsyslog(LOG_DEBUG, fmt, ap);
  154. va_end(ap);
  155. return 0;
  156. }
  157. int route_warning_message(char *fmt, ...) {
  158. va_list ap;
  159. #ifdef SUPERVISOR_LOG_WARNING_TO_BILLDB
  160. // IF... we have a connection to the commhub, and we have an instance of billdb under our care, and it is alive...
  161. //
  162. if ( (commhub_fd >= 0) && (error_logging_module >= 0) && module_status[error_logging_module].active ) {
  163. int retval;
  164. int len = 0;
  165. struct message_record outgoing_msg;
  166. char payload[MAX_PAYLOAD_LENGTH] = {0};
  167. payload[len++] = LOGLEVEL_WARN;
  168. len += make_log_prefix(payload + len, MAX_PAYLOAD_LENGTH - len);
  169. va_start(ap, fmt); /* Initialize the va_list */
  170. len += vsnprintf(payload + len, MAX_PAYLOAD_LENGTH - len, fmt, ap);
  171. va_end(ap);
  172. prepare_message(&outgoing_msg, MAILBOX_BILLING_LOG, payload, len);
  173. retval = send_message(commhub_fd, &outgoing_msg);
  174. if (retval == 0) {
  175. return 0;
  176. }
  177. }
  178. #endif
  179. // IF either that didn't work, or the conditions were not met, send it to syslog...
  180. //
  181. va_start(ap, fmt); /* Initialize the va_list */
  182. vsyslog(LOG_WARNING, fmt, ap);
  183. va_end(ap);
  184. return 0;
  185. }
  186. int route_error_message(char *fmt, ...) {
  187. va_list ap;
  188. #ifdef SUPERVISOR_LOG_ERROR_TO_BILLDB
  189. // IF... we have a connection to the commhub, and we have an instance of billdb under our care, and it is alive...
  190. //
  191. if ( (commhub_fd >= 0) && (error_logging_module >= 0) && module_status[error_logging_module].active ) {
  192. int retval;
  193. int len = 0;
  194. struct message_record outgoing_msg;
  195. char payload[MAX_PAYLOAD_LENGTH] = {0};
  196. payload[len++] = LOGLEVEL_ERROR;
  197. len += make_log_prefix(payload + len, MAX_PAYLOAD_LENGTH - len);
  198. // Initialize the va_list
  199. //
  200. va_start(ap, fmt);
  201. len += vsnprintf(payload + len, MAX_PAYLOAD_LENGTH - len, fmt, ap);
  202. va_end(ap);
  203. prepare_message(&outgoing_msg, MAILBOX_BILLING_LOG, payload, len);
  204. retval = send_message(commhub_fd, &outgoing_msg);
  205. if (retval == 0)
  206. {
  207. return 0;
  208. }
  209. }
  210. #endif
  211. // IF either that didn't work, or the conditions were not met, send it to syslog...
  212. //
  213. // Initialize the va_list
  214. //
  215. va_start(ap, fmt);
  216. vsyslog(LOG_ERR, fmt, ap);
  217. va_end(ap);
  218. return 0;
  219. }
  220. void warn_num_proc() {
  221. route_warning_message("System has %d processes running!\n", system_stat_numproc);
  222. }
  223. void warn_load_avg() {
  224. route_warning_message("System load average is %.1f!\n", system_stat_loadavg);
  225. }
  226. void warn_mem_free() {
  227. route_warning_message("System is down to %d KB of free RAM\n", system_stat_freemem);
  228. }
  229. void monitor_system_status() {
  230. int retval;
  231. int kb_free = 0;
  232. int n_procs = 0;
  233. float la5 = 0;
  234. char line[LINE_BUFFER_SIZE] = {0};
  235. FILE *f;
  236. f = fopen("/proc/loadavg", "rb");
  237. if (f) {
  238. fgets(line, sizeof(line), f);
  239. retval = sscanf(line, "%f %*f %*f %*i/%i", &la5, &n_procs);
  240. if (retval == 2) {
  241. system_stat_loadavg = la5;
  242. system_stat_numproc = n_procs;
  243. }
  244. fclose(f);
  245. }
  246. else {
  247. route_error_message("Cannot read /proc/loadavg\n");
  248. }
  249. f = fopen("/proc/meminfo", "rb");
  250. if (f) {
  251. while(!feof(f)) {
  252. fgets(line, sizeof(line), f);
  253. retval = sscanf(line, "MemFree: %i kB\n", &kb_free);
  254. if (retval == 1) {
  255. system_stat_freemem = kb_free;
  256. break;
  257. }
  258. }
  259. fclose(f);
  260. }
  261. else {
  262. route_error_message("Cannot read /proc/meminfo\n");
  263. }
  264. if (system_stat_numproc > SUPERVISOR_WARN_NUM_PROC) {
  265. warn_num_proc();
  266. }
  267. if (system_stat_loadavg > SUPERVISOR_WARN_LOAD_AVG) {
  268. warn_load_avg();
  269. }
  270. if (system_stat_freemem < SUPERVISOR_WARN_LOW_MEM) {
  271. warn_mem_free();
  272. }
  273. }
  274. void send_heartbeat() {
  275. route_debug_message("heartbeat\n");
  276. }
  277. int launch_module(int n) {
  278. pid_t fork_ret;
  279. int retval;
  280. int my_stdin, my_stdout, my_stderr;
  281. int new_stdin[2], new_stdout[2], new_stderr[2];
  282. int record_confirm_pipe[2];
  283. char record_confirm_msg[4] = {'O','K','\n','\0'};
  284. int record_confirm_expect;
  285. if ( (n < 0) || (n >= NUM_MODULES) ) {
  286. return -1;
  287. }
  288. RESET_WATCHDOG();
  289. // Save our actual standard I/O fd's
  290. //
  291. my_stdin = dup(STDIN_H);
  292. my_stdout = dup(STDOUT_H);
  293. my_stderr = dup(STDERR_H);
  294. // Generate pipes to be said handles for the child process
  295. //
  296. pipe(new_stdin);
  297. pipe(new_stdout);
  298. pipe(new_stderr);
  299. // close our actual handles in preparation for forking
  300. //
  301. close(STDIN_H);
  302. close(STDOUT_H);
  303. close(STDERR_H);
  304. // Assign the "far" end of the pipes to our handles so that the child process will use them
  305. //
  306. dup2(new_stdin[PIPE_R], STDIN_H);
  307. dup2(new_stdout[PIPE_W], STDOUT_H);
  308. dup2(new_stderr[PIPE_W], STDERR_H);
  309. // Create a pipe for record-keeping confirmation purposes (this prevents a race condition with SIGCHLD
  310. //
  311. pipe(record_confirm_pipe);
  312. record_confirm_expect = strlen(record_confirm_msg);
  313. // Ask the kernel to create a new process to hold our new module
  314. //
  315. fork_ret = fork();
  316. // If we are the child process
  317. //
  318. if (fork_ret == 0) {
  319. // We don't need this end of the confirmation pipe
  320. //
  321. close(record_confirm_pipe[PIPE_W]);
  322. // Wait for the parent process to confirm that we have been recorded before proceeding
  323. // with the exec attempt so that if it fails, the parent process will know what to do with the
  324. // SIGCHLD signal.
  325. //
  326. retval = read(record_confirm_pipe[PIPE_R], record_confirm_msg, record_confirm_expect);
  327. // If the parent process can't confirm our existence, we're in deep shit...
  328. //
  329. if (retval != record_confirm_expect) {
  330. fprintf(stderr, "Never got recordkeeping confirmation!\n");
  331. exit(EX_UNAVAILABLE);
  332. }
  333. // We are now done with our confirmation pipe
  334. //
  335. close(record_confirm_pipe[PIPE_R]);
  336. // Close the child's copy of the parent's standard I/O fds
  337. //
  338. close(my_stdin);
  339. close(my_stdout);
  340. close(my_stderr);
  341. // Close the parent's end of the pipes connecting the two processes
  342. //
  343. close(new_stdin[PIPE_W]);
  344. close(new_stdout[PIPE_R]);
  345. close(new_stderr[PIPE_R]);
  346. // Attempt to exec the child process now...
  347. //
  348. retval = execl(module_name[n], module_name[n], NULL);
  349. if (retval)
  350. {
  351. fprintf(stderr, "Cannot execl(\"%s\", \"%s\", NULL)!\n", module_name[n], module_name[n]);
  352. exit(EX_UNAVAILABLE);
  353. }
  354. exit(0);
  355. }
  356. // If we are the parent process
  357. //
  358. else if (fork_ret != -1) {
  359. // Record our child PID
  360. //
  361. module_status[n].pid = fork_ret;
  362. // Record our end of the child's standard file descriptors
  363. //
  364. module_status[n].stdin_fd = new_stdin[PIPE_W];
  365. module_status[n].stdout_fd = new_stdout[PIPE_R];
  366. module_status[n].stderr_fd = new_stderr[PIPE_R];
  367. // Reset our list ping/pong times to "never"
  368. //
  369. module_status[n].last_ping = 0;
  370. module_status[n].last_pong = 0;
  371. module_status[n].ping_lag = 0;
  372. module_status[n].ping_punish = PING_PUNISH_NONE;
  373. // Record our process start time
  374. //
  375. module_status[n].last_start = time(NULL);
  376. // This process is not currently expected to die
  377. //
  378. module_status[n].exit_expected = 0;
  379. // Flag this child module as active
  380. //
  381. module_status[n].active = 1;
  382. // Close the opposite end of all of our pipes
  383. //
  384. close(record_confirm_pipe[PIPE_R]);
  385. close(new_stdin[PIPE_R]);
  386. close(new_stdout[PIPE_W]);
  387. close(new_stderr[PIPE_W]);
  388. // Close our copy of the child's standard I/O file descriptors
  389. //
  390. close(STDIN_H);
  391. close(STDOUT_H);
  392. close(STDERR_H);
  393. // Restore the parent's original standard I/O file descriptors
  394. //
  395. dup2(my_stdin, STDIN_H);
  396. dup2(my_stdout, STDOUT_H);
  397. dup2(my_stderr, STDERR_H);
  398. // Unblock the child process by transmitting our confirmation message
  399. //
  400. retval = write(record_confirm_pipe[PIPE_W], record_confirm_msg, record_confirm_expect);
  401. if (retval != record_confirm_expect) {
  402. route_error_message("Cannot transmit recordkeeping confirmation!\n");
  403. }
  404. // We are now done with our recordkeeping confirmation pipe
  405. //
  406. close(record_confirm_pipe[PIPE_W]);
  407. }
  408. // Otherwise we have somehow FAILED to fork()
  409. //
  410. else {
  411. // restore our stderr file descriptor
  412. //
  413. close(STDERR_H);
  414. dup2(my_stderr, STDERR_H);
  415. route_error_message("Cannot fork()!\n");
  416. exit(EX_OSERR);
  417. }
  418. return 0;
  419. }
  420. void expected_exit(int n) {
  421. route_debug_message("Module %s [%d] has finished as expected.\n", module_name[n], (int)module_status[n].pid);
  422. // flag this module as "clean" for the purpose of respawn
  423. // (so we won't count the short runtime of a process we asked to terminate against it)
  424. //
  425. module_status[n].last_start = 0;
  426. module_status[n].last_exit = 0;
  427. }
  428. void unexpected_exit(int n) {
  429. int age;
  430. // Record our last stop time
  431. //
  432. module_status[n].last_exit = time(NULL);
  433. age = (int)module_status[n].last_exit - (int)module_status[n].last_start;
  434. route_warning_message("Module %s [%d] has stopped running unexpectedly after %d seconds.\n", module_name[n], (int)module_status[n].pid, age);
  435. }
  436. void reap_zombies() {
  437. int status = 0;
  438. pid_t retval = 0;
  439. int i;
  440. do {
  441. // Clear the zombie alert HERE so that if things terminate after the loop finishes,
  442. // we'll get flagged. This could result in an extra flag, but we'll never miss one.
  443. // and since we're using the WNOHANG option, there is no harm in going through the waitpid()
  444. // loop one time extra.
  445. zombie_alert = 0;
  446. retval = waitpid(-1, &status, WNOHANG); // Wait for ANY child process
  447. // If waitpid() actually waited for the specified process...
  448. //
  449. if (retval > 0) {
  450. for (i = 0; i < NUM_MODULES; i++) {
  451. // Go through our table of modules looking for an active module with a
  452. //
  453. // matching PID...
  454. //
  455. if ( module_status[i].active && (module_status[i].pid == retval) ) {
  456. // Give us an opportunity to do some clever cleanup if it is merited
  457. // These functions also set the last_exit time (and may zero the last_start
  458. // to flag a process as "clean" for respawn purposes).
  459. //
  460. if (module_status[i].exit_expected) {
  461. expected_exit(i);
  462. }
  463. else {
  464. unexpected_exit(i);
  465. }
  466. // Set this module inactive
  467. //
  468. module_status[i].active = 0;
  469. // Set this module's last ping/pong times to "never"
  470. //
  471. module_status[i].last_ping = 0;
  472. module_status[i].last_pong = 0;
  473. module_status[i].ping_lag = 0;
  474. module_status[i].ping_punish = PING_PUNISH_NONE;
  475. // No more process
  476. //
  477. module_status[i].pid = 0;
  478. // Close our end of this module's standard I/O file descriptors
  479. //
  480. close(module_status[i].stdin_fd);
  481. close(module_status[i].stdout_fd);
  482. close(module_status[i].stderr_fd);
  483. module_status[i].stdin_fd = -1;
  484. module_status[i].stdout_fd = -1;
  485. module_status[i].stderr_fd = -1;
  486. break;
  487. }
  488. }
  489. }
  490. } while(retval > 0);
  491. }
  492. void sigchld_handler(int signum, siginfo_t *info, void *data) {
  493. zombie_alert = 1;
  494. }
  495. void sighup_handler(int signum, siginfo_t *into, void *data) {
  496. hup_alert = 1;
  497. }
  498. void setup_child_handler() {
  499. struct sigaction sa = {{0}};
  500. sa.sa_sigaction = sigchld_handler;
  501. sa.sa_flags = SA_NOCLDSTOP | SA_SIGINFO;
  502. sigfillset(&sa.sa_mask);
  503. sigaction(SIGCHLD, &sa, NULL);
  504. }
  505. void setup_hup_handler() {
  506. struct sigaction sa = {{0}};
  507. sa.sa_sigaction = sighup_handler;
  508. sa.sa_flags = SA_SIGINFO;
  509. sigfillset(&sa.sa_mask);
  510. sigaction(SIGHUP, &sa, NULL);
  511. }
  512. void monitor_spawn_list() {
  513. int i;
  514. int module_runtime;
  515. time_t now;
  516. for (i = 0; i < NUM_MODULES; i++) {
  517. now = time(NULL);
  518. // If it is too soon to attempt spawning any new process, then we're done for the moment
  519. //
  520. if ( (now - last_global_spawn) < SUPERVISOR_GLOBAL_SPAWN_RATE_LIMIT ) break;
  521. // If this module is still active, we don't need to spawn it...
  522. if (module_status[i].active) continue;
  523. // This number will only be valid if last_start != 0
  524. module_runtime = module_status[i].last_exit - module_status[i].last_start;
  525. // If this module either made an expected termination or has never been launcher
  526. //
  527. if (module_status[i].last_start == 0) {
  528. // then update our last global spawn time
  529. //
  530. last_global_spawn = now;
  531. // and launch it...
  532. //
  533. launch_module(i);
  534. route_debug_message("Spawned module %s, PID = %d\n", module_name[i], (int)module_status[i].pid);
  535. }
  536. // otherwise, if it has run before, but terminated unexpectedly in a very short time
  537. //
  538. else if ( module_runtime < SUPERVISOR_RESPAWN_DELAY_THRESHOLD ) {
  539. // and it has been long enough to try again
  540. //
  541. if ( (now - module_status[i].last_exit) >= SUPERVISOR_RESPAWN_RATE_LIMIT) {
  542. // then update our last global spawn time
  543. //
  544. last_global_spawn = now;
  545. // and launch it...
  546. //
  547. launch_module(i);
  548. route_warning_message("Respawned module %s, PID = %d [rate limited]\n", module_name[i], (int)module_status[i].pid);
  549. }
  550. }
  551. // otherwise, the module has terminated unexpectedly, but it had run for a long time, so we can respawn it immediately
  552. //
  553. else {
  554. // then update our last global spawn time
  555. //
  556. last_global_spawn = now;
  557. // and launch it...
  558. //
  559. launch_module(i);
  560. route_debug_message("Respawned module %s, PID = %d\n", module_name[i], (int)module_status[i].pid);
  561. }
  562. }
  563. }
  564. void handle_dead_fd(int fd) {
  565. int i;
  566. close(fd);
  567. if (fd == commhub_fd) {
  568. commhub_fd = -1;
  569. return;
  570. }
  571. for (i = 0; i < NUM_MODULES; i++) {
  572. if (module_status[i].stdin_fd == fd) {
  573. module_status[i].stdin_fd = -1;
  574. return;
  575. }
  576. if (module_status[i].stdout_fd == fd) {
  577. module_status[i].stdout_fd = -1;
  578. return;
  579. }
  580. if (module_status[i].stderr_fd == fd) {
  581. module_status[i].stderr_fd = -1;
  582. return;
  583. }
  584. }
  585. }
  586. message_callback_return handle_pong_message(struct message_record *msg, void *param) {
  587. int i;
  588. // printf("Got PONG from PID %d\n", (int)msg->header.sender);
  589. // Iterate through all of the modules
  590. //
  591. for (i = 0; i < NUM_MODULES; i++) {
  592. // If this module is active, and its PID matches the PID of the sender of this PONG message
  593. //
  594. if ( module_status[i].active && (msg->header.sender == module_status[i].pid) ) {
  595. module_status[i].last_pong = get_usec_time();
  596. module_status[i].ping_lag = (module_status[i].last_pong - module_status[i].last_ping);
  597. // printf("PING->PONG latency %lld microseconds.\n", module_status[i].ping_lag);
  598. }
  599. }
  600. return MESSAGE_HANDLED_CONT;
  601. }
  602. void clear_ping_attempt() {
  603. int i;
  604. for (i = 0; i < NUM_MODULES; i++) {
  605. if (module_status[i].active) {
  606. module_status[i].last_ping = module_status[i].last_pong = 0;
  607. module_status[i].ping_punish = PING_PUNISH_NONE;
  608. }
  609. }
  610. }
  611. void send_ping_message()
  612. {
  613. struct message_record outgoing_msg;
  614. long long int usec_now;
  615. char target[MAILBOX_NAME_MAX + 1];
  616. int i;
  617. if (commhub_fd < 0) { return; }
  618. for (i = 0; i < NUM_MODULES; i++) {
  619. // If this module is active AND included in PING monitoring
  620. //
  621. if ( module_status[i].active && module_status[i].ping_requested ) {
  622. usec_now = get_usec_time();
  623. // If it is not yet time to re-ping this module
  624. //
  625. if ( (usec_now - module_status[i].last_ping) < SUPERVISOR_PING_INTERVAL) {
  626. // skip it for now
  627. //
  628. continue;
  629. }
  630. // If we have PING'd this host but are still waiting for a PONG back
  631. //
  632. if (module_status[i].last_pong < module_status[i].last_ping) {
  633. // Don't flood it with pings
  634. //
  635. continue;
  636. }
  637. // If we've never PING'd this host, set it's PONG time to the same as its PING
  638. // time so it doesn't get killed for not having responded to its first PING.
  639. //
  640. if (module_status[i].last_ping == 0) {
  641. module_status[i].last_pong = usec_now;
  642. }
  643. module_status[i].last_ping = usec_now;
  644. // Turn our PID into a magic mailbox name that addresses that PID.
  645. //
  646. sprintf(target, ">%d", (int)module_status[i].pid);
  647. // Prepare a message to that magic mailbox with a payload consisting of the mailbox
  648. // name of the PING notification
  649. //
  650. prepare_message(&outgoing_msg, target, MAILBOX_PING, strlen(MAILBOX_PING));
  651. // And send it!
  652. //
  653. send_message(commhub_fd, &outgoing_msg);
  654. }
  655. }
  656. }
  657. void send_hup_message() {
  658. struct message_record outgoing_msg;
  659. if (commhub_fd < 0) {
  660. return;
  661. }
  662. printf("Sending HUP\n");
  663. prepare_message(&outgoing_msg, MAILBOX_HUP, "", 0);
  664. send_message(commhub_fd, &outgoing_msg);
  665. }
  666. void ping_punish_warn(int n, long long int delta) {
  667. route_debug_message("Module %s [%d] hasn't responded to a PING in %d seconds...\n", module_name[n], (int)module_status[n].pid, (int)USEC_TO_SEC(delta));
  668. }
  669. void ping_punish_term(int n, long long int delta) {
  670. route_warning_message("Module %s [%d] hasn't responded to a PING in %d seconds... Sending SIGTERM\n", module_name[n], (int)module_status[n].pid, (int)USEC_TO_SEC(delta));
  671. kill(module_status[n].pid, SIGTERM);
  672. }
  673. void ping_punish_kill(int n, long long int delta) {
  674. route_error_message("Module %s [%d] hasn't responded to a PING in %d seconds... Sending SIGKILL\n", module_name[n], (int)module_status[n].pid, (int)USEC_TO_SEC(delta));
  675. kill(module_status[n].pid, SIGKILL);
  676. }
  677. void enforce_ping_policy()
  678. {
  679. int i;
  680. long long int delta = 0;
  681. for (i = 0; i < NUM_MODULES; i++) {
  682. //If this module is both active AND included in the ping monitoring
  683. //
  684. if ( module_status[i].active && module_status[i].ping_requested ) {
  685. // if we haven't sent a PING to this one...
  686. //
  687. if (module_status[i].last_ping == 0) {
  688. //clear any punishment that would otherwise be pending and get the next module
  689. //
  690. module_status[i].ping_punish = PING_PUNISH_NONE;
  691. continue;
  692. }
  693. //compute just how late this module is...
  694. //
  695. delta = get_usec_time() - module_status[i].last_pong;
  696. switch(module_status[i].ping_punish) {
  697. case PING_PUNISH_NONE:
  698. if (delta > SUPERVISOR_PING_WARN_TIME) {
  699. ping_punish_warn(i, delta);
  700. module_status[i].ping_punish++;
  701. }
  702. break;
  703. case PING_PUNISH_WARN:
  704. if (delta > SUPERVISOR_PING_TERM_TIME) {
  705. ping_punish_term(i, delta);
  706. module_status[i].ping_punish++;
  707. }
  708. break;
  709. case PING_PUNISH_TERM:
  710. if (delta > SUPERVISOR_PING_KILL_TIME) {
  711. ping_punish_kill(i, delta);
  712. module_status[i].ping_punish++;
  713. }
  714. break;
  715. default:
  716. route_error_message("At our wits end with module %s [%d], just plain won't die!\n", module_name[i], (int)module_status[i].pid);
  717. break;
  718. }
  719. }
  720. }
  721. }
  722. void maintain_ipc_hub_connect(char *progname) {
  723. struct message_record outgoing_msg;
  724. // if we have no connection to the communication hub
  725. //
  726. if (commhub_fd < 0) {
  727. // try and get one
  728. //
  729. commhub_fd = connect_to_message_server(progname);
  730. // printf("commhub_fd = %d\n", commhub_fd);
  731. // if it worked
  732. //
  733. if (commhub_fd >= 0) {
  734. //Subscribe to the command mailboxes we act on
  735. //
  736. prepare_message(&outgoing_msg, MAILBOX_SUBSCRIBE, MAILBOX_PONG, strlen(MAILBOX_TOKEN_MAG));
  737. send_message(commhub_fd,&outgoing_msg);
  738. //Subscribe to the relevant status management mailboxes
  739. //
  740. subscribe_to_default_messages(commhub_fd);
  741. }
  742. }
  743. }
  744. void usage(char *progname) {
  745. fprintf(stderr,"%s: %d modules given to be supervised, minimum = 1, maximum = %d\n", progname, NUM_MODULES, SUPERVISOR_MAX_MODULES - 1);
  746. fprintf(stderr,"%s: Usage: %s [module ...] [--[no-]ping] [module ...]\n", progname, progname);
  747. fprintf(stderr,"%s: By default modules will be subjet to PING monitoring through\n", progname);
  748. fprintf(stderr,"%s: the IPC hub, --no-ping will effect all modules following it until a --ping\n", progname);
  749. fprintf(stderr,"\n");
  750. exit(EX_USAGE);
  751. }
  752. void handle_command_line(int argc, char **argv) {
  753. int i;
  754. int needs_ping = 1;
  755. NUM_MODULES = 0;
  756. // Walk all of the given command line arguments and add each one as a module we have been asked to supervice
  757. //
  758. for (i = 1; i < argc; i++) {
  759. if ( !strcmp(argv[i], "--no-ping") ) {
  760. needs_ping = 0;
  761. }
  762. else if ( !strcmp(argv[i], "--ping") ) {
  763. needs_ping = 1;
  764. }
  765. else {
  766. // Here we look for the last / in the module name
  767. //
  768. char *foo = rindex(argv[i], '/');
  769. // if we found one, advance past it
  770. //
  771. if (foo) { foo++; }
  772. // otherwise
  773. //
  774. else {
  775. // use our whole argument
  776. //
  777. foo = argv[i];
  778. }
  779. // make sure it is at the end
  780. //
  781. if ( !strcmp(foo, SUPERVISOR_SPECIAL_BILLDB) ) {
  782. //if it is, this one is our guy!
  783. //
  784. error_logging_module = NUM_MODULES;
  785. }
  786. module_status[NUM_MODULES].ping_requested = needs_ping;
  787. module_name[NUM_MODULES++] = argv[i];
  788. }
  789. if (NUM_MODULES >= SUPERVISOR_MAX_MODULES) {
  790. usage(argv[0]);
  791. }
  792. }
  793. if (NUM_MODULES == 0) {
  794. usage(argv[0]);
  795. }
  796. }
  797. int main(int argc, char **argv) {
  798. int poll_ret;
  799. int nfds;
  800. int i;
  801. struct message_record incoming_msg;
  802. char linebuffer[LINE_BUFFER_SIZE] = {0};
  803. int read_ret;
  804. int modnum;
  805. struct pollfd fds[1 + (2 * SUPERVISOR_MAX_MODULES)] = {{0}};
  806. int poll_to_module[1 + (2 * SUPERVISOR_MAX_MODULES)] = {0};
  807. // Parse our command line to figure out what modules we need to supervise
  808. //
  809. handle_command_line(argc, argv);
  810. // Configure our default signal handlers
  811. //
  812. configure_signal_handlers(argv[0]);
  813. // As well as our module-specific zombie-reaping flag setter
  814. //
  815. setup_child_handler();
  816. // This listens for SIGHUP messages and flags us when one has come it.
  817. //
  818. setup_hup_handler();
  819. // Register our default keep-up-with-system status callbacks
  820. //
  821. register_system_status_callbacks();
  822. // Except we want to exempt ourselves from the MAILBOX_EXIT and
  823. // MAILBOX_PING so as not to start ping storms or kill ourselves...
  824. //
  825. register_dispatch_callback(MAILBOX_EXIT, CALLBACK_PREPROCESS, ignore_message, NULL);
  826. register_dispatch_callback(MAILBOX_PING, CALLBACK_PREPROCESS, ignore_message, NULL);
  827. // Add our specific handlers
  828. //
  829. register_dispatch_callback(MAILBOX_PONG, CALLBACK_USER(1), handle_pong_message, NULL);
  830. // This is the main processing loop which monitors system status and pumps I/O from
  831. // all of the client modules (debug and error messages) and from the IPC hub when it is up
  832. // as indicated by poll().
  833. //
  834. while( exit_request_status == EXIT_REQUEST_NONE ) {
  835. RESET_WATCHDOG();
  836. //-------------- SYSTEM STATE MAINTENANCE CODE HERE
  837. if ( (time(NULL) - last_system_stat) >= SUPERVISOR_SYSTEM_STAT_INTERVAL) {
  838. monitor_system_status();
  839. send_heartbeat();
  840. last_system_stat = time(NULL);
  841. }
  842. // Do any maintenance on our spawn list as needed
  843. //
  844. monitor_spawn_list();
  845. // Try and keep in touch with the IPC hub if possible
  846. //
  847. maintain_ipc_hub_connect(argv[0]);
  848. // If the communications hub is up, we may need to maintain our PING states
  849. //
  850. if (commhub_fd >= 0) {
  851. // Send out any PING messages that are required
  852. //
  853. send_ping_message();
  854. // Deal with any delinquents who don't answer their damn pings
  855. //
  856. enforce_ping_policy();
  857. if (hup_alert) {
  858. hup_alert = 0;
  859. send_hup_message();
  860. }
  861. }
  862. // If there is no connection to the IPC hub
  863. //
  864. else {
  865. // clear any PING attempt records we may have active
  866. //
  867. clear_ping_attempt();
  868. }
  869. // If our SIGCHLD handler has notified us that there is at least 1 zombie to reap
  870. //
  871. if (zombie_alert) {
  872. // Try and reap any zombies that may exist
  873. //
  874. reap_zombies();
  875. }
  876. //-------------- POLL FILE DESCRIPTOR SET POPULATION HERE
  877. //
  878. nfds = 0;
  879. // If we have a valid communication hub connection, add it to the poll set
  880. //
  881. if (commhub_fd >= 0) {
  882. fds[nfds].fd = commhub_fd;
  883. fds[nfds].events = POLLIN;
  884. // but flag it as NOT belonging to a child module
  885. //
  886. poll_to_module[nfds] = -1;
  887. nfds++;
  888. }
  889. // Then iterate through all of our modules and see if they need service
  890. //
  891. for (i = 0; i < NUM_MODULES; i++) {
  892. // If this child module is flagged as active
  893. //
  894. if (module_status[i].active) {
  895. // and we have a stdout file descriptor for this child
  896. //
  897. if (module_status[i].stdout_fd >= 0) {
  898. // add it to the poll set
  899. //
  900. fds[nfds].fd = module_status[i].stdout_fd;
  901. fds[nfds].events = POLLIN;
  902. // and remember which module it goes to
  903. //
  904. poll_to_module[nfds] = i;
  905. nfds++;
  906. }
  907. // if we have a stderr file descriptor for this child
  908. //
  909. if (module_status[i].stderr_fd >= 0) {
  910. // add it to the poll set
  911. //
  912. fds[nfds].fd = module_status[i].stderr_fd;
  913. fds[nfds].events = POLLIN;
  914. // and remember which module it goes to
  915. //
  916. poll_to_module[nfds] = i;
  917. nfds++;
  918. }
  919. }
  920. }
  921. // If we have any file descriptors to poll
  922. //
  923. if (nfds > 0) {
  924. // poll them
  925. //
  926. poll_ret = poll(fds, nfds, POLL_TIMEOUT);
  927. // if poll returns error
  928. //
  929. if (poll_ret < 0) {
  930. // and it's just EINTR
  931. //
  932. if (errno == EINTR) {
  933. // try the body of the while again
  934. //
  935. continue;
  936. }
  937. // if it is some other error
  938. //
  939. else {
  940. // scream bloody murder
  941. //
  942. route_debug_message("Poll returned %d, errno = %d (%s)\n", poll_ret, errno, strerror(errno));
  943. sleep(1);
  944. }
  945. }
  946. }
  947. // If we have zero functional file descriptors
  948. //
  949. else {
  950. // Pretend that we called poll and it timed out
  951. //
  952. sleep(1);
  953. poll_ret = 0;
  954. }
  955. //-------------- I/O PUMPING CODE BELOW
  956. // if there are no file descriptors flagged as needing any action
  957. //
  958. if (poll_ret == 0) { continue; }
  959. for (i = 0; i < nfds; i++) {
  960. modnum = poll_to_module[i];
  961. // If we have input and it is from a module...
  962. //
  963. if ( (fds[i].revents & POLLIN) && (modnum >= 0) ) {
  964. read_ret = read(fds[i].fd, linebuffer, sizeof(linebuffer) - 1);
  965. if (read_ret == 0) {
  966. // clean up
  967. // and skip this fd for now
  968. //
  969. handle_dead_fd(fds[i].fd);
  970. continue;
  971. }
  972. else if ( (read_ret < 0) && (errno != EINTR) ) {
  973. // clean up
  974. // and skip this fd for now
  975. //
  976. handle_dead_fd(fds[i].fd);
  977. continue;
  978. }
  979. // Terminate our read...
  980. //
  981. linebuffer[read_ret] = '\0';
  982. if (fds[i].fd == module_status[modnum].stdout_fd) {
  983. //route_debug_message("%s [%d]: %s", module_name[modnum], (int)module_status[modnum].pid, linebuffer);
  984. }
  985. else if (fds[i].fd == module_status[modnum].stderr_fd) {
  986. route_error_message("%s [%d]: %s", module_name[modnum], (int)module_status[modnum].pid, linebuffer);
  987. }
  988. else {
  989. route_debug_message("File descriptor %d does not belong to module %d (%s)!\n", fds[i].fd, modnum, module_name[modnum]);
  990. }
  991. }
  992. // If we have input and it is from the commhub...
  993. //
  994. else if ( (fds[i].revents & POLLIN) && (fds[i].fd == commhub_fd) ) {
  995. read_ret = get_message(commhub_fd, &incoming_msg);
  996. if (read_ret < 0) {
  997. // clean up
  998. // and skip this fd for now
  999. //
  1000. handle_dead_fd(fds[i].fd);
  1001. continue;
  1002. }
  1003. else {
  1004. // This passes the received message through the callback list
  1005. //
  1006. process_message(&incoming_msg);
  1007. }
  1008. }
  1009. // If it looks like we have a closed or invalid descriptor
  1010. //
  1011. if (fds[i].revents & (POLLERR | POLLHUP | POLLNVAL)) {
  1012. // clean up
  1013. // and skip this fd for now
  1014. //
  1015. handle_dead_fd(fds[i].fd);
  1016. continue;
  1017. }
  1018. }
  1019. }
  1020. route_debug_message("Attempting graceful exit...\n");
  1021. for (i = 0; i < NUM_MODULES; i++) {
  1022. if (module_status[i].active) {
  1023. route_debug_message("Sending SIGTERM to module %s [%d]\n", module_name[i], module_status[i].pid);
  1024. kill(module_status[i].pid, SIGTERM);
  1025. }
  1026. }
  1027. sleep(1);
  1028. return 0;
  1029. }