client_supervisor.c 36 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427
  1. /*
  2. * Copyright (c) 2019 Clementine Computing LLC.
  3. *
  4. * This file is part of PopuFare.
  5. *
  6. * PopuFare is free software: you can redistribute it and/or modify
  7. * it under the terms of the GNU Affero General Public License as published by
  8. * the Free Software Foundation, either version 3 of the License, or
  9. * (at your option) any later version.
  10. *
  11. * PopuFare is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  14. * GNU Affero General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU Affero General Public License
  17. * along with PopuFare. If not, see <https://www.gnu.org/licenses/>.
  18. *
  19. */
  20. #include <linux/limits.h>
  21. #include <sys/types.h>
  22. #include <sys/wait.h>
  23. #include <sysexits.h>
  24. #include <stdio.h>
  25. #include <stdlib.h>
  26. #include <unistd.h>
  27. #include <signal.h>
  28. #include <string.h>
  29. #include <strings.h>
  30. #include <errno.h>
  31. #include <time.h>
  32. #include <poll.h>
  33. #include <syslog.h>
  34. #include <stdarg.h>
  35. #include "../common/common_defs.h"
  36. #include "../commhub/commhub.h"
  37. #include "../commhub/client_utils.h"
  38. #define CLIENT_SUPERVISOR_VERSION "2.1.2"
  39. // Number of modules we are actively tracking
  40. //
  41. int NUM_MODULES = 0;
  42. // File descriptor of our connection to the comm hub
  43. //
  44. int commhub_fd = -1;
  45. // This flag is set by the SIGCHLD handler, and cleared by the reaper function
  46. //
  47. volatile sig_atomic_t zombie_alert = 0;
  48. // This is our flag that the SIGHUP handler sets and is cleared when a HUP has been delivered to the client modules
  49. //
  50. volatile sig_atomic_t hup_alert = 0;
  51. // This is the time of the last new process creation event
  52. //
  53. int last_global_spawn = 0;
  54. // This is the time of our last system status check
  55. //
  56. int last_system_stat = 0;
  57. // This is the number of KB of free memory in the system
  58. //
  59. int system_stat_freemem = 0;
  60. // This is the number of running processes in the system
  61. //
  62. int system_stat_numproc = 0;
  63. // This is the 5 minute load average
  64. //
  65. float system_stat_loadavg = 0;
  66. // This is the error logging module's module number
  67. // (this is the module who's name matches SUPERVISOR_SPECIAL_BILLDB)
  68. //
  69. int error_logging_module = -1;
  70. #define PING_PUNISH_NONE (0)
  71. #define PING_PUNISH_WARN (1)
  72. #define PING_PUNISH_TERM (2)
  73. #define PING_PUNISH_KILL (3)
  74. struct module_record {
  75. // last time this process was started
  76. //
  77. time_t last_start;
  78. // last time this process has reaped
  79. //
  80. time_t last_exit;
  81. // If this flag is nonzero, the specified module is subject to PING monitoring
  82. //
  83. int ping_requested;
  84. // however, if this flag is zero, the specified module is immune to PING monitoring.
  85. // If this is zero, all further fields are invalid
  86. //
  87. int active;
  88. //-------------------------
  89. // If this flag is set, we expect the process to exit (i.e. it's not an error if it does...)
  90. //
  91. int exit_expected;
  92. // Process ID of this child module
  93. //
  94. pid_t pid;
  95. // File descriptor of this child's standard input (write to this to feed child)
  96. //
  97. int stdin_fd;
  98. // File descriptor of this child's standard output (read from this to get output)
  99. //
  100. int stdout_fd;
  101. // File descriptor of this child's standard error (read from this to get error output)
  102. //
  103. int stderr_fd;
  104. // timestamp in microseconds of the last PING message sent to this process
  105. //
  106. long long int last_ping;
  107. // timestamp in microseconds of the last PONG message received from this process
  108. //
  109. long long int last_pong;
  110. // Latest PING->PONG lag for this process
  111. //
  112. long long int ping_lag;
  113. // Used to keep track of PING non-response action sequence
  114. //
  115. int ping_punish;
  116. // This array contains the state for each module
  117. //
  118. } module_status[SUPERVISOR_MAX_MODULES] = {{0}};
  119. // This array contains pointers to the names of each module
  120. //
  121. char *module_name[SUPERVISOR_MAX_MODULES] = {0};
  122. #define PIPE_R (0)
  123. #define PIPE_W (1)
  124. #define STDIN_H (0)
  125. #define STDOUT_H (1)
  126. #define STDERR_H (2)
  127. int route_debug_message(char *fmt, ...) {
  128. va_list ap;
  129. #ifdef SUPERVISOR_LOG_DEBUG_TO_BILLDB
  130. // IF... we have a connection to the commhub, and we have an instance of billdb under our care, and it is alive...
  131. //
  132. if ( (commhub_fd >= 0) && (error_logging_module >= 0) && module_status[error_logging_module].active ) {
  133. int retval;
  134. int len = 0;
  135. struct message_record outgoing_msg;
  136. char payload[MAX_PAYLOAD_LENGTH] = {0};
  137. payload[len++] = LOGLEVEL_DEBUG;
  138. len += make_log_prefix(payload + len, MAX_PAYLOAD_LENGTH - len);
  139. va_start(ap, fmt); /* Initialize the va_list */
  140. len += vsnprintf(payload + len, MAX_PAYLOAD_LENGTH - len, fmt, ap);
  141. va_end(ap);
  142. prepare_message(&outgoing_msg, MAILBOX_BILLING_LOG, payload, len);
  143. retval = send_message(commhub_fd, &outgoing_msg);
  144. if (retval == 0) {
  145. return 0;
  146. }
  147. }
  148. #endif
  149. // IF either that didn't work, or the conditions were not met, send it to syslog...
  150. //
  151. // Initialize the va_list
  152. //
  153. va_start(ap, fmt);
  154. vsyslog(LOG_DEBUG, fmt, ap);
  155. va_end(ap);
  156. return 0;
  157. }
  158. int route_warning_message(char *fmt, ...) {
  159. va_list ap;
  160. #ifdef SUPERVISOR_LOG_WARNING_TO_BILLDB
  161. // IF... we have a connection to the commhub, and we have an instance of billdb under our care, and it is alive...
  162. //
  163. if ( (commhub_fd >= 0) && (error_logging_module >= 0) && module_status[error_logging_module].active ) {
  164. int retval;
  165. int len = 0;
  166. struct message_record outgoing_msg;
  167. char payload[MAX_PAYLOAD_LENGTH] = {0};
  168. payload[len++] = LOGLEVEL_WARN;
  169. len += make_log_prefix(payload + len, MAX_PAYLOAD_LENGTH - len);
  170. va_start(ap, fmt); /* Initialize the va_list */
  171. len += vsnprintf(payload + len, MAX_PAYLOAD_LENGTH - len, fmt, ap);
  172. va_end(ap);
  173. prepare_message(&outgoing_msg, MAILBOX_BILLING_LOG, payload, len);
  174. retval = send_message(commhub_fd, &outgoing_msg);
  175. if (retval == 0) {
  176. return 0;
  177. }
  178. }
  179. #endif
  180. // IF either that didn't work, or the conditions were not met, send it to syslog...
  181. //
  182. va_start(ap, fmt); /* Initialize the va_list */
  183. vsyslog(LOG_WARNING, fmt, ap);
  184. va_end(ap);
  185. return 0;
  186. }
  187. int route_error_message(char *fmt, ...) {
  188. va_list ap;
  189. #ifdef SUPERVISOR_LOG_ERROR_TO_BILLDB
  190. // IF... we have a connection to the commhub, and we have an instance of billdb under our care, and it is alive...
  191. //
  192. if ( (commhub_fd >= 0) && (error_logging_module >= 0) && module_status[error_logging_module].active ) {
  193. int retval;
  194. int len = 0;
  195. struct message_record outgoing_msg;
  196. char payload[MAX_PAYLOAD_LENGTH] = {0};
  197. payload[len++] = LOGLEVEL_ERROR;
  198. len += make_log_prefix(payload + len, MAX_PAYLOAD_LENGTH - len);
  199. // Initialize the va_list
  200. //
  201. va_start(ap, fmt);
  202. len += vsnprintf(payload + len, MAX_PAYLOAD_LENGTH - len, fmt, ap);
  203. va_end(ap);
  204. prepare_message(&outgoing_msg, MAILBOX_BILLING_LOG, payload, len);
  205. retval = send_message(commhub_fd, &outgoing_msg);
  206. if (retval == 0)
  207. {
  208. return 0;
  209. }
  210. }
  211. #endif
  212. // IF either that didn't work, or the conditions were not met, send it to syslog...
  213. //
  214. // Initialize the va_list
  215. //
  216. va_start(ap, fmt);
  217. vsyslog(LOG_ERR, fmt, ap);
  218. va_end(ap);
  219. return 0;
  220. }
  221. void warn_num_proc() {
  222. route_warning_message("System has %d processes running!\n", system_stat_numproc);
  223. }
  224. void warn_load_avg() {
  225. route_warning_message("System load average is %.1f!\n", system_stat_loadavg);
  226. }
  227. void warn_mem_free() {
  228. route_warning_message("System is down to %d KB of free RAM\n", system_stat_freemem);
  229. }
  230. void monitor_system_status() {
  231. int retval;
  232. int kb_free = 0;
  233. int n_procs = 0;
  234. float la5 = 0;
  235. char line[LINE_BUFFER_SIZE] = {0};
  236. FILE *f;
  237. f = fopen("/proc/loadavg", "rb");
  238. if (f) {
  239. fgets(line, sizeof(line), f);
  240. retval = sscanf(line, "%f %*f %*f %*i/%i", &la5, &n_procs);
  241. if (retval == 2) {
  242. system_stat_loadavg = la5;
  243. system_stat_numproc = n_procs;
  244. }
  245. fclose(f);
  246. }
  247. else {
  248. route_error_message("Cannot read /proc/loadavg\n");
  249. }
  250. f = fopen("/proc/meminfo", "rb");
  251. if (f) {
  252. while(!feof(f)) {
  253. fgets(line, sizeof(line), f);
  254. retval = sscanf(line, "MemFree: %i kB\n", &kb_free);
  255. if (retval == 1) {
  256. system_stat_freemem = kb_free;
  257. break;
  258. }
  259. }
  260. fclose(f);
  261. }
  262. else {
  263. route_error_message("Cannot read /proc/meminfo\n");
  264. }
  265. if (system_stat_numproc > SUPERVISOR_WARN_NUM_PROC) {
  266. warn_num_proc();
  267. }
  268. if (system_stat_loadavg > SUPERVISOR_WARN_LOAD_AVG) {
  269. warn_load_avg();
  270. }
  271. if (system_stat_freemem < SUPERVISOR_WARN_LOW_MEM) {
  272. warn_mem_free();
  273. }
  274. }
  275. void send_heartbeat() {
  276. route_debug_message("heartbeat\n");
  277. }
  278. int launch_module(int n) {
  279. pid_t fork_ret;
  280. int retval;
  281. int my_stdin, my_stdout, my_stderr;
  282. int new_stdin[2], new_stdout[2], new_stderr[2];
  283. int record_confirm_pipe[2];
  284. char record_confirm_msg[4] = {'O','K','\n','\0'};
  285. int record_confirm_expect;
  286. if ( (n < 0) || (n >= NUM_MODULES) ) {
  287. return -1;
  288. }
  289. RESET_WATCHDOG();
  290. // Save our actual standard I/O fd's
  291. //
  292. my_stdin = dup(STDIN_H);
  293. my_stdout = dup(STDOUT_H);
  294. my_stderr = dup(STDERR_H);
  295. // Generate pipes to be said handles for the child process
  296. //
  297. pipe(new_stdin);
  298. pipe(new_stdout);
  299. pipe(new_stderr);
  300. // close our actual handles in preparation for forking
  301. //
  302. close(STDIN_H);
  303. close(STDOUT_H);
  304. close(STDERR_H);
  305. // Assign the "far" end of the pipes to our handles so that the child process will use them
  306. //
  307. dup2(new_stdin[PIPE_R], STDIN_H);
  308. dup2(new_stdout[PIPE_W], STDOUT_H);
  309. dup2(new_stderr[PIPE_W], STDERR_H);
  310. // Create a pipe for record-keeping confirmation purposes (this prevents a race condition with SIGCHLD
  311. //
  312. pipe(record_confirm_pipe);
  313. record_confirm_expect = strlen(record_confirm_msg);
  314. // Ask the kernel to create a new process to hold our new module
  315. //
  316. fork_ret = fork();
  317. // If we are the child process
  318. //
  319. if (fork_ret == 0) {
  320. // We don't need this end of the confirmation pipe
  321. //
  322. close(record_confirm_pipe[PIPE_W]);
  323. // Wait for the parent process to confirm that we have been recorded before proceeding
  324. // with the exec attempt so that if it fails, the parent process will know what to do with the
  325. // SIGCHLD signal.
  326. //
  327. retval = read(record_confirm_pipe[PIPE_R], record_confirm_msg, record_confirm_expect);
  328. // If the parent process can't confirm our existence, we're in deep shit...
  329. //
  330. if (retval != record_confirm_expect) {
  331. fprintf(stderr, "Never got recordkeeping confirmation!\n");
  332. exit(EX_UNAVAILABLE);
  333. }
  334. // We are now done with our confirmation pipe
  335. //
  336. close(record_confirm_pipe[PIPE_R]);
  337. // Close the child's copy of the parent's standard I/O fds
  338. //
  339. close(my_stdin);
  340. close(my_stdout);
  341. close(my_stderr);
  342. // Close the parent's end of the pipes connecting the two processes
  343. //
  344. close(new_stdin[PIPE_W]);
  345. close(new_stdout[PIPE_R]);
  346. close(new_stderr[PIPE_R]);
  347. // Attempt to exec the child process now...
  348. //
  349. retval = execl(module_name[n], module_name[n], NULL);
  350. if (retval)
  351. {
  352. fprintf(stderr, "Cannot execl(\"%s\", \"%s\", NULL)!\n", module_name[n], module_name[n]);
  353. exit(EX_UNAVAILABLE);
  354. }
  355. exit(0);
  356. }
  357. // If we are the parent process
  358. //
  359. else if (fork_ret != -1) {
  360. // Record our child PID
  361. //
  362. module_status[n].pid = fork_ret;
  363. // Record our end of the child's standard file descriptors
  364. //
  365. module_status[n].stdin_fd = new_stdin[PIPE_W];
  366. module_status[n].stdout_fd = new_stdout[PIPE_R];
  367. module_status[n].stderr_fd = new_stderr[PIPE_R];
  368. // Reset our list ping/pong times to "never"
  369. //
  370. module_status[n].last_ping = 0;
  371. module_status[n].last_pong = 0;
  372. module_status[n].ping_lag = 0;
  373. module_status[n].ping_punish = PING_PUNISH_NONE;
  374. // Record our process start time
  375. //
  376. module_status[n].last_start = time(NULL);
  377. // This process is not currently expected to die
  378. //
  379. module_status[n].exit_expected = 0;
  380. // Flag this child module as active
  381. //
  382. module_status[n].active = 1;
  383. // Close the opposite end of all of our pipes
  384. //
  385. close(record_confirm_pipe[PIPE_R]);
  386. close(new_stdin[PIPE_R]);
  387. close(new_stdout[PIPE_W]);
  388. close(new_stderr[PIPE_W]);
  389. // Close our copy of the child's standard I/O file descriptors
  390. //
  391. close(STDIN_H);
  392. close(STDOUT_H);
  393. close(STDERR_H);
  394. // Restore the parent's original standard I/O file descriptors
  395. //
  396. dup2(my_stdin, STDIN_H);
  397. dup2(my_stdout, STDOUT_H);
  398. dup2(my_stderr, STDERR_H);
  399. // Unblock the child process by transmitting our confirmation message
  400. //
  401. retval = write(record_confirm_pipe[PIPE_W], record_confirm_msg, record_confirm_expect);
  402. if (retval != record_confirm_expect) {
  403. route_error_message("Cannot transmit recordkeeping confirmation!\n");
  404. }
  405. // We are now done with our recordkeeping confirmation pipe
  406. //
  407. close(record_confirm_pipe[PIPE_W]);
  408. }
  409. // Otherwise we have somehow FAILED to fork()
  410. //
  411. else {
  412. // restore our stderr file descriptor
  413. //
  414. close(STDERR_H);
  415. dup2(my_stderr, STDERR_H);
  416. route_error_message("Cannot fork()!\n");
  417. exit(EX_OSERR);
  418. }
  419. return 0;
  420. }
  421. void expected_exit(int n) {
  422. route_debug_message("Module %s [%d] has finished as expected.\n", module_name[n], (int)module_status[n].pid);
  423. // flag this module as "clean" for the purpose of respawn
  424. // (so we won't count the short runtime of a process we asked to terminate against it)
  425. //
  426. module_status[n].last_start = 0;
  427. module_status[n].last_exit = 0;
  428. }
  429. void unexpected_exit(int n) {
  430. int age;
  431. // Record our last stop time
  432. //
  433. module_status[n].last_exit = time(NULL);
  434. age = (int)module_status[n].last_exit - (int)module_status[n].last_start;
  435. route_warning_message("Module %s [%d] has stopped running unexpectedly after %d seconds.\n", module_name[n], (int)module_status[n].pid, age);
  436. }
  437. void reap_zombies() {
  438. int status = 0;
  439. pid_t retval = 0;
  440. int i;
  441. do {
  442. // Clear the zombie alert HERE so that if things terminate after the loop finishes,
  443. // we'll get flagged. This could result in an extra flag, but we'll never miss one.
  444. // and since we're using the WNOHANG option, there is no harm in going through the waitpid()
  445. // loop one time extra.
  446. zombie_alert = 0;
  447. retval = waitpid(-1, &status, WNOHANG); // Wait for ANY child process
  448. // If waitpid() actually waited for the specified process...
  449. //
  450. if (retval > 0) {
  451. for (i = 0; i < NUM_MODULES; i++) {
  452. // Go through our table of modules looking for an active module with a
  453. //
  454. // matching PID...
  455. //
  456. if ( module_status[i].active && (module_status[i].pid == retval) ) {
  457. // Give us an opportunity to do some clever cleanup if it is merited
  458. // These functions also set the last_exit time (and may zero the last_start
  459. // to flag a process as "clean" for respawn purposes).
  460. //
  461. if (module_status[i].exit_expected) {
  462. expected_exit(i);
  463. }
  464. else {
  465. unexpected_exit(i);
  466. }
  467. // Set this module inactive
  468. //
  469. module_status[i].active = 0;
  470. // Set this module's last ping/pong times to "never"
  471. //
  472. module_status[i].last_ping = 0;
  473. module_status[i].last_pong = 0;
  474. module_status[i].ping_lag = 0;
  475. module_status[i].ping_punish = PING_PUNISH_NONE;
  476. // No more process
  477. //
  478. module_status[i].pid = 0;
  479. // Close our end of this module's standard I/O file descriptors
  480. //
  481. close(module_status[i].stdin_fd);
  482. close(module_status[i].stdout_fd);
  483. close(module_status[i].stderr_fd);
  484. module_status[i].stdin_fd = -1;
  485. module_status[i].stdout_fd = -1;
  486. module_status[i].stderr_fd = -1;
  487. break;
  488. }
  489. }
  490. }
  491. } while(retval > 0);
  492. }
  493. void sigchld_handler(int signum, siginfo_t *info, void *data) {
  494. zombie_alert = 1;
  495. }
  496. void sighup_handler(int signum, siginfo_t *into, void *data) {
  497. hup_alert = 1;
  498. }
  499. void setup_child_handler() {
  500. struct sigaction sa = {{0}};
  501. sa.sa_sigaction = sigchld_handler;
  502. sa.sa_flags = SA_NOCLDSTOP | SA_SIGINFO;
  503. sigfillset(&sa.sa_mask);
  504. sigaction(SIGCHLD, &sa, NULL);
  505. }
  506. void setup_hup_handler() {
  507. struct sigaction sa = {{0}};
  508. sa.sa_sigaction = sighup_handler;
  509. sa.sa_flags = SA_SIGINFO;
  510. sigfillset(&sa.sa_mask);
  511. sigaction(SIGHUP, &sa, NULL);
  512. }
  513. void monitor_spawn_list() {
  514. int i;
  515. int module_runtime;
  516. time_t now;
  517. for (i = 0; i < NUM_MODULES; i++) {
  518. now = time(NULL);
  519. // If it is too soon to attempt spawning any new process, then we're done for the moment
  520. //
  521. if ( (now - last_global_spawn) < SUPERVISOR_GLOBAL_SPAWN_RATE_LIMIT ) break;
  522. // If this module is still active, we don't need to spawn it...
  523. if (module_status[i].active) continue;
  524. // This number will only be valid if last_start != 0
  525. module_runtime = module_status[i].last_exit - module_status[i].last_start;
  526. // If this module either made an expected termination or has never been launcher
  527. //
  528. if (module_status[i].last_start == 0) {
  529. // then update our last global spawn time
  530. //
  531. last_global_spawn = now;
  532. // and launch it...
  533. //
  534. launch_module(i);
  535. route_debug_message("Spawned module %s, PID = %d\n", module_name[i], (int)module_status[i].pid);
  536. }
  537. // otherwise, if it has run before, but terminated unexpectedly in a very short time
  538. //
  539. else if ( module_runtime < SUPERVISOR_RESPAWN_DELAY_THRESHOLD ) {
  540. // and it has been long enough to try again
  541. //
  542. if ( (now - module_status[i].last_exit) >= SUPERVISOR_RESPAWN_RATE_LIMIT) {
  543. // then update our last global spawn time
  544. //
  545. last_global_spawn = now;
  546. // and launch it...
  547. //
  548. launch_module(i);
  549. route_warning_message("Respawned module %s, PID = %d [rate limited]\n", module_name[i], (int)module_status[i].pid);
  550. }
  551. }
  552. // otherwise, the module has terminated unexpectedly, but it had run for a long time, so we can respawn it immediately
  553. //
  554. else {
  555. // then update our last global spawn time
  556. //
  557. last_global_spawn = now;
  558. // and launch it...
  559. //
  560. launch_module(i);
  561. route_debug_message("Respawned module %s, PID = %d\n", module_name[i], (int)module_status[i].pid);
  562. }
  563. }
  564. }
  565. void handle_dead_fd(int fd) {
  566. int i;
  567. close(fd);
  568. if (fd == commhub_fd) {
  569. commhub_fd = -1;
  570. return;
  571. }
  572. for (i = 0; i < NUM_MODULES; i++) {
  573. if (module_status[i].stdin_fd == fd) {
  574. module_status[i].stdin_fd = -1;
  575. return;
  576. }
  577. if (module_status[i].stdout_fd == fd) {
  578. module_status[i].stdout_fd = -1;
  579. return;
  580. }
  581. if (module_status[i].stderr_fd == fd) {
  582. module_status[i].stderr_fd = -1;
  583. return;
  584. }
  585. }
  586. }
  587. message_callback_return handle_pong_message(struct message_record *msg, void *param) {
  588. int i;
  589. // printf("Got PONG from PID %d\n", (int)msg->header.sender);
  590. // Iterate through all of the modules
  591. //
  592. for (i = 0; i < NUM_MODULES; i++) {
  593. // If this module is active, and its PID matches the PID of the sender of this PONG message
  594. //
  595. if ( module_status[i].active && (msg->header.sender == module_status[i].pid) ) {
  596. module_status[i].last_pong = get_usec_time();
  597. module_status[i].ping_lag = (module_status[i].last_pong - module_status[i].last_ping);
  598. // printf("PING->PONG latency %lld microseconds.\n", module_status[i].ping_lag);
  599. }
  600. }
  601. return MESSAGE_HANDLED_CONT;
  602. }
  603. //----
  604. // Client_supervisor is now in charge of managing the state_info file,
  605. // so we need to have gps, stop and driver messages handled explicitely
  606. // to be able to write to the state_info file.
  607. // `client_supervisor` should be the sole process writing to the state_info
  608. // file while any other client should be able to read.
  609. //
  610. message_callback_return handle_gps_message(struct message_record *msg, void *param) {
  611. // guard against version mismatch
  612. //
  613. if (msg->header.payload_length != sizeof(gps_status)) {
  614. return MESSAGE_HANDLED_STOP;
  615. }
  616. memcpy(&gps_stat, msg->payload, sizeof(gps_status)); //otherwise, update our structure
  617. update_state_info_with_gps(&state_info, &gps_stat);
  618. memcpy(state_info.comment, "client_supervisor gps", strlen("client_supervisor gps")+1);
  619. save_state_info(&state_info);
  620. return MESSAGE_HANDLED_CONT;
  621. }
  622. static message_callback_return handle_stop_message(struct message_record *msg, void *param) {
  623. // guard against version mismatch
  624. //
  625. if (msg->header.payload_length != sizeof(stop_status)) {
  626. return MESSAGE_HANDLED_STOP;
  627. }
  628. memcpy(&stop_stat, msg->payload, sizeof(stop_status));
  629. update_state_info_with_stop(&state_info, &stop_stat);
  630. memcpy(state_info.comment, "client_supervisor stop", strlen("client_supervisor stop")+1);
  631. save_state_info(&state_info);
  632. return MESSAGE_HANDLED_CONT;
  633. }
  634. static message_callback_return handle_driver_message(struct message_record *msg, void *param) {
  635. // guard against version mismatch
  636. //
  637. if (msg->header.payload_length != sizeof(driver_status)) {
  638. return MESSAGE_HANDLED_STOP;
  639. }
  640. memcpy(&driver_stat, msg->payload, sizeof(driver_stat));
  641. update_state_info_with_driver(&state_info, &driver_stat);
  642. memcpy(state_info.comment, "client_supervisor driver", strlen("client_supervisor driver")+1);
  643. save_state_info(&state_info);
  644. return MESSAGE_HANDLED_CONT;
  645. }
  646. //----
  647. void clear_ping_attempt() {
  648. int i;
  649. for (i = 0; i < NUM_MODULES; i++) {
  650. if (module_status[i].active) {
  651. module_status[i].last_ping = module_status[i].last_pong = 0;
  652. module_status[i].ping_punish = PING_PUNISH_NONE;
  653. }
  654. }
  655. }
  656. void send_ping_message()
  657. {
  658. struct message_record outgoing_msg;
  659. long long int usec_now;
  660. char target[MAILBOX_NAME_MAX + 1];
  661. int i;
  662. if (commhub_fd < 0) { return; }
  663. for (i = 0; i < NUM_MODULES; i++) {
  664. // If this module is active AND included in PING monitoring
  665. //
  666. if ( module_status[i].active && module_status[i].ping_requested ) {
  667. usec_now = get_usec_time();
  668. // If it is not yet time to re-ping this module
  669. //
  670. if ( (usec_now - module_status[i].last_ping) < SUPERVISOR_PING_INTERVAL) {
  671. // skip it for now
  672. //
  673. continue;
  674. }
  675. // If we have PING'd this host but are still waiting for a PONG back
  676. //
  677. if (module_status[i].last_pong < module_status[i].last_ping) {
  678. // Don't flood it with pings
  679. //
  680. continue;
  681. }
  682. // If we've never PING'd this host, set it's PONG time to the same as its PING
  683. // time so it doesn't get killed for not having responded to its first PING.
  684. //
  685. if (module_status[i].last_ping == 0) {
  686. module_status[i].last_pong = usec_now;
  687. }
  688. module_status[i].last_ping = usec_now;
  689. // Turn our PID into a magic mailbox name that addresses that PID.
  690. //
  691. sprintf(target, ">%d", (int)module_status[i].pid);
  692. // Prepare a message to that magic mailbox with a payload consisting of the mailbox
  693. // name of the PING notification
  694. //
  695. prepare_message(&outgoing_msg, target, MAILBOX_PING, strlen(MAILBOX_PING));
  696. // And send it!
  697. //
  698. send_message(commhub_fd, &outgoing_msg);
  699. }
  700. }
  701. }
  702. void send_hup_message() {
  703. struct message_record outgoing_msg;
  704. if (commhub_fd < 0) {
  705. return;
  706. }
  707. printf("Sending HUP\n");
  708. prepare_message(&outgoing_msg, MAILBOX_HUP, "", 0);
  709. send_message(commhub_fd, &outgoing_msg);
  710. }
  711. void ping_punish_warn(int n, long long int delta) {
  712. route_debug_message("Module %s [%d] hasn't responded to a PING in %d seconds...\n", module_name[n], (int)module_status[n].pid, (int)USEC_TO_SEC(delta));
  713. }
  714. void ping_punish_term(int n, long long int delta) {
  715. route_warning_message("Module %s [%d] hasn't responded to a PING in %d seconds... Sending SIGTERM\n", module_name[n], (int)module_status[n].pid, (int)USEC_TO_SEC(delta));
  716. kill(module_status[n].pid, SIGTERM);
  717. }
  718. void ping_punish_kill(int n, long long int delta) {
  719. route_error_message("Module %s [%d] hasn't responded to a PING in %d seconds... Sending SIGKILL\n", module_name[n], (int)module_status[n].pid, (int)USEC_TO_SEC(delta));
  720. kill(module_status[n].pid, SIGKILL);
  721. }
  722. void enforce_ping_policy()
  723. {
  724. int i;
  725. long long int delta = 0;
  726. for (i = 0; i < NUM_MODULES; i++) {
  727. //If this module is both active AND included in the ping monitoring
  728. //
  729. if ( module_status[i].active && module_status[i].ping_requested ) {
  730. // if we haven't sent a PING to this one...
  731. //
  732. if (module_status[i].last_ping == 0) {
  733. //clear any punishment that would otherwise be pending and get the next module
  734. //
  735. module_status[i].ping_punish = PING_PUNISH_NONE;
  736. continue;
  737. }
  738. //compute just how late this module is...
  739. //
  740. delta = get_usec_time() - module_status[i].last_pong;
  741. switch(module_status[i].ping_punish) {
  742. case PING_PUNISH_NONE:
  743. if (delta > SUPERVISOR_PING_WARN_TIME) {
  744. ping_punish_warn(i, delta);
  745. module_status[i].ping_punish++;
  746. }
  747. break;
  748. case PING_PUNISH_WARN:
  749. if (delta > SUPERVISOR_PING_TERM_TIME) {
  750. ping_punish_term(i, delta);
  751. module_status[i].ping_punish++;
  752. }
  753. break;
  754. case PING_PUNISH_TERM:
  755. if (delta > SUPERVISOR_PING_KILL_TIME) {
  756. ping_punish_kill(i, delta);
  757. module_status[i].ping_punish++;
  758. }
  759. break;
  760. default:
  761. route_error_message("At our wits end with module %s [%d], just plain won't die!\n", module_name[i], (int)module_status[i].pid);
  762. break;
  763. }
  764. }
  765. }
  766. }
  767. void maintain_ipc_hub_connect(char *progname) {
  768. struct message_record outgoing_msg;
  769. // if we have no connection to the communication hub
  770. //
  771. if (commhub_fd < 0) {
  772. // try and get one
  773. //
  774. commhub_fd = connect_to_message_server(progname);
  775. // printf("commhub_fd = %d\n", commhub_fd);
  776. // if it worked
  777. //
  778. if (commhub_fd >= 0) {
  779. //Subscribe to the command mailboxes we act on
  780. //
  781. prepare_message(&outgoing_msg, MAILBOX_SUBSCRIBE, MAILBOX_PONG, strlen(MAILBOX_TOKEN_MAG));
  782. send_message(commhub_fd,&outgoing_msg);
  783. //Subscribe to the relevant status management mailboxes
  784. //
  785. subscribe_to_default_messages(commhub_fd);
  786. }
  787. }
  788. }
  789. void print_version(FILE *fp) {
  790. fprintf(fp, "version %s\n", CLIENT_SUPERVISOR_VERSION);
  791. }
  792. void usage(FILE *fp, char *progname) {
  793. fprintf(fp,"%s: %d modules given to be supervised, minimum = 1, maximum = %d\n", progname, NUM_MODULES, SUPERVISOR_MAX_MODULES - 1);
  794. fprintf(fp,"%s: Usage: %s [module ...] [--[no-]ping] [module ...]\n", progname, progname);
  795. fprintf(fp,"%s: By default modules will be subjet to PING monitoring through\n", progname);
  796. fprintf(fp,"%s: the IPC hub, --no-ping will effect all modules following it until a --ping\n", progname);
  797. fprintf(fp,"\n");
  798. exit(EX_USAGE);
  799. }
  800. void handle_command_line(int argc, char **argv) {
  801. int i;
  802. int needs_ping = 1;
  803. NUM_MODULES = 0;
  804. // Walk all of the given command line arguments and add each one as a module we have been asked to supervice
  805. //
  806. for (i = 1; i < argc; i++) {
  807. if ( !strcmp(argv[i], "--no-ping") ) {
  808. needs_ping = 0;
  809. }
  810. else if ( !strcmp(argv[i], "--ping") ) {
  811. needs_ping = 1;
  812. }
  813. else if ( !strcmp(argv[i], "-v") ) {
  814. print_version(stdout);
  815. exit(0);
  816. }
  817. else if ( !strcmp(argv[i], "-h") ) {
  818. usage(stdout, argv[0]);
  819. exit(EX_USAGE);
  820. }
  821. else {
  822. // Here we look for the last / in the module name
  823. //
  824. char *foo = rindex(argv[i], '/');
  825. // if we found one, advance past it
  826. //
  827. if (foo) { foo++; }
  828. // otherwise
  829. //
  830. else {
  831. // use our whole argument
  832. //
  833. foo = argv[i];
  834. }
  835. // make sure it is at the end
  836. //
  837. if ( !strcmp(foo, SUPERVISOR_SPECIAL_BILLDB) ) {
  838. //if it is, this one is our guy!
  839. //
  840. error_logging_module = NUM_MODULES;
  841. }
  842. module_status[NUM_MODULES].ping_requested = needs_ping;
  843. module_name[NUM_MODULES++] = argv[i];
  844. }
  845. if (NUM_MODULES >= SUPERVISOR_MAX_MODULES) {
  846. usage(stderr, argv[0]);
  847. }
  848. }
  849. if (NUM_MODULES == 0) {
  850. usage(stderr, argv[0]);
  851. }
  852. }
  853. int main(int argc, char **argv) {
  854. int poll_ret;
  855. int nfds;
  856. int i;
  857. struct message_record incoming_msg;
  858. char linebuffer[LINE_BUFFER_SIZE] = {0};
  859. int read_ret;
  860. int modnum;
  861. struct pollfd fds[1 + (2 * SUPERVISOR_MAX_MODULES)] = {{0}};
  862. int poll_to_module[1 + (2 * SUPERVISOR_MAX_MODULES)] = {0};
  863. init_state_info();
  864. // Parse our command line to figure out what modules we need to supervise
  865. //
  866. handle_command_line(argc, argv);
  867. // Configure our default signal handlers
  868. //
  869. configure_signal_handlers(argv[0]);
  870. // As well as our module-specific zombie-reaping flag setter
  871. //
  872. setup_child_handler();
  873. // This listens for SIGHUP messages and flags us when one has come it.
  874. //
  875. setup_hup_handler();
  876. // Register our default keep-up-with-system status callbacks
  877. //
  878. register_system_status_callbacks();
  879. // Except we want to exempt ourselves from the MAILBOX_EXIT and
  880. // MAILBOX_PING so as not to start ping storms or kill ourselves...
  881. //
  882. register_dispatch_callback(MAILBOX_EXIT, CALLBACK_PREPROCESS, ignore_message, NULL);
  883. register_dispatch_callback(MAILBOX_PING, CALLBACK_PREPROCESS, ignore_message, NULL);
  884. // Add our specific handlers
  885. //
  886. register_dispatch_callback(MAILBOX_PONG, CALLBACK_USER(1), handle_pong_message, NULL);
  887. register_dispatch_callback(MAILBOX_GPS_STATUS, CALLBACK_USER(1), handle_gps_message, NULL);
  888. register_dispatch_callback(MAILBOX_STOP_STATUS, CALLBACK_USER(1), handle_stop_message, NULL);
  889. register_dispatch_callback(MAILBOX_DRIVER_STATUS, CALLBACK_USER(1), handle_driver_message, NULL);
  890. // This is the main processing loop which monitors system status and pumps I/O from
  891. // all of the client modules (debug and error messages) and from the IPC hub when it is up
  892. // as indicated by poll().
  893. //
  894. while( exit_request_status == EXIT_REQUEST_NONE ) {
  895. RESET_WATCHDOG();
  896. //-------------- SYSTEM STATE MAINTENANCE CODE HERE
  897. if ( (time(NULL) - last_system_stat) >= SUPERVISOR_SYSTEM_STAT_INTERVAL) {
  898. monitor_system_status();
  899. send_heartbeat();
  900. last_system_stat = time(NULL);
  901. }
  902. // Do any maintenance on our spawn list as needed
  903. //
  904. monitor_spawn_list();
  905. // Try and keep in touch with the IPC hub if possible
  906. //
  907. maintain_ipc_hub_connect(argv[0]);
  908. // If the communications hub is up, we may need to maintain our PING states
  909. //
  910. if (commhub_fd >= 0) {
  911. // Send out any PING messages that are required
  912. //
  913. send_ping_message();
  914. // Deal with any delinquents who don't answer their damn pings
  915. //
  916. enforce_ping_policy();
  917. if (hup_alert) {
  918. hup_alert = 0;
  919. send_hup_message();
  920. }
  921. }
  922. // If there is no connection to the IPC hub
  923. //
  924. else {
  925. // clear any PING attempt records we may have active
  926. //
  927. clear_ping_attempt();
  928. }
  929. // If our SIGCHLD handler has notified us that there is at least 1 zombie to reap
  930. //
  931. if (zombie_alert) {
  932. // Try and reap any zombies that may exist
  933. //
  934. reap_zombies();
  935. }
  936. //-------------- POLL FILE DESCRIPTOR SET POPULATION HERE
  937. //
  938. nfds = 0;
  939. // If we have a valid communication hub connection, add it to the poll set
  940. //
  941. if (commhub_fd >= 0) {
  942. fds[nfds].fd = commhub_fd;
  943. fds[nfds].events = POLLIN;
  944. // but flag it as NOT belonging to a child module
  945. //
  946. poll_to_module[nfds] = -1;
  947. nfds++;
  948. }
  949. // Then iterate through all of our modules and see if they need service
  950. //
  951. for (i = 0; i < NUM_MODULES; i++) {
  952. // If this child module is flagged as active
  953. //
  954. if (module_status[i].active) {
  955. // and we have a stdout file descriptor for this child
  956. //
  957. if (module_status[i].stdout_fd >= 0) {
  958. // add it to the poll set
  959. //
  960. fds[nfds].fd = module_status[i].stdout_fd;
  961. fds[nfds].events = POLLIN;
  962. // and remember which module it goes to
  963. //
  964. poll_to_module[nfds] = i;
  965. nfds++;
  966. }
  967. // if we have a stderr file descriptor for this child
  968. //
  969. if (module_status[i].stderr_fd >= 0) {
  970. // add it to the poll set
  971. //
  972. fds[nfds].fd = module_status[i].stderr_fd;
  973. fds[nfds].events = POLLIN;
  974. // and remember which module it goes to
  975. //
  976. poll_to_module[nfds] = i;
  977. nfds++;
  978. }
  979. }
  980. }
  981. // If we have any file descriptors to poll
  982. //
  983. if (nfds > 0) {
  984. // poll them
  985. //
  986. poll_ret = poll(fds, nfds, POLL_TIMEOUT);
  987. // if poll returns error
  988. //
  989. if (poll_ret < 0) {
  990. // and it's just EINTR
  991. //
  992. if (errno == EINTR) {
  993. // try the body of the while again
  994. //
  995. continue;
  996. }
  997. // if it is some other error
  998. //
  999. else {
  1000. // scream bloody murder
  1001. //
  1002. route_debug_message("Poll returned %d, errno = %d (%s)\n", poll_ret, errno, strerror(errno));
  1003. sleep(1);
  1004. }
  1005. }
  1006. }
  1007. // If we have zero functional file descriptors
  1008. //
  1009. else {
  1010. // Pretend that we called poll and it timed out
  1011. //
  1012. sleep(1);
  1013. poll_ret = 0;
  1014. }
  1015. //-------------- I/O PUMPING CODE BELOW
  1016. // if there are no file descriptors flagged as needing any action
  1017. //
  1018. if (poll_ret == 0) { continue; }
  1019. for (i = 0; i < nfds; i++) {
  1020. modnum = poll_to_module[i];
  1021. // If we have input and it is from a module...
  1022. //
  1023. if ( (fds[i].revents & POLLIN) && (modnum >= 0) ) {
  1024. read_ret = read(fds[i].fd, linebuffer, sizeof(linebuffer) - 1);
  1025. if (read_ret == 0) {
  1026. // clean up
  1027. // and skip this fd for now
  1028. //
  1029. handle_dead_fd(fds[i].fd);
  1030. continue;
  1031. }
  1032. else if ( (read_ret < 0) && (errno != EINTR) ) {
  1033. // clean up
  1034. // and skip this fd for now
  1035. //
  1036. handle_dead_fd(fds[i].fd);
  1037. continue;
  1038. }
  1039. // Terminate our read...
  1040. //
  1041. linebuffer[read_ret] = '\0';
  1042. if (fds[i].fd == module_status[modnum].stdout_fd) {
  1043. //route_debug_message("%s [%d]: %s", module_name[modnum], (int)module_status[modnum].pid, linebuffer);
  1044. }
  1045. else if (fds[i].fd == module_status[modnum].stderr_fd) {
  1046. route_error_message("%s [%d]: %s", module_name[modnum], (int)module_status[modnum].pid, linebuffer);
  1047. }
  1048. else {
  1049. route_debug_message("File descriptor %d does not belong to module %d (%s)!\n", fds[i].fd, modnum, module_name[modnum]);
  1050. }
  1051. }
  1052. // If we have input and it is from the commhub...
  1053. //
  1054. else if ( (fds[i].revents & POLLIN) && (fds[i].fd == commhub_fd) ) {
  1055. read_ret = get_message(commhub_fd, &incoming_msg);
  1056. if (read_ret < 0) {
  1057. // clean up
  1058. // and skip this fd for now
  1059. //
  1060. handle_dead_fd(fds[i].fd);
  1061. continue;
  1062. }
  1063. else {
  1064. // This passes the received message through the callback list
  1065. //
  1066. process_message(&incoming_msg);
  1067. }
  1068. }
  1069. // If it looks like we have a closed or invalid descriptor
  1070. //
  1071. if (fds[i].revents & (POLLERR | POLLHUP | POLLNVAL)) {
  1072. // clean up
  1073. // and skip this fd for now
  1074. //
  1075. handle_dead_fd(fds[i].fd);
  1076. continue;
  1077. }
  1078. }
  1079. }
  1080. route_debug_message("Attempting graceful exit...\n");
  1081. for (i = 0; i < NUM_MODULES; i++) {
  1082. if (module_status[i].active) {
  1083. route_debug_message("Sending SIGTERM to module %s [%d]\n", module_name[i], module_status[i].pid);
  1084. kill(module_status[i].pid, SIGTERM);
  1085. }
  1086. }
  1087. sleep(1);
  1088. return 0;
  1089. }