| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427 |
- /*
- * Copyright (c) 2019 Clementine Computing LLC.
- *
- * This file is part of PopuFare.
- *
- * PopuFare is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Affero General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * PopuFare is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU Affero General Public License for more details.
- *
- * You should have received a copy of the GNU Affero General Public License
- * along with PopuFare. If not, see <https://www.gnu.org/licenses/>.
- *
- */
- #include <linux/limits.h>
- #include <sys/types.h>
- #include <sys/wait.h>
- #include <sysexits.h>
- #include <stdio.h>
- #include <stdlib.h>
- #include <unistd.h>
- #include <signal.h>
- #include <string.h>
- #include <strings.h>
- #include <errno.h>
- #include <time.h>
- #include <poll.h>
- #include <syslog.h>
- #include <stdarg.h>
- #include "../common/common_defs.h"
- #include "../commhub/commhub.h"
- #include "../commhub/client_utils.h"
- #define CLIENT_SUPERVISOR_VERSION "2.1.2"
- // Number of modules we are actively tracking
- //
- int NUM_MODULES = 0;
- // File descriptor of our connection to the comm hub
- //
- int commhub_fd = -1;
- // This flag is set by the SIGCHLD handler, and cleared by the reaper function
- //
- volatile sig_atomic_t zombie_alert = 0;
- // This is our flag that the SIGHUP handler sets and is cleared when a HUP has been delivered to the client modules
- //
- volatile sig_atomic_t hup_alert = 0;
- // This is the time of the last new process creation event
- //
- int last_global_spawn = 0;
- // This is the time of our last system status check
- //
- int last_system_stat = 0;
- // This is the number of KB of free memory in the system
- //
- int system_stat_freemem = 0;
- // This is the number of running processes in the system
- //
- int system_stat_numproc = 0;
- // This is the 5 minute load average
- //
- float system_stat_loadavg = 0;
- // This is the error logging module's module number
- // (this is the module who's name matches SUPERVISOR_SPECIAL_BILLDB)
- //
- int error_logging_module = -1;
- #define PING_PUNISH_NONE (0)
- #define PING_PUNISH_WARN (1)
- #define PING_PUNISH_TERM (2)
- #define PING_PUNISH_KILL (3)
- struct module_record {
- // last time this process was started
- //
- time_t last_start;
- // last time this process has reaped
- //
- time_t last_exit;
- // If this flag is nonzero, the specified module is subject to PING monitoring
- //
- int ping_requested;
-
- // however, if this flag is zero, the specified module is immune to PING monitoring.
- // If this is zero, all further fields are invalid
- //
- int active;
-
- //-------------------------
- // If this flag is set, we expect the process to exit (i.e. it's not an error if it does...)
- //
- int exit_expected;
- // Process ID of this child module
- //
- pid_t pid;
- // File descriptor of this child's standard input (write to this to feed child)
- //
- int stdin_fd;
- // File descriptor of this child's standard output (read from this to get output)
- //
- int stdout_fd;
- // File descriptor of this child's standard error (read from this to get error output)
- //
- int stderr_fd;
- // timestamp in microseconds of the last PING message sent to this process
- //
- long long int last_ping;
- // timestamp in microseconds of the last PONG message received from this process
- //
- long long int last_pong;
- // Latest PING->PONG lag for this process
- //
- long long int ping_lag;
- // Used to keep track of PING non-response action sequence
- //
- int ping_punish;
- // This array contains the state for each module
- //
- } module_status[SUPERVISOR_MAX_MODULES] = {{0}};
- // This array contains pointers to the names of each module
- //
- char *module_name[SUPERVISOR_MAX_MODULES] = {0};
- #define PIPE_R (0)
- #define PIPE_W (1)
- #define STDIN_H (0)
- #define STDOUT_H (1)
- #define STDERR_H (2)
- int route_debug_message(char *fmt, ...) {
- va_list ap;
- #ifdef SUPERVISOR_LOG_DEBUG_TO_BILLDB
- // IF... we have a connection to the commhub, and we have an instance of billdb under our care, and it is alive...
- //
- if ( (commhub_fd >= 0) && (error_logging_module >= 0) && module_status[error_logging_module].active ) {
- int retval;
- int len = 0;
- struct message_record outgoing_msg;
- char payload[MAX_PAYLOAD_LENGTH] = {0};
- payload[len++] = LOGLEVEL_DEBUG;
- len += make_log_prefix(payload + len, MAX_PAYLOAD_LENGTH - len);
- va_start(ap, fmt); /* Initialize the va_list */
- len += vsnprintf(payload + len, MAX_PAYLOAD_LENGTH - len, fmt, ap);
- va_end(ap);
- prepare_message(&outgoing_msg, MAILBOX_BILLING_LOG, payload, len);
- retval = send_message(commhub_fd, &outgoing_msg);
- if (retval == 0) {
- return 0;
- }
- }
- #endif
- // IF either that didn't work, or the conditions were not met, send it to syslog...
- //
- // Initialize the va_list
- //
- va_start(ap, fmt);
- vsyslog(LOG_DEBUG, fmt, ap);
- va_end(ap);
- return 0;
- }
- int route_warning_message(char *fmt, ...) {
- va_list ap;
- #ifdef SUPERVISOR_LOG_WARNING_TO_BILLDB
- // IF... we have a connection to the commhub, and we have an instance of billdb under our care, and it is alive...
- //
- if ( (commhub_fd >= 0) && (error_logging_module >= 0) && module_status[error_logging_module].active ) {
- int retval;
- int len = 0;
- struct message_record outgoing_msg;
- char payload[MAX_PAYLOAD_LENGTH] = {0};
- payload[len++] = LOGLEVEL_WARN;
- len += make_log_prefix(payload + len, MAX_PAYLOAD_LENGTH - len);
- va_start(ap, fmt); /* Initialize the va_list */
- len += vsnprintf(payload + len, MAX_PAYLOAD_LENGTH - len, fmt, ap);
- va_end(ap);
- prepare_message(&outgoing_msg, MAILBOX_BILLING_LOG, payload, len);
- retval = send_message(commhub_fd, &outgoing_msg);
- if (retval == 0) {
- return 0;
- }
- }
- #endif
- // IF either that didn't work, or the conditions were not met, send it to syslog...
- //
- va_start(ap, fmt); /* Initialize the va_list */
- vsyslog(LOG_WARNING, fmt, ap);
- va_end(ap);
- return 0;
- }
- int route_error_message(char *fmt, ...) {
- va_list ap;
- #ifdef SUPERVISOR_LOG_ERROR_TO_BILLDB
- // IF... we have a connection to the commhub, and we have an instance of billdb under our care, and it is alive...
- //
- if ( (commhub_fd >= 0) && (error_logging_module >= 0) && module_status[error_logging_module].active ) {
- int retval;
- int len = 0;
- struct message_record outgoing_msg;
- char payload[MAX_PAYLOAD_LENGTH] = {0};
- payload[len++] = LOGLEVEL_ERROR;
- len += make_log_prefix(payload + len, MAX_PAYLOAD_LENGTH - len);
- // Initialize the va_list
- //
- va_start(ap, fmt);
- len += vsnprintf(payload + len, MAX_PAYLOAD_LENGTH - len, fmt, ap);
- va_end(ap);
- prepare_message(&outgoing_msg, MAILBOX_BILLING_LOG, payload, len);
- retval = send_message(commhub_fd, &outgoing_msg);
- if (retval == 0)
- {
- return 0;
- }
- }
- #endif
- // IF either that didn't work, or the conditions were not met, send it to syslog...
- //
- // Initialize the va_list
- //
- va_start(ap, fmt);
- vsyslog(LOG_ERR, fmt, ap);
- va_end(ap);
- return 0;
- }
- void warn_num_proc() {
- route_warning_message("System has %d processes running!\n", system_stat_numproc);
- }
- void warn_load_avg() {
- route_warning_message("System load average is %.1f!\n", system_stat_loadavg);
- }
- void warn_mem_free() {
- route_warning_message("System is down to %d KB of free RAM\n", system_stat_freemem);
- }
- void monitor_system_status() {
- int retval;
- int kb_free = 0;
- int n_procs = 0;
- float la5 = 0;
- char line[LINE_BUFFER_SIZE] = {0};
- FILE *f;
- f = fopen("/proc/loadavg", "rb");
- if (f) {
- fgets(line, sizeof(line), f);
- retval = sscanf(line, "%f %*f %*f %*i/%i", &la5, &n_procs);
- if (retval == 2) {
- system_stat_loadavg = la5;
- system_stat_numproc = n_procs;
- }
- fclose(f);
- }
- else {
- route_error_message("Cannot read /proc/loadavg\n");
- }
- f = fopen("/proc/meminfo", "rb");
- if (f) {
- while(!feof(f)) {
- fgets(line, sizeof(line), f);
- retval = sscanf(line, "MemFree: %i kB\n", &kb_free);
- if (retval == 1) {
- system_stat_freemem = kb_free;
- break;
- }
- }
- fclose(f);
- }
- else {
- route_error_message("Cannot read /proc/meminfo\n");
- }
- if (system_stat_numproc > SUPERVISOR_WARN_NUM_PROC) {
- warn_num_proc();
- }
- if (system_stat_loadavg > SUPERVISOR_WARN_LOAD_AVG) {
- warn_load_avg();
- }
- if (system_stat_freemem < SUPERVISOR_WARN_LOW_MEM) {
- warn_mem_free();
- }
- }
- void send_heartbeat() {
- route_debug_message("heartbeat\n");
- }
- int launch_module(int n) {
- pid_t fork_ret;
- int retval;
- int my_stdin, my_stdout, my_stderr;
- int new_stdin[2], new_stdout[2], new_stderr[2];
- int record_confirm_pipe[2];
- char record_confirm_msg[4] = {'O','K','\n','\0'};
- int record_confirm_expect;
- if ( (n < 0) || (n >= NUM_MODULES) ) {
- return -1;
- }
- RESET_WATCHDOG();
- // Save our actual standard I/O fd's
- //
- my_stdin = dup(STDIN_H);
- my_stdout = dup(STDOUT_H);
- my_stderr = dup(STDERR_H);
- // Generate pipes to be said handles for the child process
- //
- pipe(new_stdin);
- pipe(new_stdout);
- pipe(new_stderr);
- // close our actual handles in preparation for forking
- //
- close(STDIN_H);
- close(STDOUT_H);
- close(STDERR_H);
- // Assign the "far" end of the pipes to our handles so that the child process will use them
- //
- dup2(new_stdin[PIPE_R], STDIN_H);
- dup2(new_stdout[PIPE_W], STDOUT_H);
- dup2(new_stderr[PIPE_W], STDERR_H);
- // Create a pipe for record-keeping confirmation purposes (this prevents a race condition with SIGCHLD
- //
- pipe(record_confirm_pipe);
- record_confirm_expect = strlen(record_confirm_msg);
- // Ask the kernel to create a new process to hold our new module
- //
- fork_ret = fork();
- // If we are the child process
- //
- if (fork_ret == 0) {
- // We don't need this end of the confirmation pipe
- //
- close(record_confirm_pipe[PIPE_W]);
- // Wait for the parent process to confirm that we have been recorded before proceeding
- // with the exec attempt so that if it fails, the parent process will know what to do with the
- // SIGCHLD signal.
- //
- retval = read(record_confirm_pipe[PIPE_R], record_confirm_msg, record_confirm_expect);
- // If the parent process can't confirm our existence, we're in deep shit...
- //
- if (retval != record_confirm_expect) {
- fprintf(stderr, "Never got recordkeeping confirmation!\n");
- exit(EX_UNAVAILABLE);
- }
- // We are now done with our confirmation pipe
- //
- close(record_confirm_pipe[PIPE_R]);
- // Close the child's copy of the parent's standard I/O fds
- //
- close(my_stdin);
- close(my_stdout);
- close(my_stderr);
- // Close the parent's end of the pipes connecting the two processes
- //
- close(new_stdin[PIPE_W]);
- close(new_stdout[PIPE_R]);
- close(new_stderr[PIPE_R]);
- // Attempt to exec the child process now...
- //
- retval = execl(module_name[n], module_name[n], NULL);
- if (retval)
- {
- fprintf(stderr, "Cannot execl(\"%s\", \"%s\", NULL)!\n", module_name[n], module_name[n]);
- exit(EX_UNAVAILABLE);
- }
- exit(0);
- }
- // If we are the parent process
- //
- else if (fork_ret != -1) {
- // Record our child PID
- //
- module_status[n].pid = fork_ret;
- // Record our end of the child's standard file descriptors
- //
- module_status[n].stdin_fd = new_stdin[PIPE_W];
- module_status[n].stdout_fd = new_stdout[PIPE_R];
- module_status[n].stderr_fd = new_stderr[PIPE_R];
- // Reset our list ping/pong times to "never"
- //
- module_status[n].last_ping = 0;
- module_status[n].last_pong = 0;
- module_status[n].ping_lag = 0;
- module_status[n].ping_punish = PING_PUNISH_NONE;
- // Record our process start time
- //
- module_status[n].last_start = time(NULL);
- // This process is not currently expected to die
- //
- module_status[n].exit_expected = 0;
- // Flag this child module as active
- //
- module_status[n].active = 1;
- // Close the opposite end of all of our pipes
- //
- close(record_confirm_pipe[PIPE_R]);
- close(new_stdin[PIPE_R]);
- close(new_stdout[PIPE_W]);
- close(new_stderr[PIPE_W]);
- // Close our copy of the child's standard I/O file descriptors
- //
- close(STDIN_H);
- close(STDOUT_H);
- close(STDERR_H);
- // Restore the parent's original standard I/O file descriptors
- //
- dup2(my_stdin, STDIN_H);
- dup2(my_stdout, STDOUT_H);
- dup2(my_stderr, STDERR_H);
- // Unblock the child process by transmitting our confirmation message
- //
- retval = write(record_confirm_pipe[PIPE_W], record_confirm_msg, record_confirm_expect);
- if (retval != record_confirm_expect) {
- route_error_message("Cannot transmit recordkeeping confirmation!\n");
- }
- // We are now done with our recordkeeping confirmation pipe
- //
- close(record_confirm_pipe[PIPE_W]);
- }
- // Otherwise we have somehow FAILED to fork()
- //
- else {
- // restore our stderr file descriptor
- //
- close(STDERR_H);
- dup2(my_stderr, STDERR_H);
- route_error_message("Cannot fork()!\n");
- exit(EX_OSERR);
- }
- return 0;
- }
- void expected_exit(int n) {
- route_debug_message("Module %s [%d] has finished as expected.\n", module_name[n], (int)module_status[n].pid);
- // flag this module as "clean" for the purpose of respawn
- // (so we won't count the short runtime of a process we asked to terminate against it)
- //
- module_status[n].last_start = 0;
- module_status[n].last_exit = 0;
- }
- void unexpected_exit(int n) {
- int age;
- // Record our last stop time
- //
- module_status[n].last_exit = time(NULL);
- age = (int)module_status[n].last_exit - (int)module_status[n].last_start;
- route_warning_message("Module %s [%d] has stopped running unexpectedly after %d seconds.\n", module_name[n], (int)module_status[n].pid, age);
- }
- void reap_zombies() {
- int status = 0;
- pid_t retval = 0;
- int i;
- do {
- // Clear the zombie alert HERE so that if things terminate after the loop finishes,
- // we'll get flagged. This could result in an extra flag, but we'll never miss one.
- // and since we're using the WNOHANG option, there is no harm in going through the waitpid()
- // loop one time extra.
- zombie_alert = 0;
- retval = waitpid(-1, &status, WNOHANG); // Wait for ANY child process
- // If waitpid() actually waited for the specified process...
- //
- if (retval > 0) {
- for (i = 0; i < NUM_MODULES; i++) {
- // Go through our table of modules looking for an active module with a
- //
- // matching PID...
- //
- if ( module_status[i].active && (module_status[i].pid == retval) ) {
- // Give us an opportunity to do some clever cleanup if it is merited
- // These functions also set the last_exit time (and may zero the last_start
- // to flag a process as "clean" for respawn purposes).
- //
- if (module_status[i].exit_expected) {
- expected_exit(i);
- }
- else {
- unexpected_exit(i);
- }
- // Set this module inactive
- //
- module_status[i].active = 0;
- // Set this module's last ping/pong times to "never"
- //
- module_status[i].last_ping = 0;
- module_status[i].last_pong = 0;
- module_status[i].ping_lag = 0;
- module_status[i].ping_punish = PING_PUNISH_NONE;
- // No more process
- //
- module_status[i].pid = 0;
- // Close our end of this module's standard I/O file descriptors
- //
- close(module_status[i].stdin_fd);
- close(module_status[i].stdout_fd);
- close(module_status[i].stderr_fd);
- module_status[i].stdin_fd = -1;
- module_status[i].stdout_fd = -1;
- module_status[i].stderr_fd = -1;
- break;
- }
- }
- }
- } while(retval > 0);
- }
- void sigchld_handler(int signum, siginfo_t *info, void *data) {
- zombie_alert = 1;
- }
- void sighup_handler(int signum, siginfo_t *into, void *data) {
- hup_alert = 1;
- }
- void setup_child_handler() {
- struct sigaction sa = {{0}};
- sa.sa_sigaction = sigchld_handler;
- sa.sa_flags = SA_NOCLDSTOP | SA_SIGINFO;
- sigfillset(&sa.sa_mask);
- sigaction(SIGCHLD, &sa, NULL);
- }
- void setup_hup_handler() {
- struct sigaction sa = {{0}};
- sa.sa_sigaction = sighup_handler;
- sa.sa_flags = SA_SIGINFO;
- sigfillset(&sa.sa_mask);
- sigaction(SIGHUP, &sa, NULL);
- }
- void monitor_spawn_list() {
- int i;
- int module_runtime;
- time_t now;
- for (i = 0; i < NUM_MODULES; i++) {
- now = time(NULL);
- // If it is too soon to attempt spawning any new process, then we're done for the moment
- //
- if ( (now - last_global_spawn) < SUPERVISOR_GLOBAL_SPAWN_RATE_LIMIT ) break;
- // If this module is still active, we don't need to spawn it...
- if (module_status[i].active) continue;
- // This number will only be valid if last_start != 0
- module_runtime = module_status[i].last_exit - module_status[i].last_start;
- // If this module either made an expected termination or has never been launcher
- //
- if (module_status[i].last_start == 0) {
- // then update our last global spawn time
- //
- last_global_spawn = now;
- // and launch it...
- //
- launch_module(i);
- route_debug_message("Spawned module %s, PID = %d\n", module_name[i], (int)module_status[i].pid);
- }
- // otherwise, if it has run before, but terminated unexpectedly in a very short time
- //
- else if ( module_runtime < SUPERVISOR_RESPAWN_DELAY_THRESHOLD ) {
- // and it has been long enough to try again
- //
- if ( (now - module_status[i].last_exit) >= SUPERVISOR_RESPAWN_RATE_LIMIT) {
- // then update our last global spawn time
- //
- last_global_spawn = now;
- // and launch it...
- //
- launch_module(i);
- route_warning_message("Respawned module %s, PID = %d [rate limited]\n", module_name[i], (int)module_status[i].pid);
- }
- }
- // otherwise, the module has terminated unexpectedly, but it had run for a long time, so we can respawn it immediately
- //
- else {
- // then update our last global spawn time
- //
- last_global_spawn = now;
- // and launch it...
- //
- launch_module(i);
- route_debug_message("Respawned module %s, PID = %d\n", module_name[i], (int)module_status[i].pid);
- }
- }
- }
- void handle_dead_fd(int fd) {
- int i;
- close(fd);
- if (fd == commhub_fd) {
- commhub_fd = -1;
- return;
- }
- for (i = 0; i < NUM_MODULES; i++) {
- if (module_status[i].stdin_fd == fd) {
- module_status[i].stdin_fd = -1;
- return;
- }
- if (module_status[i].stdout_fd == fd) {
- module_status[i].stdout_fd = -1;
- return;
- }
- if (module_status[i].stderr_fd == fd) {
- module_status[i].stderr_fd = -1;
- return;
- }
- }
- }
- message_callback_return handle_pong_message(struct message_record *msg, void *param) {
- int i;
- // printf("Got PONG from PID %d\n", (int)msg->header.sender);
- // Iterate through all of the modules
- //
- for (i = 0; i < NUM_MODULES; i++) {
- // If this module is active, and its PID matches the PID of the sender of this PONG message
- //
- if ( module_status[i].active && (msg->header.sender == module_status[i].pid) ) {
- module_status[i].last_pong = get_usec_time();
- module_status[i].ping_lag = (module_status[i].last_pong - module_status[i].last_ping);
- // printf("PING->PONG latency %lld microseconds.\n", module_status[i].ping_lag);
- }
- }
- return MESSAGE_HANDLED_CONT;
- }
- //----
- // Client_supervisor is now in charge of managing the state_info file,
- // so we need to have gps, stop and driver messages handled explicitely
- // to be able to write to the state_info file.
- // `client_supervisor` should be the sole process writing to the state_info
- // file while any other client should be able to read.
- //
- message_callback_return handle_gps_message(struct message_record *msg, void *param) {
- // guard against version mismatch
- //
- if (msg->header.payload_length != sizeof(gps_status)) {
- return MESSAGE_HANDLED_STOP;
- }
- memcpy(&gps_stat, msg->payload, sizeof(gps_status)); //otherwise, update our structure
- update_state_info_with_gps(&state_info, &gps_stat);
- memcpy(state_info.comment, "client_supervisor gps", strlen("client_supervisor gps")+1);
- save_state_info(&state_info);
- return MESSAGE_HANDLED_CONT;
- }
- static message_callback_return handle_stop_message(struct message_record *msg, void *param) {
- // guard against version mismatch
- //
- if (msg->header.payload_length != sizeof(stop_status)) {
- return MESSAGE_HANDLED_STOP;
- }
- memcpy(&stop_stat, msg->payload, sizeof(stop_status));
- update_state_info_with_stop(&state_info, &stop_stat);
- memcpy(state_info.comment, "client_supervisor stop", strlen("client_supervisor stop")+1);
- save_state_info(&state_info);
- return MESSAGE_HANDLED_CONT;
- }
- static message_callback_return handle_driver_message(struct message_record *msg, void *param) {
- // guard against version mismatch
- //
- if (msg->header.payload_length != sizeof(driver_status)) {
- return MESSAGE_HANDLED_STOP;
- }
- memcpy(&driver_stat, msg->payload, sizeof(driver_stat));
- update_state_info_with_driver(&state_info, &driver_stat);
- memcpy(state_info.comment, "client_supervisor driver", strlen("client_supervisor driver")+1);
- save_state_info(&state_info);
- return MESSAGE_HANDLED_CONT;
- }
- //----
- void clear_ping_attempt() {
- int i;
- for (i = 0; i < NUM_MODULES; i++) {
- if (module_status[i].active) {
- module_status[i].last_ping = module_status[i].last_pong = 0;
- module_status[i].ping_punish = PING_PUNISH_NONE;
- }
- }
- }
- void send_ping_message()
- {
- struct message_record outgoing_msg;
- long long int usec_now;
- char target[MAILBOX_NAME_MAX + 1];
- int i;
- if (commhub_fd < 0) { return; }
- for (i = 0; i < NUM_MODULES; i++) {
- // If this module is active AND included in PING monitoring
- //
- if ( module_status[i].active && module_status[i].ping_requested ) {
- usec_now = get_usec_time();
- // If it is not yet time to re-ping this module
- //
- if ( (usec_now - module_status[i].last_ping) < SUPERVISOR_PING_INTERVAL) {
- // skip it for now
- //
- continue;
- }
- // If we have PING'd this host but are still waiting for a PONG back
- //
- if (module_status[i].last_pong < module_status[i].last_ping) {
- // Don't flood it with pings
- //
- continue;
- }
- // If we've never PING'd this host, set it's PONG time to the same as its PING
- // time so it doesn't get killed for not having responded to its first PING.
- //
- if (module_status[i].last_ping == 0) {
- module_status[i].last_pong = usec_now;
- }
- module_status[i].last_ping = usec_now;
- // Turn our PID into a magic mailbox name that addresses that PID.
- //
- sprintf(target, ">%d", (int)module_status[i].pid);
- // Prepare a message to that magic mailbox with a payload consisting of the mailbox
- // name of the PING notification
- //
- prepare_message(&outgoing_msg, target, MAILBOX_PING, strlen(MAILBOX_PING));
- // And send it!
- //
- send_message(commhub_fd, &outgoing_msg);
- }
- }
- }
- void send_hup_message() {
- struct message_record outgoing_msg;
- if (commhub_fd < 0) {
- return;
- }
- printf("Sending HUP\n");
- prepare_message(&outgoing_msg, MAILBOX_HUP, "", 0);
- send_message(commhub_fd, &outgoing_msg);
- }
- void ping_punish_warn(int n, long long int delta) {
- route_debug_message("Module %s [%d] hasn't responded to a PING in %d seconds...\n", module_name[n], (int)module_status[n].pid, (int)USEC_TO_SEC(delta));
- }
- void ping_punish_term(int n, long long int delta) {
- route_warning_message("Module %s [%d] hasn't responded to a PING in %d seconds... Sending SIGTERM\n", module_name[n], (int)module_status[n].pid, (int)USEC_TO_SEC(delta));
- kill(module_status[n].pid, SIGTERM);
- }
- void ping_punish_kill(int n, long long int delta) {
- route_error_message("Module %s [%d] hasn't responded to a PING in %d seconds... Sending SIGKILL\n", module_name[n], (int)module_status[n].pid, (int)USEC_TO_SEC(delta));
- kill(module_status[n].pid, SIGKILL);
- }
- void enforce_ping_policy()
- {
- int i;
- long long int delta = 0;
- for (i = 0; i < NUM_MODULES; i++) {
- //If this module is both active AND included in the ping monitoring
- //
- if ( module_status[i].active && module_status[i].ping_requested ) {
- // if we haven't sent a PING to this one...
- //
- if (module_status[i].last_ping == 0) {
- //clear any punishment that would otherwise be pending and get the next module
- //
- module_status[i].ping_punish = PING_PUNISH_NONE;
- continue;
- }
- //compute just how late this module is...
- //
- delta = get_usec_time() - module_status[i].last_pong;
- switch(module_status[i].ping_punish) {
- case PING_PUNISH_NONE:
- if (delta > SUPERVISOR_PING_WARN_TIME) {
- ping_punish_warn(i, delta);
- module_status[i].ping_punish++;
- }
- break;
- case PING_PUNISH_WARN:
- if (delta > SUPERVISOR_PING_TERM_TIME) {
- ping_punish_term(i, delta);
- module_status[i].ping_punish++;
- }
- break;
- case PING_PUNISH_TERM:
- if (delta > SUPERVISOR_PING_KILL_TIME) {
- ping_punish_kill(i, delta);
- module_status[i].ping_punish++;
- }
- break;
- default:
- route_error_message("At our wits end with module %s [%d], just plain won't die!\n", module_name[i], (int)module_status[i].pid);
- break;
- }
- }
- }
- }
- void maintain_ipc_hub_connect(char *progname) {
- struct message_record outgoing_msg;
- // if we have no connection to the communication hub
- //
- if (commhub_fd < 0) {
- // try and get one
- //
- commhub_fd = connect_to_message_server(progname);
- // printf("commhub_fd = %d\n", commhub_fd);
- // if it worked
- //
- if (commhub_fd >= 0) {
- //Subscribe to the command mailboxes we act on
- //
- prepare_message(&outgoing_msg, MAILBOX_SUBSCRIBE, MAILBOX_PONG, strlen(MAILBOX_TOKEN_MAG));
- send_message(commhub_fd,&outgoing_msg);
- //Subscribe to the relevant status management mailboxes
- //
- subscribe_to_default_messages(commhub_fd);
- }
- }
- }
- void print_version(FILE *fp) {
- fprintf(fp, "version %s\n", CLIENT_SUPERVISOR_VERSION);
- }
- void usage(FILE *fp, char *progname) {
- fprintf(fp,"%s: %d modules given to be supervised, minimum = 1, maximum = %d\n", progname, NUM_MODULES, SUPERVISOR_MAX_MODULES - 1);
- fprintf(fp,"%s: Usage: %s [module ...] [--[no-]ping] [module ...]\n", progname, progname);
- fprintf(fp,"%s: By default modules will be subjet to PING monitoring through\n", progname);
- fprintf(fp,"%s: the IPC hub, --no-ping will effect all modules following it until a --ping\n", progname);
- fprintf(fp,"\n");
- exit(EX_USAGE);
- }
- void handle_command_line(int argc, char **argv) {
- int i;
- int needs_ping = 1;
- NUM_MODULES = 0;
- // Walk all of the given command line arguments and add each one as a module we have been asked to supervice
- //
- for (i = 1; i < argc; i++) {
- if ( !strcmp(argv[i], "--no-ping") ) {
- needs_ping = 0;
- }
- else if ( !strcmp(argv[i], "--ping") ) {
- needs_ping = 1;
- }
- else if ( !strcmp(argv[i], "-v") ) {
- print_version(stdout);
- exit(0);
- }
- else if ( !strcmp(argv[i], "-h") ) {
- usage(stdout, argv[0]);
- exit(EX_USAGE);
- }
- else {
- // Here we look for the last / in the module name
- //
- char *foo = rindex(argv[i], '/');
- // if we found one, advance past it
- //
- if (foo) { foo++; }
- // otherwise
- //
- else {
- // use our whole argument
- //
- foo = argv[i];
- }
- // make sure it is at the end
- //
- if ( !strcmp(foo, SUPERVISOR_SPECIAL_BILLDB) ) {
- //if it is, this one is our guy!
- //
- error_logging_module = NUM_MODULES;
- }
- module_status[NUM_MODULES].ping_requested = needs_ping;
- module_name[NUM_MODULES++] = argv[i];
- }
- if (NUM_MODULES >= SUPERVISOR_MAX_MODULES) {
- usage(stderr, argv[0]);
- }
- }
- if (NUM_MODULES == 0) {
- usage(stderr, argv[0]);
- }
- }
- int main(int argc, char **argv) {
- int poll_ret;
- int nfds;
- int i;
- struct message_record incoming_msg;
- char linebuffer[LINE_BUFFER_SIZE] = {0};
- int read_ret;
- int modnum;
- struct pollfd fds[1 + (2 * SUPERVISOR_MAX_MODULES)] = {{0}};
- int poll_to_module[1 + (2 * SUPERVISOR_MAX_MODULES)] = {0};
- init_state_info();
- // Parse our command line to figure out what modules we need to supervise
- //
- handle_command_line(argc, argv);
- // Configure our default signal handlers
- //
- configure_signal_handlers(argv[0]);
- // As well as our module-specific zombie-reaping flag setter
- //
- setup_child_handler();
- // This listens for SIGHUP messages and flags us when one has come it.
- //
- setup_hup_handler();
- // Register our default keep-up-with-system status callbacks
- //
- register_system_status_callbacks();
- // Except we want to exempt ourselves from the MAILBOX_EXIT and
- // MAILBOX_PING so as not to start ping storms or kill ourselves...
- //
- register_dispatch_callback(MAILBOX_EXIT, CALLBACK_PREPROCESS, ignore_message, NULL);
- register_dispatch_callback(MAILBOX_PING, CALLBACK_PREPROCESS, ignore_message, NULL);
- // Add our specific handlers
- //
- register_dispatch_callback(MAILBOX_PONG, CALLBACK_USER(1), handle_pong_message, NULL);
- register_dispatch_callback(MAILBOX_GPS_STATUS, CALLBACK_USER(1), handle_gps_message, NULL);
- register_dispatch_callback(MAILBOX_STOP_STATUS, CALLBACK_USER(1), handle_stop_message, NULL);
- register_dispatch_callback(MAILBOX_DRIVER_STATUS, CALLBACK_USER(1), handle_driver_message, NULL);
- // This is the main processing loop which monitors system status and pumps I/O from
- // all of the client modules (debug and error messages) and from the IPC hub when it is up
- // as indicated by poll().
- //
- while( exit_request_status == EXIT_REQUEST_NONE ) {
- RESET_WATCHDOG();
- //-------------- SYSTEM STATE MAINTENANCE CODE HERE
- if ( (time(NULL) - last_system_stat) >= SUPERVISOR_SYSTEM_STAT_INTERVAL) {
- monitor_system_status();
- send_heartbeat();
- last_system_stat = time(NULL);
- }
- // Do any maintenance on our spawn list as needed
- //
- monitor_spawn_list();
- // Try and keep in touch with the IPC hub if possible
- //
- maintain_ipc_hub_connect(argv[0]);
- // If the communications hub is up, we may need to maintain our PING states
- //
- if (commhub_fd >= 0) {
- // Send out any PING messages that are required
- //
- send_ping_message();
- // Deal with any delinquents who don't answer their damn pings
- //
- enforce_ping_policy();
- if (hup_alert) {
- hup_alert = 0;
- send_hup_message();
- }
- }
- // If there is no connection to the IPC hub
- //
- else {
- // clear any PING attempt records we may have active
- //
- clear_ping_attempt();
- }
- // If our SIGCHLD handler has notified us that there is at least 1 zombie to reap
- //
- if (zombie_alert) {
- // Try and reap any zombies that may exist
- //
- reap_zombies();
- }
- //-------------- POLL FILE DESCRIPTOR SET POPULATION HERE
- //
- nfds = 0;
- // If we have a valid communication hub connection, add it to the poll set
- //
- if (commhub_fd >= 0) {
- fds[nfds].fd = commhub_fd;
- fds[nfds].events = POLLIN;
- // but flag it as NOT belonging to a child module
- //
- poll_to_module[nfds] = -1;
- nfds++;
- }
- // Then iterate through all of our modules and see if they need service
- //
- for (i = 0; i < NUM_MODULES; i++) {
- // If this child module is flagged as active
- //
- if (module_status[i].active) {
- // and we have a stdout file descriptor for this child
- //
- if (module_status[i].stdout_fd >= 0) {
- // add it to the poll set
- //
- fds[nfds].fd = module_status[i].stdout_fd;
- fds[nfds].events = POLLIN;
- // and remember which module it goes to
- //
- poll_to_module[nfds] = i;
- nfds++;
- }
- // if we have a stderr file descriptor for this child
- //
- if (module_status[i].stderr_fd >= 0) {
- // add it to the poll set
- //
- fds[nfds].fd = module_status[i].stderr_fd;
- fds[nfds].events = POLLIN;
- // and remember which module it goes to
- //
- poll_to_module[nfds] = i;
- nfds++;
- }
- }
- }
- // If we have any file descriptors to poll
- //
- if (nfds > 0) {
- // poll them
- //
- poll_ret = poll(fds, nfds, POLL_TIMEOUT);
- // if poll returns error
- //
- if (poll_ret < 0) {
- // and it's just EINTR
- //
- if (errno == EINTR) {
- // try the body of the while again
- //
- continue;
- }
- // if it is some other error
- //
- else {
- // scream bloody murder
- //
- route_debug_message("Poll returned %d, errno = %d (%s)\n", poll_ret, errno, strerror(errno));
- sleep(1);
- }
- }
- }
- // If we have zero functional file descriptors
- //
- else {
- // Pretend that we called poll and it timed out
- //
- sleep(1);
- poll_ret = 0;
- }
- //-------------- I/O PUMPING CODE BELOW
- // if there are no file descriptors flagged as needing any action
- //
- if (poll_ret == 0) { continue; }
- for (i = 0; i < nfds; i++) {
- modnum = poll_to_module[i];
- // If we have input and it is from a module...
- //
- if ( (fds[i].revents & POLLIN) && (modnum >= 0) ) {
- read_ret = read(fds[i].fd, linebuffer, sizeof(linebuffer) - 1);
- if (read_ret == 0) {
- // clean up
- // and skip this fd for now
- //
- handle_dead_fd(fds[i].fd);
- continue;
- }
- else if ( (read_ret < 0) && (errno != EINTR) ) {
- // clean up
- // and skip this fd for now
- //
- handle_dead_fd(fds[i].fd);
- continue;
- }
- // Terminate our read...
- //
- linebuffer[read_ret] = '\0';
- if (fds[i].fd == module_status[modnum].stdout_fd) {
- //route_debug_message("%s [%d]: %s", module_name[modnum], (int)module_status[modnum].pid, linebuffer);
- }
- else if (fds[i].fd == module_status[modnum].stderr_fd) {
- route_error_message("%s [%d]: %s", module_name[modnum], (int)module_status[modnum].pid, linebuffer);
- }
- else {
- route_debug_message("File descriptor %d does not belong to module %d (%s)!\n", fds[i].fd, modnum, module_name[modnum]);
- }
- }
- // If we have input and it is from the commhub...
- //
- else if ( (fds[i].revents & POLLIN) && (fds[i].fd == commhub_fd) ) {
- read_ret = get_message(commhub_fd, &incoming_msg);
- if (read_ret < 0) {
- // clean up
- // and skip this fd for now
- //
- handle_dead_fd(fds[i].fd);
- continue;
- }
- else {
- // This passes the received message through the callback list
- //
- process_message(&incoming_msg);
- }
- }
- // If it looks like we have a closed or invalid descriptor
- //
- if (fds[i].revents & (POLLERR | POLLHUP | POLLNVAL)) {
- // clean up
- // and skip this fd for now
- //
- handle_dead_fd(fds[i].fd);
- continue;
- }
- }
- }
- route_debug_message("Attempting graceful exit...\n");
- for (i = 0; i < NUM_MODULES; i++) {
- if (module_status[i].active) {
- route_debug_message("Sending SIGTERM to module %s [%d]\n", module_name[i], module_status[i].pid);
- kill(module_status[i].pid, SIGTERM);
- }
- }
- sleep(1);
- return 0;
- }
|