/*~ Welcome, wonderful reader! * * This is the Core of um, Core Lightning: the main file of the master daemon * `lightningd`. It's mainly cluttered with the miscellany of setup, * and a few startup sanity checks. * * The role of this daemon is to start the subdaemons, shuffle peers * between them, handle the JSON RPC requests, bitcoind, the database * and centralize logging. In theory, it doesn't trust the other * daemons, though we expect `hsmd` (which holds secret keys) to be * responsive. * * Comments beginning with a ~ (like this one!) are part of our shared * adventure through the source, so they're more meta than normal code * comments, and meant to be read in a certain order. */ /*~ Notice how includes are in ASCII order: this is actually enforced by * the build system under `make check-source`. It avoids merge conflicts * and keeps things consistent. It also make sure you include "config.h" * before anything else. */ #include "config.h" /*~ This is Ian Lance Taylor's libbacktrace. It turns out that it's * horrifically difficult to obtain a decent backtrace in C; the standard * backtrace function is useless in most programs. */ /*~ These headers are from CCAN: http://ccodearchive.net. * * It's another one of Rusty's projects, and we copy and paste it * automatically into the source tree here, so you should never edit * it. There's a Makefile target update-ccan to update it (and add modules * if CCAN_NEW is specified). * * The most used of these are `ccan/tal` and `ccan/take`, which we'll describe * in detail below. */ #include #include #include #include #include #include #include #include #include /*~ This is common code: routines shared by one or more executables * (separate daemons, or the lightning-cli program). */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static void destroy_alt_subdaemons(struct lightningd *ld); #if DEVELOPER static void memleak_help_alt_subdaemons(struct htable *memtable, struct lightningd *ld); #endif /* DEVELOPER */ /*~ The core lightning object: it's passed everywhere, and is basically a * global variable. This new_xxx pattern is something we'll see often: * it allocates and initializes a new structure, using *tal*, the hierarchical * allocator. */ static struct lightningd *new_lightningd(const tal_t *ctx) { /*~ tal: each allocation is a child of an existing object (or NULL, * the top-level object). When an object is freed, all the objects * `tallocated` off it are also freed. We use it in place of malloc * and free. For the technically inclined: tal allocations usually * build a tree, and tal_freeing any node in the tree will result in * the entire subtree rooted at that node to be freed. * * It's incredibly useful for grouping object lifetimes, as we'll see. * For example, a `struct lightningd` has a pointer to a `log_book` * which is allocated off the `struct lightningd`, and has its own * internal members allocated off `log_book`: freeing `struct * lightningd` frees them all. * * In this case, freeing `ctx` will free `ld`: */ struct lightningd *ld = tal(ctx, struct lightningd); /*~ Style note: `ctx` is declared `const`, yet we can `tallocate` from * it. Adding/removing children is not considered to change an * object; nor, in fact, is freeing it with tal_free(). This allows * us to use const more liberally: the style rule here is that you * should use 'const' on pointers if you can. */ /*~ Note that we generally EXPLICITLY #if-wrap DEVELOPER code. This * is a nod to keeping it minimal and explicit: we need this code for * testing, but its existence means we're not actually testing the * same exact code users will be running. */ #if DEVELOPER ld->dev_debug_subprocess = NULL; ld->dev_no_plugin_checksum = false; ld->dev_disconnect_fd = -1; ld->dev_subdaemon_fail = false; ld->dev_allow_localhost = false; ld->dev_gossip_time = 0; ld->dev_fast_gossip = false; ld->dev_fast_gossip_prune = false; ld->dev_fast_reconnect = false; ld->dev_force_privkey = NULL; ld->dev_force_bip32_seed = NULL; ld->dev_force_channel_secrets = NULL; ld->dev_force_channel_secrets_shaseed = NULL; ld->dev_force_tmp_channel_id = NULL; ld->dev_no_htlc_timeout = false; ld->dev_no_version_checks = false; ld->dev_max_funding_unconfirmed = 2016; ld->dev_ignore_modern_onion = false; ld->dev_disable_commit = -1; ld->dev_no_ping_timer = false; #endif /*~ This is a CCAN list: an embedded double-linked list. It's not * really typesafe, but relies on convention to access the contents. * It's inspired by the closely-related Linux kernel list.h. * * You declare them as a `struct list_head` (or use the LIST_HEAD() * macro which doesn't work on dynamically-allocated objects like `ld` * here). The item which will go into the list must be declared * a `struct list_node` for each list it can be in. * * The most common operations are list_head_init(), list_add(), * list_del() and list_for_each(). * * This method of manually declaring the list hooks avoids dynamic * allocations to put things into a list. */ list_head_init(&ld->subds); /*~ These are hash tables of incoming and outgoing HTLCs (contracts), * defined as `struct htlc_in` and `struct htlc_out` in htlc_end.h. * The hash tables are declared there using the very ugly * HTABLE_DEFINE_TYPE macro. The key is the channel the HTLC is in * and the 64-bit htlc-id which is unique for that channel and * direction. That htlc-id is used in the inter-peer wire protocol, * so it is the logical key. * * There aren't usually many HTLCs, so we could have just used a linked * list attached to the channel structure itself, or even left them in * the database rather than making an in-memory version. Obviously * I was in a premature optimization mood when I wrote this: */ ld->htlcs_in = tal(ld, struct htlc_in_map); htlc_in_map_init(ld->htlcs_in); /*~ Note also: we didn't need to use an allocation here! We could * have simply made the `struct htlc_out_map` a member. But we * override the htable allocation routines to use tal(), and they * want a tal parent, so we always make our hash table a tallocated * object. */ ld->htlcs_out = tal(ld, struct htlc_out_map); htlc_out_map_init(ld->htlcs_out); /*~ This is the hash table of peers: converted from a * linked-list as part of the 100k-peers project! */ ld->peers = tal(ld, struct peer_node_id_map); peer_node_id_map_init(ld->peers); /*~ And this was done at the same time, for db lookups at startup */ ld->peers_by_dbid = tal(ld, struct peer_dbid_map); peer_dbid_map_init(ld->peers_by_dbid); /*~ For multi-part payments, we need to keep some incoming payments * in limbo until we get all the parts, or we time them out. */ ld->htlc_sets = tal(ld, struct htlc_set_map); htlc_set_map_init(ld->htlc_sets); /*~ We have a multi-entry log-book infrastructure: we define a 10MB log * book to hold all the entries (and trims as necessary), and multiple * log objects which each can write into it, each with a unique * prefix. */ ld->log_book = new_log_book(ld, 10*1024*1024); /*~ Note the tal context arg (by convention, the first argument to any * allocation function): ld->log will be implicitly freed when ld * is. */ ld->log = new_log(ld, ld->log_book, NULL, "lightningd"); ld->logfiles = NULL; /*~ We explicitly set these to NULL: if they're still NULL after option * parsing, we know they're to be set to the defaults. */ ld->alias = NULL; ld->rgb = NULL; list_head_init(&ld->connects); list_head_init(&ld->waitsendpay_commands); list_head_init(&ld->sendpay_commands); list_head_init(&ld->close_commands); list_head_init(&ld->ping_commands); list_head_init(&ld->disconnect_commands); list_head_init(&ld->waitblockheight_commands); /*~ Tal also explicitly supports arrays: it stores the number of * elements, which can be accessed with tal_count() (or tal_bytelen() * for raw bytecount). It's common for simple arrays to use * tal_resize() (or tal_arr_expand) to expand, which does not work on * NULL. So we start with a zero-length array. */ ld->proposed_wireaddr = tal_arr(ld, struct wireaddr_internal, 0); ld->proposed_listen_announce = tal_arr(ld, enum addr_listen_announce, 0); /*~ The network is not yet ready for DNS names inside node_announcements, * so we disable this by default for now. */ ld->announce_dns = false; ld->remote_addr_v4 = NULL; ld->remote_addr_v6 = NULL; ld->discovered_ip_v4 = NULL; ld->discovered_ip_v6 = NULL; ld->listen = true; ld->autolisten = true; ld->reconnect = true; ld->try_reexec = false; ld->db_upgrade_ok = NULL; /*~ This is from ccan/timer: it is efficient for the case where timers * are deleted before expiry (as is common with timeouts) using an * ingenious bucket system which more precisely sorts timers as they * approach expiry. It's a fascinating implementation you should read * if you have a spare few hours. */ ld->timers = tal(ld, struct timers); timers_init(ld->timers, time_mono()); /*~ This is detailed in chaintopology.c */ ld->topology = new_topology(ld, ld->log); ld->gossip_blockheight = 0; ld->daemon_parent_fd = -1; ld->proxyaddr = NULL; ld->always_use_proxy = false; ld->pure_tor_setup = false; ld->tor_service_password = NULL; ld->websocket_port = 0; /*~ This is initialized later, but the plugin loop examines this, * so set it to NULL explicitly now. */ ld->wallet = NULL; /*~ Behavioral options */ ld->accept_extra_tlv_types = tal_arr(ld, u64, 0); /*~ In the next step we will initialize the plugins. This will * also populate the JSON-RPC with passthrough methods, hence * lightningd needs to have something to put those in. This * is that :-) */ jsonrpc_setup(ld); /*~ We run a number of plugins (subprocesses that we talk JSON-RPC with) * alongside this process. This allows us to have an easy way for users * to add their own tools without having to modify the Core Lightning source * code. Here we initialize the context that will keep track and control * the plugins. */ ld->plugins = plugins_new(ld, ld->log_book, ld); ld->plugins->startup = true; /*~ This is set when a JSON RPC command comes in to shut us down. */ ld->stop_conn = NULL; /*~ This is used to signal that `hsm_secret` is encrypted, and will * be set to `true` if the `--encrypted-hsm` option is passed at startup. */ ld->encrypted_hsm = false; /* This is used to override subdaemons */ strmap_init(&ld->alt_subdaemons); tal_add_destructor(ld, destroy_alt_subdaemons); memleak_add_helper(ld, memleak_help_alt_subdaemons); /*~ We change umask if we daemonize, but not if we don't. Initialize the * initial_umask anyway as we might rely on it later (`plugin start`). */ ld->initial_umask = umask(0); umask(ld->initial_umask); /*~ This is the mode of the created JSON-RPC socket file, in * traditional Unix octal. 0600 means only the user that ran * lightningd can invoke RPC on it. Changing it to 0660 may * be sensible if you run lightningd in its own system user, * and just let specific users (add the group of the * lightningd runner as an ancillary group) access its * RPC. Can be overridden with `--rpc-file-mode`. */ ld->rpc_filemode = 0600; /*~ This is the exit code to use on exit. * Set to NULL meaning we are not interested in exiting yet. */ ld->exit_code = NULL; /*~ We maintain a round-robin list of channels. * This round-robin list of channels is used to ensure that * each invoice we generate has a different set of channels. */ ld->rr_counter = 0; /*~ Because fee estimates on testnet and regtest are unreliable, * we allow overriding them with --force-feerates, in which * case this is a pointer to an enum feerate-indexed array of values */ ld->force_feerates = NULL; return ld; } /*~ We list our daemons here so on startup we can test they're the * correct versions and that they exist. */ static const char *subdaemons[] = { "lightning_channeld", "lightning_closingd", "lightning_connectd", "lightning_gossipd", "lightning_hsmd", "lightning_onchaind", "lightning_openingd" }; /* Return true if called with a recognized subdaemon e.g. "hsmd" */ bool is_subdaemon(const char *sdname) { for (size_t i = 0; i < ARRAY_SIZE(subdaemons); i++) /* Skip the "lightning_" prefix in the table */ if (streq(sdname, subdaemons[i] + strlen("lightning_"))) return true; return false; } static void destroy_alt_subdaemons(struct lightningd *ld) { strmap_clear(&ld->alt_subdaemons); } #if DEVELOPER static void memleak_help_alt_subdaemons(struct htable *memtable, struct lightningd *ld) { memleak_scan_strmap(memtable, &ld->alt_subdaemons); } #endif /* DEVELOPER */ const char *subdaemon_path(const tal_t *ctx, const struct lightningd *ld, const char *name) { /* Strip the leading "lightning_" before looking in alt_subdaemons. */ size_t pfxlen = strlen("lightning_"); assert(strlen(name) > pfxlen); const char *short_name = tal_strdup(ctx, name + pfxlen); /* Is there an alternate path for this subdaemon? */ const char *dpath; const char *alt = strmap_get(&ld->alt_subdaemons, short_name); if (alt) { /* path_join will honor absolute paths as well. */ dpath = path_join(ctx, ld->daemon_dir, alt); } else { /* This subdaemon is found in the standard place. */ dpath = path_join(ctx, ld->daemon_dir, name); } return dpath; } /*~ Check we can run them, and check their versions */ void test_subdaemons(const struct lightningd *ld) { size_t i; /*~ CCAN's ARRAY_SIZE() should always be used on defined arrays like * the subdaemons array above. You can calculate the number of * elements it has using `sizeof(subdaemons)/sizeof(subdaemons[0])` * but if `subdaemons` were refactored into a pointer (eg. to make * it a dynamic array) that would erroneously evaluate to `1`. * * ARRAY_SIZE will cause a compiler error if the argument is actually * a pointer, not an array. */ for (i = 0; i < ARRAY_SIZE(subdaemons); i++) { /*~ CCAN's path module uses tal, so wants a context to * allocate from. We have a magic convenience context * `tmpctx` for temporary allocations like this. * * Because all our daemons at their core are of form `while * (!stopped) handle_events();` (an event loop pattern), we * can free `tmpctx` in that top-level loop after each event * is handled. */ int outfd; const char *dpath = subdaemon_path(tmpctx, ld, subdaemons[i]); const char *verstring; /*~ CCAN's pipecmd module is like popen for grownups: it * takes pointers to fill in stdin, stdout and stderr file * descriptors if desired, and the remainder of arguments * are the command and its argument. */ pid_t pid = pipecmd(NULL, &outfd, &outfd, dpath, "--version", NULL); /*~ Our logging system: spam goes in at log_debug level, but * logging is mainly added by developer necessity and removed * by developer/user complaints. The only strong convention * is that log_broken() is used for "should never happen". * * Note, however, that logging takes care to preserve the * global `errno` which is set above. */ log_debug(ld->log, "testing %s", dpath); /*~ ccan/err is a wrapper around BSD's err.h, which defines * the convenience functions err() (error with message * followed by a string based on errno) and errx() (same,x * but no errno string). */ if (pid == -1) err(EXITCODE_SUBDAEMON_FAIL, "Could not run %s", dpath); /*~ CCAN's grab_file module contains a routine to read into a * tallocated buffer until EOF */ verstring = grab_fd(tmpctx, outfd); /*~ Like many CCAN modules, it set errno on failure, which * err (ccan/err, but usually just the BSD ) prints */ if (!verstring) err(1, "Could not get output from %s", dpath); /*~ strstarts is from CCAN/str. */ if (!strstarts(verstring, version()) || verstring[strlen(version())] != '\n') errx(EXITCODE_SUBDAEMON_FAIL, "%s: bad version '%s'", subdaemons[i], verstring); /*~ The child will be reaped by sigchld_rfd_in, so we don't * need to waitpid() here. */ } } /* Check if all subdaemons exist in specified directory. */ static bool has_all_subdaemons(const char *daemon_dir) { size_t i; bool missing_daemon = false; for (i = 0; i < ARRAY_SIZE(subdaemons); ++i) { if (!path_is_file(path_join(tmpctx, daemon_dir, subdaemons[i]))) { missing_daemon = true; break; } } return !missing_daemon; } /* Returns the directory this executable is running from */ static const char *find_my_directory(const tal_t *ctx, const char *argv0) { /* find_my_abspath simply exits on failure, so never returns NULL. */ const char *me = find_my_abspath(NULL, argv0); /*~ The caller just wants the directory we're in. * * Note the magic `take()` macro here: it annotates a pointer as "to * be taken", and the recipient is expected to take ownership of the * pointer. This improves efficiency because the recipient might * choose to use or even keep it rather than make a copy (or it * might just free it). * * Many CCAN and our own routines support this, but if you hand a * `take()` to a routine which *doesn't* expect it, unfortunately you * don't get a compile error (we have runtime detection for this * case, however). */ return path_dirname(ctx, take(me)); } /*~ This returns the PKGLIBEXEC path which is where binaries get installed. * Note the `TAKES` annotation which indicates that the `my_path` parameter * can be take(); in which case, this function will handle freeing it. * * TAKES is only a convention unfortunately, and ignored by the compiler. */ static const char *find_my_pkglibexec_path(struct lightningd *ld, const char *my_path TAKES) { const char *pkglibexecdir; /*~`path_join` is declared in ccan/path/path.h as: * * char *path_join(const tal_t *ctx, * const char *base TAKES, const char *a TAKES); * * So, as we promised with 'TAKES' in our own declaration, if the * caller has called `take()` the `my_path` parameter, path_join() * will free it. */ pkglibexecdir = path_join(NULL, my_path, BINTOPKGLIBEXECDIR); /*~ The plugin dir is in ../libexec/c-lightning/plugins, which (unlike * those given on the command line) does not need to exist. */ plugins_set_builtin_plugins_dir(ld->plugins, path_join(tmpctx, pkglibexecdir, "plugins")); /*~ Sometimes take() can be more efficient, since the routine can * manipulate the string in place. This is the case here. */ return path_simplify(ld, take(pkglibexecdir)); } /* Determine the correct daemon dir. */ static const char *find_daemon_dir(struct lightningd *ld, const char *argv0) { const char *my_path = find_my_directory(ld, argv0); /* If we're running in-tree, all the subdaemons are with lightningd. */ if (has_all_subdaemons(my_path)) { /* In this case, look for built-in plugins in ../plugins */ plugins_set_builtin_plugins_dir(ld->plugins, path_join(tmpctx, my_path, "../plugins")); return my_path; } /* Otherwise we assume they're in the installed dir. */ return find_my_pkglibexec_path(ld, take(my_path)); } /*~ We like to free everything on exit, so valgrind doesn't complain (valgrind * is an awesome runtime memory usage detector for C and C++ programs). In * some ways it would be neater not to do this, but it turns out some * transient objects still need cleaning. */ static void free_all_channels(struct lightningd *ld) { struct peer *p; struct peer_node_id_map_iter it; /*~ tal supports *destructors* using `tal_add_destructor()`; the most * common use is for an object to delete itself from a linked list * when it's freed. * * As a result, freeing an object (which frees any tal objects * allocated off it, and any allocated off them, etc) may cause * callbacks; in this case, some objects freed here can cause database * writes, which must be inside a transaction. */ db_begin_transaction(ld->wallet->db); /* Now we free all the HTLCs */ free_htlcs(ld, NULL); /*~ For every peer, we free every channel. On allocation the peer was * given a destructor (`destroy_peer`) which removes itself from the * hashtable. * * Deletion from a hashtable is allowed, but it does mean we could * skip entries in iteration. Hence we repeat until empty! */ again: for (p = peer_node_id_map_first(ld->peers, &it); p; p = peer_node_id_map_next(ld->peers, &it)) { struct channel *c; /*~ A peer can have multiple channels. */ while ((c = list_top(&p->channels, struct channel, list)) != NULL) { /* Removes itself from list as we free it */ tal_free(c); } /* A peer may have a channel in the process of opening. */ if (p->uncommitted_channel) { struct uncommitted_channel *uc = p->uncommitted_channel; /* Setting to NULL stops destroy_uncommitted_channel * from trying to remove peer from db! */ p->uncommitted_channel = NULL; tal_free(uc); } /* Removes itself from htable as we free it */ tal_free(p); } if (peer_node_id_map_first(ld->peers, &it)) goto again; /*~ Commit the transaction. Note that the db is actually * single-threaded, so commits never fail and we don't need * spin-and-retry logic everywhere. */ db_commit_transaction(ld->wallet->db); } static void shutdown_global_subdaemons(struct lightningd *ld) { /* Let everyone shutdown cleanly. */ close(ld->hsm_fd); /*~ The three "global" daemons, which we shutdown explicitly: we * give them 10 seconds to exit gracefully before killing them. */ ld->connectd = subd_shutdown(ld->connectd, 10); ld->gossip = subd_shutdown(ld->gossip, 10); ld->hsm = subd_shutdown(ld->hsm, 10); } /*~ Our wallet logic needs to know what outputs we might be interested in. We * use BIP32 (a.k.a. "HD wallet") to generate keys from a single seed, so we * keep the maximum-ever-used key index in the db, and add them all to the * filter here. */ static void init_txfilter(struct wallet *w, const struct ext_key *bip32_base, struct txfilter *filter) { /*~ This is defined in libwally, so we didn't have to reimplement */ struct ext_key ext; /*~ Note the use of ccan/short_types u64 rather than uint64_t. * Thank me later. */ u64 bip32_max_index; bip32_max_index = db_get_intvar(w->db, "bip32_max_index", 0); /*~ One of the C99 things I unequivocally approve: for-loop scope. */ for (u64 i = 0; i <= bip32_max_index + w->keyscan_gap; i++) { if (bip32_key_from_parent(bip32_base, i, BIP32_FLAG_KEY_PUBLIC, &ext) != WALLY_OK) { abort(); } txfilter_add_derkey(filter, ext.pub_key); } } /*~ The normal advice for daemons is to move into the root directory, so you * don't prevent unmounting whatever filesystem you happen to start in. * * But we define every path relative to our (~/.lightning) data dir, so we * make sure we stay there. The rest of this is taken from ccan/daemonize, * which was based on W. Richard Stevens' advice in Programming in The Unix * Environment. */ static void complete_daemonize(struct lightningd *ld) { int ok_status = 0; /* Don't hold files open. */ close(STDIN_FILENO); close(STDOUT_FILENO); close(STDERR_FILENO); /* Many routines write to stderr; that can cause chaos if used * for something else, so set it here. */ if (open("/dev/null", O_WRONLY) != 0) fatal("Could not open /dev/null: %s", strerror(errno)); if (dup2(0, STDERR_FILENO) != STDERR_FILENO) fatal("Could not dup /dev/null for stderr: %s", strerror(errno)); close(0); /* Session leader so ^C doesn't whack us. */ if (setsid() == (pid_t)-1) fatal("Could not setsid: %s", strerror(errno)); /* Discard our parent's old-fashioned umask prejudices. */ ld->initial_umask = umask(0); /* OK, parent, you can exit(0) now. */ write_all(ld->daemon_parent_fd, &ok_status, sizeof(ok_status)); close(ld->daemon_parent_fd); } /*~ It's pretty standard behaviour (especially for daemons) to create and * file-lock a pidfile. This not only prevents accidentally running multiple * daemons on the same database at once, but lets nosy sysadmins see what pid * the currently-running daemon is supposed to be. */ static void pidfile_create(const struct lightningd *ld) { int pid_fd; char *pid; /* Create PID file: relative to .config dir. */ pid_fd = open(ld->pidfile, O_WRONLY|O_CREAT, 0640); if (pid_fd < 0) err(1, "Failed to open PID file"); /* Lock PID file, so future lockf will fail. */ if (lockf(pid_fd, F_TLOCK, 0) < 0) /* Problem locking file */ err(EXITCODE_PIDFILE_LOCK, "lightningd already running? Error locking PID file"); /*~ As closing the file will remove the lock, we need to keep it open; * the OS will close it implicitly when we exit for any reason. */ /*~ Note that tal_fmt() is what asprintf() dreams of being. */ pid = tal_fmt(tmpctx, "%d\n", getpid()); /*~ CCAN's write_all writes to a file descriptor, looping if necessary * (which, on a file unlike a socket, is never, for historical UNIX * reasons). It also isn't declared with GCC's warn_unused_result * which write() is when FORTIFY_SOURCE is defined, so we're allowed * to ignore the result without jumping through hoops. */ write_all(pid_fd, pid, strlen(pid)); } /*~ ccan/io allows overriding the poll() function that is the very core * of the event loop it runs for us. We override it so that we can do * extra sanity checks, and it's also a good point to free the tmpctx. */ static int io_poll_lightningd(struct pollfd *fds, nfds_t nfds, int timeout) { /* These checks and freeing tmpctx are common to all daemons. */ return daemon_poll(fds, nfds, timeout); } /*~ Ever had one of those functions which doesn't quite fit anywhere? Me too. * Implementing a generic notifier framework is overkill in a static codebase * like this, and it's always better to have compile-time calls than runtime, * as it makes the code more explicit. But pasting in direct calls is also an * abstraction violation, so we use this middleman function. */ void notify_new_block(struct lightningd *ld, u32 block_height) { /* Inform our subcomponents individually. */ htlcs_notify_new_block(ld, block_height); channel_notify_new_block(ld, block_height); gossip_notify_new_block(ld, block_height); waitblockheight_notify_new_block(ld, block_height); } static void on_sigint(int _ UNUSED) { static const char *msg = "lightningd: SIGINT caught, exiting.\n"; write_all(STDERR_FILENO, msg, strlen(msg)); _exit(1); } static void on_sigterm(int _ UNUSED) { static const char *msg = "lightningd: SIGTERM caught, exiting.\n"; write_all(STDERR_FILENO, msg, strlen(msg)); _exit(1); } /* Globals are terrible, but we all do it. */ static int sigchld_wfd; static void on_sigchild(int _ UNUSED) { /*~ UNIX signals are async, which is usually terrible. The usual * trick, which we use here, it to write a byte to a pipe, and * then handle it in the main event loop. * * This can fail if we get flooded by signals but that's OK; * we made it non-blocking, and the reader will loop until * there are no more children. But glibc's overzealous use of * __attribute__((warn_unused_result)) means we have to * "catch" the return value. */ if (write(sigchld_wfd, "", 1) != 1) { if (errno != EAGAIN && errno != EWOULDBLOCK) { /* Should not call this in a signal handler, but we're * already messed up! */ fatal("on_sigchild: write errno %s", strerror(errno)); } } } /*~ We only need to handle SIGTERM and SIGINT for the case we are PID 1 of * docker container since Linux makes special this PID and requires that * some handler exist. * * We also want to catch SIGCHLD, so we can report on such children and * avoid zombies. */ static int setup_sig_handlers(void) { struct sigaction sigint, sigterm, sigchild; int fds[2]; memset(&sigint, 0, sizeof(struct sigaction)); memset(&sigterm, 0, sizeof(struct sigaction)); memset(&sigchild, 0, sizeof(struct sigaction)); sigint.sa_handler = on_sigint; sigterm.sa_handler = on_sigterm; sigchild.sa_handler = on_sigchild; sigchild.sa_flags = SA_RESTART; if (1 == getpid()) { sigaction(SIGINT, &sigint, NULL); sigaction(SIGTERM, &sigterm, NULL); } if (pipe(fds) != 0) err(1, "creating sigchild pipe"); sigchld_wfd = fds[1]; if (fcntl(sigchld_wfd, F_SETFL, fcntl(sigchld_wfd, F_GETFL)|O_NONBLOCK) != 0) err(1, "setting sigchild pip nonblock"); sigaction(SIGCHLD, &sigchild, NULL); return fds[0]; } /*~ This removes the SIGCHLD handler, so we don't try to write * to a broken pipe. */ static void remove_sigchild_handler(struct io_conn *sigchld_conn) { struct sigaction sigchild; memset(&sigchild, 0, sizeof(struct sigaction)); sigchild.sa_handler = SIG_DFL; sigaction(SIGCHLD, &sigchild, NULL); io_close(sigchld_conn); } /*~ This is the routine which sets up the sigchild handling. We just * reap them for now so they don't become zombies, but our subd * handling calls waitpid() synchronously, so we can't simply do this * in the signal handler or set SIGCHLD to be ignored, which has the * same effect. * * We can usually ignore these because we keep pipes to our children, * and use the closure of those to indicate termination. */ static struct io_plan *sigchld_rfd_in(struct io_conn *conn, struct lightningd *ld) { /* We don't actually care what we read, so we stuff things here. */ static u8 ignorebuf; static size_t len; pid_t childpid; int wstatus; /* Reap the plugins, since we otherwise ignore them. */ while ((childpid = waitpid(-1, &wstatus, WNOHANG)) != 0) { maybe_subd_child(ld, childpid, wstatus); } return io_read_partial(conn, &ignorebuf, 1, &len, sigchld_rfd_in, ld); } /*~ We actually keep more than one set of features, used in different * contexts. common/features.c knows how each standard feature is * presented, so we have it generate the set for each one at a time, and * combine them. * * This is inefficient, but the primitives are useful for adding single * features later, or adding them when supplied by plugins. */ static struct feature_set *default_features(const tal_t *ctx) { struct feature_set *ret = NULL; static const u32 features[] = { OPTIONAL_FEATURE(OPT_DATA_LOSS_PROTECT), OPTIONAL_FEATURE(OPT_UPFRONT_SHUTDOWN_SCRIPT), OPTIONAL_FEATURE(OPT_GOSSIP_QUERIES), COMPULSORY_FEATURE(OPT_VAR_ONION), COMPULSORY_FEATURE(OPT_PAYMENT_SECRET), OPTIONAL_FEATURE(OPT_BASIC_MPP), OPTIONAL_FEATURE(OPT_GOSSIP_QUERIES_EX), OPTIONAL_FEATURE(OPT_STATIC_REMOTEKEY), OPTIONAL_FEATURE(OPT_SHUTDOWN_ANYSEGWIT), OPTIONAL_FEATURE(OPT_PAYMENT_METADATA), OPTIONAL_FEATURE(OPT_SCID_ALIAS), OPTIONAL_FEATURE(OPT_ZEROCONF), OPTIONAL_FEATURE(OPT_CHANNEL_TYPE), OPTIONAL_FEATURE(OPT_ROUTE_BLINDING), #if EXPERIMENTAL_FEATURES OPTIONAL_FEATURE(OPT_ANCHOR_OUTPUTS), OPTIONAL_FEATURE(OPT_QUIESCE), OPTIONAL_FEATURE(OPT_ONION_MESSAGES), #endif }; for (size_t i = 0; i < ARRAY_SIZE(features); i++) { struct feature_set *f = feature_set_for_feature(NULL, features[i]); if (!ret) ret = tal_steal(ctx, f); else feature_set_or(ret, take(f)); } return ret; } /*~ We need this function style to hand to ecdh_hsmd_setup, but it's just a thin * wrapper around fatal() */ static void hsm_ecdh_failed(enum status_failreason fail, const char *fmt, ...) { fatal("hsm failure: %s", fmt); } /*~ This signals to the mainloop that some part wants to cleanly exit now. */ void lightningd_exit(struct lightningd *ld, int exit_code) { ld->exit_code = tal(ld, int); *ld->exit_code = exit_code; log_debug(ld->log, "io_break: %s", __func__); io_break(ld); } int main(int argc, char *argv[]) { struct lightningd *ld; u32 min_blockheight, max_blockheight; int connectd_gossipd_fd; int stop_fd; struct timers *timers; const char *stop_response; struct htlc_in_map *unconnected_htlcs_in; int sigchld_rfd; struct io_conn *sigchld_conn = NULL; int exit_code = 0; char **orig_argv; bool try_reexec; /*~ We fork out new processes very very often; every channel gets its * own process, for example, and we have `hsmd` and `gossipd` and * the plugins as well. * Now, we also keep around several file descriptors (`fd`s), including * file descriptors to communicate with `hsmd` which is a privileged * process with access to private keys and is therefore very sensitive. * Thus, we need to close all file descriptors other than what the * forked-out new process should have ASAP. * * We do this by using the `ccan/closefrom` module, which implements * an emulation for the `closefrom` syscall on BSD and Solaris. * This emulation tries to use the fastest facility available on the * system (`close_range` syscall on Linux 5.9+, snooping through * `/proc/$PID/fd` on many OSs (but requires procps to be mounted), * the actual `closefrom` call if available, etc.). * As a fallback if none of those are available on the system, however, * it just iterates over the theoretical range of possible file * descriptors. * * On some systems, that theoretical range can be very high, up to * `INT_MAX` in the worst case. * If the `closefrom` emulation has to fall back to this loop, it * can be very slow; fortunately, the emulation will also inform * us of that via the `closefrom_may_be_slow` function, and also has * `closefrom_limit` to limit the number of allowed file descriptors * *IF AND ONLY IF* `closefrom_may_be_slow()` is true. * * On systems with a fast `closefrom` then `closefrom_limit` does * nothing. * * Previously we always imposed a limit of 1024 file descriptors * (because we used to always iterate up to limit instead of using * some OS facility, because those were non-portable and needed * code for each OS), until @whitslack went and made >1000 channels * and hit the 1024 limit. */ closefrom_limit(4096); /*~ What happens in strange locales should stay there. */ setup_locale(); /*~ This sets up SIGCHLD to make sigchld_rfd readable. */ sigchld_rfd = setup_sig_handlers(); /*~ This checks that the system-installed libraries (usually * dynamically linked) actually are compatible with the ones we * compiled with. * * The header itself is auto-generated every time the version of the * installed libraries changes, as we had an sqlite3 version update * which broke people, and "make" didn't think there was any work to * do, so rebuilding didn't fix it. */ check_linked_library_versions(); /*~ Every daemon calls this in some form: the hooks are for dumping * backtraces when we crash (if supported on this platform). */ daemon_setup(argv[0], log_backtrace_print, log_backtrace_exit); /*~ There's always a battle between what a constructor like this * should do, and what should be added later by the caller. In * general, because we use valgrind heavily for testing, we prefer not * to initialize unused fields which we expect the caller to set: * valgrind will warn us if we make decisions based on uninitialized * variables. */ ld = new_lightningd(NULL); ld->state = LD_STATE_INITIALIZING; /*~ We store an copy of our arguments before parsing mangles them, so * we can re-exec if versions of subdaemons change. Note the use of * notleak() since our leak-detector can't find orig_argv on the * stack. */ orig_argv = notleak(tal_arr(ld, char *, argc + 1)); for (size_t i = 1; i < argc; i++) orig_argv[i] = tal_strdup(orig_argv, argv[i]); /*~ Turn argv[0] into an absolute path (if not already) */ orig_argv[0] = path_join(orig_argv, take(path_cwd(NULL)), argv[0]); orig_argv[argc] = NULL; /* Figure out where our daemons are first. */ ld->daemon_dir = find_daemon_dir(ld, argv[0]); if (!ld->daemon_dir) errx(EXITCODE_SUBDAEMON_FAIL, "Could not find daemons"); /* Set up the feature bits for what we support */ ld->our_features = default_features(ld); /*~ Handle early options; this moves us into --lightning-dir. * Plugins may add new options, which is why we are splitting * between early args (including --plugin registration) and * non-early opts. This also forks if they say --daemon. */ handle_early_opts(ld, argc, argv); /*~ Set the default portnum according to the used network * similarly to what Bitcoin Core does to ports by default. */ ld->portnum = chainparams_get_ln_port(chainparams); /*~ Initialize all the plugins we just registered, so they can * do their thing and tell us about themselves (including * options registration). */ plugins_init(ld->plugins); /*~ If the plugis are misconfigured we don't want to proceed. A * misconfiguration could for example be a plugin marked as important * not working correctly or a plugin squatting something an important * plugin needs to register, such as a method or CLI option. If we are * going to shut down immediately again, we shouldn't spend too much * effort in starting up. */ if (ld->exit_code) fatal("Could not initialize the plugins, see above for details."); /*~ Handle options and config. */ handle_opts(ld, argc, argv); /*~ Now create the PID file: this errors out if there's already a * daemon running, so we call before doing almost anything else. */ pidfile_create(ld); /*~ Make sure we can reach the subdaemons, and versions match. * This can be turned off in DEVELOPER builds with --dev-skip-version-checks, * but the `dev_no_version_checks` field of `ld` doesn't even exist * if DEVELOPER isn't defined, so we use IFDEV(devoption,non-devoption): */ if (IFDEV(!ld->dev_no_version_checks, 1)) test_subdaemons(ld); /*~ Set up the HSM daemon, which knows our node secret key, so tells * us who we are. * * HSM stands for Hardware Security Module, which is the industry * standard of key storage; ours is in software for now, so the name * doesn't really make sense, but we can't call it the Badly-named * Daemon Software Module. */ ld->bip32_base = hsm_init(ld); /*~ Our "wallet" code really wraps the db, which is more than a simple * bitcoin wallet (though it's that too). It also stores channel * states, invoices, payments, blocks and bitcoin transactions. */ ld->wallet = wallet_new(ld, ld->timers); /*~ We keep a filter of scriptpubkeys we're interested in. */ ld->owned_txfilter = txfilter_new(ld); /*~ This is the ccan/io central poll override from above. */ io_poll_override(io_poll_lightningd); /*~ If hsm_secret is encrypted, we don't need its encryption key * anymore. Note that sodium_munlock() also zeroes the memory.*/ if (ld->config.keypass) discard_key(take(ld->config.keypass)); /*~ Our default color and alias are derived from our node id, so we * can only set those now (if not set by config options). */ setup_color_and_alias(ld); /*~ Set up connect daemon: this manages receiving and making * TCP connections. It needs to talk to the gossip daemon * which knows (via node_announcement messages) the public * addresses of nodes, so connectd_init hands it one end of a * socket pair, and gives us the other */ connectd_gossipd_fd = connectd_init(ld); /*~ We do every database operation within a transaction; usually this * is covered by the infrastructure (eg. opening a transaction before * handling a message or expiring a timer), but for startup we do this * explicitly. */ db_begin_transaction(ld->wallet->db); /*~ Our default names, eg. for the database file, are not dependent on * the network. Instead, the db knows what chain it belongs to, and we * simple barf here if it's wrong. * * We also check that our node_id is what we expect: otherwise a change * in hsm_secret will have strange consequences! */ if (!wallet_sanity_check(ld->wallet)) errx(EXITCODE_WALLET_DB_MISMATCH, "Wallet sanity check failed."); /*~ Initialize the transaction filter with our pubkeys. */ init_txfilter(ld->wallet, ld->bip32_base, ld->owned_txfilter); /*~ Get the blockheight we are currently at, UINT32_MAX is used to signal * an uninitialized wallet and that we should start off of bitcoind's * current height */ wallet_blocks_heights(ld->wallet, UINT32_MAX, &min_blockheight, &max_blockheight); /*~ If we were asked to rescan from an absolute height (--rescan < 0) * then just go there. Otherwise compute the diff to our current height, * lowerbounded by 0. */ if (ld->config.rescan < 0) max_blockheight = -ld->config.rescan; else if (max_blockheight < (u32)ld->config.rescan) max_blockheight = 0; else if (max_blockheight != UINT32_MAX) max_blockheight -= ld->config.rescan; /*~ That's all of the wallet db operations for now. */ db_commit_transaction(ld->wallet->db); /*~ Initialize block topology. This does its own io_loop to * talk to bitcoind, so does its own db transactions. */ setup_topology(ld->topology, min_blockheight, max_blockheight); db_begin_transaction(ld->wallet->db); /*~ Pull peers, channels and HTLCs from db. Needs to happen after the * topology is initialized since some decisions rely on being able to * know the blockheight. */ unconnected_htlcs_in = notleak(load_channels_from_wallet(ld)); db_commit_transaction(ld->wallet->db); /*~ The gossip daemon looks after the routing gossip; * channel_announcement, channel_update, node_announcement and gossip * queries. It also hands us the latest channel_updates for our * channels. */ gossip_init(ld, connectd_gossipd_fd); /*~ Create RPC socket: now lightning-cli can send us JSON RPC commands * over a UNIX domain socket specified by `ld->rpc_filename`. */ jsonrpc_listen(ld->jsonrpc, ld); /*~ Now that the rpc path exists, we can start the plugins and they * can start talking to us. */ if (!plugins_config(ld->plugins)) { /* Valgrind can complain about this leak! */ tal_free(unconnected_htlcs_in); goto stop; } /*~ Process any HTLCs we were in the middle of when we exited, now * that plugins (who might want to know via htlc_accepted hook) are * active. These will immediately fail, since no peers are connected, * however partial payments may still be absorbed into htlc_set. */ db_begin_transaction(ld->wallet->db); htlcs_resubmit(ld, unconnected_htlcs_in); db_commit_transaction(ld->wallet->db); /*~ Activate connect daemon. Needs to be after the initialization of * chaintopology, otherwise peers may connect and ask for * uninitialized data. */ connectd_activate(ld); /*~ "onchaind" is a dumb daemon which tries to get our funds back: it * doesn't handle reorganizations, but it's idempotent, so we can * simply just restart it if the chain moves. Similarly, we replay it * chain events from the database on restart, beginning with the * "funding transaction spent" event which creates it. */ onchaind_replay_channels(ld); /*~ Now handle sigchld, so we can clean up appropriately. */ sigchld_conn = notleak(io_new_conn(ld, sigchld_rfd, sigchld_rfd_in, ld)); /*~ Mark ourselves live. * * Note the use of type_to_string() here: it's a typesafe formatter, * often handed 'tmpctx' like here to allocate a throwaway string for * formatting. json_escape() avoids printing weird characters in our * log. And tal_hex() is a helper from utils which returns a hex string; * it's assumed that the argument was allocated with tal or tal_arr * so it can use tal_bytelen() to get the length. */ log_info(ld->log, "--------------------------------------------------"); log_info(ld->log, "Server started with public key %s, alias %s (color #%s) and lightningd %s", type_to_string(tmpctx, struct node_id, &ld->id), json_escape(tmpctx, (const char *)ld->alias)->s, tal_hex(tmpctx, ld->rgb), version()); ld->state = LD_STATE_RUNNING; /*~ If `closefrom_may_be_slow`, we limit ourselves to 4096 file * descriptors; tell the user about it as that limits the number * of channels they can have. * We do not really expect most users to ever reach that many, * but: https://github.com/ElementsProject/lightning/issues/4868 */ if (closefrom_may_be_slow()) log_info(ld->log, "We have self-limited number of open file " "descriptors to 4096, but that will result in a " "'Too many open files' error if you ever reach " ">4000 channels. Please upgrade your OS kernel " "(Linux 5.9+, FreeBSD 8.0+), or mount proc or " "/dev/fd (if running in chroot) if you are " "approaching that many channels."); /*~ If we have channels closing, make sure we re-xmit the last * transaction, in case bitcoind lost it. */ db_begin_transaction(ld->wallet->db); resend_closing_transactions(ld); db_commit_transaction(ld->wallet->db); /*~ This is where we ask connectd to reconnect to any peers who have * live channels with us, and makes sure we're watching the funding * tx. */ setup_peers(ld); /*~ Now that all the notifications for transactions are in place, we * can start the poll loop which queries bitcoind for new blocks. */ begin_topology(ld->topology); /*~ To handle --daemon, we fork the daemon early (otherwise we hit * issues with our pid changing), but keep the parent around until * we've completed most initialization: that way we'll exit with an * error rather than silently exiting 0, then realizing we can't start * and forcing the confused user to read the logs. * * But we're all initialized, so detach and have parent exit now. */ if (ld->daemon_parent_fd != -1) complete_daemonize(ld); /*~ Setting this (global) activates the crash log: we don't usually need * a backtrace if we fail during startup. */ crashlog = ld->log; /*~ This sets up the ecdh() function in ecdh_hsmd to talk to hsmd */ ecdh_hsmd_setup(ld->hsm_fd, hsm_ecdh_failed); /*~ The root of every backtrace (almost). This is our main event * loop. We don't even call it if they've already called `stop` */ if (!ld->stop_conn) { void *io_loop_ret = io_loop_with_timers(ld); /*~ io_loop_with_timers will only exit if we call io_break. * At this point in code, we should use io_break(ld) to * shut down. */ assert(io_loop_ret == ld); log_debug(ld->log, "io_loop_with_timers: %s", __func__); } stop: /* Stop *new* JSON RPC requests. */ jsonrpc_stop_listening(ld->jsonrpc); /* Stop new connectd requests */ connectd_start_shutdown(ld->connectd); /* Give permission for things to get destroyed without getting upset. */ ld->state = LD_STATE_SHUTDOWN; stop_fd = -1; stop_response = NULL; /* Were we exited via `lightningd_exit`? */ if (ld->exit_code) { exit_code = *ld->exit_code; } else if (ld->stop_conn) { /* Keep this fd around, to write final response at the end. */ stop_fd = io_conn_fd(ld->stop_conn); io_close_taken_fd(ld->stop_conn); stop_response = tal_steal(NULL, ld->stop_response); } /* Stop topology callbacks. */ stop_topology(ld->topology); /* We're not going to collect our children. */ remove_sigchild_handler(sigchld_conn); /* Get rid of per-channel subdaemons. */ subd_shutdown_nonglobals(ld); /* Tell plugins we're shutting down, use force if necessary. */ shutdown_plugins(ld); /* Now kill any remaining connections */ jsonrpc_stop_all(ld); /* Get rid of major subdaemons. */ shutdown_global_subdaemons(ld); /* Clean up internal peer/channel/htlc structures. */ free_all_channels(ld); /* Now close database */ ld->wallet->db = tal_free(ld->wallet->db); remove(ld->pidfile); /* FIXME: pay can have children off tmpctx which unlink from * ld->payments, so clean that up. */ clean_tmpctx(); /* Gather these before we free ld! */ try_reexec = ld->try_reexec; if (try_reexec) tal_steal(NULL, orig_argv); /* Free this last: other things may clean up timers. */ timers = tal_steal(NULL, ld->timers); tal_free(ld); timers_cleanup(timers); tal_free(timers); opt_free_table(); daemon_shutdown(); /* Finally, send response to shutdown command if appropriate. */ if (stop_fd >= 0) { write_all(stop_fd, stop_response, strlen(stop_response)); close(stop_fd); tal_free(stop_response); } /* Were we supposed to restart ourselves? */ if (try_reexec) { /* Give a reasonable chance for the install to finish. */ sleep(5); /* Close all filedescriptors except stdin/stdout/stderr */ closefrom(STDERR_FILENO + 1); execv(orig_argv[0], orig_argv); err(1, "Failed to re-exec ourselves after version change"); } /*~ Farewell. Next stop: hsmd/hsmd.c. */ return exit_code; }