channeld: don't read from gossipfd while we're reconnecting.

That was the cause of the bad gossip order failures: gossipd thought our
channel was live, but the other end didn't receive message last time.

Now gossipd doesn't use fd to kill us (connectd tells master to do so), we
can implement read_peer_msg_nogossip().

Fixes: #1706
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
This commit is contained in:
Rusty Russell 2018-07-24 15:48:59 +09:30
parent d241bd762c
commit b5fcd54ef0
4 changed files with 23 additions and 12 deletions

View File

@ -1884,8 +1884,13 @@ static void peer_reconnect(struct peer *peer)
peer_billboard(false, "Sent reestablish, waiting for theirs");
/* Read until they say something interesting */
while ((msg = channeld_read_peer_msg(peer)) == NULL)
/* Read until they say something interesting (don't forward
* gossip to them yet: we might try sending channel_update
* before we've reestablished channel). */
while ((msg = read_peer_msg_nogossip(peer, &peer->cs,
&peer->channel_id,
channeld_send_reply,
peer)) == NULL)
clean_tmpctx();
if (!fromwire_channel_reestablish(msg, &channel_id,

View File

@ -56,19 +56,18 @@ u8 *read_peer_msg_(const tal_t *ctx,
FD_ZERO(&readfds);
FD_SET(peer_fd, &readfds);
FD_SET(gossip_fd, &readfds);
if (gossip_fd >= 0)
FD_SET(gossip_fd, &readfds);
select(peer_fd > gossip_fd ? peer_fd + 1 : gossip_fd + 1,
&readfds, NULL, NULL, NULL);
if (FD_ISSET(gossip_fd, &readfds)) {
/* gossipd uses this to kill us, so not a surprise if it
happens. */
if (gossip_fd >= 0 && FD_ISSET(gossip_fd, &readfds)) {
msg = wire_sync_read(NULL, gossip_fd);
if (!msg) {
status_debug("Error reading gossip msg");
peer_failed_connection_lost();
}
if (!msg)
status_failed(STATUS_FAIL_GOSSIP_IO,
"Error reading gossip msg: %s",
strerror(errno));
handle_gossip_msg_(msg, peer_fd, cs, send_reply, arg);
return NULL;

View File

@ -26,6 +26,15 @@ struct channel_id;
const u8 *), \
arg)
/* Like the above, but don't read from GOSSIP_FD */
#define read_peer_msg_nogossip(ctx, cs, chanid, send_reply, arg) \
read_peer_msg_((ctx), PEER_FD, -1, (cs), \
(chanid), \
typesafe_cb_preargs(bool, void *, (send_reply), (arg), \
struct crypto_state *, int, \
const u8 *), \
arg)
/* Helper: sync_crypto_write, with extra args it ignores */
bool sync_crypto_write_arg(struct crypto_state *cs, int fd, const u8 *TAKES,
void *unused);

View File

@ -3565,8 +3565,6 @@ class LightningDTests(BaseLightningDTests):
# Just to be sure, second openingd hand over to channeld.
l2.daemon.wait_for_log('lightning_openingd.*REPLY WIRE_OPENING_FUNDEE_REPLY with 2 fds')
# FIXME: bad gossip order fix is wrapped up in gossipd/welcomed: see #1706
@flaky
@unittest.skipIf(not DEVELOPER, "needs DEVELOPER=1")
def test_reconnect_normal(self):
# Should reconnect fine even if locked message gets lost.