channeld: don't read from gossipfd while we're reconnecting.

That was the cause of the bad gossip order failures: gossipd thought our channel was live, but the other end didn't receive message last time. Now gossipd doesn't use fd to kill us (connectd tells master to do so), we can implement read_peer_msg_nogossip(). Fixes: #1706 Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
2018-07-24 15:48:59 +09:30 · 2018-07-24 15:48:59 +09:30 · b5fcd54ef0
parent d241bd762c
commit b5fcd54ef0
4 changed files with 23 additions and 12 deletions
--- a/channeld/channel.c
+++ b/channeld/channel.c
@ -1884,8 +1884,13 @@ static void peer_reconnect(struct peer *peer)

 	peer_billboard(false, "Sent reestablish, waiting for theirs");

-	/* Read until they say something interesting */
-	while ((msg = channeld_read_peer_msg(peer)) == NULL)
+	/* Read until they say something interesting (don't forward
+	 * gossip to them yet: we might try sending channel_update
+	 * before we've reestablished channel). */
+	while ((msg = read_peer_msg_nogossip(peer, &peer->cs,
+					     &peer->channel_id,
+					     channeld_send_reply,
+					     peer)) == NULL)
 		clean_tmpctx();

 	if (!fromwire_channel_reestablish(msg, &channel_id,
--- a/common/read_peer_msg.c
+++ b/common/read_peer_msg.c
@ -56,19 +56,18 @@ u8 *read_peer_msg_(const tal_t *ctx,

 	FD_ZERO(&readfds);
 	FD_SET(peer_fd, &readfds);
-	FD_SET(gossip_fd, &readfds);
+	if (gossip_fd >= 0)
+		FD_SET(gossip_fd, &readfds);

 	select(peer_fd > gossip_fd ? peer_fd + 1 : gossip_fd + 1,
 	       &readfds, NULL, NULL, NULL);

-	if (FD_ISSET(gossip_fd, &readfds)) {
-		/* gossipd uses this to kill us, so not a surprise if it
-		   happens. */
+	if (gossip_fd >= 0 && FD_ISSET(gossip_fd, &readfds)) {
 		msg = wire_sync_read(NULL, gossip_fd);
-		if (!msg) {
-			status_debug("Error reading gossip msg");
-			peer_failed_connection_lost();
-		}
+		if (!msg)
+			status_failed(STATUS_FAIL_GOSSIP_IO,
+				      "Error reading gossip msg: %s",
+				      strerror(errno));

 		handle_gossip_msg_(msg, peer_fd, cs, send_reply, arg);
 		return NULL;
--- a/common/read_peer_msg.h
+++ b/common/read_peer_msg.h
@ -26,6 +26,15 @@ struct channel_id;
 					   const u8 *),			\
 		       arg)

+/* Like the above, but don't read from GOSSIP_FD */
+#define read_peer_msg_nogossip(ctx, cs, chanid, send_reply, arg)	\
+	read_peer_msg_((ctx), PEER_FD, -1, (cs),			\
+		       (chanid),					\
+		       typesafe_cb_preargs(bool, void *, (send_reply), (arg), \
+					   struct crypto_state *, int,	\
+					   const u8 *),			\
+		       arg)
+
 /* Helper: sync_crypto_write, with extra args it ignores */
 bool sync_crypto_write_arg(struct crypto_state *cs, int fd, const u8 *TAKES,
 			   void *unused);
--- a/tests/test_lightningd.py
+++ b/tests/test_lightningd.py
@ -3565,8 +3565,6 @@ class LightningDTests(BaseLightningDTests):
        # Just to be sure, second openingd hand over to channeld.
        l2.daemon.wait_for_log('lightning_openingd.*REPLY WIRE_OPENING_FUNDEE_REPLY with 2 fds')

-    # FIXME: bad gossip order fix is wrapped up in gossipd/welcomed: see #1706
-    @flaky
    @unittest.skipIf(not DEVELOPER, "needs DEVELOPER=1")
    def test_reconnect_normal(self):
        # Should reconnect fine even if locked message gets lost.