Implement a better clock skew estimator.

This time, our estimator discards outliers, takes the mean of what's left, and uses the standard deviation to try to figure out how seriously to take our report of skew/not-skew. These estimates are still not actually used.
2022-04-07 16:03:25 -04:00 · 2022-04-07 16:03:25 -04:00 · cf362fac9f
parent eedee51899
commit cf362fac9f
5 changed files with 390 additions and 24 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -3462,6 +3462,7 @@ dependencies = [
 "derive_builder",
 "derive_more",
 "educe",
+ "float_eq",
 "futures",
 "humantime 2.1.0",
 "humantime-serde",
--- a/crates/tor-guardmgr/Cargo.toml
+++ b/crates/tor-guardmgr/Cargo.toml
@ -49,3 +49,5 @@ tor-netdoc = { path = "../tor-netdoc", version = "0.2.0" }
 tor-persist = { path = "../tor-persist", version = "0.2.0", features = ["testing"] }
 tor-rtcompat = { path = "../tor-rtcompat", version = "0.2.0", features = ["tokio", "native-tls"] }
 tor-rtmock = { path = "../tor-rtmock", version = "0.2.0" }
+
+float_eq = "0.7"
--- a/crates/tor-guardmgr/src/skew.rs
+++ b/crates/tor-guardmgr/src/skew.rs
@ -6,7 +6,6 @@ use tor_proto::ClockSkew;

 /// A single observation related to reported clock skew.
 #[derive(Debug, Clone)]
-#[allow(dead_code)] //XXXX Nothing reads these yet.
 pub(crate) struct SkewObservation {
    /// The reported clock skew
    pub(crate) skew: ClockSkew,
@ -23,26 +22,62 @@ impl SkewObservation {
 }

 /// An estimate of how skewed our clock is, plus a summary of why we think so.
-//
-// XXXX This is a placeholder for now.
 #[derive(Clone, Debug)]
 pub struct SkewEstimate {
    /// Our best guess for the magnitude of the skew.
    estimate: ClockSkew,
    /// The number of observations leading to this estimate.
    n_observations: usize,
+    /// A description of how confident we are.
+    confidence: Confidence,
 }

+/// Subjective description of how sure we are that our clock is/isn't skewed.
+#[derive(Clone, Debug)]
+enum Confidence {
+    /// We aren't very sure about our estimate.
+    None,
+    /// It seems plausible that our clock is skewed
+    Low,
+    /// We are pretty confident that our clock is skewed.
+    High,
+}
+
+/// How bad does clock skew need to be before we'll tell the user that it's a
+/// problem?
+const SIGNIFICANCE_THRESHOLD: Duration = Duration::from_secs(15 * 60);
+
 impl std::fmt::Display for SkewEstimate {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        use humantime::format_duration;
+        /// Format the whole-second part of `d`.
+        ///
+        /// We don't care about fractions here, since skew only mattes if it's
+        /// on the order of many minutes.
+        fn fmt_secs(d: Duration) -> humantime::FormattedDuration {
+            humantime::format_duration(Duration::from_secs(d.as_secs()))
+        }
+
        match self.estimate {
-            ClockSkew::Slow(d) => write!(f, "slow by {}", format_duration(d)),
-            ClockSkew::None => write!(f, "not skewed"),
-            ClockSkew::Fast(d) => write!(f, "fast by {}", format_duration(d)),
+            ClockSkew::Slow(d) => write!(f, "slow by around {}", fmt_secs(d)),
+            ClockSkew::None => write!(
+                f,
+                "not skewed by more than {}",
+                fmt_secs(SIGNIFICANCE_THRESHOLD)
+            ),
+            ClockSkew::Fast(d) => write!(f, "fast by around {}", fmt_secs(d)),
        }?;

-        write!(f, " (based on {} recent observations)", self.n_observations)
+        let confidence = match self.confidence {
+            Confidence::None => "very little confidence",
+            Confidence::Low => "some confidence",
+            Confidence::High => "high confidence",
+        };
+
+        write!(
+            f,
+            " (based on {} recent observations, with {})",
+            self.n_observations, confidence
+        )
    }
 }

@ -61,8 +96,8 @@ impl SkewEstimate {
        // Only consider skew observations reported at least this recently.
        let cutoff = now - Duration::from_secs(3600);

-        // Don't believe observations unless we have at least this many. (This
-        // value is chosen somewhat arbitrarily.)
+        // Don't even look at our observations unless we  have at least this
+        // many. (This value is chosen somewhat arbitrarily.)
        //
        // Note that under normal client operation, we won't connect to this
        // many guards or fallbacks.  That's fine: clock skew is only a problem
@ -71,19 +106,274 @@ impl SkewEstimate {
        // fallbacks.
        let min_observations = 8;

-        let mut skews: Vec<_> = skews
+        let skews: Vec<_> = skews
            .filter_map(|obs| obs.more_recent_than(cutoff).then(|| obs.skew))
            .collect();
-        let n_observations = skews.len();
-        if n_observations < min_observations {
+        if skews.len() < min_observations {
            return None;
        }

-        let (_, median, _) = skews.select_nth_unstable(n_observations / 2);
-        // TODO: Consider the quartiles as well, as a rough estimate of confidence.
+        // Throw away all the members of `skews` that are too eccentric, and
+        // convert the rest to f64s.
+        let skews: Vec<f64> = discard_outliers(skews);
+        let n_observations = skews.len();
+        debug_assert!(n_observations >= 3);
+
+        // Use the mean of the remaining observations to determine our estimate.
+        let (mean, standard_deviation) = mean_and_standard_deviation(&skews[..]);
+        let estimate = ClockSkew::from_secs_f64(mean)
+            .expect("Somehow generated NaN clock skew‽")
+            .if_above(SIGNIFICANCE_THRESHOLD);
+
+        // Use the standard deviation to determine how confident we should be in
+        // our estimate.
+        //
+        // TODO: probably we should be using a real statistical test instead,
+        // but that seems like overkill.
+        let confidence = if standard_deviation < 1.0 {
+            // Avoid divide-by-zero below: if the standard deviation is less
+            // than 1 second then the mean is probably right.
+            Confidence::High
+        } else {
+            let distance = if estimate.is_skewed() {
+                // If we're saying that we are skewed, look at how many standard
+                // deviations we are from zero.
+                estimate.magnitude().as_secs_f64() / standard_deviation
+            } else {
+                // If we're saying that we're not skewed, look at how many
+                // standard deviations zero is from "skewed".
+                SIGNIFICANCE_THRESHOLD.as_secs_f64() / standard_deviation
+            };
+            if distance >= 3.0 {
+                Confidence::High
+            } else if distance >= 2.0 {
+                Confidence::Low
+            } else {
+                Confidence::None
+            }
+        };
+
        Some(SkewEstimate {
-            estimate: *median,
+            estimate: estimate.if_above(SIGNIFICANCE_THRESHOLD),
            n_observations,
+            confidence,
        })
    }
 }
+
+/// Remove all outliers from `values`, and convert the resulting times into
+/// `f64`s.
+///
+/// We guarantee that no more than 1/2 of the input will be discarded.
+///
+/// # Panics
+///
+/// Panics if values is empty.
+fn discard_outliers(mut values: Vec<ClockSkew>) -> Vec<f64> {
+    // Compute the quartiles  of our observations.
+    let (q1, q3) = {
+        let n = values.len();
+        let (low, _median, high) = values.select_nth_unstable(n / 2);
+        let n_low = low.len();
+        let n_high = high.len();
+        debug_assert!(n_low >= 1);
+        debug_assert!(n_high >= 1);
+        let (_, q1, _) = low.select_nth_unstable(n_low / 2);
+        let (_, q3, _) = high.select_nth_unstable(n_high / 2);
+
+        (q1, q3)
+    };
+
+    // Compute the inter-quartile range (IQR) and use this to discard outliers.
+    //
+    // (Define IRQ = Q3-Q1. We'll allow all values that are no more than 1.5 IQR
+    // outside the quartiles.)
+    let iqr = (q1.as_secs_f64() - q3.as_secs_f64()).abs();
+    let permissible_range = (q1.as_secs_f64() - iqr * 1.5)..=(q3.as_secs_f64() + iqr * 1.5);
+    values
+        .into_iter()
+        .filter_map(|skew| Some(skew.as_secs_f64()).filter(|v| permissible_range.contains(v)))
+        .collect()
+}
+
+/// Compute and return the the mean and standard deviation of `values`.
+///
+/// Returns `(Nan,Nan)` if `values` is empty.
+fn mean_and_standard_deviation(values: &[f64]) -> (f64, f64) {
+    let n = values.len() as f64;
+    let mean = values.iter().sum::<f64>() / n;
+    let variance = values
+        .iter()
+        .map(|v| {
+            let diff = v - mean;
+            diff * diff
+        })
+        .sum::<f64>()
+        / n;
+
+    (mean, variance.sqrt())
+}
+
+#[cfg(test)]
+mod test {
+    #![allow(clippy::unwrap_used)]
+    use super::*;
+    use float_eq::assert_float_eq;
+
+    /// Tolerance for float comparison.
+    const TOL: f64 = 0.00001;
+
+    #[test]
+    fn mean_stddev() {
+        // This case is trivial.
+        let a = [17.0];
+        let (m, s) = mean_and_standard_deviation(&a[..]);
+        assert_float_eq!(m, 17.0, abs <= TOL);
+        assert_float_eq!(s, 0.0, abs <= TOL);
+
+        // Computed these by hand using a calculator.
+        let a = [1.0, 2.0, 3.0, 4.0];
+        let (m, s) = mean_and_standard_deviation(&a[..]);
+        assert_float_eq!(m, 2.5, abs <= TOL);
+        assert_float_eq!(s, 1.11803398, abs <= TOL);
+
+        // Generated these using numpy from normal distribution with stddev=1,
+        // mean=0.
+        let a = [
+            1.34528777,
+            0.17855632,
+            -0.08147599,
+            0.14845672,
+            0.6838537,
+            -1.59034826,
+            0.06777352,
+            -2.42469117,
+            -0.12017179,
+            0.47098959,
+        ];
+        let (m, s) = mean_and_standard_deviation(&a[..]);
+        assert_float_eq!(m, -0.132176959, abs <= TOL);
+        assert_float_eq!(s, 1.0398321132, abs <= TOL);
+    }
+
+    #[test]
+    fn outliers() {
+        use ClockSkew::{Fast, Slow};
+        let hour = Duration::from_secs(3600);
+        // median will be 0. quartiles will ± 2 hours.  That makes
+        // the IQR 4 hours, so nothing will be discarded.
+        let a = vec![
+            Slow(hour * 3),
+            Slow(hour * 2),
+            Slow(hour),
+            ClockSkew::None,
+            Fast(hour),
+            Fast(hour * 2),
+            Fast(hour * 3),
+        ];
+        let mut b = discard_outliers(a.clone());
+        b.sort_by(|a, b| a.partial_cmp(b).unwrap());
+        assert_eq!(b.len(), 7);
+        for (ai, bi) in a.iter().zip(b.iter()) {
+            assert_float_eq!(ai.as_secs_f64(), bi, abs <= TOL);
+        }
+
+        // Now try with a case that does have some outliers. This time, the IQR
+        // will be 1 hour, so the first and last times will be discarded as
+        // outliers.
+        let a = vec![
+            Slow(hour * 4),
+            Slow(hour / 2),
+            Slow(hour / 3),
+            ClockSkew::None,
+            Fast(hour / 3),
+            Fast(hour / 2),
+            Fast(hour * 4),
+        ];
+        let mut b = discard_outliers(a.clone());
+        b.sort_by(|a, b| a.partial_cmp(b).unwrap());
+        assert_eq!(b.len(), 5);
+        for (ai, bi) in a[1..=5].iter().zip(b.iter()) {
+            assert_float_eq!(ai.as_secs_f64(), bi, abs <= TOL);
+        }
+    }
+
+    #[test]
+    fn estimate_with_no_data() {
+        // zero inputs -> output is none.
+        let now = Instant::now();
+        let est = SkewEstimate::estimate_skew([].iter(), now);
+        assert!(est.is_none());
+
+        // Same with fewer than min_observations.
+        let year = Duration::from_secs(365 * 60 * 60);
+        let obs = vec![
+            SkewObservation {
+                skew: ClockSkew::Fast(year),
+                when: now
+            };
+            5
+        ];
+        let est = SkewEstimate::estimate_skew(obs.iter(), now);
+        assert!(est.is_none());
+
+        // Same with many observations all of which are obsolete.
+        let obs = vec![
+            SkewObservation {
+                skew: ClockSkew::Fast(year),
+                when: now - year
+            };
+            100
+        ];
+        let est = SkewEstimate::estimate_skew(obs.iter(), now);
+        assert!(est.is_none());
+    }
+
+    /// Construct a vector of SkewObservations from a slice of skew magnitudes
+    /// expressed in minutes.
+    fn from_minutes(mins: &[f64]) -> Vec<SkewObservation> {
+        mins.iter()
+            .map(|m| SkewObservation {
+                skew: ClockSkew::from_secs_f64(m * 60.0).unwrap(),
+                when: Instant::now(),
+            })
+            .collect()
+    }
+
+    #[test]
+    fn estimate_skewed() {
+        // The quartiles here are -22 and -10.  The IQR is therefore 12, so we
+        // won't discard anything.
+        //
+        // The mean is -17.125: That's more than 15 minutes from zero, so we'll say
+        // we're slow.
+        //
+        // The standard deviation is 7.67: that puts the mean between 2 and 3
+        // stddevs from zero, so we'll say we're skewed with "low" confidence.
+        let obs = from_minutes(&[-20.0, -10.0, -20.0, -25.0, 0.0, -18.0, -22.0, -22.0]);
+
+        let est = SkewEstimate::estimate_skew(obs.iter(), Instant::now()).unwrap();
+        assert_eq!(
+            est.to_string(),
+            "slow by around 17m 7s (based on 8 recent observations, with some confidence)"
+        );
+    }
+
+    #[test]
+    fn estimate_not_skewed() {
+        // The quartiles here are -2 and 6: IRQ is 8, so we'll discard all the
+        // huge values, leaving 8.
+        //
+        // Mean of the remaining items is 0.75 and standard deviation is 2.62:
+        // we'll be pretty sure we're not much skewed.
+        let obs = from_minutes(&[
+            -100.0, 100.0, -3.0, -2.0, 0.0, 1.0, 0.5, 6.0, 3.0, 0.5, 99.0,
+        ]);
+
+        let est = SkewEstimate::estimate_skew(obs.iter(), Instant::now()).unwrap();
+        assert_eq!(
+            est.to_string(),
+            "not skewed by more than 15m (based on 8 recent observations, with high confidence)"
+        );
+    }
+}
--- a/crates/tor-proto/src/util/skew.rs
+++ b/crates/tor-proto/src/util/skew.rs
@ -9,7 +9,7 @@ use std::time::{Duration, SystemTime};
 ///
 /// The skews reported here are _minimum_ amounts; the actual skew may
 /// be a little higher, depending on latency.
-#[derive(Copy, Clone, Debug, Eq, PartialEq, Ord, PartialOrd)]
+#[derive(Copy, Clone, Debug, Eq, PartialEq)]
 #[allow(clippy::exhaustive_enums)]
 pub enum ClockSkew {
    /// Our own clock is "running slow": the relay's clock is at least this far
@ -22,6 +22,13 @@ pub enum ClockSkew {
    Fast(Duration),
 }

+/// We treat clock skew as "zero" if it less than this long.
+///
+/// (Since the relay only reports its time to the nearest second, we
+/// can't reasonably infer that differences less than this much reflect
+/// accurate differences in our clocks.)
+const MIN: Duration = Duration::from_secs(2);
+
 impl ClockSkew {
    /// Construct a ClockSkew from a set of channel handshake timestamps.
    ///
@ -34,13 +41,6 @@ impl ClockSkew {
        theirs: SystemTime,
        delay: Duration,
    ) -> Self {
-        /// We treat clock skew as "zero" if it less than this long.
-        ///
-        /// (Since the relay only reports its time to the nearest second, we
-        /// can't reasonably infer that differences less than this much reflect
-        /// accurate differences in our clocks.)
-        const MIN: Duration = Duration::from_secs(2);
-
        // The relay may have generated its own timestamp any time between when
        // we sent the handshake, and when we got the reply.  Therefore, at the
        // time we started, it was between these values.
@ -66,6 +66,41 @@ impl ClockSkew {
        }
    }

+    /// Return this clock skew as a signed number of seconds, with slow values
+    /// treated as negative and fast values treated as positive.
+    pub fn as_secs_f64(&self) -> f64 {
+        match self {
+            ClockSkew::Slow(d) => -d.as_secs_f64(),
+            ClockSkew::None => 0.0,
+            ClockSkew::Fast(d) => d.as_secs_f64(),
+        }
+    }
+
+    /// Return a clock skew computed from a signed number of seconds.
+    ///
+    /// Returns None if the value is degenerate.
+    pub fn from_secs_f64(seconds: f64) -> Option<Self> {
+        use std::num::FpCategory;
+        let max_seconds = Duration::MAX.as_secs_f64();
+
+        // I dislike working with floating point, and I dislike the current lack
+        // of Duration::try_from_secs_f64() in stable Rust.  Look what they made
+        // me do!
+        match seconds.classify() {
+            FpCategory::Nan => None,
+            FpCategory::Zero | FpCategory::Subnormal => Some(ClockSkew::None),
+            FpCategory::Normal | FpCategory::Infinite => Some(if seconds <= -max_seconds {
+                ClockSkew::Slow(Duration::MAX)
+            } else if seconds < 0.0 {
+                ClockSkew::Slow(Duration::from_secs_f64(-seconds)).if_above(MIN)
+            } else if seconds < max_seconds {
+                ClockSkew::Fast(Duration::from_secs_f64(seconds)).if_above(MIN)
+            } else {
+                ClockSkew::Fast(Duration::MAX)
+            }),
+        }
+    }
+
    /// Return this value if it is greater than `min`; otherwise return None.
    pub fn if_above(self, min: Duration) -> Self {
        if self.magnitude() > min {
@ -74,6 +109,38 @@ impl ClockSkew {
            ClockSkew::None
        }
    }
+
+    /// Return true if we're estimating any skew.
+    pub fn is_skewed(&self) -> bool {
+        !matches!(self, ClockSkew::None)
+    }
+}
+
+impl Ord for ClockSkew {
+    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
+        use std::cmp::Ordering::*;
+        use ClockSkew::*;
+        match (self, other) {
+            // This is the reason we need to define this ordering rather than
+            // deriving it: we want clock skews to sort by their signed distance
+            // from the current time.
+            (Slow(a), Slow(b)) => a.cmp(b).reverse(),
+            (Slow(_), _) => Less,
+
+            (None, None) => Equal,
+            (None, Slow(_)) => Greater,
+            (None, Fast(_)) => Less,
+
+            (Fast(a), Fast(b)) => a.cmp(b),
+            (Fast(_), _) => Greater,
+        }
+    }
+}
+
+impl PartialOrd for ClockSkew {
+    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
+        Some(self.cmp(other))
+    }
 }

 #[cfg(test)]
--- a/doc/semver_status.md
+++ b/doc/semver_status.md
@ -31,10 +31,16 @@ BREAKING: Added `ChanProvenance` to `ChanMgr::get_or_launch`.

 MODIFIED: Added a new variant in tor_circmgr::Error.

+### tor-guardmgr
+
+MODIFIED: New functions to get estimated clock skew.
+MODIFIED: New functions to report observed clock skew.
+
 ### tor-proto

 MODIFIED: New accessors in tor_proto::Channel.
 BREAKING: Removed clock skew from Error::HandshakeCertsExpired.
+MODIFIED: New functions on ClockSkew.

 ### tor-rtmock