GuardMgr: revise handling of "all guards are down".

When all guards are down, we would previously mark them all as up,
and retry aggressively.  But that's far too aggressive: if there's
something wrong with our ability to connect to guards, it makes us
hammer the network over and over, ignoring all the guard retry
timeouts in practice.

Instead,
  * We now allow the `pick_guard()` function to fail without
    automatically retrying.
  * We give different errors in the cases when all our guards are
    down, and when all of the guards selected by our active usage
    are down.
  * Our "guards are down" error includes the time at which a guard
    will next be retriable.

This is part of #407.
This commit is contained in:
Nick Mathewson 2022-03-16 14:07:52 -04:00
parent cb103e04cf
commit eed1f06662
3 changed files with 61 additions and 20 deletions

View File

@ -230,6 +230,13 @@ impl Guard {
self.reachable
}
/// Return the next time at which this guard will be retriable.
///
/// (Return None if we think this guard might be reachable right now.)
pub(crate) fn next_retry(&self) -> Option<Instant> {
self.retry_at
}
/// Return true if this guard is listed in the latest NetDir, and hasn't
/// been turned off for some other reason.
pub(crate) fn usable(&self) -> bool {

View File

@ -341,6 +341,13 @@ impl<R: Runtime> GuardMgr<R> {
== 0
}
/// Mark every guard as potentially retriable, regardless of how recently we
/// failed to connect to it.
pub fn mark_all_guards_retriable(&self) {
let mut inner = self.inner.lock().expect("Poisoned lock");
inner.guards.active_guards_mut().mark_all_guards_retriable();
}
/// Update the state of this [`GuardMgr`] based on a new or modified
/// [`NetDir`] object.
///
@ -773,7 +780,8 @@ impl GuardMgrInner {
now: SystemTime,
) -> Result<(sample::ListKind, GuardId), PickGuardError> {
// Try to find a guard.
if let Ok(s) = self.guards.active_guards().pick_guard(usage, &self.params) {
let res1 = self.guards.active_guards().pick_guard(usage, &self.params);
if let Ok(s) = res1 {
return Ok(s);
}
@ -789,16 +797,12 @@ impl GuardMgrInner {
self.guards
.active_guards_mut()
.select_primary_guards(&self.params);
if let Ok(s) = self.guards.active_guards().pick_guard(usage, &self.params) {
return Ok(s);
}
return self.guards.active_guards().pick_guard(usage, &self.params);
}
}
// That didn't work either. Mark everybody as potentially retriable.
info!("All guards seem down. Marking them retriable and trying again.");
self.guards.active_guards_mut().mark_all_guards_retriable();
self.guards.active_guards().pick_guard(usage, &self.params)
// Couldn't extend the sample; return the original error.
res1
}
}

View File

@ -524,6 +524,11 @@ impl GuardSet {
}
}
/// Return the earliest time at which any guard will be retriable.
pub(crate) fn next_retry(&self) -> Option<Instant> {
self.guards.values().filter_map(Guard::next_retry).min()
}
/// Mark every `Unreachable` primary guard as `Unknown`.
pub(crate) fn mark_primary_guards_retriable(&mut self) {
for id in &self.primary {
@ -690,13 +695,16 @@ impl GuardSet {
GuardUsageKind::Data => params.data_parallelism,
};
// count the number of running options, distinct from the total options.
let mut n_running: usize = 0;
let mut options: Vec<_> = self
.preference_order()
.filter(|(_, g)| g.usable() && g.reachable() != Reachable::Unreachable)
.inspect(|_| n_running += 1)
.filter(|(_, g)| {
g.usable()
!g.exploratory_circ_pending()
&& self.active_filter.permits(*g)
&& g.reachable() != Reachable::Unreachable
&& !g.exploratory_circ_pending()
&& g.conforms_to_usage(usage)
})
.take(n_options)
@ -712,7 +720,14 @@ impl GuardSet {
match options.choose(&mut rand::thread_rng()) {
Some((src, g)) => Ok((*src, g.guard_id().clone())),
None => Err(PickGuardError::EveryoneIsDown),
None => {
if n_running == 0 {
let retry_at = self.next_retry();
Err(PickGuardError::AllGuardsDown { retry_at })
} else {
Err(PickGuardError::NoGuardsUsable)
}
}
}
}
}
@ -757,14 +772,29 @@ impl<'a> From<GuardSample<'a>> for GuardSet {
#[derive(Clone, Debug, thiserror::Error)]
#[non_exhaustive]
pub enum PickGuardError {
/// All members of the current sample were down, or waiting for
/// other circuits to finish.
#[error("Everybody is either down or pending")]
EveryoneIsDown,
/// All members of the current sample were down.
#[error("All guards are down")]
AllGuardsDown {
/// The next time at which any guard will be retriable.
retry_at: Option<Instant>,
},
/// We had no members in the current sample.
#[error("The current sample is empty")]
SampleIsEmpty,
/// Some guards were running, but all of them were either blocked on pending
/// circuits at other guards, unusable for the provided purpose, or filtered
/// out.
#[error("No running guards were usable for the selected purpose")]
NoGuardsUsable,
}
impl tor_error::HasKind for PickGuardError {
fn kind(&self) -> tor_error::ErrorKind {
use tor_error::ErrorKind as EK;
use PickGuardError as E;
match self {
E::AllGuardsDown { .. } => EK::TorAccessFailed,
E::NoGuardsUsable => EK::NoPath,
}
}
}
#[cfg(test)]
@ -1121,7 +1151,7 @@ mod test {
}
let e = guards.pick_guard(&usage, &params);
assert!(matches!(e, Err(PickGuardError::EveryoneIsDown)));
assert!(matches!(e, Err(PickGuardError::AllGuardsDown { .. })));
// Now in theory we should re-grow when we extend.
guards.extend_sample_as_needed(st, &params, &netdir);