GuardMgr: revise handling of "all guards are down".
When all guards are down, we would previously mark them all as up, and retry aggressively. But that's far too aggressive: if there's something wrong with our ability to connect to guards, it makes us hammer the network over and over, ignoring all the guard retry timeouts in practice. Instead, * We now allow the `pick_guard()` function to fail without automatically retrying. * We give different errors in the cases when all our guards are down, and when all of the guards selected by our active usage are down. * Our "guards are down" error includes the time at which a guard will next be retriable. This is part of #407.
This commit is contained in:
parent
cb103e04cf
commit
eed1f06662
|
@ -230,6 +230,13 @@ impl Guard {
|
|||
self.reachable
|
||||
}
|
||||
|
||||
/// Return the next time at which this guard will be retriable.
|
||||
///
|
||||
/// (Return None if we think this guard might be reachable right now.)
|
||||
pub(crate) fn next_retry(&self) -> Option<Instant> {
|
||||
self.retry_at
|
||||
}
|
||||
|
||||
/// Return true if this guard is listed in the latest NetDir, and hasn't
|
||||
/// been turned off for some other reason.
|
||||
pub(crate) fn usable(&self) -> bool {
|
||||
|
|
|
@ -341,6 +341,13 @@ impl<R: Runtime> GuardMgr<R> {
|
|||
== 0
|
||||
}
|
||||
|
||||
/// Mark every guard as potentially retriable, regardless of how recently we
|
||||
/// failed to connect to it.
|
||||
pub fn mark_all_guards_retriable(&self) {
|
||||
let mut inner = self.inner.lock().expect("Poisoned lock");
|
||||
inner.guards.active_guards_mut().mark_all_guards_retriable();
|
||||
}
|
||||
|
||||
/// Update the state of this [`GuardMgr`] based on a new or modified
|
||||
/// [`NetDir`] object.
|
||||
///
|
||||
|
@ -773,7 +780,8 @@ impl GuardMgrInner {
|
|||
now: SystemTime,
|
||||
) -> Result<(sample::ListKind, GuardId), PickGuardError> {
|
||||
// Try to find a guard.
|
||||
if let Ok(s) = self.guards.active_guards().pick_guard(usage, &self.params) {
|
||||
let res1 = self.guards.active_guards().pick_guard(usage, &self.params);
|
||||
if let Ok(s) = res1 {
|
||||
return Ok(s);
|
||||
}
|
||||
|
||||
|
@ -789,16 +797,12 @@ impl GuardMgrInner {
|
|||
self.guards
|
||||
.active_guards_mut()
|
||||
.select_primary_guards(&self.params);
|
||||
if let Ok(s) = self.guards.active_guards().pick_guard(usage, &self.params) {
|
||||
return Ok(s);
|
||||
}
|
||||
return self.guards.active_guards().pick_guard(usage, &self.params);
|
||||
}
|
||||
}
|
||||
|
||||
// That didn't work either. Mark everybody as potentially retriable.
|
||||
info!("All guards seem down. Marking them retriable and trying again.");
|
||||
self.guards.active_guards_mut().mark_all_guards_retriable();
|
||||
self.guards.active_guards().pick_guard(usage, &self.params)
|
||||
// Couldn't extend the sample; return the original error.
|
||||
res1
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -524,6 +524,11 @@ impl GuardSet {
|
|||
}
|
||||
}
|
||||
|
||||
/// Return the earliest time at which any guard will be retriable.
|
||||
pub(crate) fn next_retry(&self) -> Option<Instant> {
|
||||
self.guards.values().filter_map(Guard::next_retry).min()
|
||||
}
|
||||
|
||||
/// Mark every `Unreachable` primary guard as `Unknown`.
|
||||
pub(crate) fn mark_primary_guards_retriable(&mut self) {
|
||||
for id in &self.primary {
|
||||
|
@ -690,13 +695,16 @@ impl GuardSet {
|
|||
GuardUsageKind::Data => params.data_parallelism,
|
||||
};
|
||||
|
||||
// count the number of running options, distinct from the total options.
|
||||
let mut n_running: usize = 0;
|
||||
|
||||
let mut options: Vec<_> = self
|
||||
.preference_order()
|
||||
.filter(|(_, g)| g.usable() && g.reachable() != Reachable::Unreachable)
|
||||
.inspect(|_| n_running += 1)
|
||||
.filter(|(_, g)| {
|
||||
g.usable()
|
||||
!g.exploratory_circ_pending()
|
||||
&& self.active_filter.permits(*g)
|
||||
&& g.reachable() != Reachable::Unreachable
|
||||
&& !g.exploratory_circ_pending()
|
||||
&& g.conforms_to_usage(usage)
|
||||
})
|
||||
.take(n_options)
|
||||
|
@ -712,7 +720,14 @@ impl GuardSet {
|
|||
|
||||
match options.choose(&mut rand::thread_rng()) {
|
||||
Some((src, g)) => Ok((*src, g.guard_id().clone())),
|
||||
None => Err(PickGuardError::EveryoneIsDown),
|
||||
None => {
|
||||
if n_running == 0 {
|
||||
let retry_at = self.next_retry();
|
||||
Err(PickGuardError::AllGuardsDown { retry_at })
|
||||
} else {
|
||||
Err(PickGuardError::NoGuardsUsable)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -757,14 +772,29 @@ impl<'a> From<GuardSample<'a>> for GuardSet {
|
|||
#[derive(Clone, Debug, thiserror::Error)]
|
||||
#[non_exhaustive]
|
||||
pub enum PickGuardError {
|
||||
/// All members of the current sample were down, or waiting for
|
||||
/// other circuits to finish.
|
||||
#[error("Everybody is either down or pending")]
|
||||
EveryoneIsDown,
|
||||
/// All members of the current sample were down.
|
||||
#[error("All guards are down")]
|
||||
AllGuardsDown {
|
||||
/// The next time at which any guard will be retriable.
|
||||
retry_at: Option<Instant>,
|
||||
},
|
||||
|
||||
/// We had no members in the current sample.
|
||||
#[error("The current sample is empty")]
|
||||
SampleIsEmpty,
|
||||
/// Some guards were running, but all of them were either blocked on pending
|
||||
/// circuits at other guards, unusable for the provided purpose, or filtered
|
||||
/// out.
|
||||
#[error("No running guards were usable for the selected purpose")]
|
||||
NoGuardsUsable,
|
||||
}
|
||||
|
||||
impl tor_error::HasKind for PickGuardError {
|
||||
fn kind(&self) -> tor_error::ErrorKind {
|
||||
use tor_error::ErrorKind as EK;
|
||||
use PickGuardError as E;
|
||||
match self {
|
||||
E::AllGuardsDown { .. } => EK::TorAccessFailed,
|
||||
E::NoGuardsUsable => EK::NoPath,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
@ -1121,7 +1151,7 @@ mod test {
|
|||
}
|
||||
|
||||
let e = guards.pick_guard(&usage, ¶ms);
|
||||
assert!(matches!(e, Err(PickGuardError::EveryoneIsDown)));
|
||||
assert!(matches!(e, Err(PickGuardError::AllGuardsDown { .. })));
|
||||
|
||||
// Now in theory we should re-grow when we extend.
|
||||
guards.extend_sample_as_needed(st, ¶ms, &netdir);
|
||||
|
|
Loading…
Reference in New Issue