feat(sync): add safety lag to prevent race conditions in sync operations

+2

src/cli/cmd_sync.rs

··· 118 118 shutdown_rx: None, 119 119 shutdown_tx: None, 120 120 fetch_log: cmd.fetch_log, 121 + safety_lag: Duration::from_millis(constants::DEFAULT_SAFETY_LAG_MS), 121 122 }; 122 123 123 124 let quiet = global_quiet; ··· 137 138 shutdown_rx: Some(shutdown_signal), 138 139 shutdown_tx: Some(shutdown_sender), 139 140 fetch_log: cmd.fetch_log, 141 + safety_lag: Duration::from_millis(constants::DEFAULT_SAFETY_LAG_MS), 140 142 }; 141 143 142 144 let logger = SyncLoggerImpl::new_server(global_verbose, cmd.interval);

+5

src/constants.rs

··· 76 76 /// Minimum time between bundle creation attempts (60 seconds) 77 77 pub const MIN_BUNDLE_CREATION_INTERVAL_SECS: i64 = 60; 78 78 79 + /// Default safety lag for sync operations (in milliseconds) 80 + /// This buffer prevents race conditions where the indexer might miss operations 81 + /// that were committed just before the fetch but not yet visible or indexed. 82 + pub const DEFAULT_SAFETY_LAG_MS: u64 = 1000; 83 + 79 84 // ============================================================================ 80 85 // File and Directory Constants 81 86 // ============================================================================

+81 -13

src/manager.rs

··· 1757 1757 shutdown_rx: Option<tokio::sync::watch::Receiver<bool>>, 1758 1758 update_did_index: bool, 1759 1759 fetch_log: bool, 1760 + safety_lag: Option<std::time::Duration>, 1760 1761 ) -> Result<SyncResult> { 1761 1762 use crate::sync::{get_boundary_cids, strip_boundary_duplicates}; 1762 1763 use std::time::Instant; ··· 1858 1859 let mut total_wait = std::time::Duration::from_secs(0); 1859 1860 let mut total_http = std::time::Duration::from_secs(0); 1860 1861 1862 + // Cutoff time will be calculated per-request based on server time 1863 + // (removed static cutoff calculation) 1864 + 1861 1865 while fetch_num < MAX_ATTEMPTS { 1862 1866 let stats = self.get_mempool_stats()?; 1863 1867 ··· 1893 1897 { 1894 1898 anyhow::bail!("Shutdown requested"); 1895 1899 } 1896 - let (plc_ops, wait_dur, http_dur, raw_capture_opt) = if fetch_log { 1900 + let (plc_ops, wait_dur, http_dur, raw_capture_opt, server_time) = if fetch_log { 1897 1901 if let Some(rx) = shutdown_rx.clone() { 1898 - let (ops, w, h, capture_opt) = client 1902 + let (ops, w, h, capture_opt, st) = client 1899 1903 .fetch_operations(&after_time, request_count, Some(rx), true) 1900 1904 .await?; 1901 - (ops, w, h, capture_opt) 1905 + (ops, w, h, capture_opt, st) 1902 1906 } else { 1903 - let (ops, w, h, capture_opt) = client 1907 + let (ops, w, h, capture_opt, st) = client 1904 1908 .fetch_operations(&after_time, request_count, None, true) 1905 1909 .await?; 1906 - (ops, w, h, capture_opt) 1910 + (ops, w, h, capture_opt, st) 1907 1911 } 1908 1912 } else { 1909 1913 if let Some(rx) = shutdown_rx.clone() { 1910 - let (ops, w, h, _) = client 1914 + let (ops, w, h, _, st) = client 1911 1915 .fetch_operations(&after_time, request_count, Some(rx), false) 1912 1916 .await?; 1913 - (ops, w, h, None) 1917 + (ops, w, h, None, st) 1914 1918 } else { 1915 - let (ops, w, h, _) = 1919 + let (ops, w, h, _, st) = 1916 1920 client.fetch_operations(&after_time, request_count, None, false).await?; 1917 - (ops, w, h, None) 1921 + (ops, w, h, None, st) 1918 1922 } 1919 1923 }; 1920 1924 total_wait += wait_dur; ··· 1937 1941 1938 1942 total_fetched += fetched_count; 1939 1943 1944 + // Calculate cutoff time based on server time if available, otherwise local time 1945 + let cutoff_time = if let Some(lag) = safety_lag { 1946 + let base_time = server_time.unwrap_or_else(chrono::Utc::now); 1947 + let cutoff = base_time - chrono::Duration::from_std(lag).unwrap_or(chrono::Duration::seconds(0)); 1948 + 1949 + // Only log if we're using server time (to avoid spamming logs) or if verbose 1950 + if *self.verbose.lock().unwrap() { 1951 + let source = if server_time.is_some() { "server" } else { "local" }; 1952 + log::debug!( 1953 + "Safety lag cutoff: {} (source: {}, lag: {:?})", 1954 + cutoff.to_rfc3339(), 1955 + source, 1956 + lag 1957 + ); 1958 + } 1959 + Some(cutoff) 1960 + } else { 1961 + None 1962 + }; 1963 + 1940 1964 // Convert to operations 1941 - let ops_pre: Vec<Operation> = plc_ops.into_iter().map(Into::into).collect(); 1965 + let ops_pre_raw: Vec<Operation> = plc_ops.into_iter().map(Into::into).collect(); 1966 + 1967 + // Apply safety lag filtering 1968 + let (ops_pre, filtered_count) = if let Some(cutoff) = cutoff_time { 1969 + let mut kept = Vec::with_capacity(ops_pre_raw.len()); 1970 + let mut filtered = 0; 1971 + for op in ops_pre_raw { 1972 + if let Ok(op_time) = chrono::DateTime::parse_from_rfc3339(&op.created_at) { 1973 + if op_time <= cutoff { 1974 + kept.push(op); 1975 + } else { 1976 + filtered += 1; 1977 + } 1978 + } else { 1979 + // If we can't parse the time, keep it (safe default? or unsafe?) 1980 + // Keeping it is safer for data availability, but risky for consistency. 1981 + // Given the issue is about race conditions, keeping it might be risky. 1982 + // But failing to parse is a bigger issue. Let's keep it and log warning. 1983 + log::warn!("Failed to parse timestamp for op {}, keeping it", op.did); 1984 + kept.push(op); 1985 + } 1986 + } 1987 + (kept, filtered) 1988 + } else { 1989 + (ops_pre_raw, 0) 1990 + }; 1991 + 1992 + if filtered_count > 0 { 1993 + if *self.verbose.lock().unwrap() { 1994 + log::info!( 1995 + " Safety lag: filtered {} operations newer than cutoff", 1996 + filtered_count 1997 + ); 1998 + } 1999 + // If we filtered any operations, we must consider ourselves "caught up" 2000 + // because we can't proceed past the cutoff time safely. 2001 + // We also stop fetching in this cycle. 2002 + caught_up = true; 2003 + } 2004 + 1942 2005 let mut all_cids_pre: Vec<String> = Vec::new(); 1943 2006 if fetch_log { 1944 2007 all_cids_pre = ops_pre ··· 2063 2126 } 2064 2127 2065 2128 // Stop if we got an incomplete batch or made no progress 2066 - if got_incomplete_batch || added == 0 { 2129 + // Also stop if we filtered operations due to safety lag (caught_up is set above) 2130 + if got_incomplete_batch || added == 0 || (filtered_count > 0 && caught_up) { 2067 2131 caught_up = true; 2068 2132 if *self.verbose.lock().unwrap() { 2069 - log::debug!("Caught up to latest PLC data"); 2133 + if filtered_count > 0 { 2134 + log::debug!("Caught up to safety lag cutoff"); 2135 + } else { 2136 + log::debug!("Caught up to latest PLC data"); 2137 + } 2070 2138 } 2071 2139 break; 2072 2140 } ··· 2268 2336 let mut synced = 0; 2269 2337 2270 2338 loop { 2271 - match self.sync_next_bundle(client, None, true, false).await { 2339 + match self.sync_next_bundle(client, None, true, false, None).await { 2272 2340 Ok(SyncResult::BundleCreated { .. }) => { 2273 2341 synced += 1; 2274 2342

+14 -3

src/plc_client.rs

··· 90 90 Duration, 91 91 Duration, 92 92 Option<RawExportResponse>, 93 + Option<chrono::DateTime<chrono::Utc>>, 93 94 )> { 94 95 self.fetch_operations_unified(after, count, shutdown_rx, capture_raw) 95 96 .await ··· 110 111 Duration, 111 112 Duration, 112 113 Option<RawExportResponse>, 114 + Option<chrono::DateTime<chrono::Utc>>, 113 115 )> { 114 116 let mut backoff = Duration::from_secs(1); 115 117 let mut last_err = None; ··· 157 159 }; 158 160 159 161 match result { 160 - Ok((operations, http_duration, capture)) => { 162 + Ok((operations, http_duration, capture, server_time)) => { 161 163 total_http += http_duration; 162 - return Ok((operations, total_wait, total_http, capture)); 164 + return Ok((operations, total_wait, total_http, capture, server_time)); 163 165 } 164 166 Err(e) => { 165 167 last_err = Some(e); ··· 229 231 Vec<PLCOperation>, 230 232 Duration, 231 233 Option<RawExportResponse>, 234 + Option<chrono::DateTime<chrono::Utc>>, 232 235 )> { 233 236 let url = format!("{}/export", self.base_url); 234 237 let request_start_wall = chrono::Utc::now(); ··· 263 266 None 264 267 }; 265 268 269 + // Extract Date header for server time 270 + let server_time = response 271 + .headers() 272 + .get("date") 273 + .and_then(|v| v.to_str().ok()) 274 + .and_then(|s| httpdate::parse_http_date(s).ok()) 275 + .map(|t| chrono::DateTime::<chrono::Utc>::from(t)); 276 + 266 277 let body = response.text().await?; 267 278 let request_duration = request_start.elapsed(); 268 279 let mut operations = Vec::new(); ··· 295 306 None 296 307 }; 297 308 298 - Ok((operations, request_duration, capture)) 309 + Ok((operations, request_duration, capture, server_time)) 299 310 } 300 311 301 312 /// Fetch DID document raw JSON from PLC directory

+1

src/server/startup.rs

··· 480 480 shutdown_rx: Some(shutdown_signal), 481 481 shutdown_tx: Some(sync_runtime.shutdown_sender()), 482 482 fetch_log, 483 + safety_lag: Duration::from_millis(constants::DEFAULT_SAFETY_LAG_MS), 483 484 }; 484 485 485 486 use crate::sync::SyncLoggerImpl;

+10 -1

src/sync.rs

··· 132 132 pub shutdown_rx: Option<tokio::sync::watch::Receiver<bool>>, 133 133 pub shutdown_tx: Option<tokio::sync::watch::Sender<bool>>, 134 134 pub fetch_log: bool, 135 + pub safety_lag: Duration, 135 136 } 136 137 137 138 impl Default for SyncConfig { ··· 145 146 shutdown_rx: None, 146 147 shutdown_tx: None, 147 148 fetch_log: false, 149 + safety_lag: Duration::from_millis(crate::constants::DEFAULT_SAFETY_LAG_MS), 148 150 } 149 151 } 150 152 } ··· 508 510 509 511 match self 510 512 .manager 511 - .sync_next_bundle(&self.client, None, true, self.config.fetch_log) 513 + .sync_next_bundle( 514 + &self.client, 515 + None, 516 + true, 517 + self.config.fetch_log, 518 + Some(self.config.safety_lag), 519 + ) 512 520 .await 513 521 { 514 522 Ok(crate::manager::SyncResult::BundleCreated { ··· 646 654 self.config.shutdown_rx.clone(), 647 655 !is_initial_sync, 648 656 self.config.fetch_log, 657 + Some(self.config.safety_lag), 649 658 ) 650 659 .await; 651 660