diff --git a/CHANGELOG.md b/CHANGELOG.md index bbcc7012f..37dea3f08 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 * New `jj op abandon` command is added to clean up the operation history. If GC is implemented, Git refs and commit objects can be compacted. +* `jj util gc` now removes unreachable operation and view objects. + * `jj branch rename` will now warn if the renamed branch has a remote branch, since those will have to be manually renamed outside of `jj`. diff --git a/cli/src/commands/util.rs b/cli/src/commands/util.rs index 357ed28ed..74eabc5a2 100644 --- a/cli/src/commands/util.rs +++ b/cli/src/commands/util.rs @@ -13,6 +13,8 @@ // limitations under the License. use std::io::Write; +use std::slice; +use std::time::{Duration, SystemTime}; use clap::Subcommand; use jj_lib::repo::Repo; @@ -114,8 +116,14 @@ fn cmd_util_gc( )); } let workspace_command = command.workspace_helper(ui)?; - let store = workspace_command.repo().store(); - store.gc().map_err(|err| user_error(err.to_string()))?; + // TODO: add command argument to specify the expiration time? + let keep_newer = SystemTime::now() - Duration::from_secs(14 * 86400); + let repo = workspace_command.repo(); + repo.op_store() + .gc(slice::from_ref(repo.op_id()), keep_newer)?; + repo.store() + .gc() + .map_err(|err| user_error(err.to_string()))?; Ok(()) } diff --git a/lib/src/file_util.rs b/lib/src/file_util.rs index 4a45b9346..d2b6c4820 100644 --- a/lib/src/file_util.rs +++ b/lib/src/file_util.rs @@ -104,6 +104,7 @@ pub fn persist_content_addressed_temp_file>( Ok(file) => Ok(file), Err(PersistError { error, file: _ }) => { if let Ok(existing_file) = File::open(new_path) { + // TODO: Update mtime to help GC keep this file Ok(existing_file) } else { Err(error) diff --git a/lib/src/op_store.rs b/lib/src/op_store.rs index 8533a192d..b91411954 100644 --- a/lib/src/op_store.rs +++ b/lib/src/op_store.rs @@ -17,6 +17,7 @@ use std::collections::{BTreeMap, HashMap, HashSet}; use std::fmt::{Debug, Error, Formatter}; use std::iter; +use std::time::SystemTime; use itertools::Itertools as _; use once_cell::sync::Lazy; @@ -424,6 +425,15 @@ pub trait OpStore: Send + Sync + Debug { &self, prefix: &HexPrefix, ) -> OpStoreResult>; + + /// Prunes unreachable operations and views. + /// + /// All operations and views reachable from the `head_ids` won't be + /// removed. In addition to that, objects created after `keep_newer` will be + /// preserved. This mitigates a risk of deleting new heads created + /// concurrently by another process. + // TODO: return stats? + fn gc(&self, head_ids: &[OperationId], keep_newer: SystemTime) -> OpStoreResult<()>; } #[cfg(test)] diff --git a/lib/src/simple_op_store.rs b/lib/src/simple_op_store.rs index ff90e4f7b..63c3279ff 100644 --- a/lib/src/simple_op_store.rs +++ b/lib/src/simple_op_store.rs @@ -14,26 +14,28 @@ #![allow(missing_docs)] -use std::collections::BTreeMap; +use std::collections::{BTreeMap, HashMap, HashSet}; use std::fmt::Debug; use std::io::{ErrorKind, Write}; use std::path::{Path, PathBuf}; +use std::time::SystemTime; use std::{fs, io}; +use itertools::Itertools as _; use prost::Message; use tempfile::NamedTempFile; use thiserror::Error; use crate::backend::{CommitId, MillisSinceEpoch, Timestamp}; use crate::content_hash::blake2b_hash; -use crate::file_util::{persist_content_addressed_temp_file, IoResultExt as _}; +use crate::file_util::{persist_content_addressed_temp_file, IoResultExt as _, PathError}; use crate::merge::Merge; use crate::object_id::{HexPrefix, ObjectId, PrefixResolution}; use crate::op_store::{ OpStore, OpStoreError, OpStoreResult, Operation, OperationId, OperationMetadata, RefTarget, RemoteRef, RemoteRefState, RemoteView, View, ViewId, WorkspaceId, }; -use crate::{git, op_store}; +use crate::{dag_walk, git, op_store}; // BLAKE2b-512 hash length in bytes const OPERATION_ID_LENGTH: usize = 64; @@ -195,6 +197,89 @@ impl OpStore for SimpleOpStore { .context(&op_dir) .map_err(|err| OpStoreError::Other(err.into())) } + + #[tracing::instrument(skip(self))] + fn gc(&self, head_ids: &[OperationId], keep_newer: SystemTime) -> OpStoreResult<()> { + let to_op_id = |entry: &fs::DirEntry| -> Option { + let name = entry.file_name().into_string().ok()?; + OperationId::try_from_hex(&name).ok() + }; + let to_view_id = |entry: &fs::DirEntry| -> Option { + let name = entry.file_name().into_string().ok()?; + ViewId::try_from_hex(&name).ok() + }; + let remove_file_if_not_new = |entry: &fs::DirEntry| -> Result<(), PathError> { + let path = entry.path(); + // Check timestamp, but there's still TOCTOU problem if an existing + // file is renewed. + let metadata = entry.metadata().context(&path)?; + let mtime = metadata.modified().expect("unsupported platform?"); + if mtime > keep_newer { + tracing::trace!(?path, "not removing"); + Ok(()) + } else { + tracing::trace!(?path, "removing"); + fs::remove_file(&path).context(&path) + } + }; + + // Reachable objects are resolved without considering the keep_newer + // parameter. We could collect ancestors of the "new" operations here, + // but more files can be added anyway after that. + let read_op = |id: &OperationId| self.read_operation(id).map(|data| (id.clone(), data)); + let reachable_ops: HashMap = dag_walk::dfs_ok( + head_ids.iter().map(read_op), + |(id, _)| id.clone(), + |(_, data)| data.parents.iter().map(read_op).collect_vec(), + ) + .try_collect()?; + let reachable_views: HashSet<&ViewId> = + reachable_ops.values().map(|data| &data.view_id).collect(); + tracing::info!( + reachable_op_count = reachable_ops.len(), + reachable_view_count = reachable_views.len(), + "collected reachable objects" + ); + + let prune_ops = || -> Result<(), PathError> { + let op_dir = self.path.join("operations"); + for entry in op_dir.read_dir().context(&op_dir)? { + let entry = entry.context(&op_dir)?; + let Some(id) = to_op_id(&entry) else { + tracing::trace!(?entry, "skipping invalid file name"); + continue; + }; + if reachable_ops.contains_key(&id) { + continue; + } + // If the operation was added after collecting reachable_views, + // its view mtime would also be renewed. So there's no need to + // update the reachable_views set to preserve the view. + remove_file_if_not_new(&entry)?; + } + Ok(()) + }; + prune_ops().map_err(|err| OpStoreError::Other(err.into()))?; + + let prune_views = || -> Result<(), PathError> { + let view_dir = self.path.join("views"); + for entry in view_dir.read_dir().context(&view_dir)? { + let entry = entry.context(&view_dir)?; + let Some(id) = to_view_id(&entry) else { + tracing::trace!(?entry, "skipping invalid file name"); + continue; + }; + if reachable_views.contains(&id) { + continue; + } + remove_file_if_not_new(&entry)?; + } + Ok(()) + }; + prune_views().map_err(|err| OpStoreError::Other(err.into()))?; + + Ok(()) + } } fn io_to_read_error(err: std::io::Error, id: &impl ObjectId) -> OpStoreError { diff --git a/lib/tests/test_operations.rs b/lib/tests/test_operations.rs index 988b93de7..2a2b45636 100644 --- a/lib/tests/test_operations.rs +++ b/lib/tests/test_operations.rs @@ -15,6 +15,7 @@ use std::path::Path; use std::slice; use std::sync::Arc; +use std::time::SystemTime; use assert_matches::assert_matches; use itertools::Itertools as _; @@ -30,6 +31,7 @@ fn list_dir(dir: &Path) -> Vec { std::fs::read_dir(dir) .unwrap() .map(|entry| entry.unwrap().file_name().to_str().unwrap().to_owned()) + .sorted() .collect() } @@ -568,3 +570,81 @@ fn test_resolve_op_parents_children() { )) ); } + +#[test] +fn test_gc() { + let settings = stable_op_id_settings(); + let test_repo = TestRepo::init(); + let repo_0 = test_repo.repo; + let op_store = repo_0.op_store(); + let op_dir = repo_0.repo_path().join("op_store").join("operations"); + let view_dir = repo_0.repo_path().join("op_store").join("views"); + + // Set up operation graph: + // + // F + // E (empty) + // D | + // C | + // |/ + // B + // A + // 0 (initial) + let empty_tx = |repo: &Arc| repo.start_transaction(&settings); + let random_tx = |repo: &Arc| { + let mut tx = repo.start_transaction(&settings); + write_random_commit(tx.mut_repo(), &settings); + tx + }; + let repo_a = random_tx(&repo_0).commit("op A"); + let repo_b = random_tx(&repo_a).commit("op B"); + let repo_c = random_tx(&repo_b).commit("op C"); + let repo_d = random_tx(&repo_c).commit("op D"); + let repo_e = empty_tx(&repo_b).commit("op E"); + let repo_f = random_tx(&repo_e).commit("op F"); + + // Sanity check for the original state + let mut expected_op_entries = list_dir(&op_dir); + let mut expected_view_entries = list_dir(&view_dir); + assert_eq!(expected_op_entries.len(), 7); + assert_eq!(expected_view_entries.len(), 6); + + // No heads, but all kept by file modification time + op_store.gc(&[], SystemTime::UNIX_EPOCH).unwrap(); + assert_eq!(list_dir(&op_dir), expected_op_entries); + assert_eq!(list_dir(&view_dir), expected_view_entries); + + // All reachable from heads + let now = SystemTime::now(); + let head_ids = [repo_d.op_id().clone(), repo_f.op_id().clone()]; + op_store.gc(&head_ids, now).unwrap(); + assert_eq!(list_dir(&op_dir), expected_op_entries); + assert_eq!(list_dir(&view_dir), expected_view_entries); + + // E|F are no longer reachable, but E's view is still reachable + op_store.gc(slice::from_ref(repo_d.op_id()), now).unwrap(); + expected_op_entries + .retain(|name| *name != repo_e.op_id().hex() && *name != repo_f.op_id().hex()); + expected_view_entries.retain(|name| *name != repo_f.operation().view_id().hex()); + assert_eq!(list_dir(&op_dir), expected_op_entries); + assert_eq!(list_dir(&view_dir), expected_view_entries); + + // B|C|D are no longer reachable + op_store.gc(slice::from_ref(repo_a.op_id()), now).unwrap(); + expected_op_entries.retain(|name| { + *name != repo_b.op_id().hex() + && *name != repo_c.op_id().hex() + && *name != repo_d.op_id().hex() + }); + expected_view_entries.retain(|name| { + *name != repo_b.operation().view_id().hex() + && *name != repo_c.operation().view_id().hex() + && *name != repo_d.operation().view_id().hex() + }); + assert_eq!(list_dir(&op_dir), expected_op_entries); + assert_eq!(list_dir(&view_dir), expected_view_entries); + + // Sanity check for the last state + assert_eq!(expected_op_entries.len(), 2); + assert_eq!(expected_view_entries.len(), 2); +}