Skip to content

Commit 4437572

Browse files
committed
Repo contents cache GC
Very simple GC implementation for the repo contents cache. Entry access is logged by "touching" the recorded inputs file. GC tasks then delete old entries that haven't been accessed in `--repo_contents_cache_gc_max_age` time. Work towards #12227. Closes #26080. PiperOrigin-RevId: 761930313 Change-Id: I6c0b92771f57d9949380fb08698385c4a96fe7d4
1 parent e883486 commit 4437572

File tree

10 files changed

+406
-20
lines changed

10 files changed

+406
-20
lines changed

src/main/java/com/google/devtools/build/lib/bazel/BUILD

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@ java_library(
4747
"//src/main/java/com/google/devtools/build/lib/cmdline",
4848
"//src/main/java/com/google/devtools/build/lib/events",
4949
"//src/main/java/com/google/devtools/build/lib/pkgcache",
50+
"//src/main/java/com/google/devtools/build/lib/profiler",
5051
"//src/main/java/com/google/devtools/build/lib/rules:repository/local_repository_rule",
5152
"//src/main/java/com/google/devtools/build/lib/rules:repository/new_local_repository_function",
5253
"//src/main/java/com/google/devtools/build/lib/rules:repository/new_local_repository_rule",

src/main/java/com/google/devtools/build/lib/bazel/BazelRepositoryModule.java

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,9 @@
8181
import com.google.devtools.build.lib.cmdline.RepositoryName;
8282
import com.google.devtools.build.lib.events.Event;
8383
import com.google.devtools.build.lib.pkgcache.PackageOptions;
84+
import com.google.devtools.build.lib.profiler.Profiler;
85+
import com.google.devtools.build.lib.profiler.ProfilerTask;
86+
import com.google.devtools.build.lib.profiler.SilentCloseable;
8487
import com.google.devtools.build.lib.rules.repository.LocalRepositoryFunction;
8588
import com.google.devtools.build.lib.rules.repository.LocalRepositoryRule;
8689
import com.google.devtools.build.lib.rules.repository.NewLocalRepositoryFunction;
@@ -382,6 +385,7 @@ public void beforeCommand(CommandEnvironment env) throws AbruptExitException {
382385
// invocation, as is often the case with the repo contents cache.
383386
// TODO: wyv@ - This is a crude check that disables some use cases (such as when the output
384387
// base itself is inside the main repo). Investigate a better check.
388+
repositoryCache.getRepoContentsCache().setPath(null);
385389
throw new AbruptExitException(
386390
detailedExitCode(
387391
"""
@@ -392,6 +396,26 @@ public void beforeCommand(CommandEnvironment env) throws AbruptExitException {
392396
.formatted(repoContentsCachePath, env.getWorkspace()),
393397
Code.BAD_REPO_CONTENTS_CACHE));
394398
}
399+
if (repositoryCache.getRepoContentsCache().isEnabled()) {
400+
try (SilentCloseable c =
401+
Profiler.instance()
402+
.profile(ProfilerTask.REPO_CACHE_GC_WAIT, "waiting to acquire repo cache lock")) {
403+
repositoryCache.getRepoContentsCache().acquireSharedLock();
404+
} catch (IOException e) {
405+
throw new AbruptExitException(
406+
detailedExitCode(
407+
"could not acquire lock on repo contents cache", Code.BAD_REPO_CONTENTS_CACHE),
408+
e);
409+
}
410+
if (!repoOptions.repoContentsCacheGcMaxAge.isZero()) {
411+
env.addIdleTask(
412+
repositoryCache
413+
.getRepoContentsCache()
414+
.createGcIdleTask(
415+
repoOptions.repoContentsCacheGcMaxAge,
416+
repoOptions.repoContentsCacheGcIdleDelay));
417+
}
418+
}
395419

396420
try {
397421
downloadManager.setNetrcCreds(
@@ -648,6 +672,20 @@ private Path toPath(PathFragment path, CommandEnvironment env) {
648672
return env.getBlazeWorkspace().getWorkspace().getRelative(path);
649673
}
650674

675+
@Override
676+
public void afterCommand() throws AbruptExitException {
677+
if (repositoryCache.getRepoContentsCache().isEnabled()) {
678+
try {
679+
repositoryCache.getRepoContentsCache().releaseSharedLock();
680+
} catch (IOException e) {
681+
throw new AbruptExitException(
682+
detailedExitCode(
683+
"could not release lock on repo contents cache", Code.BAD_REPO_CONTENTS_CACHE),
684+
e);
685+
}
686+
}
687+
}
688+
651689
@Override
652690
public ImmutableList<Injected> getPrecomputedValues() {
653691
Instant now = clock.now();

src/main/java/com/google/devtools/build/lib/bazel/repository/RepositoryOptions.java

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,13 +19,15 @@
1919
import com.google.devtools.build.lib.util.OptionsUtils;
2020
import com.google.devtools.build.lib.vfs.PathFragment;
2121
import com.google.devtools.common.options.Converter;
22+
import com.google.devtools.common.options.Converters.DurationConverter;
2223
import com.google.devtools.common.options.EnumConverter;
2324
import com.google.devtools.common.options.Option;
2425
import com.google.devtools.common.options.OptionDocumentationCategory;
2526
import com.google.devtools.common.options.OptionEffectTag;
2627
import com.google.devtools.common.options.OptionMetadataTag;
2728
import com.google.devtools.common.options.OptionsBase;
2829
import com.google.devtools.common.options.OptionsParsingException;
30+
import java.time.Duration;
2931
import java.util.List;
3032
import net.starlark.java.eval.EvalException;
3133

@@ -64,6 +66,32 @@ public class RepositoryOptions extends OptionsBase {
6466
""")
6567
public PathFragment repoContentsCache;
6668

69+
@Option(
70+
name = "repo_contents_cache_gc_max_age",
71+
defaultValue = "14d",
72+
documentationCategory = OptionDocumentationCategory.BAZEL_CLIENT_OPTIONS,
73+
effectTags = {OptionEffectTag.BAZEL_INTERNAL_CONFIGURATION},
74+
converter = DurationConverter.class,
75+
help =
76+
"""
77+
Specifies the amount of time an entry in the repo contents cache can stay unused before \
78+
it's garbage collected. If set to zero, garbage collection is disabled.
79+
""")
80+
public Duration repoContentsCacheGcMaxAge;
81+
82+
@Option(
83+
name = "repo_contents_cache_gc_idle_delay",
84+
defaultValue = "5m",
85+
documentationCategory = OptionDocumentationCategory.BAZEL_CLIENT_OPTIONS,
86+
effectTags = {OptionEffectTag.BAZEL_INTERNAL_CONFIGURATION},
87+
converter = DurationConverter.class,
88+
help =
89+
"""
90+
Specifies the amount of time the server must remain idle before garbage collection happens
91+
to the repo contents cache.
92+
""")
93+
public Duration repoContentsCacheGcIdleDelay;
94+
6795
@Option(
6896
name = "registry",
6997
defaultValue = "null",

src/main/java/com/google/devtools/build/lib/bazel/repository/cache/BUILD

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ java_library(
2121
"RepositoryCache.java",
2222
],
2323
deps = [
24+
"//src/main/java/com/google/devtools/build/lib/server:idle_task",
2425
"//src/main/java/com/google/devtools/build/lib/util:file_system_lock",
2526
"//src/main/java/com/google/devtools/build/lib/vfs",
2627
"//src/main/java/com/google/devtools/build/lib/vfs/bazel",

src/main/java/com/google/devtools/build/lib/bazel/repository/cache/RepoContentsCache.java

Lines changed: 147 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -18,22 +18,65 @@
1818

1919
import com.google.common.base.Preconditions;
2020
import com.google.common.collect.ImmutableList;
21+
import com.google.devtools.build.lib.server.IdleTask;
22+
import com.google.devtools.build.lib.server.IdleTaskException;
2123
import com.google.devtools.build.lib.util.FileSystemLock;
2224
import com.google.devtools.build.lib.util.FileSystemLock.LockMode;
25+
import com.google.devtools.build.lib.vfs.Dirent;
2326
import com.google.devtools.build.lib.vfs.FileSystemUtils;
2427
import com.google.devtools.build.lib.vfs.Path;
28+
import com.google.devtools.build.lib.vfs.Symlinks;
2529
import java.io.IOException;
2630
import java.nio.charset.StandardCharsets;
31+
import java.time.Duration;
32+
import java.time.Instant;
2733
import java.util.Comparator;
34+
import java.util.UUID;
2835
import javax.annotation.Nullable;
2936

30-
/** A cache directory that stores the contents of fetched repos across different workspaces. */
31-
public class RepoContentsCache {
37+
/**
38+
* A cache directory that stores the contents of fetched repos across different workspaces.
39+
*
40+
* <p>The repo contents cache is laid out in two layers. The first layer is a lookup by "predeclared
41+
* inputs hash", which is defined as the hash of all predeclared inputs of a repo (such as
42+
* transitive bzl digest, repo attrs, starlark semantics, etc). Each distinct predeclared inputs
43+
* hash is its own entry directory in the first layer.
44+
*
45+
* <p>Inside each entry directory are pairs of directories and files {@code <N, N.recorded_inputs>}
46+
* where {@code N} is an integer. The file {@code N.recorded_inputs} contains the recorded inputs
47+
* and their values of a cached repo, and the directory {@code N} contains the cached repo contents.
48+
* There is also a file named {@code counter} that stores the next available {@code N} for this
49+
* entry directory, and a file named {@code lock} to ensure exclusive access to the {@code counter}
50+
* file.
51+
*
52+
* <p>On a cache hit (that is, the predeclared inputs hash matches, and recorded inputs are
53+
* up-to-date), the recorded inputs file has its mtime updated. Cached repos whose recorded inputs
54+
* file is older than {@code --repo_contents_cache_gc_max_age} are garbage collected.
55+
*/
56+
public final class RepoContentsCache {
3257
public static final String RECORDED_INPUTS_SUFFIX = ".recorded_inputs";
3358

34-
@Nullable private Path path;
59+
/**
60+
* The path to a "lock" file, relative to the root of the repo contents cache. While a shared lock
61+
* is held, no garbage collection should happen. While an exclusive lock is held, no reads should
62+
* happen.
63+
*/
64+
public static final String LOCK_PATH = "gc_lock";
65+
66+
/**
67+
* The path to a trash directory relative to the root of the repo contents cache.
68+
*
69+
* <p>Since deleting entire directories could take a bit of time, we create a trash directory
70+
* where we move the garbage directories to (which should be very fast). Then we can delete this
71+
* trash directory altogether at the end. This makes the GC process safe against being interrupted
72+
* in the middle (any undeleted trash will get deleted by the next GC). Also be sure to name this
73+
* trashDir something that couldn't ever be a predeclared inputs hash (starting with an underscore
74+
* should suffice).
75+
*/
76+
public static final String TRASH_PATH = "_trash";
3577

36-
// TODO: wyv@ - implement garbage collection
78+
@Nullable private Path path;
79+
@Nullable private FileSystemLock sharedLock;
3780

3881
public void setPath(@Nullable Path path) {
3982
this.path = path;
@@ -58,6 +101,15 @@ private static CandidateRepo fromRecordedInputsFile(Path recordedInputsFile) {
58101
return new CandidateRepo(
59102
recordedInputsFile, recordedInputsFile.replaceName(contentsDirBaseName));
60103
}
104+
105+
/** Updates the mtime of the recorded inputs file, to delay GC for this entry. */
106+
public void touch() {
107+
try {
108+
recordedInputsFile.setLastModifiedTime(Path.NOW_SENTINEL_TIME);
109+
} catch (IOException e) {
110+
// swallow the exception. it's not a huge deal.
111+
}
112+
}
61113
}
62114

63115
/** Returns the list of candidate repos for the given predeclared input hash. */
@@ -80,8 +132,19 @@ public ImmutableList<CandidateRepo> getCandidateRepos(String predeclaredInputHas
80132
}
81133
}
82134

83-
/** Moves a freshly fetched repo into the contents cache. */
84-
public void moveToCache(
135+
private Path ensureTrashDir() throws IOException {
136+
Preconditions.checkState(path != null);
137+
Path trashDir = path.getChild(TRASH_PATH);
138+
trashDir.createDirectoryAndParents();
139+
return trashDir;
140+
}
141+
142+
/**
143+
* Moves a freshly fetched repo into the contents cache.
144+
*
145+
* @return the repo dir in the contents cache.
146+
*/
147+
public Path moveToCache(
85148
Path fetchedRepoDir, Path fetchedRepoMarkerFile, String predeclaredInputHash)
86149
throws IOException {
87150
Preconditions.checkState(path != null);
@@ -98,7 +161,7 @@ public void moveToCache(
98161
cacheRepoDir.createDirectoryAndParents();
99162
// Move the fetched marker file to a temp location, so that if following operations fail, both
100163
// the fetched repo and the cache locations are considered out-of-date.
101-
Path temporaryMarker = entryDir.getChild(counter + ".temp_recorded_inputs");
164+
Path temporaryMarker = ensureTrashDir().getChild(UUID.randomUUID().toString());
102165
FileSystemUtils.moveFile(fetchedRepoMarkerFile, temporaryMarker);
103166
// Now perform the move, and afterwards, restore the marker file.
104167
try {
@@ -110,6 +173,7 @@ public void moveToCache(
110173
// Set up a symlink at the original fetched repo dir path.
111174
fetchedRepoDir.deleteTree();
112175
FileSystemUtils.ensureSymbolicLink(fetchedRepoDir, cacheRepoDir);
176+
return cacheRepoDir;
113177
}
114178

115179
private static String getNextCounterInDir(Path entryDir) throws IOException {
@@ -128,4 +192,80 @@ private static String getNextCounterInDir(Path entryDir) throws IOException {
128192
return counter;
129193
}
130194
}
195+
196+
public void acquireSharedLock() throws IOException {
197+
Preconditions.checkState(path != null);
198+
Preconditions.checkState(sharedLock == null, "this process already has the shared lock");
199+
sharedLock = FileSystemLock.get(path.getRelative(LOCK_PATH), LockMode.SHARED);
200+
}
201+
202+
public void releaseSharedLock() throws IOException {
203+
Preconditions.checkState(sharedLock != null);
204+
sharedLock.close();
205+
sharedLock = null;
206+
}
207+
208+
/**
209+
* Creates a garbage collection {@link IdleTask} that deletes cached repos who are last accessed
210+
* more than {@code maxAge} ago, with an idle delay of {@code idleDelay}.
211+
*/
212+
public IdleTask createGcIdleTask(Duration maxAge, Duration idleDelay) {
213+
Preconditions.checkState(path != null);
214+
return new IdleTask() {
215+
@Override
216+
public String displayName() {
217+
return "Repo contents cache garbage collection";
218+
}
219+
220+
@Override
221+
public Duration delay() {
222+
return idleDelay;
223+
}
224+
225+
@Override
226+
public void run() throws InterruptedException, IdleTaskException {
227+
try {
228+
Preconditions.checkState(path != null);
229+
// If we can't grab the lock, abort GC. Someone will come along later.
230+
try (var lock = FileSystemLock.tryGet(path.getRelative(LOCK_PATH), LockMode.EXCLUSIVE)) {
231+
runGc(maxAge);
232+
}
233+
// Empty the trash dir outside the lock. No one is reading from these files, so it should
234+
// be safe. At worst, multiple servers performing GC will try to delete the same files,
235+
// but whatever.
236+
path.getChild(TRASH_PATH).deleteTreesBelow();
237+
} catch (IOException e) {
238+
throw new IdleTaskException(e);
239+
}
240+
}
241+
};
242+
}
243+
244+
private void runGc(Duration maxAge) throws InterruptedException, IOException {
245+
path.setLastModifiedTime(Path.NOW_SENTINEL_TIME);
246+
Instant cutoff = Instant.ofEpochMilli(path.getLastModifiedTime()).minus(maxAge);
247+
Path trashDir = ensureTrashDir();
248+
249+
for (Dirent dirent : path.readdir(Symlinks.NOFOLLOW)) {
250+
if (dirent.getType() != Dirent.Type.DIRECTORY || dirent.getName().equals(TRASH_PATH)) {
251+
continue;
252+
}
253+
for (Path recordedInputsFile : path.getChild(dirent.getName()).getDirectoryEntries()) {
254+
if (!recordedInputsFile.getBaseName().endsWith(RECORDED_INPUTS_SUFFIX)) {
255+
continue;
256+
}
257+
if (Thread.interrupted()) {
258+
throw new InterruptedException();
259+
}
260+
261+
if (Instant.ofEpochMilli(recordedInputsFile.getLastModifiedTime()).isBefore(cutoff)) {
262+
// Sorry buddy, you're out.
263+
recordedInputsFile.delete();
264+
var repoDir = CandidateRepo.fromRecordedInputsFile(recordedInputsFile).contentsDir;
265+
// Use a UUID to avoid clashes.
266+
repoDir.renameTo(trashDir.getChild(UUID.randomUUID().toString()));
267+
}
268+
}
269+
}
270+
}
131271
}

src/main/java/com/google/devtools/build/lib/bazel/repository/starlark/StarlarkRepositoryFunction.java

Lines changed: 20 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,10 @@ public void reportSkyframeRestart(Environment env, RepositoryName repoName) {
123123
}
124124
}
125125

126+
private static class State extends WorkerSkyKeyComputeState<FetchResult> {
127+
@Nullable FetchResult result;
128+
}
129+
126130
private record FetchArgs(
127131
Rule rule, Path outputDirectory, BlazeDirectories directories, Environment env, SkyKey key) {
128132
FetchArgs toWorkerArgs(Environment env) {
@@ -141,15 +145,23 @@ public FetchResult fetch(
141145
}
142146
// See below (the `catch CancellationException` clause) for why there's a `while` loop here.
143147
while (true) {
144-
var state = env.getState(WorkerSkyKeyComputeState<FetchResult>::new);
148+
var state = env.getState(State::new);
149+
if (state.result != null) {
150+
// Escape early if we've already finished fetching once. This can happen if
151+
// RepositoryDelegatorFunction triggers a Skyframe restart _after_
152+
// StarlarkRepositoryFunction#fetch is finished.
153+
return state.result;
154+
}
145155
try {
146-
return state.startOrContinueWork(
147-
env,
148-
"starlark-repository-" + rule.getName(),
149-
(workerEnv) -> {
150-
setupRepoRoot(outputDirectory);
151-
return fetchInternal(args.toWorkerArgs(workerEnv));
152-
});
156+
state.result =
157+
state.startOrContinueWork(
158+
env,
159+
"starlark-repository-" + rule.getName(),
160+
(workerEnv) -> {
161+
setupRepoRoot(outputDirectory);
162+
return fetchInternal(args.toWorkerArgs(workerEnv));
163+
});
164+
return state.result;
153165
} catch (ExecutionException e) {
154166
Throwables.throwIfInstanceOf(e.getCause(), RepositoryFunctionException.class);
155167
Throwables.throwIfInstanceOf(e.getCause(), InterruptedException.class);

src/main/java/com/google/devtools/build/lib/profiler/ProfilerTask.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,7 @@ public enum ProfilerTask {
8888
DYNAMIC_LOCK("Acquiring dynamic execution output lock", Threshold.FIFTY_MILLIS),
8989
REPOSITORY_FETCH("Fetching repository"),
9090
REPOSITORY_VENDOR("Vendoring repository"),
91+
REPO_CACHE_GC_WAIT("blocked on repo contents cache GC", Threshold.TEN_MILLIS),
9192
SPAWN_LOG("logging spawn", Threshold.TEN_MILLIS),
9293

9394
UNKNOWN("Unknown event");

0 commit comments

Comments
 (0)