perf(core): Wire warm-cache heuristic into packager and tests

Companion to the previous commit. Plumb `tokenCountCacheFileExists` into the packager `defaultDeps` so the metrics warm-up sizing can be exercised deterministically from tests, and add a paired test that asserts the 2-warm-up-worker branch is taken when the persistent disk cache exists. Also rename the cold-cache test to make the new gating explicit and refresh its docstring with the warm/cold distinction. https://claude.ai/code/session_01TJqKkJ8n3r6Pa2JdW9Vp2w
2026-05-30 11:18:53 +02:00 · 2026-05-10 15:59:43 +09:00
parent dcdbdd5efc
commit ca44a74f3f
2 changed files with 64 additions and 17 deletions
@@ -13,7 +13,7 @@ import type { ProcessedFile } from './file/fileTypes.js';
 import { getGitDiffs } from './git/gitDiffHandle.js';
 import { getGitLogs } from './git/gitLogHandle.js';
 import { calculateMetrics, createMetricsTaskRunner } from './metrics/calculateMetrics.js';
-import { loadTokenCountCache, saveTokenCountCache } from './metrics/tokenCountCache.js';
+import { loadTokenCountCache, saveTokenCountCache, tokenCountCacheFileExists } from './metrics/tokenCountCache.js';
 import { prefetchCompiledTemplate } from './output/outputGenerate.js';
 import { prefetchSortData, sortOutputFiles } from './output/outputSort.js';
 import { produceOutput } from './packager/produceOutput.js';
@@ -56,6 +56,7 @@ const defaultDeps = {
  prefetchSortData,
  getGitDiffs,
  getGitLogs,
+  tokenCountCacheFileExists,
  // Lazy-load packSkill to defer importing the skill module chain
  // (skillSectionGenerators, skillStyle → Handlebars), which adds ~25ms
  // to module loading. Only used when --skill-generate is active (non-default).
@@ -114,22 +115,41 @@ export const pack = async (
  //
  // Worker count: the metrics phase wall-clock is dominated by gpt-tokenizer
  // throughput, so adding a 3rd warm worker pays off when the metrics phase
-  // is long enough to amortize its ~250ms BPE warm-up. We size on a coarse
-  // proxy for "long metrics phase" — whether the user constrained the file
-  // set via --include / config.include / --stdin (explicitFiles). When
-  // unconstrained (default scan), 3 warm workers measure +2.46% paired mean
-  // wall-clock reduction on the 1046-file workload (n=20, t=3.18,
-  // NODE_DISABLE_COMPILE_CACHE=1). With an explicit scope the file set is
-  // typically a few hundred files, the metrics phase is shorter, and the
-  // 3rd worker's BPE warm-up dominates the parallelism gain (-11.85%
-  // paired regression on the 258-file `--include 'src,tests'` workload at
-  // unconditional EAGER_WARMUP_THREADS=3), so we keep the original
-  // 2-worker sizing.
+  // is long enough to amortize its ~250ms BPE warm-up. We size on two coarse
+  // proxies for "long metrics phase":
+  //
+  // 1. `hasExplicitScope` — whether the user constrained the file set via
+  //    --include / config.include / --stdin (explicitFiles). Explicit scopes
+  //    typically pack a few hundred files; the metrics phase is short and
+  //    the 3rd worker's BPE warm-up dominates the parallelism gain (-11.85%
+  //    paired regression on the 258-file `--include 'src,tests'` workload).
+  //
+  // 2. `tokenCountCacheFileExists()` — whether the persistent disk cache
+  //    exists from a previous run. When present, almost every per-file token
+  //    count is served from cache and `calculateFileMetrics` returns without
+  //    dispatching any worker tasks (see `calculateFileMetrics.ts`); the only
+  //    real worker work is a single wrapper-token call (also cached after the
+  //    first run). On warm-cache repeat runs (the common case), spawning 3
+  //    workers means 3 redundant BPE table parses (~340 ms of CPU each) that
+  //    contend with file collection and security check on a 4-core host.
+  //    Reducing to 2 measured -110.5 ms / -9.65 % paired wall-clock on the
+  //    1047-file repomix self-pack workload (n=30, t=11.97, faster=30/30,
+  //    NODE_DISABLE_COMPILE_CACHE=1).
+  //    Cold-cache (cache file missing) keeps the original 3-worker warmup
+  //    so the actual file tokenizations parallelise across more workers.
+  //    The probe is a coarse heuristic: a cache file written by a previous
+  //    run that used a different `tokenCount.encoding` (e.g. cl100k_base
+  //    rather than the default o200k_base) yields no hits for the current
+  //    run, so the metrics phase will be cold-but-with-2-workers. Output
+  //    correctness is unaffected — only the warmup parallelism is
+  //    suboptimal — and the next run rebuilds the cache for the new
+  //    encoding so subsequent runs hit again.
  //
  // EAGER_WARMUP_THREADS × TASKS_PER_THREAD yields maxThreads=min(cpuCount, N)
  // for N ∈ {2, 3}; single-CPU hosts collapse to maxThreads=1 (no change).
  const hasExplicitScope = (explicitFiles?.length ?? 0) > 0 || config.include.length > 0;
-  const EAGER_WARMUP_THREADS = hasExplicitScope ? 2 : 3;
+  const cacheLikelyWarm = deps.tokenCountCacheFileExists();
+  const EAGER_WARMUP_THREADS = hasExplicitScope || cacheLikelyWarm ? 2 : 3;
  const EAGER_METRICS_NUM_TASKS = EAGER_WARMUP_THREADS * TASKS_PER_THREAD;
  const { taskRunner: metricsTaskRunner, warmupPromise: metricsWarmupPromise } = deps.createMetricsTaskRunner(
    EAGER_METRICS_NUM_TASKS,
@@ -193,6 +193,10 @@ describe('packager', () => {
          getGitLogs: vi.fn().mockResolvedValue(undefined),
          prefetchSortData: vi.fn().mockResolvedValue(undefined),
          sortOutputFiles: vi.fn().mockImplementation((files) => files),
+          // Default: assume the persistent token-count cache is missing so the
+          // metrics warm-up sizes for the cold-cache path. Tests targeting the
+          // warm-cache path override this to return true.
+          tokenCountCacheFileExists: vi.fn().mockReturnValue(false),
        },
      };
    };
@@ -248,12 +252,15 @@ describe('packager', () => {
      expect(cleanup).toHaveBeenCalled();
    });

-    test('uses 3 warm-up workers for the metrics pool when no scope is specified', async () => {
+    test('uses 3 warm-up workers for the metrics pool when no scope is specified and token cache is cold', async () => {
      // The eager metrics warm-up sizes the pool via numOfTasks. When the user did
-      // not constrain the file set (no --include / no --stdin), the metrics phase is
-      // typically long enough to amortize a 3rd worker's BPE warm-up, so we pass
-      // numOfTasks = 3 * TASKS_PER_THREAD = 600 (yielding maxThreads=min(cpuCount, 3)).
+      // not constrain the file set (no --include / no --stdin) AND no persistent
+      // token-count cache file exists from a prior run, the metrics phase has to
+      // tokenize every file and is typically long enough to amortize a 3rd worker's
+      // BPE warm-up, so we pass numOfTasks = 3 * TASKS_PER_THREAD = 600 (yielding
+      // maxThreads=min(cpuCount, 3)).
      const { deps } = baseDeps();
+      deps.tokenCountCacheFileExists = vi.fn().mockReturnValue(false);
      const config = createMockConfig();
      config.include = [];

@@ -264,6 +271,26 @@ describe('packager', () => {
      expect(deps.createSecurityTaskRunner).toHaveBeenCalled();
    });

+    test('uses 2 warm-up workers for the metrics pool when no scope is specified but the token cache is warm', async () => {
+      // When the persistent token-count cache file already exists from a previous
+      // run, almost every per-file token count is served from cache and the metrics
+      // phase performs at most a single wrapper-token tokenization (also cached on
+      // the second run). Spawning a 3rd worker means a redundant ~340 ms BPE table
+      // parse that contends with file collection and security check on the critical
+      // path, so the heuristic falls back to numOfTasks = 2 * TASKS_PER_THREAD = 400.
+      const { deps } = baseDeps();
+      deps.tokenCountCacheFileExists = vi.fn().mockReturnValue(true);
+      const config = createMockConfig();
+      config.include = [];
+
+      await pack(['root'], config, vi.fn(), deps);
+
+      expect(deps.createMetricsTaskRunner).toHaveBeenCalledWith(400, expect.any(String));
+      // The security pool pre-warm is unaffected by the cache state — its gate is
+      // still `hasExplicitScope`, which is false here.
+      expect(deps.createSecurityTaskRunner).toHaveBeenCalled();
+    });
+
    test('uses 2 warm-up workers for the metrics pool when --include narrows scope', async () => {
      // With explicit includes the file set is typically much smaller and the
      // 3rd worker's BPE warm-up dominates the parallelism gain (paired benchmarks