diff --git a/src/core/packager.ts b/src/core/packager.ts index 5c6727f8..0615d7fb 100644 --- a/src/core/packager.ts +++ b/src/core/packager.ts @@ -13,7 +13,7 @@ import type { ProcessedFile } from './file/fileTypes.js'; import { getGitDiffs } from './git/gitDiffHandle.js'; import { getGitLogs } from './git/gitLogHandle.js'; import { calculateMetrics, createMetricsTaskRunner } from './metrics/calculateMetrics.js'; -import { loadTokenCountCache, saveTokenCountCache } from './metrics/tokenCountCache.js'; +import { loadTokenCountCache, saveTokenCountCache, tokenCountCacheFileExists } from './metrics/tokenCountCache.js'; import { prefetchCompiledTemplate } from './output/outputGenerate.js'; import { prefetchSortData, sortOutputFiles } from './output/outputSort.js'; import { produceOutput } from './packager/produceOutput.js'; @@ -56,6 +56,7 @@ const defaultDeps = { prefetchSortData, getGitDiffs, getGitLogs, + tokenCountCacheFileExists, // Lazy-load packSkill to defer importing the skill module chain // (skillSectionGenerators, skillStyle → Handlebars), which adds ~25ms // to module loading. Only used when --skill-generate is active (non-default). @@ -114,22 +115,41 @@ export const pack = async ( // // Worker count: the metrics phase wall-clock is dominated by gpt-tokenizer // throughput, so adding a 3rd warm worker pays off when the metrics phase - // is long enough to amortize its ~250ms BPE warm-up. We size on a coarse - // proxy for "long metrics phase" — whether the user constrained the file - // set via --include / config.include / --stdin (explicitFiles). When - // unconstrained (default scan), 3 warm workers measure +2.46% paired mean - // wall-clock reduction on the 1046-file workload (n=20, t=3.18, - // NODE_DISABLE_COMPILE_CACHE=1). With an explicit scope the file set is - // typically a few hundred files, the metrics phase is shorter, and the - // 3rd worker's BPE warm-up dominates the parallelism gain (-11.85% - // paired regression on the 258-file `--include 'src,tests'` workload at - // unconditional EAGER_WARMUP_THREADS=3), so we keep the original - // 2-worker sizing. + // is long enough to amortize its ~250ms BPE warm-up. We size on two coarse + // proxies for "long metrics phase": + // + // 1. `hasExplicitScope` — whether the user constrained the file set via + // --include / config.include / --stdin (explicitFiles). Explicit scopes + // typically pack a few hundred files; the metrics phase is short and + // the 3rd worker's BPE warm-up dominates the parallelism gain (-11.85% + // paired regression on the 258-file `--include 'src,tests'` workload). + // + // 2. `tokenCountCacheFileExists()` — whether the persistent disk cache + // exists from a previous run. When present, almost every per-file token + // count is served from cache and `calculateFileMetrics` returns without + // dispatching any worker tasks (see `calculateFileMetrics.ts`); the only + // real worker work is a single wrapper-token call (also cached after the + // first run). On warm-cache repeat runs (the common case), spawning 3 + // workers means 3 redundant BPE table parses (~340 ms of CPU each) that + // contend with file collection and security check on a 4-core host. + // Reducing to 2 measured -110.5 ms / -9.65 % paired wall-clock on the + // 1047-file repomix self-pack workload (n=30, t=11.97, faster=30/30, + // NODE_DISABLE_COMPILE_CACHE=1). + // Cold-cache (cache file missing) keeps the original 3-worker warmup + // so the actual file tokenizations parallelise across more workers. + // The probe is a coarse heuristic: a cache file written by a previous + // run that used a different `tokenCount.encoding` (e.g. cl100k_base + // rather than the default o200k_base) yields no hits for the current + // run, so the metrics phase will be cold-but-with-2-workers. Output + // correctness is unaffected — only the warmup parallelism is + // suboptimal — and the next run rebuilds the cache for the new + // encoding so subsequent runs hit again. // // EAGER_WARMUP_THREADS × TASKS_PER_THREAD yields maxThreads=min(cpuCount, N) // for N ∈ {2, 3}; single-CPU hosts collapse to maxThreads=1 (no change). const hasExplicitScope = (explicitFiles?.length ?? 0) > 0 || config.include.length > 0; - const EAGER_WARMUP_THREADS = hasExplicitScope ? 2 : 3; + const cacheLikelyWarm = deps.tokenCountCacheFileExists(); + const EAGER_WARMUP_THREADS = hasExplicitScope || cacheLikelyWarm ? 2 : 3; const EAGER_METRICS_NUM_TASKS = EAGER_WARMUP_THREADS * TASKS_PER_THREAD; const { taskRunner: metricsTaskRunner, warmupPromise: metricsWarmupPromise } = deps.createMetricsTaskRunner( EAGER_METRICS_NUM_TASKS, diff --git a/tests/core/packager.test.ts b/tests/core/packager.test.ts index 3afbd7ef..493df0d4 100644 --- a/tests/core/packager.test.ts +++ b/tests/core/packager.test.ts @@ -193,6 +193,10 @@ describe('packager', () => { getGitLogs: vi.fn().mockResolvedValue(undefined), prefetchSortData: vi.fn().mockResolvedValue(undefined), sortOutputFiles: vi.fn().mockImplementation((files) => files), + // Default: assume the persistent token-count cache is missing so the + // metrics warm-up sizes for the cold-cache path. Tests targeting the + // warm-cache path override this to return true. + tokenCountCacheFileExists: vi.fn().mockReturnValue(false), }, }; }; @@ -248,12 +252,15 @@ describe('packager', () => { expect(cleanup).toHaveBeenCalled(); }); - test('uses 3 warm-up workers for the metrics pool when no scope is specified', async () => { + test('uses 3 warm-up workers for the metrics pool when no scope is specified and token cache is cold', async () => { // The eager metrics warm-up sizes the pool via numOfTasks. When the user did - // not constrain the file set (no --include / no --stdin), the metrics phase is - // typically long enough to amortize a 3rd worker's BPE warm-up, so we pass - // numOfTasks = 3 * TASKS_PER_THREAD = 600 (yielding maxThreads=min(cpuCount, 3)). + // not constrain the file set (no --include / no --stdin) AND no persistent + // token-count cache file exists from a prior run, the metrics phase has to + // tokenize every file and is typically long enough to amortize a 3rd worker's + // BPE warm-up, so we pass numOfTasks = 3 * TASKS_PER_THREAD = 600 (yielding + // maxThreads=min(cpuCount, 3)). const { deps } = baseDeps(); + deps.tokenCountCacheFileExists = vi.fn().mockReturnValue(false); const config = createMockConfig(); config.include = []; @@ -264,6 +271,26 @@ describe('packager', () => { expect(deps.createSecurityTaskRunner).toHaveBeenCalled(); }); + test('uses 2 warm-up workers for the metrics pool when no scope is specified but the token cache is warm', async () => { + // When the persistent token-count cache file already exists from a previous + // run, almost every per-file token count is served from cache and the metrics + // phase performs at most a single wrapper-token tokenization (also cached on + // the second run). Spawning a 3rd worker means a redundant ~340 ms BPE table + // parse that contends with file collection and security check on the critical + // path, so the heuristic falls back to numOfTasks = 2 * TASKS_PER_THREAD = 400. + const { deps } = baseDeps(); + deps.tokenCountCacheFileExists = vi.fn().mockReturnValue(true); + const config = createMockConfig(); + config.include = []; + + await pack(['root'], config, vi.fn(), deps); + + expect(deps.createMetricsTaskRunner).toHaveBeenCalledWith(400, expect.any(String)); + // The security pool pre-warm is unaffected by the cache state — its gate is + // still `hasExplicitScope`, which is false here. + expect(deps.createSecurityTaskRunner).toHaveBeenCalled(); + }); + test('uses 2 warm-up workers for the metrics pool when --include narrows scope', async () => { // With explicit includes the file set is typically much smaller and the // 3rd worker's BPE warm-up dominates the parallelism gain (paired benchmarks