mirror of
https://github.com/yamadashy/repomix.git
synced 2026-05-30 11:18:53 +02:00
c8560d1bc6
Lower `EAGER_WARMUP_THREADS` from 2 to 1 when `tokenCountCacheFileExists()` returns true. With the persistent token-count disk cache populated by a prior run, `calculateFileMetrics` serves every per-file token count from the in-memory map and dispatches zero worker tasks. The only worker work that survives caching on a warm rerun is a small fixed set of dispatches: - the wrapper-token tokenization (cache hit after run #2) - git diff staged/worktree token counts (only when `output.git.includeDiffs` is enabled) - git log token count (only when `output.git.includeLogs` is enabled) That worst case is 2-3 short tasks (a few KB each) that fit a single warm worker serially in well under 30 ms. Spawning a second warm worker means a redundant ~340 ms BPE table parse that contends with the file-collection main thread for CPU AND extends the final `pool.destroy()` blocking wait (BPE-loaded workers take ~21 ms to terminate vs ~3 ms when idle). Cold-cache (no cache file) behavior is preserved: the unscoped path keeps 3 warm workers and the explicit-scope path keeps 2, so the actual file tokenizations still parallelise across the original worker counts. The probe is a coarse heuristic — a cache file written by a previous run that used a different `tokenCount.encoding` (e.g. cl100k_base instead of the default o200k_base) yields no hits for the current run, so the metrics phase pays one BPE parse sequentially on the critical path before tokenizing files. This is a one-time cost on encoding switches; subsequent runs rebuild the cache for the new encoding and hit again. Benchmark (paired, n=25, repomix self-pack on 1068 files): WARM CACHE (cache file present) BASELINE mean=968.9ms median=976.0ms sd=40.3ms AFTER mean=883.2ms median=875.0ms sd=33.1ms DELTA mean=85.6 ms (8.84%) median=87.0 ms sd=42.7 t=10.02 (df=24) faster=24/25 COLD CACHE (cache file deleted before each run, n=12) BASELINE mean=1606.3ms median=1588.0ms sd=58.6ms AFTER mean=1593.2ms median=1598.5ms sd=58.6ms DELTA mean=13.2 ms (0.82%) t=0.62 faster=9/12 — within noise Stacks on top of the existing warm-cache wins on this branch (token-count disk cache, output-wrapper cache, prefetched template, native ignore-file prescan, etc.); this single change pushes warm-cache wall-clock another ~86 ms below the previous floor.
350 lines
15 KiB
TypeScript
350 lines
15 KiB
TypeScript
import path from 'node:path';
|
||
import { beforeEach, describe, expect, test, vi } from 'vitest';
|
||
import { pack } from '../../src/core/packager.js';
|
||
import { createMockConfig } from '../testing/testUtils.js';
|
||
|
||
vi.mock('node:fs/promises');
|
||
vi.mock('fs/promises');
|
||
vi.mock('../../src/core/metrics/TokenCounter.js', () => {
|
||
return {
|
||
TOKEN_ENCODINGS: ['o200k_base', 'cl100k_base', 'p50k_base', 'p50k_edit', 'r50k_base'],
|
||
TokenCounter: vi.fn().mockImplementation(() => ({
|
||
countTokens: vi.fn().mockReturnValue(10),
|
||
free: vi.fn(),
|
||
})),
|
||
};
|
||
});
|
||
|
||
describe('packager', () => {
|
||
beforeEach(() => {
|
||
vi.resetAllMocks();
|
||
});
|
||
|
||
test('pack should orchestrate packing files and generating output', async () => {
|
||
const file2Path = path.join('dir1', 'file2.txt');
|
||
const mockRawFiles = [
|
||
{ path: 'file1.txt', content: 'raw content 1' },
|
||
{ path: file2Path, content: 'raw content 2' },
|
||
];
|
||
const mockProcessedFiles = [
|
||
{ path: 'file1.txt', content: 'processed content 1' },
|
||
{ path: file2Path, content: 'processed content 2' },
|
||
];
|
||
const mockOutput = 'mock output';
|
||
const mockFilePaths = ['file1.txt', file2Path];
|
||
|
||
const mockDeps = {
|
||
searchFiles: vi.fn().mockResolvedValue({
|
||
filePaths: mockFilePaths,
|
||
emptyDirPaths: [],
|
||
}),
|
||
sortPaths: vi.fn().mockImplementation((paths) => paths),
|
||
collectFiles: vi.fn().mockResolvedValue({ rawFiles: mockRawFiles, skippedFiles: [] }),
|
||
processFiles: vi.fn().mockReturnValue(mockProcessedFiles),
|
||
validateFileSafety: vi.fn().mockResolvedValue({
|
||
safeFilePaths: mockFilePaths,
|
||
safeRawFiles: mockRawFiles,
|
||
suspiciousFilesResults: [],
|
||
suspiciousGitDiffResults: [],
|
||
suspiciousGitLogResults: [],
|
||
}),
|
||
produceOutput: vi.fn().mockResolvedValue({
|
||
outputForMetrics: mockOutput,
|
||
}),
|
||
createMetricsTaskRunner: vi.fn().mockReturnValue({
|
||
taskRunner: {
|
||
run: vi.fn().mockResolvedValue(0),
|
||
cleanup: vi.fn().mockResolvedValue(undefined),
|
||
},
|
||
warmupPromise: Promise.resolve(),
|
||
}),
|
||
createSecurityTaskRunner: vi.fn().mockReturnValue({
|
||
taskRunner: {
|
||
run: vi.fn().mockResolvedValue([]),
|
||
cleanup: vi.fn().mockResolvedValue(undefined),
|
||
},
|
||
warmupPromise: Promise.resolve(),
|
||
}),
|
||
calculateMetrics: vi.fn().mockResolvedValue({
|
||
totalFiles: 2,
|
||
totalCharacters: 11,
|
||
totalTokens: 10,
|
||
fileCharCounts: {
|
||
'file1.txt': 19,
|
||
[file2Path]: 19,
|
||
},
|
||
fileTokenCounts: {
|
||
'file1.txt': 10,
|
||
[file2Path]: 10,
|
||
},
|
||
gitDiffTokenCount: 0,
|
||
gitLogTokenCount: 0,
|
||
}),
|
||
};
|
||
|
||
const mockConfig = createMockConfig();
|
||
const progressCallback = vi.fn();
|
||
const result = await pack(['root'], mockConfig, progressCallback, mockDeps);
|
||
|
||
expect(mockDeps.searchFiles).toHaveBeenCalledWith('root', mockConfig, undefined);
|
||
expect(mockDeps.collectFiles).toHaveBeenCalledWith(mockFilePaths, 'root', mockConfig, progressCallback);
|
||
expect(mockDeps.validateFileSafety).toHaveBeenCalled();
|
||
expect(mockDeps.processFiles).toHaveBeenCalled();
|
||
expect(mockDeps.produceOutput).toHaveBeenCalled();
|
||
expect(mockDeps.calculateMetrics).toHaveBeenCalled();
|
||
|
||
expect(mockDeps.validateFileSafety).toHaveBeenCalledWith(
|
||
mockRawFiles,
|
||
progressCallback,
|
||
mockConfig,
|
||
undefined,
|
||
undefined,
|
||
expect.objectContaining({ taskRunner: expect.any(Object) }),
|
||
);
|
||
expect(mockDeps.processFiles).toHaveBeenCalledWith(mockRawFiles, mockConfig, progressCallback);
|
||
expect(mockDeps.produceOutput).toHaveBeenCalledWith(
|
||
['root'],
|
||
mockConfig,
|
||
mockProcessedFiles,
|
||
mockFilePaths,
|
||
undefined,
|
||
undefined,
|
||
progressCallback,
|
||
[{ rootLabel: 'root', files: mockFilePaths }],
|
||
undefined,
|
||
);
|
||
expect(mockDeps.calculateMetrics).toHaveBeenCalledWith(
|
||
mockProcessedFiles,
|
||
expect.anything(),
|
||
progressCallback,
|
||
mockConfig,
|
||
undefined,
|
||
undefined,
|
||
expect.objectContaining({ taskRunner: expect.anything() }),
|
||
);
|
||
|
||
// Verify that calculateMetrics received a promise that resolves to the expected output
|
||
const outputArg = mockDeps.calculateMetrics.mock.calls[0][1];
|
||
await expect(outputArg).resolves.toBe(mockOutput);
|
||
|
||
// Check the result of pack function
|
||
expect(result.totalFiles).toBe(2);
|
||
expect(result.totalCharacters).toBe(11);
|
||
expect(result.totalTokens).toBe(10);
|
||
expect(result.fileCharCounts).toEqual({
|
||
'file1.txt': 19,
|
||
[file2Path]: 19,
|
||
});
|
||
expect(result.fileTokenCounts).toEqual({
|
||
'file1.txt': 10,
|
||
[file2Path]: 10,
|
||
});
|
||
expect(result.skippedFiles).toEqual([]);
|
||
});
|
||
|
||
describe('parallel error handling', () => {
|
||
// The pipeline runs several stages in parallel (security check + file processing,
|
||
// output generation + metrics). Regressions in error propagation or worker cleanup
|
||
// are easy to introduce when adding parallel branches.
|
||
const mockFilePaths = ['file1.txt'];
|
||
const mockRawFiles = [{ path: 'file1.txt', content: 'raw' }];
|
||
const mockProcessedFiles = [{ path: 'file1.txt', content: 'processed' }];
|
||
|
||
const baseDeps = () => {
|
||
const cleanup = vi.fn().mockResolvedValue(undefined);
|
||
return {
|
||
cleanup,
|
||
deps: {
|
||
searchFiles: vi.fn().mockResolvedValue({ filePaths: mockFilePaths, emptyDirPaths: [] }),
|
||
sortPaths: vi.fn().mockImplementation((p) => p),
|
||
collectFiles: vi.fn().mockResolvedValue({ rawFiles: mockRawFiles, skippedFiles: [] }),
|
||
processFiles: vi.fn().mockResolvedValue(mockProcessedFiles),
|
||
validateFileSafety: vi.fn().mockResolvedValue({
|
||
safeFilePaths: mockFilePaths,
|
||
safeRawFiles: mockRawFiles,
|
||
suspiciousFilesResults: [],
|
||
suspiciousGitDiffResults: [],
|
||
suspiciousGitLogResults: [],
|
||
}),
|
||
produceOutput: vi.fn().mockResolvedValue({ outputForMetrics: 'output' }),
|
||
// Mirror real calculateMetrics behavior: await the outputForMetrics promise so a
|
||
// produceOutput rejection propagates here instead of becoming unhandled.
|
||
calculateMetrics: vi.fn().mockImplementation(async (_files, outputPromise) => {
|
||
await outputPromise;
|
||
return {
|
||
totalFiles: 1,
|
||
totalCharacters: 9,
|
||
totalTokens: 1,
|
||
fileCharCounts: { 'file1.txt': 9 },
|
||
fileTokenCounts: { 'file1.txt': 1 },
|
||
gitDiffTokenCount: 0,
|
||
gitLogTokenCount: 0,
|
||
};
|
||
}),
|
||
createMetricsTaskRunner: vi.fn().mockReturnValue({
|
||
taskRunner: { run: vi.fn().mockResolvedValue(0), cleanup },
|
||
warmupPromise: Promise.resolve(),
|
||
}),
|
||
createSecurityTaskRunner: vi.fn().mockReturnValue({
|
||
taskRunner: { run: vi.fn().mockResolvedValue([]), cleanup: vi.fn().mockResolvedValue(undefined) },
|
||
warmupPromise: Promise.resolve(),
|
||
}),
|
||
getGitDiffs: vi.fn().mockResolvedValue(undefined),
|
||
getGitLogs: vi.fn().mockResolvedValue(undefined),
|
||
prefetchSortData: vi.fn().mockResolvedValue(undefined),
|
||
sortOutputFiles: vi.fn().mockImplementation((files) => files),
|
||
// Default: assume the persistent token-count cache is missing so the
|
||
// metrics warm-up sizes for the cold-cache path. Tests targeting the
|
||
// warm-cache path override this to return true.
|
||
tokenCountCacheFileExists: vi.fn().mockReturnValue(false),
|
||
},
|
||
};
|
||
};
|
||
|
||
test('cleans up the metrics worker pool when searchFiles rejects', async () => {
|
||
// The metrics worker pool is created before searchFiles so its BPE warm-up
|
||
// can overlap with the glob scan. A searchFiles rejection must still trigger
|
||
// the pool cleanup via the surrounding try/finally.
|
||
const { cleanup, deps } = baseDeps();
|
||
deps.searchFiles = vi.fn().mockRejectedValue(new Error('search failed'));
|
||
|
||
await expect(pack(['root'], createMockConfig(), vi.fn(), deps)).rejects.toThrow('search failed');
|
||
|
||
expect(cleanup).toHaveBeenCalled();
|
||
});
|
||
|
||
test('cleans up the metrics worker pool when validateFileSafety rejects', async () => {
|
||
const { cleanup, deps } = baseDeps();
|
||
deps.validateFileSafety = vi.fn().mockRejectedValue(new Error('security check failed'));
|
||
|
||
await expect(pack(['root'], createMockConfig(), vi.fn(), deps)).rejects.toThrow('security check failed');
|
||
|
||
expect(cleanup).toHaveBeenCalled();
|
||
});
|
||
|
||
test('cleans up the metrics worker pool when produceOutput rejects', async () => {
|
||
const { cleanup, deps } = baseDeps();
|
||
deps.produceOutput = vi.fn().mockRejectedValue(new Error('output failed'));
|
||
|
||
await expect(pack(['root'], createMockConfig(), vi.fn(), deps)).rejects.toThrow('output failed');
|
||
|
||
expect(cleanup).toHaveBeenCalled();
|
||
});
|
||
|
||
test('cleans up the metrics worker pool when calculateMetrics rejects', async () => {
|
||
const { cleanup, deps } = baseDeps();
|
||
deps.calculateMetrics = vi.fn().mockRejectedValue(new Error('metrics failed'));
|
||
|
||
await expect(pack(['root'], createMockConfig(), vi.fn(), deps)).rejects.toThrow('metrics failed');
|
||
|
||
expect(cleanup).toHaveBeenCalled();
|
||
});
|
||
|
||
test('a prefetchSortData failure does not block the pipeline', async () => {
|
||
const { cleanup, deps } = baseDeps();
|
||
deps.prefetchSortData = vi.fn().mockRejectedValue(new Error('git failed'));
|
||
|
||
const result = await pack(['root'], createMockConfig(), vi.fn(), deps);
|
||
|
||
// Pack should complete successfully even though the prefetch failed.
|
||
expect(result.totalFiles).toBe(1);
|
||
expect(deps.sortOutputFiles).toHaveBeenCalled();
|
||
expect(cleanup).toHaveBeenCalled();
|
||
});
|
||
|
||
test('uses 3 warm-up workers for the metrics pool when no scope is specified and token cache is cold', async () => {
|
||
// The eager metrics warm-up sizes the pool via numOfTasks. When the user did
|
||
// not constrain the file set (no --include / no --stdin) AND no persistent
|
||
// token-count cache file exists from a prior run, the metrics phase has to
|
||
// tokenize every file and is typically long enough to amortize a 3rd worker's
|
||
// BPE warm-up, so we pass numOfTasks = 3 * TASKS_PER_THREAD = 600 (yielding
|
||
// maxThreads=min(cpuCount, 3)).
|
||
const { deps } = baseDeps();
|
||
deps.tokenCountCacheFileExists = vi.fn().mockReturnValue(false);
|
||
const config = createMockConfig();
|
||
config.include = [];
|
||
|
||
await pack(['root'], config, vi.fn(), deps);
|
||
|
||
expect(deps.createMetricsTaskRunner).toHaveBeenCalledWith(600, expect.any(String));
|
||
// Same gate also enables the security pool pre-warm on the unscoped path.
|
||
expect(deps.createSecurityTaskRunner).toHaveBeenCalled();
|
||
});
|
||
|
||
test('uses 1 warm-up worker for the metrics pool when no scope is specified but the token cache is warm', async () => {
|
||
// When the persistent token-count cache file already exists from a previous
|
||
// run, almost every per-file token count is served from cache and the metrics
|
||
// phase performs only a small fixed set of dispatches that survive caching:
|
||
// a wrapper-token tokenization (cache hit after run #2) and, when opted in,
|
||
// git diff staged/worktree and git log token counts. Worst-case 2–3 short
|
||
// tasks fit one warm worker serially in well under 30 ms. Spawning extra
|
||
// warm workers means each one parses the ~2.2 MB o200k_base BPE table
|
||
// (~340 ms of pure CPU), contending with the file-collection main thread
|
||
// AND extending the final `pool.destroy()` (BPE-loaded workers take ~21 ms
|
||
// to terminate vs ~3 ms when idle). The heuristic therefore falls back to
|
||
// numOfTasks = 1 * TASKS_PER_THREAD = 200 (yielding maxThreads=min(cpuCount, 1)=1
|
||
// and a single warmup task).
|
||
const { deps } = baseDeps();
|
||
deps.tokenCountCacheFileExists = vi.fn().mockReturnValue(true);
|
||
const config = createMockConfig();
|
||
config.include = [];
|
||
|
||
await pack(['root'], config, vi.fn(), deps);
|
||
|
||
expect(deps.createMetricsTaskRunner).toHaveBeenCalledWith(200, expect.any(String));
|
||
// The security pool pre-warm is unaffected by the cache state — its gate is
|
||
// still `hasExplicitScope`, which is false here.
|
||
expect(deps.createSecurityTaskRunner).toHaveBeenCalled();
|
||
});
|
||
|
||
test('uses 2 warm-up workers for the metrics pool when --include narrows scope', async () => {
|
||
// With explicit includes the file set is typically much smaller and the
|
||
// 3rd worker's BPE warm-up dominates the parallelism gain (paired benchmarks
|
||
// regressed by ~12% on the 258-file --include 'src,tests' workload), so the
|
||
// heuristic falls back to 2 warm workers. The same `hasExplicitScope`
|
||
// gate also disables the security pool pre-warm in this branch — its
|
||
// up-front cost outweighs the saved cold-start on small/scoped runs.
|
||
const { deps } = baseDeps();
|
||
const config = createMockConfig();
|
||
config.include = ['src'];
|
||
|
||
await pack(['root'], config, vi.fn(), deps);
|
||
|
||
expect(deps.createMetricsTaskRunner).toHaveBeenCalledWith(400, expect.any(String));
|
||
expect(deps.createSecurityTaskRunner).not.toHaveBeenCalled();
|
||
});
|
||
|
||
test('uses 2 warm-up workers for the metrics pool when explicitFiles is provided (--stdin)', async () => {
|
||
const { deps } = baseDeps();
|
||
const config = createMockConfig();
|
||
config.include = [];
|
||
|
||
await pack(['root'], config, vi.fn(), deps, ['file1.txt', 'file2.txt']);
|
||
|
||
expect(deps.createMetricsTaskRunner).toHaveBeenCalledWith(400, expect.any(String));
|
||
// The security pool pre-warm shares the `hasExplicitScope` gate — when --stdin
|
||
// (explicitFiles) provides the file set, the pre-warm is skipped to avoid the
|
||
// pool-construction overhead that outweighs the saved cold-start on small runs.
|
||
expect(deps.createSecurityTaskRunner).not.toHaveBeenCalled();
|
||
});
|
||
|
||
test('cleans up the metrics worker pool even when the warmup promise rejects', async () => {
|
||
const { cleanup, deps } = baseDeps();
|
||
// Pre-attach a no-op handler so the rejection is observed at construction time,
|
||
// before pack() reaches `await metricsWarmupPromise`. Production code mirrors this
|
||
// with `.catch(() => {})` in packager.ts:262, so the warmup rejection is fully
|
||
// contained — but vitest's unhandled-rejection detector can flag it eagerly here.
|
||
const warmupPromise = Promise.reject(new Error('warmup failed'));
|
||
warmupPromise.catch(() => {});
|
||
deps.createMetricsTaskRunner = vi.fn().mockReturnValue({
|
||
taskRunner: { run: vi.fn().mockResolvedValue(0), cleanup },
|
||
warmupPromise,
|
||
});
|
||
|
||
await expect(pack(['root'], createMockConfig(), vi.fn(), deps)).rejects.toThrow('warmup failed');
|
||
|
||
expect(cleanup).toHaveBeenCalled();
|
||
});
|
||
});
|
||
});
|