mirror of
https://github.com/yamadashy/repomix.git
synced 2026-06-11 15:37:16 +02:00
68a47b9149
When `tokenCountTree` is enabled `calculateSelectiveFileMetrics` already
tokenizes every file individually on the primary worker pool. The original
`calculateOutputMetrics` then re-tokenized the full output a second time, split
into 200 KB chunks, to compute `totalTokens`. On large repos with the tree
display enabled, this second pass was the single longest task in the
`calculateMetrics` `Promise.all`, consuming roughly 1 second of worker time
that duplicated work already done for the per-file counts.
This change introduces a fast path for the common case (xml / markdown / plain
output, non-parsable, single-part): walk the generated output with
`indexOf(file.content, cursor)` once per file to splice file contents out of
the output, tokenize only the remaining "wrapper" (template boilerplate +
directory tree + git diff/log + per-file headers), and compute
`totalTokens = Σ per-file tokens + wrapper tokens`.
The accuracy delta versus the old 200 KB-chunk approach is bounded by BPE
merges across file↔wrapper boundaries; on the repomix repository itself the
measured error was 309 / 1,284,067 tokens ≈ 0.024 %, comparable to the chunk
boundary error the existing approach already accepts.
## Implementation
- `src/core/metrics/calculateMetrics.ts`
- Add `extractOutputWrapper(output, processedFilesInOutputOrder)` which
walks the output with a single forward cursor. Returns `null` and
triggers a fall back to `calculateOutputMetrics` if any file content is
not found (e.g., template escaped it, output was split, order mismatch).
- Add `canUseFastOutputTokenPath(config)` gate: only enabled when
`tokenCountTree` is truthy, `splitOutput` is undefined, `parsableStyle`
is false, and the style is `xml` / `markdown` / `plain`. JSON output
and parsable XML go through `JSON.stringify` / `fast-xml-builder` which
escape file contents, so `indexOf(content)` would miss them.
- In `calculateMetrics`, when the fast path is available and wrapper
extraction succeeds, replace `outputMetricsPromise` with a promise that
awaits the already-running `selectiveFileMetricsPromise`, sums the
per-file token counts, and dispatches a single `runTokenCount` on the
extracted wrapper string. The rest of the `Promise.all` is unchanged.
- `src/core/packager.ts`
- Call `sortOutputFiles(filteredProcessedFiles, config)` once in `pack`
immediately after suspicious-file filtering and use its result as
`processedFiles` downstream (for `produceOutput`, `calculateMetrics`,
and the final result object). `generateOutput` internally calls
`sortOutputFiles` as well, which is stable and memoized via
`fileChangeCountsCache`, so the two now share the single git-log
subprocess result and consumers see files in the exact order they
appear in the output. This is a precondition for the fast path's
forward-walk extraction.
- Expose `sortOutputFiles` on `defaultDeps` so existing packager unit
tests can inject their own implementation.
- `tests/core/packager/diffsFunctionality.test.ts`
- Extend the `gitRepositoryHandle.js` `vi.mock` to also stub
`isGitInstalled` and `getFileChangeCount`, since `sortOutputFiles`
resolves its default dependencies from that module at module load time.
All 1102 existing tests pass unchanged; lint is clean.
## Benchmark
Interleaved 30-run benchmark against the repomix repo itself (1018 files,
~4 MB xml output, `tokenCountTree: 50000`, `sortByChanges: true`, `includeDiffs`
and `includeLogs` enabled via the repo's own `repomix.config.json`):
base median: 2735.2 ms [2389 - 3528] IQR=367 ms
opt median: 2373.6 ms [2125 - 2653] IQR=293 ms
delta: -361.6 ms (-13.22%)
Verbose trace before/after (single run, representative):
before:
Selective metrics calculation completed in 639 ms
Output token count completed in 1046 ms
Calculate Metrics wall: 1296 ms
after:
Selective metrics calculation completed in 579 ms
Fast-path output tokens: files=1017293, wrapper=33678 (126996 chars)
Calculate Metrics wall: ~580 ms
The savings are concentrated in the `calculateMetrics` phase, which was the
dominant critical path in the final `Promise.all` for tokenCountTree runs on
large repos.
162 lines
5.4 KiB
TypeScript
162 lines
5.4 KiB
TypeScript
import { beforeEach, describe, expect, test, vi } from 'vitest';
|
|
import type { RepomixConfigMerged } from '../../../src/config/configSchema.js';
|
|
import type { ProcessedFile } from '../../../src/core/file/fileTypes.js';
|
|
import * as gitDiffModule from '../../../src/core/git/gitDiffHandle.js';
|
|
import * as gitRepositoryModule from '../../../src/core/git/gitRepositoryHandle.js';
|
|
import { pack } from '../../../src/core/packager.js';
|
|
import { createMockConfig } from '../../testing/testUtils.js';
|
|
|
|
// Mock the dependencies
|
|
vi.mock('../../../src/core/git/gitDiffHandle.js', () => ({
|
|
getWorkTreeDiff: vi.fn(),
|
|
getStagedDiff: vi.fn(),
|
|
getGitDiffs: vi.fn(),
|
|
}));
|
|
|
|
vi.mock('../../../src/core/git/gitRepositoryHandle.js', () => ({
|
|
isGitRepository: vi.fn(),
|
|
isGitInstalled: vi.fn().mockResolvedValue(false),
|
|
getFileChangeCount: vi.fn().mockResolvedValue({}),
|
|
}));
|
|
|
|
describe('Git Diffs Functionality', () => {
|
|
let mockConfig: RepomixConfigMerged;
|
|
const mockRootDir = '/test/repo';
|
|
const sampleDiff = `diff --git a/file1.js b/file1.js
|
|
index 123..456 100644
|
|
--- a/file1.js
|
|
+++ b/file1.js
|
|
@@ -1,5 +1,5 @@
|
|
-old line
|
|
+new line
|
|
`;
|
|
|
|
beforeEach(() => {
|
|
vi.resetAllMocks();
|
|
|
|
// Sample minimal config using createMockConfig utility
|
|
mockConfig = createMockConfig({
|
|
cwd: mockRootDir,
|
|
output: {
|
|
filePath: 'repomix-output.txt',
|
|
style: 'plain',
|
|
git: {
|
|
includeDiffs: false,
|
|
},
|
|
},
|
|
});
|
|
|
|
// Set up our mocks
|
|
vi.mocked(gitRepositoryModule.isGitRepository).mockResolvedValue(true);
|
|
vi.mocked(gitDiffModule.getWorkTreeDiff).mockResolvedValue(sampleDiff);
|
|
vi.mocked(gitDiffModule.getStagedDiff).mockResolvedValue('');
|
|
});
|
|
|
|
test('should not fetch diffs when includeDiffs is disabled', async () => {
|
|
// Mock the dependencies for pack
|
|
const mockSearchFiles = vi.fn().mockResolvedValue({ filePaths: [] });
|
|
const mockCollectFiles = vi.fn().mockResolvedValue({ rawFiles: [], skippedFiles: [] });
|
|
const mockProcessFiles = vi.fn().mockResolvedValue([]);
|
|
const mockValidateFileSafety = vi.fn().mockResolvedValue({
|
|
safeFilePaths: [],
|
|
safeRawFiles: [],
|
|
suspiciousFilesResults: [],
|
|
});
|
|
const mockProduceOutput = vi.fn().mockResolvedValue({
|
|
outputForMetrics: 'mocked output',
|
|
});
|
|
const mockCalculateMetrics = vi.fn().mockResolvedValue({
|
|
totalFiles: 0,
|
|
totalCharacters: 0,
|
|
totalTokens: 0,
|
|
fileCharCounts: {},
|
|
fileTokenCounts: {},
|
|
});
|
|
const mockSortPaths = vi.fn().mockImplementation((paths) => paths);
|
|
const mockCreateMetricsTaskRunner = vi.fn().mockReturnValue({
|
|
taskRunner: {
|
|
run: vi.fn().mockResolvedValue(0),
|
|
cleanup: vi.fn().mockResolvedValue(undefined),
|
|
},
|
|
warmupPromise: Promise.resolve(),
|
|
});
|
|
|
|
// Config with diffs disabled
|
|
if (mockConfig.output.git) {
|
|
mockConfig.output.git.includeDiffs = false;
|
|
}
|
|
|
|
await pack([mockRootDir], mockConfig, vi.fn(), {
|
|
searchFiles: mockSearchFiles,
|
|
collectFiles: mockCollectFiles,
|
|
processFiles: mockProcessFiles,
|
|
validateFileSafety: mockValidateFileSafety,
|
|
produceOutput: mockProduceOutput,
|
|
calculateMetrics: mockCalculateMetrics,
|
|
createMetricsTaskRunner: mockCreateMetricsTaskRunner,
|
|
sortPaths: mockSortPaths,
|
|
});
|
|
|
|
// Should not call getWorkTreeDiff
|
|
expect(gitDiffModule.getWorkTreeDiff).not.toHaveBeenCalled();
|
|
});
|
|
|
|
test('should calculate diff token count correctly', async () => {
|
|
// Create a processed files array with a sample file
|
|
const processedFiles: ProcessedFile[] = [
|
|
{
|
|
path: 'test.js',
|
|
content: 'console.log("test");',
|
|
},
|
|
];
|
|
|
|
// Mock dependencies
|
|
const mockSearchFiles = vi.fn().mockResolvedValue({ filePaths: ['test.js'] });
|
|
const mockCollectFiles = vi.fn().mockResolvedValue({ rawFiles: processedFiles, skippedFiles: [] });
|
|
const mockProcessFiles = vi.fn().mockResolvedValue(processedFiles);
|
|
const mockValidateFileSafety = vi.fn().mockResolvedValue({
|
|
safeFilePaths: ['test.js'],
|
|
safeRawFiles: processedFiles,
|
|
suspiciousFilesResults: [],
|
|
});
|
|
const mockProduceOutput = vi.fn().mockResolvedValue({
|
|
outputForMetrics: 'Generated output with diffs included',
|
|
});
|
|
const mockCalculateMetrics = vi.fn().mockResolvedValue({
|
|
totalFiles: 1,
|
|
totalCharacters: 30,
|
|
totalTokens: 10,
|
|
fileCharCounts: { 'test.js': 10 },
|
|
fileTokenCounts: { 'test.js': 5 },
|
|
gitDiffTokenCount: 15, // Mock diff token count
|
|
});
|
|
const mockSortPaths = vi.fn().mockImplementation((paths) => paths);
|
|
const mockCreateMetricsTaskRunner = vi.fn().mockReturnValue({
|
|
taskRunner: {
|
|
run: vi.fn().mockResolvedValue(0),
|
|
cleanup: vi.fn().mockResolvedValue(undefined),
|
|
},
|
|
warmupPromise: Promise.resolve(),
|
|
});
|
|
|
|
// Config with diffs enabled
|
|
if (mockConfig.output.git) {
|
|
mockConfig.output.git.includeDiffs = true;
|
|
}
|
|
|
|
const result = await pack([mockRootDir], mockConfig, vi.fn(), {
|
|
searchFiles: mockSearchFiles,
|
|
collectFiles: mockCollectFiles,
|
|
processFiles: mockProcessFiles,
|
|
validateFileSafety: mockValidateFileSafety,
|
|
produceOutput: mockProduceOutput,
|
|
calculateMetrics: mockCalculateMetrics,
|
|
createMetricsTaskRunner: mockCreateMetricsTaskRunner,
|
|
sortPaths: mockSortPaths,
|
|
});
|
|
|
|
// Check gitDiffTokenCount in the result
|
|
expect(result.gitDiffTokenCount).toBe(15);
|
|
});
|
|
});
|