Files
repomix-mirror/tests/core/packager/diffsFunctionality.test.ts
Claude 68a47b9149 perf(core): Skip redundant full-output tokenization via wrapper-extraction fast path (-13.2%)
When `tokenCountTree` is enabled `calculateSelectiveFileMetrics` already
tokenizes every file individually on the primary worker pool. The original
`calculateOutputMetrics` then re-tokenized the full output a second time, split
into 200 KB chunks, to compute `totalTokens`. On large repos with the tree
display enabled, this second pass was the single longest task in the
`calculateMetrics` `Promise.all`, consuming roughly 1 second of worker time
that duplicated work already done for the per-file counts.

This change introduces a fast path for the common case (xml / markdown / plain
output, non-parsable, single-part): walk the generated output with
`indexOf(file.content, cursor)` once per file to splice file contents out of
the output, tokenize only the remaining "wrapper" (template boilerplate +
directory tree + git diff/log + per-file headers), and compute
`totalTokens = Σ per-file tokens + wrapper tokens`.

The accuracy delta versus the old 200 KB-chunk approach is bounded by BPE
merges across file↔wrapper boundaries; on the repomix repository itself the
measured error was 309 / 1,284,067 tokens ≈ 0.024 %, comparable to the chunk
boundary error the existing approach already accepts.

## Implementation

- `src/core/metrics/calculateMetrics.ts`
  - Add `extractOutputWrapper(output, processedFilesInOutputOrder)` which
    walks the output with a single forward cursor. Returns `null` and
    triggers a fall back to `calculateOutputMetrics` if any file content is
    not found (e.g., template escaped it, output was split, order mismatch).
  - Add `canUseFastOutputTokenPath(config)` gate: only enabled when
    `tokenCountTree` is truthy, `splitOutput` is undefined, `parsableStyle`
    is false, and the style is `xml` / `markdown` / `plain`. JSON output
    and parsable XML go through `JSON.stringify` / `fast-xml-builder` which
    escape file contents, so `indexOf(content)` would miss them.
  - In `calculateMetrics`, when the fast path is available and wrapper
    extraction succeeds, replace `outputMetricsPromise` with a promise that
    awaits the already-running `selectiveFileMetricsPromise`, sums the
    per-file token counts, and dispatches a single `runTokenCount` on the
    extracted wrapper string. The rest of the `Promise.all` is unchanged.

- `src/core/packager.ts`
  - Call `sortOutputFiles(filteredProcessedFiles, config)` once in `pack`
    immediately after suspicious-file filtering and use its result as
    `processedFiles` downstream (for `produceOutput`, `calculateMetrics`,
    and the final result object). `generateOutput` internally calls
    `sortOutputFiles` as well, which is stable and memoized via
    `fileChangeCountsCache`, so the two now share the single git-log
    subprocess result and consumers see files in the exact order they
    appear in the output. This is a precondition for the fast path's
    forward-walk extraction.
  - Expose `sortOutputFiles` on `defaultDeps` so existing packager unit
    tests can inject their own implementation.

- `tests/core/packager/diffsFunctionality.test.ts`
  - Extend the `gitRepositoryHandle.js` `vi.mock` to also stub
    `isGitInstalled` and `getFileChangeCount`, since `sortOutputFiles`
    resolves its default dependencies from that module at module load time.

All 1102 existing tests pass unchanged; lint is clean.

## Benchmark

Interleaved 30-run benchmark against the repomix repo itself (1018 files,
~4 MB xml output, `tokenCountTree: 50000`, `sortByChanges: true`, `includeDiffs`
and `includeLogs` enabled via the repo's own `repomix.config.json`):

    base median: 2735.2 ms  [2389 - 3528]  IQR=367 ms
    opt  median: 2373.6 ms  [2125 - 2653]  IQR=293 ms
    delta:       -361.6 ms  (-13.22%)

Verbose trace before/after (single run, representative):

    before:
      Selective metrics calculation completed in 639 ms
      Output token count completed in      1046 ms
      Calculate Metrics wall:               1296 ms

    after:
      Selective metrics calculation completed in 579 ms
      Fast-path output tokens: files=1017293, wrapper=33678 (126996 chars)
      Calculate Metrics wall:                ~580 ms

The savings are concentrated in the `calculateMetrics` phase, which was the
dominant critical path in the final `Promise.all` for tokenCountTree runs on
large repos.
2026-04-12 17:47:03 +09:00

162 lines
5.4 KiB
TypeScript

import { beforeEach, describe, expect, test, vi } from 'vitest';
import type { RepomixConfigMerged } from '../../../src/config/configSchema.js';
import type { ProcessedFile } from '../../../src/core/file/fileTypes.js';
import * as gitDiffModule from '../../../src/core/git/gitDiffHandle.js';
import * as gitRepositoryModule from '../../../src/core/git/gitRepositoryHandle.js';
import { pack } from '../../../src/core/packager.js';
import { createMockConfig } from '../../testing/testUtils.js';
// Mock the dependencies
vi.mock('../../../src/core/git/gitDiffHandle.js', () => ({
getWorkTreeDiff: vi.fn(),
getStagedDiff: vi.fn(),
getGitDiffs: vi.fn(),
}));
vi.mock('../../../src/core/git/gitRepositoryHandle.js', () => ({
isGitRepository: vi.fn(),
isGitInstalled: vi.fn().mockResolvedValue(false),
getFileChangeCount: vi.fn().mockResolvedValue({}),
}));
describe('Git Diffs Functionality', () => {
let mockConfig: RepomixConfigMerged;
const mockRootDir = '/test/repo';
const sampleDiff = `diff --git a/file1.js b/file1.js
index 123..456 100644
--- a/file1.js
+++ b/file1.js
@@ -1,5 +1,5 @@
-old line
+new line
`;
beforeEach(() => {
vi.resetAllMocks();
// Sample minimal config using createMockConfig utility
mockConfig = createMockConfig({
cwd: mockRootDir,
output: {
filePath: 'repomix-output.txt',
style: 'plain',
git: {
includeDiffs: false,
},
},
});
// Set up our mocks
vi.mocked(gitRepositoryModule.isGitRepository).mockResolvedValue(true);
vi.mocked(gitDiffModule.getWorkTreeDiff).mockResolvedValue(sampleDiff);
vi.mocked(gitDiffModule.getStagedDiff).mockResolvedValue('');
});
test('should not fetch diffs when includeDiffs is disabled', async () => {
// Mock the dependencies for pack
const mockSearchFiles = vi.fn().mockResolvedValue({ filePaths: [] });
const mockCollectFiles = vi.fn().mockResolvedValue({ rawFiles: [], skippedFiles: [] });
const mockProcessFiles = vi.fn().mockResolvedValue([]);
const mockValidateFileSafety = vi.fn().mockResolvedValue({
safeFilePaths: [],
safeRawFiles: [],
suspiciousFilesResults: [],
});
const mockProduceOutput = vi.fn().mockResolvedValue({
outputForMetrics: 'mocked output',
});
const mockCalculateMetrics = vi.fn().mockResolvedValue({
totalFiles: 0,
totalCharacters: 0,
totalTokens: 0,
fileCharCounts: {},
fileTokenCounts: {},
});
const mockSortPaths = vi.fn().mockImplementation((paths) => paths);
const mockCreateMetricsTaskRunner = vi.fn().mockReturnValue({
taskRunner: {
run: vi.fn().mockResolvedValue(0),
cleanup: vi.fn().mockResolvedValue(undefined),
},
warmupPromise: Promise.resolve(),
});
// Config with diffs disabled
if (mockConfig.output.git) {
mockConfig.output.git.includeDiffs = false;
}
await pack([mockRootDir], mockConfig, vi.fn(), {
searchFiles: mockSearchFiles,
collectFiles: mockCollectFiles,
processFiles: mockProcessFiles,
validateFileSafety: mockValidateFileSafety,
produceOutput: mockProduceOutput,
calculateMetrics: mockCalculateMetrics,
createMetricsTaskRunner: mockCreateMetricsTaskRunner,
sortPaths: mockSortPaths,
});
// Should not call getWorkTreeDiff
expect(gitDiffModule.getWorkTreeDiff).not.toHaveBeenCalled();
});
test('should calculate diff token count correctly', async () => {
// Create a processed files array with a sample file
const processedFiles: ProcessedFile[] = [
{
path: 'test.js',
content: 'console.log("test");',
},
];
// Mock dependencies
const mockSearchFiles = vi.fn().mockResolvedValue({ filePaths: ['test.js'] });
const mockCollectFiles = vi.fn().mockResolvedValue({ rawFiles: processedFiles, skippedFiles: [] });
const mockProcessFiles = vi.fn().mockResolvedValue(processedFiles);
const mockValidateFileSafety = vi.fn().mockResolvedValue({
safeFilePaths: ['test.js'],
safeRawFiles: processedFiles,
suspiciousFilesResults: [],
});
const mockProduceOutput = vi.fn().mockResolvedValue({
outputForMetrics: 'Generated output with diffs included',
});
const mockCalculateMetrics = vi.fn().mockResolvedValue({
totalFiles: 1,
totalCharacters: 30,
totalTokens: 10,
fileCharCounts: { 'test.js': 10 },
fileTokenCounts: { 'test.js': 5 },
gitDiffTokenCount: 15, // Mock diff token count
});
const mockSortPaths = vi.fn().mockImplementation((paths) => paths);
const mockCreateMetricsTaskRunner = vi.fn().mockReturnValue({
taskRunner: {
run: vi.fn().mockResolvedValue(0),
cleanup: vi.fn().mockResolvedValue(undefined),
},
warmupPromise: Promise.resolve(),
});
// Config with diffs enabled
if (mockConfig.output.git) {
mockConfig.output.git.includeDiffs = true;
}
const result = await pack([mockRootDir], mockConfig, vi.fn(), {
searchFiles: mockSearchFiles,
collectFiles: mockCollectFiles,
processFiles: mockProcessFiles,
validateFileSafety: mockValidateFileSafety,
produceOutput: mockProduceOutput,
calculateMetrics: mockCalculateMetrics,
createMetricsTaskRunner: mockCreateMetricsTaskRunner,
sortPaths: mockSortPaths,
});
// Check gitDiffTokenCount in the result
expect(result.gitDiffTokenCount).toBe(15);
});
});