mirror of
https://github.com/yamadashy/repomix.git
synced 2026-05-30 11:18:53 +02:00
test(core): Add multi-root duplicate-path spec to guard pack output integrity
intent(regression-net): PR #1562 (token-count cache) initially merged FileMetrics by file path via a Map, which silently collapsed entries when the same relative path appeared under multiple roots (packages/a/README.md vs packages/b/README.md both become README.md). Fixed by index-based assembly; this spec prevents any future optimization from re-introducing the collapse through a different layer (Set dedup, sortedFile uniqueing, output rendering, etc.) decision(content-sentinel): assert on unique content markers rather than `totalFiles` count — both roots' actual bytes reaching the pack output is the user-observable contract; counts couple the test to internal accounting constraint(existing-quirk): `sortedFilePathsByDir` in packager.ts inflates `totalFiles` by passing duplicate filename entries through each root's filter (string-equality on globally-flattened sorted paths). This is pre-existing; spec is intentionally count-agnostic to avoid coupling to that unrelated behavior. Added a bounded-duplication assertion (`<=2`) as a catch-all so a regression that explodes the duplication further still fails Covered: - xml style: both roots' READMEs and their src-side markers reach the output - plain style: same contract holds (style-agnostic rendering path) - duplication does not exceed the pre-existing 2x ceiling Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,178 @@
|
||||
import fs from 'node:fs/promises';
|
||||
import os from 'node:os';
|
||||
import path from 'node:path';
|
||||
import { afterEach, beforeEach, describe, expect, it } from 'vitest';
|
||||
import { mergeConfigs } from '../../../src/config/configLoad.js';
|
||||
import type { RepomixConfigFile, RepomixConfigMerged, RepomixOutputStyle } from '../../../src/config/configSchema.js';
|
||||
import { collectFiles } from '../../../src/core/file/fileCollect.js';
|
||||
import { readRawFile } from '../../../src/core/file/fileRead.js';
|
||||
import { searchFiles } from '../../../src/core/file/fileSearch.js';
|
||||
import type { ProcessedFile } from '../../../src/core/file/fileTypes.js';
|
||||
import fileProcessWorker from '../../../src/core/file/workers/fileProcessWorker.js';
|
||||
import { produceOutput } from '../../../src/core/packager/produceOutput.js';
|
||||
import { pack } from '../../../src/core/packager.js';
|
||||
import { filterOutUntrustedFiles } from '../../../src/core/security/filterOutUntrustedFiles.js';
|
||||
import { validateFileSafety } from '../../../src/core/security/validateFileSafety.js';
|
||||
|
||||
// Behavior-level regression tests for multi-root packing.
|
||||
//
|
||||
// PR #1562 (token-count cache) initially merged FileMetrics results by file
|
||||
// path via a Map, which silently collapsed entries when the same relative path
|
||||
// appeared under multiple roots (e.g. `packages/a/README.md` and
|
||||
// `packages/b/README.md` both relativize to `README.md`). That bug was fixed
|
||||
// at the metrics layer with index-based assembly. The metrics layer is mocked
|
||||
// out below, so this spec does NOT re-catch the metrics-layer flavor — that
|
||||
// is owned by calculateFileMetrics.test.ts.
|
||||
//
|
||||
// What this spec catches instead is a content-side collapse in any layer that
|
||||
// operates on the *flattened* (multi-root) file list — `processFiles`,
|
||||
// `produceOutput`, or any intermediate assembly between them. A Map<path, ...>
|
||||
// dedup at that level would drop one root's bytes from the file we hand to
|
||||
// the LLM. The unique content sentinels per root make that visible.
|
||||
//
|
||||
// Out of scope:
|
||||
// - `collectFiles` itself is called once per root, so a per-call internal
|
||||
// dedup-by-path would not collapse cross-root copies and would not be
|
||||
// detected here.
|
||||
// - A collapse in the file-tree generator alone would still let both root
|
||||
// bodies reach the files section — see tests around
|
||||
// `generateTreeStringWithRoots` for tree-only guarantees.
|
||||
|
||||
const buildMergedConfig = (
|
||||
cwd: string,
|
||||
outputFilePath: string,
|
||||
style: RepomixOutputStyle,
|
||||
overrides: Partial<RepomixConfigFile> = {},
|
||||
): RepomixConfigMerged =>
|
||||
mergeConfigs(cwd, overrides, {
|
||||
output: {
|
||||
filePath: outputFilePath,
|
||||
style,
|
||||
git: { sortByChanges: false, includeDiffs: false, includeLogs: false },
|
||||
},
|
||||
});
|
||||
|
||||
const runPack = (rootDirs: string[], config: RepomixConfigMerged) =>
|
||||
pack(rootDirs, config, () => {}, {
|
||||
searchFiles,
|
||||
sortPaths: (filePaths) => filePaths,
|
||||
collectFiles: (filePaths, root, cfg, progressCallback) =>
|
||||
collectFiles(filePaths, root, cfg, progressCallback, { readRawFile }),
|
||||
processFiles: async (rawFiles, cfg) => {
|
||||
const processedFiles: ProcessedFile[] = [];
|
||||
for (const rawFile of rawFiles) {
|
||||
processedFiles.push(await fileProcessWorker({ rawFile, config: cfg }));
|
||||
}
|
||||
return processedFiles;
|
||||
},
|
||||
// No secrets in these fixtures — short-circuit the security scan to keep
|
||||
// the test fast and focused on the multi-root assembly path.
|
||||
validateFileSafety: (rawFiles, progressCallback, cfg, gitDiff, gitLog) =>
|
||||
validateFileSafety(rawFiles, progressCallback, cfg, gitDiff, gitLog, {
|
||||
runSecurityCheck: async () => [],
|
||||
filterOutUntrustedFiles,
|
||||
}),
|
||||
produceOutput,
|
||||
createMetricsTaskRunner: () => ({
|
||||
taskRunner: { run: async () => 0, cleanup: async () => {} },
|
||||
warmupPromise: Promise.resolve(),
|
||||
}),
|
||||
calculateMetrics: async (processedFiles) => ({
|
||||
totalFiles: processedFiles.length,
|
||||
totalCharacters: processedFiles.reduce((acc, f) => acc + f.content.length, 0),
|
||||
totalTokens: 0,
|
||||
gitDiffTokenCount: 0,
|
||||
gitLogTokenCount: 0,
|
||||
fileCharCounts: Object.fromEntries(processedFiles.map((f) => [f.path, f.content.length])),
|
||||
fileTokenCounts: Object.fromEntries(processedFiles.map((f) => [f.path, 0])),
|
||||
suspiciousFilesResults: [],
|
||||
suspiciousGitDiffResults: [],
|
||||
}),
|
||||
});
|
||||
|
||||
const countOccurrences = (haystack: string, needle: string): number => {
|
||||
let count = 0;
|
||||
let index = haystack.indexOf(needle);
|
||||
while (index !== -1) {
|
||||
count++;
|
||||
index = haystack.indexOf(needle, index + needle.length);
|
||||
}
|
||||
return count;
|
||||
};
|
||||
|
||||
describe('multi-root pack spec', () => {
|
||||
let rootA: string;
|
||||
let rootB: string;
|
||||
let outputDir: string;
|
||||
let outputPath: string;
|
||||
|
||||
beforeEach(async () => {
|
||||
rootA = await fs.mkdtemp(path.join(os.tmpdir(), 'repomix-multiroot-a-'));
|
||||
rootB = await fs.mkdtemp(path.join(os.tmpdir(), 'repomix-multiroot-b-'));
|
||||
outputDir = await fs.mkdtemp(path.join(os.tmpdir(), 'repomix-multiroot-out-'));
|
||||
outputPath = path.join(outputDir, 'output.xml');
|
||||
|
||||
// Same relative path in each root, but unique content so a collapsing
|
||||
// implementation produces an obviously-wrong output.
|
||||
await fs.writeFile(path.join(rootA, 'README.md'), '# Root A\nMARKER_FROM_ROOT_A\n');
|
||||
await fs.mkdir(path.join(rootA, 'src'));
|
||||
await fs.writeFile(path.join(rootA, 'src', 'index.ts'), 'export const fromA = "ROOT_A_SRC_MARKER";\n');
|
||||
|
||||
await fs.writeFile(path.join(rootB, 'README.md'), '# Root B (totally different)\nMARKER_FROM_ROOT_B\n');
|
||||
await fs.mkdir(path.join(rootB, 'lib'));
|
||||
await fs.writeFile(path.join(rootB, 'lib', 'util.ts'), 'export const fromB = "ROOT_B_LIB_MARKER";\n');
|
||||
});
|
||||
|
||||
afterEach(async () => {
|
||||
await fs.rm(rootA, { recursive: true, force: true });
|
||||
await fs.rm(rootB, { recursive: true, force: true });
|
||||
await fs.rm(outputDir, { recursive: true, force: true });
|
||||
});
|
||||
|
||||
// Note on `totalFiles`: a separate pre-existing quirk in `sortedFilePathsByDir`
|
||||
// means the same relative path appears in each root's collect input (e.g.
|
||||
// `README.md` is filtered from the global sorted list per-root, so duplicates
|
||||
// pass twice). That inflates `result.totalFiles` beyond 4. This spec is
|
||||
// intentionally narrow: it verifies the *unique content* contract — both
|
||||
// roots' real bytes reach the packed output — and does not assert on
|
||||
// counts, which would couple it to that unrelated quirk.
|
||||
|
||||
it('preserves both unique contents of a duplicate relative path when packing multiple roots (xml)', async () => {
|
||||
const config = buildMergedConfig(outputDir, outputPath, 'xml');
|
||||
await runPack([rootA, rootB], config);
|
||||
|
||||
const output = await fs.readFile(outputPath, 'utf-8');
|
||||
// The contract: each root's bytes survive into the pack. A
|
||||
// Map<path, FileMetrics> collapse — the one PR #1562 fixed in the metrics
|
||||
// layer — would drop one of these markers entirely.
|
||||
expect(output).toContain('MARKER_FROM_ROOT_A');
|
||||
expect(output).toContain('MARKER_FROM_ROOT_B');
|
||||
expect(output).toContain('ROOT_A_SRC_MARKER');
|
||||
expect(output).toContain('ROOT_B_LIB_MARKER');
|
||||
});
|
||||
|
||||
it('preserves both unique contents in plain style too', async () => {
|
||||
// Plain style takes a different rendering path; the assertion is
|
||||
// style-agnostic because the collapse, if reintroduced, would surface
|
||||
// identically in both.
|
||||
const config = buildMergedConfig(outputDir, outputPath, 'plain');
|
||||
await runPack([rootA, rootB], config);
|
||||
|
||||
const output = await fs.readFile(outputPath, 'utf-8');
|
||||
expect(output).toContain('MARKER_FROM_ROOT_A');
|
||||
expect(output).toContain('MARKER_FROM_ROOT_B');
|
||||
});
|
||||
|
||||
// Sanity check on the deduplication direction: each unique sentinel must
|
||||
// appear at most twice (the pre-existing `sortedFilePathsByDir` quirk causes
|
||||
// each root's bytes to be packed twice). If an over-eager perf change starts
|
||||
// multiplying entries further, this catches it.
|
||||
it('does not balloon a duplicated path beyond the existing 2x quirk', async () => {
|
||||
const config = buildMergedConfig(outputDir, outputPath, 'xml');
|
||||
await runPack([rootA, rootB], config);
|
||||
|
||||
const output = await fs.readFile(outputPath, 'utf-8');
|
||||
expect(countOccurrences(output, 'MARKER_FROM_ROOT_A')).toBeLessThanOrEqual(2);
|
||||
expect(countOccurrences(output, 'MARKER_FROM_ROOT_B')).toBeLessThanOrEqual(2);
|
||||
});
|
||||
});
|
||||
@@ -110,7 +110,7 @@ describe('security scan spec', () => {
|
||||
tmpDir = await fs.mkdtemp(path.join(os.tmpdir(), 'repomix-security-spec-'));
|
||||
outputPath = path.join(tmpDir, 'output.xml');
|
||||
// Fixture: one clean file + one file carrying a known secretlint trigger.
|
||||
await fs.writeFile(path.join(tmpDir, 'clean.ts'), 'export const greet = (n: string) => `hi ${n}`;\n');
|
||||
await fs.writeFile(path.join(tmpDir, 'clean.ts'), 'export const greet = (n: string) => "hi " + n;\n');
|
||||
await fs.writeFile(path.join(tmpDir, 'leaky.env'), `${FAKE_AWS_SECRET}\n`);
|
||||
});
|
||||
|
||||
|
||||
Reference in New Issue
Block a user