perf(core): Scan only long lines for standalone base64 runs

`hasLongBase64Run` (the cheap precondition gating the standalone-base64
truncation regex) used a newline pre-filter to bail out when every line is
shorter than the 256-char threshold, but once it found a single long line it
fell back to a `charCodeAt` scan over the *entire* file — every short line
included. A base64 run of 256 chars can never span a newline (`\n` is not a
base64 character and resets the run counter), so it must lie wholly within one
line that is itself at least 256 chars long. Every shorter line is provably
incapable of holding a qualifying run.

Replace the two-pass "find a long line, then scan everything" structure with a
single line-scoped pass: walk the content one line at a time via the native
`indexOf('\n')` and run the per-character scan only on lines that clear the
threshold. On the repomix repo ~220 of ~1070 files contain a long line, but
only ~365 KB of their ~6 MB total content actually sits on those long lines, so
the old code character-scanned ~16x more bytes than necessary for them.

Behavior is unchanged: the function returns the same boolean for all inputs
(the old full scan already reset the run at every newline, so it only ever
found within-line runs too — it just wasted time visiting the short lines).

Verification:
- Differential fuzz of old vs new logic over 2,000,000 randomized inputs
  (base64 chars, newlines, padding, injected long runs): 0 mismatches.
- Old vs new over all 1,410 repo text files: 0 mismatches.
- Full output byte-identical (`cmp`) on xml/markdown/json when packing a fixed
  external git repo with both the base and patched builds.
- 1453 tests pass; lint + tsgo clean. Added a regression test for a base64 run
  on a long line that follows another long non-base64 line.

Benchmark (warm cache, real packed set):
- Isolated `processFiles` phase over the 1,116 packed files, interleaved
  base-vs-patched (3 rounds): 37.3 ms -> 27.2 ms, -10.1 ms (-27%), stable.
- `hasLongBase64Run` micro over the 1,410 repo files (interleaved): 33.5 ms ->
  22.5 ms (1.49x).
- `processFiles` runs on the main thread as the long pole of its concurrent
  phase (the security check runs in worker threads), so the ~10 ms lands on the
  critical path — ~2% of the ~0.5 s warm CLI run, and far larger on repos with
  big minified/base64 payloads where the old code scanned whole multi-MB lines.
This commit is contained in:
Claude
2026-06-10 03:39:25 +00:00
parent f748a8b2eb
commit b9d38ca6bd
2 changed files with 48 additions and 33 deletions
+28 -27
View File
@@ -24,35 +24,36 @@ const standaloneBase64Pattern = new RegExp(`([A-Za-z0-9+/]{${MIN_BASE64_LENGTH_S
const hasLongBase64Run = (content: string): boolean => {
const len = content.length;
if (len < MIN_BASE64_LENGTH_STANDALONE) return false;
// Newline pre-filter: `\n` is not a base64 character, so it always resets the
// run below. A run of `MIN_BASE64_LENGTH_STANDALONE` therefore has to fit
// inside a single line. If every line is shorter than that threshold no run is
// possible, and we can bail out before the per-character scan. `indexOf` is a
// native (memchr-style) scan, far cheaper than the charCodeAt loop, and the
// vast majority of source files have no such long line, so this skips the hot
// loop entirely for them.
// `\n` is not a base64 character, so it always resets the run counter below. A
// run of `MIN_BASE64_LENGTH_STANDALONE` therefore has to fit inside a single
// line, and only lines at least that long can possibly contain one. Walk the
// content one line at a time (via the native, memchr-style `indexOf('\n')`)
// and run the per-character scan *only* on lines that clear the threshold.
// Shorter lines provably cannot hold a qualifying run, so they are skipped
// without a charCodeAt pass — and the vast majority of source files have no
// long line at all, so the expensive loop is usually skipped entirely. The
// old code, once it found a single long line, fell back to scanning the whole
// file (every short line included); this restricts the scan to the long lines,
// which is equivalent because the run never spans a newline.
let lineStart = 0;
let newlineIndex = content.indexOf('\n');
while (newlineIndex !== -1) {
if (newlineIndex - lineStart >= MIN_BASE64_LENGTH_STANDALONE) break;
lineStart = newlineIndex + 1;
newlineIndex = content.indexOf('\n', lineStart);
}
// The final segment (after the last newline, or the whole content when there
// is none) also needs the length check before we can rule out a long run.
if (newlineIndex === -1 && len - lineStart < MIN_BASE64_LENGTH_STANDALONE) {
return false;
}
let run = 0;
for (let i = 0; i < len; i++) {
const c = content.charCodeAt(i);
// [A-Z]:65-90, [a-z]:97-122, [0-9]:48-57, '+':43, '/':47
if ((c >= 65 && c <= 90) || (c >= 97 && c <= 122) || (c >= 48 && c <= 57) || c === 43 || c === 47) {
run++;
if (run >= MIN_BASE64_LENGTH_STANDALONE) return true;
} else {
run = 0;
while (true) {
const newlineIndex = content.indexOf('\n', lineStart);
const lineEnd = newlineIndex === -1 ? len : newlineIndex;
if (lineEnd - lineStart >= MIN_BASE64_LENGTH_STANDALONE) {
let run = 0;
for (let i = lineStart; i < lineEnd; i++) {
const c = content.charCodeAt(i);
// [A-Z]:65-90, [a-z]:97-122, [0-9]:48-57, '+':43, '/':47
if ((c >= 65 && c <= 90) || (c >= 97 && c <= 122) || (c >= 48 && c <= 57) || c === 43 || c === 47) {
run++;
if (run >= MIN_BASE64_LENGTH_STANDALONE) return true;
} else {
run = 0;
}
}
}
if (newlineIndex === -1) break;
lineStart = newlineIndex + 1;
}
return false;
};
+20 -6
View File
@@ -130,7 +130,8 @@ describe('truncateBase64Content', () => {
it('should not truncate a base64-like run split across a newline', () => {
// A 320-char base64 body interrupted by a newline: neither line segment
// reaches 256, and `\n` resets the run, so nothing should be truncated.
// Guards the newline pre-filter in `hasLongBase64Run`.
// Guards the line-scoped scan in `hasLongBase64Run`, which only inspects
// lines that individually reach the threshold.
const half = longBase64.slice(0, 160);
const input = `const data = "${half}\n${half}";`;
const result = truncateBase64Content(input);
@@ -138,8 +139,8 @@ describe('truncateBase64Content', () => {
});
it('should truncate a long base64 run that follows many short lines', () => {
// Many short lines (each < 256) precede the real run, so the newline
// pre-filter must fall through to the full scan and still truncate.
// Many short lines (each < 256) precede the real run; the line-scoped scan
// must skip them and still reach the long final line to truncate.
const shortLines = 'const a = 1;\n'.repeat(50);
const input = `${shortLines}const data = "${longBase64}";`;
const result = truncateBase64Content(input);
@@ -147,9 +148,22 @@ describe('truncateBase64Content', () => {
expect(result.startsWith(shortLines)).toBe(true);
});
it('should truncate a base64 run on a long line that follows another long non-base64 line', () => {
// The scan in `hasLongBase64Run` is line-scoped: it only character-scans
// lines that reach the length threshold. An earlier long line that is NOT a
// base64 run (here a 300-char run of '-', which resets the counter every
// char) must not cause the scanner to stop — the real run on the following
// long line still has to be detected and truncated.
const longDashes = '-'.repeat(300);
const input = `${longDashes}\nconst data = "${longBase64}";`;
const result = truncateBase64Content(input);
expect(result).toContain(longDashes);
expect(result).toContain('DTJXfKHG6xA1Wn+kye4TOF2Cp8zxFjtg...');
});
it('should truncate a base64 run on a CRLF-terminated line', () => {
// The `\r` before `\n` is also non-base64; the long line must still be
// detected by the pre-filter and truncated by the full scan.
// detected by the line-scoped scan and truncated.
const input = `const data = "${longBase64}";\r\nconst next = 2;\r\n`;
const result = truncateBase64Content(input);
expect(result).toContain('DTJXfKHG6xA1Wn+kye4TOF2Cp8zxFjtg...');
@@ -157,8 +171,8 @@ describe('truncateBase64Content', () => {
});
it('should truncate a long base64 run with no newline at all', () => {
// Single-line content (no `\n`): the pre-filter treats the whole string as
// one segment and must fall through to the full scan.
// Single-line content (no `\n`): the line-scoped scan treats the whole
// string as one segment and scans it directly.
const input = `prefix-${longBase64}-suffix`;
const result = truncateBase64Content(input);
expect(result).toContain('DTJXfKHG6xA1Wn+kye4TOF2Cp8zxFjtg...');