From ccdd3027ab63eb0e4801391b879437c5a1d35b0f Mon Sep 17 00:00:00 2001 From: Mikael Hugo Date: Mon, 4 May 2026 15:20:16 +0200 Subject: [PATCH] perf(read): stream lines when offset/limit provided to avoid loading entire file When offset or limit are specified, use Node.js readline streaming instead of loading the entire file into memory. This fixes the truncation issue for large files (>50KB) where the read tool would return truncated content even when requesting a small slice. - Add readLinesStreamed() for memory-efficient line reading - Add countLines() for total line count without full read - Use streaming path when offset !== undefined || limit !== undefined - Keep existing full-file read path when no offset/limit specified - Add tests for streaming behavior with large files Fixes the long-standing issue where reading large files like src/headless.ts (~50KB) with offset/limit would still hit truncation limits. --- .../pi-coding-agent/src/core/tools/read.ts | 230 +++++++++++++----- src/tests/read-tool-offset-clamp.test.ts | 67 +++++ 2 files changed, 237 insertions(+), 60 deletions(-) diff --git a/packages/pi-coding-agent/src/core/tools/read.ts b/packages/pi-coding-agent/src/core/tools/read.ts index 2f6907dba..bc171bee3 100644 --- a/packages/pi-coding-agent/src/core/tools/read.ts +++ b/packages/pi-coding-agent/src/core/tools/read.ts @@ -1,8 +1,10 @@ import type { AgentTool } from "@singularity-forge/pi-agent-core"; import type { ImageContent, TextContent } from "@singularity-forge/pi-ai"; import { type Static, Type } from "@sinclair/typebox"; +import { createReadStream } from "fs"; import { constants } from "fs"; import { access as fsAccess, readFile as fsReadFile } from "fs/promises"; +import { createInterface } from "readline"; import { formatDimensionNote, resizeImage } from "../../utils/image-resize.js"; import { detectSupportedImageMimeTypeFromFile } from "../../utils/mime.js"; import { resolveReadPath } from "./path-utils.js"; @@ -46,6 +48,99 @@ export interface ReadToolOptions { operations?: ReadOperations; } +/** + * Read specific lines from a file using streaming (memory-efficient). + * Only reads the requested lines without loading the entire file. + * + * @param absolutePath - Path to the file + * @param startLine - 1-indexed line to start from + * @param lineCount - Maximum number of lines to read + * @param signal - Optional abort signal + * @returns Object with lines array and total line count (if reached) + */ +async function readLinesStreamed( + absolutePath: string, + startLine: number, + lineCount: number, + signal?: AbortSignal, +): Promise<{ lines: string[]; totalLines: number }> { + const stream = createReadStream(absolutePath, { encoding: "utf-8" }); + const rl = createInterface({ input: stream }); + + const lines: string[] = []; + let currentLine = 0; + let totalLines = 0; + + try { + for await (const line of rl) { + if (signal?.aborted) { + throw new Error("Operation aborted"); + } + + currentLine++; + + // Skip lines before startLine + if (currentLine < startLine) { + continue; + } + + // Collect lines within the requested range + if (currentLine >= startLine && lines.length < lineCount) { + lines.push(line); + } + + // Stop if we've collected enough lines + if (lines.length >= lineCount) { + // We need to count remaining lines for the "total lines" info + // But we can stop reading and just count + totalLines = currentLine; + break; + } + } + + // If we didn't break early, currentLine is the total + if (totalLines === 0) { + totalLines = currentLine; + } + } finally { + stream.destroy(); + rl.close(); + } + + return { lines, totalLines }; +} + +/** + * Count total lines in a file efficiently. + * + * @param absolutePath - Path to the file + * @param signal - Optional abort signal + * @returns Total number of lines + */ +async function countLines( + absolutePath: string, + signal?: AbortSignal, +): Promise { + const stream = createReadStream(absolutePath, { encoding: "utf-8" }); + const rl = createInterface({ input: stream }); + + let count = 0; + + try { + for await (const _line of rl) { + if (signal?.aborted) { + throw new Error("Operation aborted"); + } + count++; + } + } finally { + stream.destroy(); + rl.close(); + } + + return count; +} + export function createReadTool(cwd: string, options?: ReadToolOptions): AgentTool { const autoResizeImages = options?.autoResizeImages ?? true; const ops = options?.operations ?? defaultReadOperations; @@ -127,74 +222,89 @@ export function createReadTool(cwd: string, options?: ReadToolOptions): AgentToo } } else { // Read as text - const buffer = await ops.readFile(absolutePath); - const textContent = buffer.toString("utf-8"); - const allLines = textContent.split("\n"); - const totalFileLines = allLines.length; - - // Apply offset if specified (1-indexed to 0-indexed) - let startLine = offset ? Math.max(0, offset - 1) : 0; - - // Clamp offset to file bounds instead of throwing (#3007). - // When an agent requests offset:30 on a 13-line file, return - // the last line with a notice rather than an error that - // propagates as invalid JSON downstream. - let offsetClamped = false; - if (startLine >= allLines.length) { - startLine = Math.max(0, allLines.length - 1); - offsetClamped = true; - } - const startLineDisplay = startLine + 1; // For display (1-indexed) - - // If limit is specified by user, use it; otherwise we'll let truncateHead decide - let selectedContent: string; - let userLimitedLines: number | undefined; - if (limit !== undefined) { - const endLine = Math.min(startLine + limit, allLines.length); - selectedContent = allLines.slice(startLine, endLine).join("\n"); - userLimitedLines = endLine - startLine; - } else { - selectedContent = allLines.slice(startLine).join("\n"); - } - - // Apply truncation (respects both line and byte limits) - const truncation = truncateHead(selectedContent); - let outputText: string; + let details: ReadToolDetails | undefined; - if (truncation.firstLineExceedsLimit) { - // First line at offset exceeds 30KB - tell model to use bash - const firstLineSize = formatSize(Buffer.byteLength(allLines[startLine], "utf-8")); - outputText = `[Line ${startLineDisplay} is ${firstLineSize}, exceeds ${formatSize(DEFAULT_MAX_BYTES)} limit. Use bash: sed -n '${startLineDisplay}p' ${path} | head -c ${DEFAULT_MAX_BYTES}]`; - details = { truncation }; - } else if (truncation.truncated) { - // Truncation occurred - build actionable notice - const endLineDisplay = startLineDisplay + truncation.outputLines - 1; - const nextOffset = endLineDisplay + 1; + // Use streaming when offset or limit are specified (memory-efficient for large files) + if (offset !== undefined || limit !== undefined) { + const startLine = offset ? Math.max(1, offset) : 1; + const lineCount = limit ?? DEFAULT_MAX_LINES; - outputText = truncation.content; + // First, count total lines for the notice + const totalFileLines = await countLines(absolutePath, signal); - if (truncation.truncatedBy === "lines") { - outputText += `\n\n[Showing lines ${startLineDisplay}-${endLineDisplay} of ${totalFileLines}. Use offset=${nextOffset} to continue.]`; - } else { - outputText += `\n\n[Showing lines ${startLineDisplay}-${endLineDisplay} of ${totalFileLines} (${formatSize(DEFAULT_MAX_BYTES)} limit). Use offset=${nextOffset} to continue.]`; + // Clamp offset to file bounds (#3007) + let effectiveStartLine = startLine; + let offsetClamped = false; + if (effectiveStartLine > totalFileLines) { + effectiveStartLine = Math.max(1, totalFileLines); + offsetClamped = true; } - details = { truncation }; - } else if (userLimitedLines !== undefined && startLine + userLimitedLines < allLines.length) { - // User specified limit, there's more content, but no truncation - const remaining = allLines.length - (startLine + userLimitedLines); - const nextOffset = startLine + userLimitedLines + 1; - outputText = truncation.content; - outputText += `\n\n[${remaining} more lines in file. Use offset=${nextOffset} to continue.]`; + // Stream only the requested lines + const { lines } = await readLinesStreamed(absolutePath, effectiveStartLine, lineCount, signal); + const selectedContent = lines.join("\n"); + + // Apply truncation to the selected content + const truncation = truncateHead(selectedContent); + + if (truncation.firstLineExceedsLimit) { + const firstLineSize = formatSize(Buffer.byteLength(lines[0] ?? "", "utf-8")); + outputText = `[Line ${effectiveStartLine} is ${firstLineSize}, exceeds ${formatSize(DEFAULT_MAX_BYTES)} limit. Use bash: sed -n '${effectiveStartLine}p' ${path} | head -c ${DEFAULT_MAX_BYTES}]`; + details = { truncation }; + } else if (truncation.truncated) { + const endLineDisplay = effectiveStartLine + truncation.outputLines - 1; + const nextOffset = endLineDisplay + 1; + + outputText = truncation.content; + + if (truncation.truncatedBy === "lines") { + outputText += `\n\n[Showing lines ${effectiveStartLine}-${endLineDisplay} of ${totalFileLines}. Use offset=${nextOffset} to continue.]`; + } else { + outputText += `\n\n[Showing lines ${effectiveStartLine}-${endLineDisplay} of ${totalFileLines} (${formatSize(DEFAULT_MAX_BYTES)} limit). Use offset=${nextOffset} to continue.]`; + } + details = { truncation }; + } else if (lines.length >= lineCount && effectiveStartLine + lines.length - 1 < totalFileLines) { + // User limit reached, more content available + const nextOffset = effectiveStartLine + lines.length; + const remaining = totalFileLines - (effectiveStartLine + lines.length - 1); + + outputText = truncation.content; + outputText += `\n\n[${remaining} more lines in file. Use offset=${nextOffset} to continue.]`; + } else { + outputText = truncation.content; + } + + if (offsetClamped) { + outputText = `[Offset ${offset} beyond end of file (${totalFileLines} lines). Clamped to line ${effectiveStartLine}.]\n\n${outputText}`; + } } else { - // No truncation, no user limit exceeded - outputText = truncation.content; - } + // No offset/limit - read entire file (existing behavior) + const buffer = await ops.readFile(absolutePath); + const textContent = buffer.toString("utf-8"); + const allLines = textContent.split("\n"); + const totalFileLines = allLines.length; - // Prepend clamp notice so the agent knows offset was adjusted - if (offsetClamped) { - outputText = `[Offset ${offset} beyond end of file (${totalFileLines} lines). Clamped to line ${startLineDisplay}.]\n\n${outputText}`; + const truncation = truncateHead(textContent); + + if (truncation.firstLineExceedsLimit) { + outputText = `[File exceeds ${formatSize(DEFAULT_MAX_BYTES)} limit. Use bash: head -c ${DEFAULT_MAX_BYTES} ${path}]`; + details = { truncation }; + } else if (truncation.truncated) { + const endLineDisplay = truncation.outputLines; + const nextOffset = endLineDisplay + 1; + + outputText = truncation.content; + + if (truncation.truncatedBy === "lines") { + outputText += `\n\n[Showing lines 1-${endLineDisplay} of ${totalFileLines}. Use offset=${nextOffset} to continue.]`; + } else { + outputText += `\n\n[Showing lines 1-${endLineDisplay} of ${totalFileLines} (${formatSize(DEFAULT_MAX_BYTES)} limit). Use offset=${nextOffset} to continue.]`; + } + details = { truncation }; + } else { + outputText = truncation.content; + } } content = [{ type: "text", text: outputText }]; diff --git a/src/tests/read-tool-offset-clamp.test.ts b/src/tests/read-tool-offset-clamp.test.ts index 4c07c15ab..5746211ed 100644 --- a/src/tests/read-tool-offset-clamp.test.ts +++ b/src/tests/read-tool-offset-clamp.test.ts @@ -118,3 +118,70 @@ test("read tool: clamped offset includes notice about adjustment", async (t) => `should indicate offset was clamped, got: ${text.slice(0, 200)}`, ); }); + +test("read tool: streaming with offset and limit on large file reads only requested lines", async (t) => { + const { dir, cleanup } = makeTmpDir(); + afterEach(cleanup); + + // Create a 1000-line file + writeLines(dir, "large-file.txt", 1000); + + const readTool = createReadTool(dir); + const result = await readTool.execute("test-call", { + path: "large-file.txt", + offset: 100, + limit: 10, + }); + + const text = (result.content[0] as any).text as string; + + // Should include lines 100-109 + assert.ok(text.includes("Line 100:"), "should include line 100"); + assert.ok(text.includes("Line 109:"), "should include line 109"); + + // Should NOT include line 1 or line 200 + assert.ok(!text.includes("Line 1:"), "should NOT include line 1"); + assert.ok(!text.includes("Line 200:"), "should NOT include line 200"); + + // Should include continuation notice + assert.ok( + text.includes("more lines") || text.includes("offset="), + "should include continuation notice", + ); +}); + +test("read tool: streaming with limit only reads first N lines", async (t) => { + const { dir, cleanup } = makeTmpDir(); + afterEach(cleanup); + writeLines(dir, "medium-file.txt", 500); + + const readTool = createReadTool(dir); + const result = await readTool.execute("test-call", { + path: "medium-file.txt", + limit: 5, + }); + + const text = (result.content[0] as any).text as string; + + assert.ok(text.includes("Line 1:"), "should include line 1"); + assert.ok(text.includes("Line 5:"), "should include line 5"); + assert.ok(!text.includes("Line 6:"), "should NOT include line 6"); +}); + +test("read tool: streaming with offset only reads from offset to end", async (t) => { + const { dir, cleanup } = makeTmpDir(); + afterEach(cleanup); + writeLines(dir, "offset-only.txt", 100); + + const readTool = createReadTool(dir); + const result = await readTool.execute("test-call", { + path: "offset-only.txt", + offset: 95, + }); + + const text = (result.content[0] as any).text as string; + + assert.ok(text.includes("Line 95:"), "should include line 95"); + assert.ok(text.includes("Line 100:"), "should include line 100"); + assert.ok(!text.includes("Line 94:"), "should NOT include line 94"); +});