diff --git a/apps/sim/app/api/knowledge/route.ts b/apps/sim/app/api/knowledge/route.ts index 31951276176..20499ce8fce 100644 --- a/apps/sim/app/api/knowledge/route.ts +++ b/apps/sim/app/api/knowledge/route.ts @@ -15,14 +15,6 @@ import { captureServerEvent } from '@/lib/posthog/server' const logger = createLogger('KnowledgeBaseAPI') -/** - * Schema for creating a knowledge base - * - * Chunking config units: - * - maxSize: tokens (1 token ≈ 4 characters) - * - minSize: characters - * - overlap: tokens (1 token ≈ 4 characters) - */ const CreateKnowledgeBaseSchema = z.object({ name: z.string().min(1, 'Name is required'), description: z.string().optional(), @@ -31,12 +23,20 @@ const CreateKnowledgeBaseSchema = z.object({ embeddingDimension: z.literal(1536).default(1536), chunkingConfig: z .object({ - /** Maximum chunk size in tokens (1 token ≈ 4 characters) */ maxSize: z.number().min(100).max(4000).default(1024), - /** Minimum chunk size in characters */ minSize: z.number().min(1).max(2000).default(100), - /** Overlap between chunks in tokens (1 token ≈ 4 characters) */ overlap: z.number().min(0).max(500).default(200), + strategy: z + .enum(['auto', 'text', 'regex', 'recursive', 'sentence', 'token']) + .default('auto') + .optional(), + strategyOptions: z + .object({ + pattern: z.string().max(500).optional(), + separators: z.array(z.string()).optional(), + recipe: z.enum(['plain', 'markdown', 'code']).optional(), + }) + .optional(), }) .default({ maxSize: 1024, @@ -45,13 +45,31 @@ const CreateKnowledgeBaseSchema = z.object({ }) .refine( (data) => { - // Convert maxSize from tokens to characters for comparison (1 token ≈ 4 chars) const maxSizeInChars = data.maxSize * 4 return data.minSize < maxSizeInChars }, { message: 'Min chunk size (characters) must be less than max chunk size (tokens × 4)', } + ) + .refine( + (data) => { + return data.overlap < data.maxSize + }, + { + message: 'Overlap must be less than max chunk size', + } + ) + .refine( + (data) => { + if (data.strategy === 'regex' && !data.strategyOptions?.pattern) { + return false + } + return true + }, + { + message: 'Regex pattern is required when using the regex chunking strategy', + } ), }) diff --git a/apps/sim/app/workspace/[workspaceId]/knowledge/[id]/components/add-documents-modal/add-documents-modal.tsx b/apps/sim/app/workspace/[workspaceId]/knowledge/[id]/components/add-documents-modal/add-documents-modal.tsx index 5ddb7eb6a20..a731e38e0da 100644 --- a/apps/sim/app/workspace/[workspaceId]/knowledge/[id]/components/add-documents-modal/add-documents-modal.tsx +++ b/apps/sim/app/workspace/[workspaceId]/knowledge/[id]/components/add-documents-modal/add-documents-modal.tsx @@ -263,7 +263,8 @@ export function AddDocumentsModal({ {isDragging ? 'Drop files here' : 'Drop files here or click to browse'} - PDF, DOC, DOCX, TXT, CSV, XLS, XLSX, MD, PPT, PPTX, HTML (max 100MB each) + PDF, DOC, DOCX, TXT, CSV, XLS, XLSX, MD, PPT, PPTX, HTML, JSONL (max 100MB + each) diff --git a/apps/sim/app/workspace/[workspaceId]/knowledge/components/create-base-modal/create-base-modal.tsx b/apps/sim/app/workspace/[workspaceId]/knowledge/components/create-base-modal/create-base-modal.tsx index a4e1e44ebc9..e6884cc332d 100644 --- a/apps/sim/app/workspace/[workspaceId]/knowledge/components/create-base-modal/create-base-modal.tsx +++ b/apps/sim/app/workspace/[workspaceId]/knowledge/components/create-base-modal/create-base-modal.tsx @@ -9,6 +9,8 @@ import { useForm } from 'react-hook-form' import { z } from 'zod' import { Button, + Combobox, + type ComboboxOption, Input, Label, Modal, @@ -18,6 +20,7 @@ import { ModalHeader, Textarea, } from '@/components/emcn' +import type { StrategyOptions } from '@/lib/chunkers/types' import { cn } from '@/lib/core/utils/cn' import { formatFileSize, validateKnowledgeBaseFile } from '@/lib/uploads/utils/file-utils' import { ACCEPT_ATTRIBUTE } from '@/lib/uploads/utils/validation' @@ -35,6 +38,20 @@ interface CreateBaseModalProps { onOpenChange: (open: boolean) => void } +const STRATEGY_OPTIONS = [ + { value: 'auto', label: 'Auto (detect from content)' }, + { value: 'text', label: 'Text (word boundary splitting)' }, + { value: 'recursive', label: 'Recursive (configurable separators)' }, + { value: 'sentence', label: 'Sentence' }, + { value: 'token', label: 'Token (fixed-size)' }, + { value: 'regex', label: 'Regex (custom pattern)' }, +] as const + +const STRATEGY_COMBOBOX_OPTIONS: ComboboxOption[] = STRATEGY_OPTIONS.map((o) => ({ + label: o.label, + value: o.value, +})) + const FormSchema = z .object({ name: z @@ -43,25 +60,24 @@ const FormSchema = z .max(100, 'Name must be less than 100 characters') .refine((value) => value.trim().length > 0, 'Name cannot be empty'), description: z.string().max(500, 'Description must be less than 500 characters').optional(), - /** Minimum chunk size in characters */ minChunkSize: z .number() .min(1, 'Min chunk size must be at least 1 character') .max(2000, 'Min chunk size must be less than 2000 characters'), - /** Maximum chunk size in tokens (1 token ≈ 4 characters) */ maxChunkSize: z .number() .min(100, 'Max chunk size must be at least 100 tokens') .max(4000, 'Max chunk size must be less than 4000 tokens'), - /** Overlap between chunks in tokens */ overlapSize: z .number() .min(0, 'Overlap must be non-negative') .max(500, 'Overlap must be less than 500 tokens'), + strategy: z.enum(['auto', 'text', 'regex', 'recursive', 'sentence', 'token']).default('auto'), + regexPattern: z.string().optional(), + customSeparators: z.string().optional(), }) .refine( (data) => { - // Convert maxChunkSize from tokens to characters for comparison (1 token ≈ 4 chars) const maxChunkSizeInChars = data.maxChunkSize * 4 return data.minChunkSize < maxChunkSizeInChars }, @@ -70,6 +86,27 @@ const FormSchema = z path: ['minChunkSize'], } ) + .refine( + (data) => { + return data.overlapSize < data.maxChunkSize + }, + { + message: 'Overlap must be less than max chunk size', + path: ['overlapSize'], + } + ) + .refine( + (data) => { + if (data.strategy === 'regex' && !data.regexPattern?.trim()) { + return false + } + return true + }, + { + message: 'Regex pattern is required when using the regex strategy', + path: ['regexPattern'], + } + ) type FormValues = z.infer @@ -124,6 +161,7 @@ export const CreateBaseModal = memo(function CreateBaseModal({ handleSubmit, reset, watch, + setValue, formState: { errors }, } = useForm({ resolver: zodResolver(FormSchema), @@ -133,11 +171,15 @@ export const CreateBaseModal = memo(function CreateBaseModal({ minChunkSize: 100, maxChunkSize: 1024, overlapSize: 200, + strategy: 'auto', + regexPattern: '', + customSeparators: '', }, mode: 'onSubmit', }) const nameValue = watch('name') + const strategyValue = watch('strategy') useEffect(() => { if (open) { @@ -153,6 +195,9 @@ export const CreateBaseModal = memo(function CreateBaseModal({ minChunkSize: 100, maxChunkSize: 1024, overlapSize: 200, + strategy: 'auto', + regexPattern: '', + customSeparators: '', }) } }, [open, reset]) @@ -255,6 +300,17 @@ export const CreateBaseModal = memo(function CreateBaseModal({ setSubmitStatus(null) try { + const strategyOptions: StrategyOptions | undefined = + data.strategy === 'regex' && data.regexPattern + ? { pattern: data.regexPattern } + : data.strategy === 'recursive' && data.customSeparators?.trim() + ? { + separators: data.customSeparators + .split(',') + .map((s) => s.trim().replace(/\\n/g, '\n').replace(/\\t/g, '\t')), + } + : undefined + const newKnowledgeBase = await createKnowledgeBaseMutation.mutateAsync({ name: data.name, description: data.description || undefined, @@ -263,6 +319,8 @@ export const CreateBaseModal = memo(function CreateBaseModal({ maxSize: data.maxChunkSize, minSize: data.minChunkSize, overlap: data.overlapSize, + ...(data.strategy !== 'auto' && { strategy: data.strategy }), + ...(strategyOptions && { strategyOptions }), }, }) @@ -312,7 +370,6 @@ export const CreateBaseModal = memo(function CreateBaseModal({
- {/* Hidden decoy fields to prevent browser autofill */}
+
+ + setValue('strategy', value as FormValues['strategy'])} + dropdownWidth='trigger' + align='start' + /> +

+ Auto detects the best strategy based on file content type. +

+
+ + {strategyValue === 'regex' && ( +
+ + + {errors.regexPattern && ( +

+ {errors.regexPattern.message} +

+ )} +

+ Text will be split at each match of this regex pattern. +

+
+ )} + + {strategyValue === 'recursive' && ( +
+ + +

+ Comma-separated list of delimiters in priority order. Leave empty for default + separators. +

+
+ )} +
diff --git a/apps/sim/hooks/queries/kb/knowledge.ts b/apps/sim/hooks/queries/kb/knowledge.ts index 455d762ecab..e1d3343a57d 100644 --- a/apps/sim/hooks/queries/kb/knowledge.ts +++ b/apps/sim/hooks/queries/kb/knowledge.ts @@ -1,6 +1,7 @@ import { createLogger } from '@sim/logger' import { keepPreviousData, useMutation, useQuery, useQueryClient } from '@tanstack/react-query' import { toast } from '@/components/emcn' +import type { ChunkingStrategy, StrategyOptions } from '@/lib/chunkers/types' import type { ChunkData, ChunksPagination, @@ -338,10 +339,7 @@ export interface DocumentChunkSearchParams { search: string } -/** - * Fetches all chunks matching a search query by paginating through results. - * This is used for search functionality where we need all matching chunks. - */ +/** Paginates through all matching chunks rather than returning a single page. */ export async function fetchAllDocumentChunks( { knowledgeBaseId, documentId, search }: DocumentChunkSearchParams, signal?: AbortSignal @@ -376,10 +374,6 @@ export const serializeSearchParams = (params: DocumentChunkSearchParams) => search: params.search, }) -/** - * Hook to search for chunks in a document. - * Fetches all matching chunks and returns them for client-side pagination. - */ export function useDocumentChunkSearchQuery( params: DocumentChunkSearchParams, options?: { @@ -707,6 +701,8 @@ export interface CreateKnowledgeBaseParams { maxSize: number minSize: number overlap: number + strategy?: ChunkingStrategy + strategyOptions?: StrategyOptions } } diff --git a/apps/sim/lib/chunkers/docs-chunker.ts b/apps/sim/lib/chunkers/docs-chunker.ts index 8ec203b6501..ddfecc3ab19 100644 --- a/apps/sim/lib/chunkers/docs-chunker.ts +++ b/apps/sim/lib/chunkers/docs-chunker.ts @@ -3,12 +3,12 @@ import path from 'path' import { createLogger } from '@sim/logger' import { TextChunker } from '@/lib/chunkers/text-chunker' import type { DocChunk, DocsChunkerOptions } from '@/lib/chunkers/types' +import { estimateTokens } from '@/lib/chunkers/utils' import { generateEmbeddings } from '@/lib/knowledge/embeddings' interface HeaderInfo { level: number text: string - slug?: string anchor?: string position?: number } @@ -21,25 +21,21 @@ interface Frontmatter { const logger = createLogger('DocsChunker') -/** - * Docs-specific chunker that processes .mdx files and tracks header context - */ export class DocsChunker { private readonly textChunker: TextChunker private readonly baseUrl: string + private readonly chunkSize: number constructor(options: DocsChunkerOptions = {}) { + this.chunkSize = options.chunkSize ?? 300 this.textChunker = new TextChunker({ - chunkSize: options.chunkSize ?? 300, // Max 300 tokens per chunk + chunkSize: this.chunkSize, minCharactersPerChunk: options.minCharactersPerChunk ?? 1, chunkOverlap: options.chunkOverlap ?? 50, }) this.baseUrl = options.baseUrl ?? 'https://docs.sim.ai' } - /** - * Process all .mdx files in the docs directory - */ async chunkAllDocs(docsPath: string): Promise { const allChunks: DocChunk[] = [] @@ -65,20 +61,17 @@ export class DocsChunker { } } - /** - * Process a single .mdx file - */ async chunkMdxFile(filePath: string, basePath: string): Promise { const content = await fs.readFile(filePath, 'utf-8') const relativePath = path.relative(basePath, filePath) const { data: frontmatter, content: markdownContent } = this.parseFrontmatter(content) - const headers = this.extractHeaders(markdownContent) - const documentUrl = this.generateDocumentUrl(relativePath) - const textChunks = await this.splitContent(markdownContent) + const { chunks: textChunks, cleanedContent } = await this.splitContent(markdownContent) + + const headers = this.extractHeaders(cleanedContent) logger.info(`Generating embeddings for ${textChunks.length} chunks in ${relativePath}`) const embeddings: number[][] = @@ -97,7 +90,7 @@ export class DocsChunker { const chunk: DocChunk = { text: chunkText, - tokenCount: Math.ceil(chunkText.length / 4), // Simple token estimation + tokenCount: estimateTokens(chunkText), sourceDocument: relativePath, headerLink: relevantHeader ? `${documentUrl}#${relevantHeader.anchor}` : documentUrl, headerText: relevantHeader?.text || frontmatter.title || 'Document Root', @@ -118,9 +111,6 @@ export class DocsChunker { return chunks } - /** - * Find all .mdx files recursively - */ private async findMdxFiles(dirPath: string): Promise { const files: string[] = [] @@ -140,9 +130,6 @@ export class DocsChunker { return files } - /** - * Extract headers and their positions from markdown content - */ private extractHeaders(content: string): HeaderInfo[] { const headers: HeaderInfo[] = [] const headerRegex = /^(#{1,6})\s+(.+)$/gm @@ -164,42 +151,28 @@ export class DocsChunker { return headers } - /** - * Generate URL-safe anchor from header text - */ private generateAnchor(headerText: string): string { return headerText .toLowerCase() - .replace(/[^\w\s-]/g, '') // Remove special characters except hyphens - .replace(/\s+/g, '-') // Replace spaces with hyphens - .replace(/-+/g, '-') // Replace multiple hyphens with single - .replace(/^-|-$/g, '') // Remove leading/trailing hyphens + .replace(/[^\w\s-]/g, '') + .replace(/\s+/g, '-') + .replace(/-+/g, '-') + .replace(/^-|-$/g, '') } - /** - * Generate document URL from relative path - * Handles index.mdx files specially - they are served at the parent directory path - */ + /** index.mdx files are served at the parent directory path */ private generateDocumentUrl(relativePath: string): string { - // Convert file path to URL path - // e.g., "tools/knowledge.mdx" -> "/tools/knowledge" - // e.g., "triggers/index.mdx" -> "/triggers" (NOT "/triggers/index") - let urlPath = relativePath.replace(/\.mdx$/, '').replace(/\\/g, '/') // Handle Windows paths + let urlPath = relativePath.replace(/\.mdx$/, '').replace(/\\/g, '/') - // In fumadocs, index.mdx files are served at the parent directory path - // e.g., "triggers/index" -> "triggers" if (urlPath.endsWith('/index')) { - urlPath = urlPath.slice(0, -6) // Remove "/index" + urlPath = urlPath.slice(0, -6) } else if (urlPath === 'index') { - urlPath = '' // Root index.mdx + urlPath = '' } return `${this.baseUrl}/${urlPath}` } - /** - * Find the most relevant header for a given position - */ private findRelevantHeader(headers: HeaderInfo[], position: number): HeaderInfo | null { if (headers.length === 0) return null @@ -216,10 +189,10 @@ export class DocsChunker { return relevantHeader } - /** - * Split content into chunks using the existing TextChunker with table awareness - */ - private async splitContent(content: string): Promise { + /** Returns both chunks and cleaned content so header extraction uses aligned positions. */ + private async splitContent( + content: string + ): Promise<{ chunks: string[]; cleanedContent: string }> { const cleanedContent = this.cleanContent(content) const tableBoundaries = this.detectTableBoundaries(cleanedContent) @@ -234,30 +207,23 @@ export class DocsChunker { const finalChunks = this.enforceSizeLimit(processedChunks) - return finalChunks + return { chunks: finalChunks, cleanedContent } } - /** - * Clean content by removing MDX-specific elements and excessive whitespace - */ private cleanContent(content: string): string { - return ( - content - // Remove import statements - .replace(/^import\s+.*$/gm, '') - // Remove JSX components and React-style comments - .replace(/<[^>]+>/g, ' ') - .replace(/\{\/\*[\s\S]*?\*\/\}/g, ' ') - // Remove excessive whitespace - .replace(/\n{3,}/g, '\n\n') - .replace(/[ \t]{2,}/g, ' ') - .trim() - ) + return content + .replace(/\r\n/g, '\n') + .replace(/\r/g, '\n') + .replace(/^import\s+.*$/gm, '') + .replace(/^export\s+.*$/gm, '') + .replace(/<\/?[a-zA-Z][^>]*>/g, ' ') + .replace(/\{\/\*[\s\S]*?\*\/\}/g, ' ') + .replace(/\{[^{}]*\}/g, ' ') + .replace(/\n{3,}/g, '\n\n') + .replace(/[ \t]{2,}/g, ' ') + .trim() } - /** - * Parse frontmatter from MDX content - */ private parseFrontmatter(content: string): { data: Frontmatter; content: string } { const frontmatterRegex = /^---\r?\n([\s\S]*?)\r?\n---\r?\n([\s\S]*)$/ const match = content.match(frontmatterRegex) @@ -285,26 +251,25 @@ export class DocsChunker { return { data, content: markdownContent } } - /** - * Estimate token count (rough approximation) - */ - private estimateTokens(text: string): number { - return Math.ceil(text.length / 4) - } - - /** - * Detect table boundaries in markdown content to avoid splitting them - */ + /** Detects table boundaries to avoid splitting tables across chunks. */ private detectTableBoundaries(content: string): { start: number; end: number }[] { const tables: { start: number; end: number }[] = [] const lines = content.split('\n') let inTable = false + let inCodeBlock = false let tableStart = -1 for (let i = 0; i < lines.length; i++) { const line = lines[i].trim() + if (line.startsWith('```')) { + inCodeBlock = !inCodeBlock + continue + } + + if (inCodeBlock) continue + if (line.includes('|') && line.split('|').length >= 3 && !inTable) { const nextLine = lines[i + 1]?.trim() if (nextLine?.includes('|') && nextLine.includes('-')) { @@ -314,7 +279,7 @@ export class DocsChunker { } else if (inTable && (!line.includes('|') || line === '' || line.startsWith('#'))) { tables.push({ start: this.getCharacterPosition(lines, tableStart), - end: this.getCharacterPosition(lines, i - 1) + lines[i - 1]?.length || 0, + end: this.getCharacterPosition(lines, i - 1) + (lines[i - 1]?.length ?? 0), }) inTable = false } @@ -330,16 +295,10 @@ export class DocsChunker { return tables } - /** - * Get character position from line number - */ private getCharacterPosition(lines: string[], lineIndex: number): number { return lines.slice(0, lineIndex).reduce((acc, line) => acc + line.length + 1, 0) } - /** - * Merge chunks that would split tables - */ private mergeTableChunks( chunks: string[], tableBoundaries: { start: number; end: number }[], @@ -354,6 +313,10 @@ export class DocsChunker { for (const chunk of chunks) { const chunkStart = originalContent.indexOf(chunk, currentPosition) + if (chunkStart === -1) { + mergedChunks.push(chunk) + continue + } const chunkEnd = chunkStart + chunk.length const intersectsTable = tableBoundaries.some( @@ -373,10 +336,10 @@ export class DocsChunker { const minStart = Math.min(chunkStart, ...affectedTables.map((t) => t.start)) const maxEnd = Math.max(chunkEnd, ...affectedTables.map((t) => t.end)) - const completeChunk = originalContent.slice(minStart, maxEnd) + const completeChunk = originalContent.slice(minStart, maxEnd).trim() - if (!mergedChunks.some((existing) => existing.includes(completeChunk.trim()))) { - mergedChunks.push(completeChunk.trim()) + if (completeChunk && !mergedChunks.some((existing) => existing === completeChunk)) { + mergedChunks.push(completeChunk) } } else { mergedChunks.push(chunk) @@ -388,16 +351,13 @@ export class DocsChunker { return mergedChunks.filter((chunk) => chunk.length > 50) } - /** - * Enforce 300 token size limit on chunks - */ private enforceSizeLimit(chunks: string[]): string[] { const finalChunks: string[] = [] for (const chunk of chunks) { - const tokens = this.estimateTokens(chunk) + const tokens = estimateTokens(chunk) - if (tokens <= 300) { + if (tokens <= this.chunkSize) { finalChunks.push(chunk) } else { const lines = chunk.split('\n') @@ -406,7 +366,7 @@ export class DocsChunker { for (const line of lines) { const testChunk = currentChunk ? `${currentChunk}\n${line}` : line - if (this.estimateTokens(testChunk) <= 300) { + if (estimateTokens(testChunk) <= this.chunkSize) { currentChunk = testChunk } else { if (currentChunk.trim()) { diff --git a/apps/sim/lib/chunkers/index.ts b/apps/sim/lib/chunkers/index.ts index 403e75a20d1..2e4595b5ea0 100644 --- a/apps/sim/lib/chunkers/index.ts +++ b/apps/sim/lib/chunkers/index.ts @@ -1,5 +1,9 @@ export { DocsChunker } from './docs-chunker' export { JsonYamlChunker } from './json-yaml-chunker' +export { RecursiveChunker } from './recursive-chunker' +export { RegexChunker } from './regex-chunker' +export { SentenceChunker } from './sentence-chunker' export { StructuredDataChunker } from './structured-data-chunker' export { TextChunker } from './text-chunker' +export { TokenChunker } from './token-chunker' export * from './types' diff --git a/apps/sim/lib/chunkers/json-yaml-chunker.test.ts b/apps/sim/lib/chunkers/json-yaml-chunker.test.ts index 0568c8eff93..251b50daeaa 100644 --- a/apps/sim/lib/chunkers/json-yaml-chunker.test.ts +++ b/apps/sim/lib/chunkers/json-yaml-chunker.test.ts @@ -30,14 +30,11 @@ describe('JsonYamlChunker', () => { expect(JsonYamlChunker.isStructuredData('key: value\nother: data')).toBe(true) }) - it('should return true for YAML-like plain text', () => { - // Note: js-yaml is permissive and parses plain text as valid YAML (scalar value) - // This is expected behavior of the YAML parser - expect(JsonYamlChunker.isStructuredData('Hello, this is plain text.')).toBe(true) + it('should return false for plain text parsed as YAML scalar', () => { + expect(JsonYamlChunker.isStructuredData('Hello, this is plain text.')).toBe(false) }) it('should return false for invalid JSON/YAML with unbalanced braces', () => { - // Only truly malformed content that fails YAML parsing returns false expect(JsonYamlChunker.isStructuredData('{invalid: json: content: {{')).toBe(false) }) @@ -61,7 +58,6 @@ describe('JsonYamlChunker', () => { const json = '{}' const chunks = await chunker.chunk(json) - // Empty object is valid JSON, should return at least metadata expect(chunks.length).toBeGreaterThanOrEqual(0) }) @@ -204,7 +200,6 @@ server: const json = '[]' const chunks = await chunker.chunk(json) - // Empty array should not produce chunks with meaningful content expect(chunks.length).toBeGreaterThanOrEqual(0) }) @@ -272,7 +267,6 @@ server: it.concurrent('should fall back to text chunking for invalid JSON', async () => { const chunker = new JsonYamlChunker({ chunkSize: 100, minCharactersPerChunk: 10 }) - // Create content that fails YAML parsing and is long enough to produce chunks const invalidJson = `{this is not valid json: content: {{${' more content here '.repeat(10)}` const chunks = await chunker.chunk(invalidJson) @@ -377,9 +371,7 @@ server: const json = JSON.stringify({ a: 1, b: 2, c: 3 }) const chunks = await chunker.chunk(json) - // Should produce chunks that are valid expect(chunks.length).toBeGreaterThan(0) - // The entire small object fits in one chunk expect(chunks[0].text.length).toBeGreaterThan(0) }) }) diff --git a/apps/sim/lib/chunkers/json-yaml-chunker.ts b/apps/sim/lib/chunkers/json-yaml-chunker.ts index 458f8d3e8cb..d18cd0859f9 100644 --- a/apps/sim/lib/chunkers/json-yaml-chunker.ts +++ b/apps/sim/lib/chunkers/json-yaml-chunker.ts @@ -1,8 +1,7 @@ import { createLogger } from '@sim/logger' import * as yaml from 'js-yaml' import type { Chunk, ChunkerOptions } from '@/lib/chunkers/types' -import { getAccurateTokenCount } from '@/lib/tokenization' -import { estimateTokenCount } from '@/lib/tokenization/estimators' +import { estimateTokens } from '@/lib/chunkers/utils' const logger = createLogger('JsonYamlChunker') @@ -11,57 +10,31 @@ type JsonValue = JsonPrimitive | JsonObject | JsonArray type JsonObject = { [key: string]: JsonValue } type JsonArray = JsonValue[] -function getTokenCount(text: string): number { - try { - return getAccurateTokenCount(text, 'text-embedding-3-small') - } catch (error) { - logger.warn('Tiktoken failed, falling back to estimation') - const estimate = estimateTokenCount(text) - return estimate.count - } -} - -/** - * Configuration for JSON/YAML chunking - * Reduced limits to ensure we stay well under OpenAI's 8,191 token limit per embedding request - */ -const JSON_YAML_CHUNKING_CONFIG = { - TARGET_CHUNK_SIZE: 1024, // Target tokens per chunk - MIN_CHARACTERS_PER_CHUNK: 100, // Minimum characters per chunk to filter tiny fragments - MAX_CHUNK_SIZE: 1500, // Maximum tokens per chunk - MAX_DEPTH_FOR_SPLITTING: 5, // Maximum depth to traverse for splitting -} +const MAX_DEPTH = 5 export class JsonYamlChunker { - private chunkSize: number // in tokens - private minCharactersPerChunk: number // in characters + private chunkSize: number + private minCharactersPerChunk: number constructor(options: ChunkerOptions = {}) { - this.chunkSize = options.chunkSize ?? JSON_YAML_CHUNKING_CONFIG.TARGET_CHUNK_SIZE - this.minCharactersPerChunk = - options.minCharactersPerChunk ?? JSON_YAML_CHUNKING_CONFIG.MIN_CHARACTERS_PER_CHUNK + this.chunkSize = options.chunkSize ?? 1024 + this.minCharactersPerChunk = options.minCharactersPerChunk ?? 100 } - /** - * Check if content is structured JSON/YAML data - */ static isStructuredData(content: string): boolean { try { - JSON.parse(content) - return true + const parsed = JSON.parse(content) + return typeof parsed === 'object' && parsed !== null } catch { try { - yaml.load(content) - return true + const parsed = yaml.load(content) + return typeof parsed === 'object' && parsed !== null } catch { return false } } } - /** - * Chunk JSON/YAML content intelligently based on structure - */ async chunk(content: string): Promise { try { let data: JsonValue @@ -70,16 +43,10 @@ export class JsonYamlChunker { } catch { data = yaml.load(content) as JsonValue } - const chunks = this.chunkStructuredData(data) + const chunks = this.chunkStructuredData(data, [], 0) - const tokenCounts = chunks.map((c) => c.tokenCount) - const totalTokens = tokenCounts.reduce((a, b) => a + b, 0) - const maxTokens = Math.max(...tokenCounts) - const avgTokens = Math.round(totalTokens / chunks.length) - - logger.info( - `JSON chunking complete: ${chunks.length} chunks, ${totalTokens} total tokens (avg: ${avgTokens}, max: ${maxTokens})` - ) + const totalTokens = chunks.reduce((sum, c) => sum + c.tokenCount, 0) + logger.info(`JSON chunking complete: ${chunks.length} chunks, ${totalTokens} total tokens`) return chunks } catch (error) { @@ -88,42 +55,38 @@ export class JsonYamlChunker { } } - /** - * Chunk structured data based on its structure - */ - private chunkStructuredData(data: JsonValue, path: string[] = []): Chunk[] { - const chunks: Chunk[] = [] - + private chunkStructuredData(data: JsonValue, path: string[], depth: number): Chunk[] { if (Array.isArray(data)) { - return this.chunkArray(data, path) + return this.chunkArray(data, path, depth) } if (typeof data === 'object' && data !== null) { - return this.chunkObject(data as JsonObject, path) + return this.chunkObject(data as JsonObject, path, depth) } const content = JSON.stringify(data, null, 2) - const tokenCount = getTokenCount(content) + const contextHeader = path.length > 0 ? `// ${path.join('.')}\n` : '' + const contentTokens = estimateTokens(content) - // Filter tiny fragments using character count - if (content.length >= this.minCharactersPerChunk) { - chunks.push({ - text: content, - tokenCount, - metadata: { - startIndex: 0, - endIndex: content.length, - }, - }) + if (contentTokens > this.chunkSize) { + return this.chunkAsText(contextHeader + content) } - return chunks + if (content.length < this.minCharactersPerChunk) { + return [] + } + + const text = contextHeader + content + return [ + { + text, + tokenCount: estimateTokens(text), + metadata: { startIndex: 0, endIndex: text.length }, + }, + ] } - /** - * Chunk an array intelligently - */ - private chunkArray(arr: JsonArray, path: string[]): Chunk[] { + private chunkArray(arr: JsonArray, path: string[], depth: number): Chunk[] { const chunks: Chunk[] = [] let currentBatch: JsonValue[] = [] let currentTokens = 0 @@ -133,46 +96,30 @@ export class JsonYamlChunker { for (let i = 0; i < arr.length; i++) { const item = arr[i] const itemStr = JSON.stringify(item, null, 2) - const itemTokens = getTokenCount(itemStr) + const itemTokens = estimateTokens(itemStr) if (itemTokens > this.chunkSize) { if (currentBatch.length > 0) { - const batchContent = contextHeader + JSON.stringify(currentBatch, null, 2) - chunks.push({ - text: batchContent, - tokenCount: getTokenCount(batchContent), - metadata: { - startIndex: i - currentBatch.length, - endIndex: i - 1, - }, - }) + chunks.push( + this.buildBatchChunk(contextHeader, currentBatch, i - currentBatch.length, i - 1) + ) currentBatch = [] currentTokens = 0 } - if (typeof item === 'object' && item !== null) { - const subChunks = this.chunkStructuredData(item, [...path, `[${i}]`]) - chunks.push(...subChunks) + if (depth < MAX_DEPTH && typeof item === 'object' && item !== null) { + chunks.push(...this.chunkStructuredData(item, [...path, `[${i}]`], depth + 1)) } else { chunks.push({ text: contextHeader + itemStr, tokenCount: itemTokens, - metadata: { - startIndex: i, - endIndex: i, - }, + metadata: { startIndex: i, endIndex: i }, }) } } else if (currentTokens + itemTokens > this.chunkSize && currentBatch.length > 0) { - const batchContent = contextHeader + JSON.stringify(currentBatch, null, 2) - chunks.push({ - text: batchContent, - tokenCount: getTokenCount(batchContent), - metadata: { - startIndex: i - currentBatch.length, - endIndex: i - 1, - }, - }) + chunks.push( + this.buildBatchChunk(contextHeader, currentBatch, i - currentBatch.length, i - 1) + ) currentBatch = [item] currentTokens = itemTokens } else { @@ -182,121 +129,112 @@ export class JsonYamlChunker { } if (currentBatch.length > 0) { - const batchContent = contextHeader + JSON.stringify(currentBatch, null, 2) - chunks.push({ - text: batchContent, - tokenCount: getTokenCount(batchContent), - metadata: { - startIndex: arr.length - currentBatch.length, - endIndex: arr.length - 1, - }, - }) + chunks.push( + this.buildBatchChunk( + contextHeader, + currentBatch, + arr.length - currentBatch.length, + arr.length - 1 + ) + ) } return chunks } - /** - * Chunk an object intelligently - */ - private chunkObject(obj: JsonObject, path: string[]): Chunk[] { + private chunkObject(obj: JsonObject, path: string[], depth: number): Chunk[] { const chunks: Chunk[] = [] const entries = Object.entries(obj) const fullContent = JSON.stringify(obj, null, 2) - const fullTokens = getTokenCount(fullContent) + const fullTokens = estimateTokens(fullContent) if (fullTokens <= this.chunkSize) { - chunks.push({ - text: fullContent, - tokenCount: fullTokens, - metadata: { - startIndex: 0, - endIndex: fullContent.length, + const contextHeader = path.length > 0 ? `// ${path.join('.')}\n` : '' + const text = contextHeader + fullContent + return [ + { + text, + tokenCount: estimateTokens(text), + metadata: { startIndex: 0, endIndex: text.length }, }, - }) - return chunks + ] } + const contextHeader = path.length > 0 ? `// ${path.join('.')}\n` : '' let currentObj: JsonObject = {} let currentTokens = 0 - let currentKeys: string[] = [] for (const [key, value] of entries) { const valueStr = JSON.stringify({ [key]: value }, null, 2) - const valueTokens = getTokenCount(valueStr) + const valueTokens = estimateTokens(valueStr) if (valueTokens > this.chunkSize) { if (Object.keys(currentObj).length > 0) { - const objContent = JSON.stringify(currentObj, null, 2) + const objContent = contextHeader + JSON.stringify(currentObj, null, 2) chunks.push({ text: objContent, - tokenCount: getTokenCount(objContent), - metadata: { - startIndex: 0, - endIndex: objContent.length, - }, + tokenCount: estimateTokens(objContent), + metadata: { startIndex: 0, endIndex: objContent.length }, }) currentObj = {} currentTokens = 0 - currentKeys = [] } - if (typeof value === 'object' && value !== null) { - const subChunks = this.chunkStructuredData(value, [...path, key]) - chunks.push(...subChunks) + if (depth < MAX_DEPTH && typeof value === 'object' && value !== null) { + chunks.push(...this.chunkStructuredData(value, [...path, key], depth + 1)) } else { chunks.push({ - text: valueStr, + text: contextHeader + valueStr, tokenCount: valueTokens, - metadata: { - startIndex: 0, - endIndex: valueStr.length, - }, + metadata: { startIndex: 0, endIndex: valueStr.length }, }) } } else if ( currentTokens + valueTokens > this.chunkSize && Object.keys(currentObj).length > 0 ) { - const objContent = JSON.stringify(currentObj, null, 2) + const objContent = contextHeader + JSON.stringify(currentObj, null, 2) chunks.push({ text: objContent, - tokenCount: getTokenCount(objContent), - metadata: { - startIndex: 0, - endIndex: objContent.length, - }, + tokenCount: estimateTokens(objContent), + metadata: { startIndex: 0, endIndex: objContent.length }, }) currentObj = { [key]: value } currentTokens = valueTokens - currentKeys = [key] } else { currentObj[key] = value currentTokens += valueTokens - currentKeys.push(key) } } if (Object.keys(currentObj).length > 0) { - const objContent = JSON.stringify(currentObj, null, 2) + const objContent = contextHeader + JSON.stringify(currentObj, null, 2) chunks.push({ text: objContent, - tokenCount: getTokenCount(objContent), - metadata: { - startIndex: 0, - endIndex: objContent.length, - }, + tokenCount: estimateTokens(objContent), + metadata: { startIndex: 0, endIndex: objContent.length }, }) } return chunks } - /** - * Fall back to text chunking if JSON parsing fails - */ - private async chunkAsText(content: string): Promise { + private buildBatchChunk( + contextHeader: string, + batch: JsonValue[], + startIdx: number, + endIdx: number + ): Chunk { + const batchContent = contextHeader + JSON.stringify(batch, null, 2) + return { + text: batchContent, + tokenCount: estimateTokens(batchContent), + metadata: { startIndex: startIdx, endIndex: endIdx }, + } + } + + private chunkAsText(content: string): Chunk[] { const chunks: Chunk[] = [] const lines = content.split('\n') let currentChunk = '' @@ -304,16 +242,13 @@ export class JsonYamlChunker { let startIndex = 0 for (const line of lines) { - const lineTokens = getTokenCount(line) + const lineTokens = estimateTokens(line) if (currentTokens + lineTokens > this.chunkSize && currentChunk) { chunks.push({ text: currentChunk, tokenCount: currentTokens, - metadata: { - startIndex, - endIndex: startIndex + currentChunk.length, - }, + metadata: { startIndex, endIndex: startIndex + currentChunk.length }, }) startIndex += currentChunk.length + 1 @@ -325,24 +260,17 @@ export class JsonYamlChunker { } } - // Filter tiny fragments using character count if (currentChunk && currentChunk.length >= this.minCharactersPerChunk) { chunks.push({ text: currentChunk, tokenCount: currentTokens, - metadata: { - startIndex, - endIndex: startIndex + currentChunk.length, - }, + metadata: { startIndex, endIndex: startIndex + currentChunk.length }, }) } return chunks } - /** - * Static method for chunking JSON/YAML data with default options - */ static async chunkJsonYaml(content: string, options: ChunkerOptions = {}): Promise { const chunker = new JsonYamlChunker(options) return chunker.chunk(content) diff --git a/apps/sim/lib/chunkers/recursive-chunker.test.ts b/apps/sim/lib/chunkers/recursive-chunker.test.ts new file mode 100644 index 00000000000..846267034cf --- /dev/null +++ b/apps/sim/lib/chunkers/recursive-chunker.test.ts @@ -0,0 +1,275 @@ +/** + * @vitest-environment node + */ + +import { loggerMock } from '@sim/testing' +import { describe, expect, it, vi } from 'vitest' +import { RecursiveChunker } from './recursive-chunker' + +vi.mock('@sim/logger', () => loggerMock) + +describe('RecursiveChunker', () => { + describe('empty and whitespace input', () => { + it.concurrent('should return empty array for empty string', async () => { + const chunker = new RecursiveChunker({ chunkSize: 100 }) + const chunks = await chunker.chunk('') + expect(chunks).toEqual([]) + }) + + it.concurrent('should return empty array for whitespace-only input', async () => { + const chunker = new RecursiveChunker({ chunkSize: 100 }) + const chunks = await chunker.chunk(' \n\n\t ') + expect(chunks).toEqual([]) + }) + }) + + describe('small content', () => { + it.concurrent('should return single chunk when content fits in one chunk', async () => { + const chunker = new RecursiveChunker({ chunkSize: 100 }) + const text = 'This is a short text.' + const chunks = await chunker.chunk(text) + + expect(chunks).toHaveLength(1) + expect(chunks[0].text).toBe(text) + }) + }) + + describe('paragraph splitting', () => { + it.concurrent('should split at paragraph boundaries first', async () => { + const chunker = new RecursiveChunker({ chunkSize: 20 }) + const text = + 'First paragraph with enough content to matter.\n\nSecond paragraph with enough content to matter.\n\nThird paragraph with enough content here.' + const chunks = await chunker.chunk(text) + + expect(chunks.length).toBeGreaterThan(1) + }) + }) + + describe('line splitting fallback', () => { + it.concurrent('should split at newlines when paragraphs are too large', async () => { + const chunker = new RecursiveChunker({ chunkSize: 15 }) + const text = + 'Line one with content here.\nLine two with content here.\nLine three with content here.\nLine four with content here.' + const chunks = await chunker.chunk(text) + + expect(chunks.length).toBeGreaterThan(1) + }) + }) + + describe('sentence splitting fallback', () => { + it.concurrent('should split at sentence boundaries when lines are too large', async () => { + const chunker = new RecursiveChunker({ chunkSize: 10 }) + const text = + 'First sentence here. Second sentence here. Third sentence here. Fourth sentence here.' + const chunks = await chunker.chunk(text) + + expect(chunks.length).toBeGreaterThan(1) + }) + }) + + describe('word splitting fallback', () => { + it.concurrent('should split at spaces when sentences are too large', async () => { + const chunker = new RecursiveChunker({ chunkSize: 5 }) + const text = 'word1 word2 word3 word4 word5 word6 word7 word8 word9 word10' + const chunks = await chunker.chunk(text) + + expect(chunks.length).toBeGreaterThan(1) + }) + }) + + describe('keep_separator behavior', () => { + it.concurrent('should prepend separator to subsequent chunks', async () => { + const chunker = new RecursiveChunker({ chunkSize: 15 }) + const text = + 'First paragraph content here.\n\nSecond paragraph content here.\n\nThird paragraph content here.' + const chunks = await chunker.chunk(text) + + if (chunks.length > 1) { + expect(chunks[1].text.startsWith('\n\n') || chunks[1].text.length > 0).toBe(true) + } + }) + }) + + describe('custom separators', () => { + it.concurrent('should use custom separators instead of default recipe', async () => { + const chunker = new RecursiveChunker({ + chunkSize: 15, + separators: ['---', '\n'], + }) + const text = + 'Section one content here with words.---Section two content here with words.---Section three content here.' + const chunks = await chunker.chunk(text) + + expect(chunks.length).toBeGreaterThan(1) + }) + }) + + describe('recipe: plain', () => { + it.concurrent('should use plain recipe by default', async () => { + const chunker = new RecursiveChunker({ chunkSize: 20 }) + const text = + 'First paragraph with enough words to exceed the chunk size limit.\n\nSecond paragraph with enough words to exceed the chunk size limit.\n\nThird paragraph with enough words to exceed the chunk size limit.' + const chunks = await chunker.chunk(text) + + expect(chunks.length).toBeGreaterThan(1) + }) + }) + + describe('recipe: markdown', () => { + it.concurrent('should split at heading boundaries for markdown content', async () => { + const chunker = new RecursiveChunker({ chunkSize: 20, recipe: 'markdown' }) + const text = + '\n# Title\n\nParagraph content under the title goes here.\n\n## Subtitle\n\nMore text content under the subtitle goes here.' + const chunks = await chunker.chunk(text) + + expect(chunks.length).toBeGreaterThan(1) + }) + + it.concurrent('should handle markdown horizontal rules', async () => { + const chunker = new RecursiveChunker({ chunkSize: 20, recipe: 'markdown' }) + const text = + 'Section one content here.\n---\nSection two content here.\n---\nSection three content here.' + const chunks = await chunker.chunk(text) + + expect(chunks.length).toBeGreaterThan(0) + }) + }) + + describe('recipe: code', () => { + it.concurrent('should split on function and class boundaries', async () => { + const chunker = new RecursiveChunker({ chunkSize: 20, recipe: 'code' }) + const text = [ + 'const x = 1;', + 'function hello() {', + ' return "hello";', + '}', + 'function world() {', + ' return "world";', + '}', + 'class MyClass {', + ' constructor() {}', + ' method() { return true; }', + '}', + ].join('\n') + const chunks = await chunker.chunk(text) + + expect(chunks.length).toBeGreaterThan(1) + }) + }) + + describe('chunk size respected', () => { + it.concurrent('should not exceed chunk size in tokens', async () => { + const chunkSize = 30 + const chunker = new RecursiveChunker({ chunkSize }) + const text = 'This is a test sentence with content. '.repeat(30) + const chunks = await chunker.chunk(text) + + for (const chunk of chunks) { + expect(chunk.tokenCount).toBeLessThanOrEqual(chunkSize + 5) + } + }) + }) + + describe('overlap', () => { + it.concurrent('should share text between consecutive chunks when overlap is set', async () => { + const chunker = new RecursiveChunker({ chunkSize: 20, chunkOverlap: 5 }) + const text = + 'First paragraph with some content here.\n\nSecond paragraph with different content here.\n\nThird paragraph with more content here.' + const chunks = await chunker.chunk(text) + + if (chunks.length > 1) { + expect(chunks[1].text.length).toBeGreaterThan(0) + } + }) + + it.concurrent('should not add overlap when overlap is 0', async () => { + const chunker = new RecursiveChunker({ chunkSize: 20, chunkOverlap: 0 }) + const text = + 'First sentence content here. Second sentence content here. Third sentence content here.' + const chunks = await chunker.chunk(text) + + if (chunks.length > 1) { + const firstChunkEnd = chunks[0].text.slice(-10) + expect(chunks[1].text.startsWith(firstChunkEnd)).toBe(false) + } + }) + }) + + describe('chunk metadata', () => { + it.concurrent('should include text, tokenCount, and metadata fields', async () => { + const chunker = new RecursiveChunker({ chunkSize: 100 }) + const text = 'This is test content for metadata.' + const chunks = await chunker.chunk(text) + + expect(chunks).toHaveLength(1) + expect(chunks[0].text).toBe(text) + expect(chunks[0].tokenCount).toBe(Math.ceil(text.length / 4)) + expect(chunks[0].metadata.startIndex).toBeDefined() + expect(chunks[0].metadata.endIndex).toBeDefined() + }) + + it.concurrent('should have startIndex of 0 for the first chunk', async () => { + const chunker = new RecursiveChunker({ chunkSize: 100 }) + const text = 'Some content here.' + const chunks = await chunker.chunk(text) + + expect(chunks[0].metadata.startIndex).toBe(0) + }) + + it.concurrent('should have non-negative indices for all chunks', async () => { + const chunker = new RecursiveChunker({ chunkSize: 20, chunkOverlap: 5 }) + const text = 'First part. Second part. Third part. Fourth part. Fifth part.' + const chunks = await chunker.chunk(text) + + for (const chunk of chunks) { + expect(chunk.metadata.startIndex).toBeGreaterThanOrEqual(0) + expect(chunk.metadata.endIndex).toBeGreaterThanOrEqual(chunk.metadata.startIndex) + } + }) + + it.concurrent('should have endIndex greater than startIndex for non-empty chunks', async () => { + const chunker = new RecursiveChunker({ chunkSize: 20 }) + const text = 'Multiple sentences here. Another one here. And another. And more content.' + const chunks = await chunker.chunk(text) + + for (const chunk of chunks) { + expect(chunk.metadata.endIndex).toBeGreaterThan(chunk.metadata.startIndex) + } + }) + }) + + describe('edge cases', () => { + it.concurrent('should handle very long text', async () => { + const chunker = new RecursiveChunker({ chunkSize: 100 }) + const text = 'This is a sentence. '.repeat(1000) + const chunks = await chunker.chunk(text) + + expect(chunks.length).toBeGreaterThan(1) + }) + + it.concurrent('should handle text with no natural separators', async () => { + const chunker = new RecursiveChunker({ chunkSize: 5 }) + const text = 'abcdefghijklmnopqrstuvwxyz'.repeat(5) + const chunks = await chunker.chunk(text) + + expect(chunks.length).toBeGreaterThan(1) + }) + + it.concurrent('should handle unicode text', async () => { + const chunker = new RecursiveChunker({ chunkSize: 100 }) + const text = '这是中文测试。日本語テスト。한국어 테스트.' + const chunks = await chunker.chunk(text) + + expect(chunks.length).toBeGreaterThan(0) + expect(chunks[0].text).toContain('中文') + }) + + it.concurrent('should use default chunkSize of 1024 tokens', async () => { + const chunker = new RecursiveChunker({}) + const text = 'Word '.repeat(400) + const chunks = await chunker.chunk(text) + + expect(chunks).toHaveLength(1) + }) + }) +}) diff --git a/apps/sim/lib/chunkers/recursive-chunker.ts b/apps/sim/lib/chunkers/recursive-chunker.ts new file mode 100644 index 00000000000..0dba2240987 --- /dev/null +++ b/apps/sim/lib/chunkers/recursive-chunker.ts @@ -0,0 +1,145 @@ +import { createLogger } from '@sim/logger' +import type { Chunk, RecursiveChunkerOptions } from '@/lib/chunkers/types' +import { + addOverlap, + buildChunks, + cleanText, + estimateTokens, + resolveChunkerOptions, + splitAtWordBoundaries, + tokensToChars, +} from '@/lib/chunkers/utils' + +const logger = createLogger('RecursiveChunker') + +const RECIPES = { + plain: ['\n\n', '\n', '. ', ' ', ''], + markdown: [ + '\n---\n', + '\n***\n', + '\n___\n', + '\n# ', + '\n## ', + '\n### ', + '\n#### ', + '\n##### ', + '\n###### ', + '\n```\n', + '\n> ', + '\n\n', + '\n', + '. ', + ' ', + '', + ], + code: [ + '\nfunction ', + '\nclass ', + '\nexport ', + '\nconst ', + '\nlet ', + '\nvar ', + '\nif ', + '\nfor ', + '\nwhile ', + '\nswitch ', + '\nreturn ', + '\n\n', + '\n', + '; ', + ' ', + '', + ], +} as const + +export class RecursiveChunker { + private readonly chunkSize: number + private readonly chunkOverlap: number + private readonly separators: string[] + + constructor(options: RecursiveChunkerOptions = {}) { + const resolved = resolveChunkerOptions(options) + this.chunkSize = resolved.chunkSize + this.chunkOverlap = resolved.chunkOverlap + + if (options.separators && options.separators.length > 0) { + this.separators = options.separators + } else { + const recipe = options.recipe ?? 'plain' + this.separators = [...RECIPES[recipe]] + } + } + + private splitRecursively(text: string, separatorIndex = 0): string[] { + const tokenCount = estimateTokens(text) + + if (tokenCount <= this.chunkSize) { + return text.trim() ? [text] : [] + } + + if (separatorIndex >= this.separators.length) { + const chunkSizeChars = tokensToChars(this.chunkSize) + return splitAtWordBoundaries(text, chunkSizeChars) + } + + const separator = this.separators[separatorIndex] + + if (separator === '') { + return this.splitRecursively(text, this.separators.length) + } + + const parts = text.split(separator).filter((part) => part.trim()) + + if (parts.length <= 1) { + return this.splitRecursively(text, separatorIndex + 1) + } + + const chunks: string[] = [] + let currentChunk = '' + + for (const part of parts) { + const testChunk = currentChunk + (currentChunk ? separator : '') + part + + if (estimateTokens(testChunk) <= this.chunkSize) { + currentChunk = testChunk + } else { + if (currentChunk.trim()) { + chunks.push(currentChunk.trim()) + } + + if (estimateTokens(part) > this.chunkSize) { + const subChunks = this.splitRecursively(part, separatorIndex + 1) + for (const subChunk of subChunks) { + chunks.push(subChunk) + } + currentChunk = '' + } else { + currentChunk = part + } + } + } + + if (currentChunk.trim()) { + chunks.push(currentChunk.trim()) + } + + return chunks + } + + async chunk(content: string): Promise { + if (!content?.trim()) { + return [] + } + + const cleaned = cleanText(content) + let chunks = this.splitRecursively(cleaned) + + if (this.chunkOverlap > 0) { + const overlapChars = tokensToChars(this.chunkOverlap) + chunks = addOverlap(chunks, overlapChars) + } + + logger.info(`Chunked into ${chunks.length} recursive chunks`) + return buildChunks(chunks, this.chunkOverlap) + } +} diff --git a/apps/sim/lib/chunkers/regex-chunker.test.ts b/apps/sim/lib/chunkers/regex-chunker.test.ts new file mode 100644 index 00000000000..5b64cf3f495 --- /dev/null +++ b/apps/sim/lib/chunkers/regex-chunker.test.ts @@ -0,0 +1,189 @@ +/** + * @vitest-environment node + */ + +import { loggerMock } from '@sim/testing' +import { describe, expect, it, vi } from 'vitest' +import { RegexChunker } from './regex-chunker' + +vi.mock('@sim/logger', () => loggerMock) + +describe('RegexChunker', () => { + describe('empty and whitespace input', () => { + it.concurrent('should return empty array for empty string', async () => { + const chunker = new RegexChunker({ pattern: '\\n\\n' }) + const chunks = await chunker.chunk('') + expect(chunks).toEqual([]) + }) + + it.concurrent('should return empty array for whitespace-only input', async () => { + const chunker = new RegexChunker({ pattern: '\\n\\n' }) + const chunks = await chunker.chunk(' \n\n ') + expect(chunks).toEqual([]) + }) + }) + + describe('small content', () => { + it.concurrent('should return single chunk when content fits in chunkSize', async () => { + const chunker = new RegexChunker({ pattern: '\\n\\n', chunkSize: 100 }) + const text = 'This is a short text.' + const chunks = await chunker.chunk(text) + + expect(chunks).toHaveLength(1) + expect(chunks[0].text).toBe(text) + }) + }) + + describe('basic regex splitting', () => { + it.concurrent('should split on double newlines with pattern \\n\\n', async () => { + const chunker = new RegexChunker({ pattern: '\\n\\n', chunkSize: 20 }) + const text = + 'First paragraph content here.\n\nSecond paragraph content here.\n\nThird paragraph content here.' + const chunks = await chunker.chunk(text) + + expect(chunks.length).toBeGreaterThan(1) + }) + }) + + describe('custom pattern splitting', () => { + it.concurrent('should split text at --- delimiters', async () => { + const chunker = new RegexChunker({ pattern: '---', chunkSize: 20 }) + const text = + 'Section one has enough content to fill a chunk on its own here.---Section two also has enough content to fill another chunk here.---Section three needs content too for splitting.' + const chunks = await chunker.chunk(text) + + expect(chunks.length).toBeGreaterThan(1) + }) + }) + + describe('segment merging', () => { + it.concurrent('should merge small adjacent segments up to chunkSize', async () => { + const chunker = new RegexChunker({ pattern: '\\n\\n', chunkSize: 100 }) + const text = 'Short.\n\nAlso short.\n\nTiny.\n\nSmall too.' + const chunks = await chunker.chunk(text) + + expect(chunks).toHaveLength(1) + expect(chunks[0].text).toContain('Short.') + expect(chunks[0].text).toContain('Also short.') + }) + }) + + describe('oversized segment fallback', () => { + it.concurrent( + 'should sub-chunk segments larger than chunkSize via word boundaries', + async () => { + const chunker = new RegexChunker({ pattern: '---', chunkSize: 10 }) + const longSegment = + 'This is a very long segment with many words that exceeds the chunk size limit significantly. ' + const text = `${longSegment}---${longSegment}` + const chunks = await chunker.chunk(text) + + expect(chunks.length).toBeGreaterThan(2) + } + ) + }) + + describe('no-match fallback', () => { + it.concurrent( + 'should fall back to word-boundary splitting when regex matches nothing', + async () => { + const chunker = new RegexChunker({ pattern: '###SPLIT###', chunkSize: 10 }) + const text = 'This is a text with no matching delimiter anywhere in the content at all.' + const chunks = await chunker.chunk(text) + + expect(chunks.length).toBeGreaterThan(1) + } + ) + }) + + describe('chunk size respected', () => { + it.concurrent('should not exceed chunkSize tokens approximately', async () => { + const chunkSize = 30 + const chunker = new RegexChunker({ pattern: '\\n\\n', chunkSize }) + const text = + 'Paragraph one with some words. '.repeat(5) + + '\n\n' + + 'Paragraph two with more words. '.repeat(5) + + '\n\n' + + 'Paragraph three continues here. '.repeat(5) + const chunks = await chunker.chunk(text) + + for (const chunk of chunks) { + expect(chunk.tokenCount).toBeLessThanOrEqual(chunkSize + 10) + } + }) + }) + + describe('overlap', () => { + it.concurrent('should share content between chunks when chunkOverlap > 0', async () => { + const chunker = new RegexChunker({ pattern: '\\n\\n', chunkSize: 20, chunkOverlap: 5 }) + const text = + 'First paragraph with enough content.\n\nSecond paragraph with more content.\n\nThird paragraph with even more.' + const chunks = await chunker.chunk(text) + + if (chunks.length > 1) { + const firstChunkEnd = chunks[0].text.slice(-10) + const secondChunkStart = chunks[1].text.slice(0, 20) + expect(secondChunkStart.length).toBeGreaterThan(0) + expect(chunks[1].text.length).toBeGreaterThan(0) + } + }) + }) + + describe('chunk metadata', () => { + it.concurrent('should include text, tokenCount, and metadata with indices', async () => { + const chunker = new RegexChunker({ pattern: '\\n\\n', chunkSize: 100 }) + const text = 'Hello world test content.' + const chunks = await chunker.chunk(text) + + expect(chunks).toHaveLength(1) + expect(chunks[0].text).toBe(text) + expect(chunks[0].tokenCount).toBe(Math.ceil(text.length / 4)) + expect(chunks[0].metadata.startIndex).toBeDefined() + expect(chunks[0].metadata.endIndex).toBeDefined() + expect(chunks[0].metadata.startIndex).toBe(0) + }) + + it.concurrent('should have non-negative indices across multiple chunks', async () => { + const chunker = new RegexChunker({ pattern: '\\n\\n', chunkSize: 20, chunkOverlap: 0 }) + const text = 'First paragraph here.\n\nSecond paragraph here.\n\nThird paragraph here.' + const chunks = await chunker.chunk(text) + + for (const chunk of chunks) { + expect(chunk.metadata.startIndex).toBeGreaterThanOrEqual(0) + expect(chunk.metadata.endIndex).toBeGreaterThanOrEqual(chunk.metadata.startIndex) + } + }) + }) + + describe('invalid regex', () => { + it.concurrent('should throw error for invalid regex pattern', async () => { + expect(() => new RegexChunker({ pattern: '[invalid' })).toThrow() + }) + }) + + describe('empty pattern', () => { + it.concurrent('should throw error for empty pattern', async () => { + expect(() => new RegexChunker({ pattern: '' })).toThrow('Regex pattern is required') + }) + }) + + describe('pattern too long', () => { + it.concurrent('should throw error for pattern exceeding 500 characters', async () => { + const longPattern = 'a'.repeat(501) + expect(() => new RegexChunker({ pattern: longPattern })).toThrow( + 'Regex pattern exceeds maximum length of 500 characters' + ) + }) + }) + + describe('ReDoS protection', () => { + it.concurrent('should accept safe pattern \\n+', async () => { + expect(() => new RegexChunker({ pattern: '\\n+' })).not.toThrow() + }) + + it.concurrent('should accept safe pattern [,;]', async () => { + expect(() => new RegexChunker({ pattern: '[,;]' })).not.toThrow() + }) + }) +}) diff --git a/apps/sim/lib/chunkers/regex-chunker.ts b/apps/sim/lib/chunkers/regex-chunker.ts new file mode 100644 index 00000000000..58c8cb16b91 --- /dev/null +++ b/apps/sim/lib/chunkers/regex-chunker.ts @@ -0,0 +1,144 @@ +import { createLogger } from '@sim/logger' +import type { Chunk, RegexChunkerOptions } from '@/lib/chunkers/types' +import { + addOverlap, + buildChunks, + cleanText, + estimateTokens, + resolveChunkerOptions, + splitAtWordBoundaries, + tokensToChars, +} from '@/lib/chunkers/utils' + +const logger = createLogger('RegexChunker') + +const MAX_PATTERN_LENGTH = 500 + +export class RegexChunker { + private readonly chunkSize: number + private readonly chunkOverlap: number + private readonly regex: RegExp + + constructor(options: RegexChunkerOptions) { + const resolved = resolveChunkerOptions(options) + this.chunkSize = resolved.chunkSize + this.chunkOverlap = resolved.chunkOverlap + this.regex = this.compilePattern(options.pattern) + } + + private compilePattern(pattern: string): RegExp { + if (!pattern) { + throw new Error('Regex pattern is required') + } + + if (pattern.length > MAX_PATTERN_LENGTH) { + throw new Error(`Regex pattern exceeds maximum length of ${MAX_PATTERN_LENGTH} characters`) + } + + try { + const regex = new RegExp(pattern, 'g') + + const testStrings = [ + 'a'.repeat(10000), + ' '.repeat(10000), + 'a '.repeat(5000), + 'aB1 xY2\n'.repeat(1250), + `${'a'.repeat(30)}!`, + `${'a b '.repeat(25)}!`, + ] + for (const testStr of testStrings) { + regex.lastIndex = 0 + const start = Date.now() + regex.test(testStr) + const elapsed = Date.now() - start + if (elapsed > 50) { + throw new Error('Regex pattern appears to have catastrophic backtracking') + } + } + + regex.lastIndex = 0 + return regex + } catch (error) { + if (error instanceof Error && error.message.includes('catastrophic')) { + throw error + } + throw new Error( + `Invalid regex pattern "${pattern}": ${error instanceof Error ? error.message : String(error)}` + ) + } + } + + async chunk(content: string): Promise { + if (!content?.trim()) { + return [] + } + + const cleaned = cleanText(content) + + if (estimateTokens(cleaned) <= this.chunkSize) { + logger.info('Content fits in single chunk') + return buildChunks([cleaned], 0) + } + + this.regex.lastIndex = 0 + const segments = cleaned.split(this.regex).filter((s) => s.trim().length > 0) + + if (segments.length <= 1) { + logger.warn( + 'Regex pattern did not produce any splits, falling back to word-boundary splitting' + ) + const chunkSizeChars = tokensToChars(this.chunkSize) + let chunks = splitAtWordBoundaries(cleaned, chunkSizeChars) + if (this.chunkOverlap > 0) { + const overlapChars = tokensToChars(this.chunkOverlap) + chunks = addOverlap(chunks, overlapChars) + } + return buildChunks(chunks, this.chunkOverlap) + } + + const merged = this.mergeSegments(segments) + + let chunks = merged + if (this.chunkOverlap > 0) { + const overlapChars = tokensToChars(this.chunkOverlap) + chunks = addOverlap(chunks, overlapChars) + } + + logger.info(`Chunked into ${chunks.length} regex-based chunks`) + return buildChunks(chunks, this.chunkOverlap) + } + + private mergeSegments(segments: string[]): string[] { + const chunks: string[] = [] + let current = '' + + for (const segment of segments) { + const test = current ? `${current}\n${segment}` : segment + + if (estimateTokens(test) <= this.chunkSize) { + current = test + } else { + if (current.trim()) { + chunks.push(current.trim()) + } + + if (estimateTokens(segment) > this.chunkSize) { + const chunkSizeChars = tokensToChars(this.chunkSize) + const subChunks = splitAtWordBoundaries(segment, chunkSizeChars) + for (const sub of subChunks) { + chunks.push(sub) + } + current = '' + } else { + current = segment + } + } + } + + if (current.trim()) { + chunks.push(current.trim()) + } + + return chunks + } +} diff --git a/apps/sim/lib/chunkers/sentence-chunker.test.ts b/apps/sim/lib/chunkers/sentence-chunker.test.ts new file mode 100644 index 00000000000..78708de29ad --- /dev/null +++ b/apps/sim/lib/chunkers/sentence-chunker.test.ts @@ -0,0 +1,286 @@ +/** + * @vitest-environment node + */ + +import { loggerMock } from '@sim/testing' +import { describe, expect, it, vi } from 'vitest' +import { SentenceChunker } from './sentence-chunker' + +vi.mock('@sim/logger', () => loggerMock) + +describe('SentenceChunker', () => { + describe('empty and whitespace input', () => { + it.concurrent('should return empty array for empty string', async () => { + const chunker = new SentenceChunker({ chunkSize: 100 }) + const chunks = await chunker.chunk('') + expect(chunks).toEqual([]) + }) + + it.concurrent('should return empty array for whitespace-only input', async () => { + const chunker = new SentenceChunker({ chunkSize: 100 }) + const chunks = await chunker.chunk(' \n\n\t ') + expect(chunks).toEqual([]) + }) + + it.concurrent('should return empty array for null-ish content', async () => { + const chunker = new SentenceChunker({ chunkSize: 100 }) + const chunks = await chunker.chunk(undefined as unknown as string) + expect(chunks).toEqual([]) + }) + }) + + describe('small content (single chunk)', () => { + it.concurrent('should return single chunk when content fits within chunk size', async () => { + const chunker = new SentenceChunker({ chunkSize: 100 }) + const text = 'This is a short sentence. Another short one.' + const chunks = await chunker.chunk(text) + + expect(chunks).toHaveLength(1) + expect(chunks[0].text).toBe(text) + expect(chunks[0].tokenCount).toBe(Math.ceil(text.length / 4)) + }) + }) + + describe('sentence boundary splitting', () => { + it.concurrent('should split text at sentence boundaries', async () => { + const chunker = new SentenceChunker({ chunkSize: 20 }) + const text = + 'First sentence here. Second sentence here. Third sentence here. Fourth sentence here.' + const chunks = await chunker.chunk(text) + + expect(chunks.length).toBeGreaterThan(1) + for (let i = 0; i < chunks.length - 1; i++) { + const trimmed = chunks[i].text.trim() + const lastChar = trimmed[trimmed.length - 1] + expect(['.', '!', '?']).toContain(lastChar) + } + }) + }) + + describe('abbreviation handling', () => { + it.concurrent('should not split at common abbreviations', async () => { + const chunker = new SentenceChunker({ chunkSize: 200 }) + const text = 'Mr. Smith went to Washington. He arrived on Jan. 5th.' + const chunks = await chunker.chunk(text) + + expect(chunks).toHaveLength(1) + expect(chunks[0].text).toContain('Mr. Smith') + expect(chunks[0].text).toContain('Jan. 5th') + }) + + it.concurrent('should not split at Dr., Mrs., Ms., Prof., Jr., Sr., St.', async () => { + const chunker = new SentenceChunker({ chunkSize: 500 }) + const text = + 'Dr. Jones and Mrs. Brown met Prof. Davis at St. Mary hospital. Jr. members joined Sr. staff in Feb. for a review.' + const chunks = await chunker.chunk(text) + + expect(chunks).toHaveLength(1) + }) + }) + + describe('single capital initial handling', () => { + it.concurrent('should not split at single capital letter initials', async () => { + const chunker = new SentenceChunker({ chunkSize: 200 }) + const text = 'J. K. Rowling wrote books. They are popular.' + const chunks = await chunker.chunk(text) + + expect(chunks).toHaveLength(1) + expect(chunks[0].text).toContain('J. K. Rowling') + }) + }) + + describe('decimal handling', () => { + it.concurrent('should not split at decimal numbers', async () => { + const chunker = new SentenceChunker({ chunkSize: 20 }) + const text = 'The value is 3.14. That is pi.' + const chunks = await chunker.chunk(text) + + const allText = chunks.map((c) => c.text).join(' ') + expect(allText).toContain('3.14') + + const largeChunker = new SentenceChunker({ chunkSize: 200 }) + const largeChunks = await largeChunker.chunk(text) + expect(largeChunks).toHaveLength(1) + }) + }) + + describe('ellipsis handling', () => { + it.concurrent('should not split at ellipsis', async () => { + const chunker = new SentenceChunker({ chunkSize: 200 }) + const text = 'Wait for it... The answer is here. Done.' + const chunks = await chunker.chunk(text) + + expect(chunks).toHaveLength(1) + expect(chunks[0].text).toContain('Wait for it...') + }) + }) + + describe('exclamation and question marks', () => { + it.concurrent('should split at exclamation and question marks', async () => { + const chunker = new SentenceChunker({ chunkSize: 10 }) + const text = 'What is this? It is great! I agree.' + const chunks = await chunker.chunk(text) + + const allText = chunks.map((c) => c.text).join(' ') + expect(allText).toContain('What is this?') + expect(allText).toContain('It is great!') + expect(allText).toContain('I agree.') + }) + + it.concurrent('should treat ? and ! as sentence boundaries', async () => { + const chunker = new SentenceChunker({ chunkSize: 15 }) + const text = 'What is this thing? It is really great! I strongly agree.' + const chunks = await chunker.chunk(text) + + expect(chunks.length).toBeGreaterThanOrEqual(1) + const allText = chunks.map((c) => c.text).join(' ') + expect(allText).toContain('?') + expect(allText).toContain('!') + }) + }) + + describe('minSentencesPerChunk', () => { + it.concurrent('should group at least minSentencesPerChunk sentences per chunk', async () => { + const chunker = new SentenceChunker({ chunkSize: 100, minSentencesPerChunk: 2 }) + const text = + 'First sentence. Second sentence. Third sentence. Fourth sentence. Fifth sentence.' + const chunks = await chunker.chunk(text) + + expect(chunks.length).toBeGreaterThan(0) + expect(chunks).toHaveLength(1) + }) + + it.concurrent('should enforce min sentences even when token limit is reached', async () => { + const chunker = new SentenceChunker({ chunkSize: 6, minSentencesPerChunk: 2 }) + const text = 'Short one. Another one. Third one here. Fourth one here.' + const chunks = await chunker.chunk(text) + + const firstChunkSentences = chunks[0].text + .split(/(?<=[.!?])\s+/) + .filter((s) => s.trim().length > 0) + expect(firstChunkSentences.length).toBeGreaterThanOrEqual(2) + }) + }) + + describe('oversized sentence fallback', () => { + it.concurrent( + 'should chunk a single very long sentence via word-boundary splitting', + async () => { + const chunker = new SentenceChunker({ chunkSize: 10 }) + const longSentence = `${'word '.repeat(50).trim()}.` + const chunks = await chunker.chunk(longSentence) + + expect(chunks.length).toBeGreaterThan(1) + const allText = chunks.map((c) => c.text).join(' ') + expect(allText).toContain('word') + } + ) + + it.concurrent('should handle oversized sentence mixed with normal sentences', async () => { + const chunker = new SentenceChunker({ chunkSize: 10 }) + const longSentence = `${'word '.repeat(50).trim()}.` + const text = `Short sentence. ${longSentence} Another short one.` + const chunks = await chunker.chunk(text) + + expect(chunks.length).toBeGreaterThan(2) + const allText = chunks.map((c) => c.text).join(' ') + expect(allText).toContain('Short sentence.') + expect(allText).toContain('Another short one.') + }) + }) + + describe('sentence-level overlap', () => { + it.concurrent('should include overlap from previous chunk when chunkOverlap > 0', async () => { + const chunker = new SentenceChunker({ chunkSize: 15, chunkOverlap: 10 }) + const text = + 'First sentence here. Second sentence here. Third sentence here. Fourth sentence here.' + const chunks = await chunker.chunk(text) + + if (chunks.length > 1) { + expect(chunks[1].text.length).toBeGreaterThan(0) + } + }) + + it.concurrent('should not add overlap when chunkOverlap is 0', async () => { + const chunker = new SentenceChunker({ chunkSize: 15, chunkOverlap: 0 }) + const text = 'First sentence here. Second sentence here. Third sentence here.' + const chunks = await chunker.chunk(text) + + if (chunks.length > 1) { + const chunk1End = chunks[0].text.slice(-20) + expect(chunks[1].text.startsWith(chunk1End)).toBe(false) + } + }) + }) + + describe('chunk metadata', () => { + it.concurrent('should include text, tokenCount, and metadata in each chunk', async () => { + const chunker = new SentenceChunker({ chunkSize: 100 }) + const text = 'This is a test sentence. Another sentence follows.' + const chunks = await chunker.chunk(text) + + expect(chunks).toHaveLength(1) + expect(chunks[0]).toHaveProperty('text') + expect(chunks[0]).toHaveProperty('tokenCount') + expect(chunks[0]).toHaveProperty('metadata') + expect(chunks[0].metadata).toHaveProperty('startIndex') + expect(chunks[0].metadata).toHaveProperty('endIndex') + }) + + it.concurrent('should have startIndex of 0 for the first chunk', async () => { + const chunker = new SentenceChunker({ chunkSize: 10 }) + const text = 'First sentence. Second sentence. Third sentence.' + const chunks = await chunker.chunk(text) + + expect(chunks[0].metadata.startIndex).toBe(0) + }) + + it.concurrent('should have non-negative indices for all chunks', async () => { + const chunker = new SentenceChunker({ chunkSize: 10, chunkOverlap: 5 }) + const text = + 'First sentence here. Second sentence here. Third sentence here. Fourth sentence.' + const chunks = await chunker.chunk(text) + + for (const chunk of chunks) { + expect(chunk.metadata.startIndex).toBeGreaterThanOrEqual(0) + expect(chunk.metadata.endIndex).toBeGreaterThanOrEqual(chunk.metadata.startIndex) + } + }) + + it.concurrent('should have correct tokenCount based on text length', async () => { + const chunker = new SentenceChunker({ chunkSize: 100 }) + const text = 'Hello world test.' + const chunks = await chunker.chunk(text) + + expect(chunks[0].tokenCount).toBe(Math.ceil(text.length / 4)) + }) + }) + + describe('respects chunk size', () => { + it.concurrent('should produce chunks within approximate token limit', async () => { + const chunkSize = 20 + const chunker = new SentenceChunker({ chunkSize }) + const text = + 'This is the first sentence. Here is the second one. And the third sentence follows. Then comes the fourth. Finally the fifth sentence.' + const chunks = await chunker.chunk(text) + + expect(chunks.length).toBeGreaterThan(1) + for (const chunk of chunks) { + expect(chunk.tokenCount).toBeLessThanOrEqual(chunkSize * 2) + } + }) + + it.concurrent('should create more chunks with smaller chunk size', async () => { + const text = + 'Sentence number one. Sentence number two. Sentence number three. Sentence number four. Sentence number five. Sentence number six.' + + const largeChunker = new SentenceChunker({ chunkSize: 200 }) + const smallChunker = new SentenceChunker({ chunkSize: 10 }) + + const largeChunks = await largeChunker.chunk(text) + const smallChunks = await smallChunker.chunk(text) + + expect(smallChunks.length).toBeGreaterThan(largeChunks.length) + }) + }) +}) diff --git a/apps/sim/lib/chunkers/sentence-chunker.ts b/apps/sim/lib/chunkers/sentence-chunker.ts new file mode 100644 index 00000000000..f8b92e6f22c --- /dev/null +++ b/apps/sim/lib/chunkers/sentence-chunker.ts @@ -0,0 +1,141 @@ +import { createLogger } from '@sim/logger' +import type { Chunk, SentenceChunkerOptions } from '@/lib/chunkers/types' +import { + buildChunks, + cleanText, + estimateTokens, + resolveChunkerOptions, + splitAtWordBoundaries, + tokensToChars, +} from '@/lib/chunkers/utils' + +const logger = createLogger('SentenceChunker') + +/** Never splits mid-sentence unless a single sentence exceeds the limit. */ +export class SentenceChunker { + private readonly chunkSize: number + private readonly chunkOverlap: number + private readonly minSentencesPerChunk: number + + constructor(options: SentenceChunkerOptions = {}) { + const resolved = resolveChunkerOptions(options) + this.chunkSize = resolved.chunkSize + this.chunkOverlap = resolved.chunkOverlap + this.minSentencesPerChunk = options.minSentencesPerChunk ?? 1 + } + + /** Splits on sentence boundaries while avoiding abbreviations, decimals, and ellipses. */ + private splitSentences(text: string): string[] { + return text + .split( + /(? s.trim().length > 0) + } + + async chunk(content: string): Promise { + if (!content?.trim()) { + return [] + } + + const cleaned = cleanText(content) + const sentences = this.splitSentences(cleaned) + + if (sentences.length === 0) { + return [] + } + + if (estimateTokens(cleaned) <= this.chunkSize) { + logger.info('Content fits in single chunk') + return buildChunks([cleaned], 0) + } + + const chunkSentenceGroups: string[][] = [] + let currentGroup: string[] = [] + let currentTokens = 0 + const chunkSizeChars = tokensToChars(this.chunkSize) + + for (const sentence of sentences) { + const sentenceTokens = estimateTokens(sentence) + + if (sentenceTokens > this.chunkSize) { + if (currentGroup.length > 0) { + chunkSentenceGroups.push(currentGroup) + currentGroup = [] + currentTokens = 0 + } + const parts = splitAtWordBoundaries(sentence, chunkSizeChars) + for (const part of parts) { + chunkSentenceGroups.push([part]) + } + continue + } + + const wouldExceed = currentTokens + sentenceTokens > this.chunkSize + const hasMinSentences = currentGroup.length >= this.minSentencesPerChunk + + if (wouldExceed && hasMinSentences) { + chunkSentenceGroups.push(currentGroup) + currentGroup = [sentence] + currentTokens = sentenceTokens + } else { + currentGroup.push(sentence) + currentTokens += sentenceTokens + } + } + + if (currentGroup.length > 0) { + chunkSentenceGroups.push(currentGroup) + } + + const rawChunks = this.applyOverlapFromGroups(chunkSentenceGroups) + + logger.info(`Chunked into ${rawChunks.length} sentence-based chunks`) + return buildChunks(rawChunks, this.chunkOverlap) + } + + /** Applies overlap at the sentence level using original groups to avoid re-splitting. */ + private applyOverlapFromGroups(groups: string[][]): string[] { + if (this.chunkOverlap <= 0 || groups.length <= 1) { + return groups.map((g) => g.join(' ')) + } + + const overlapChars = tokensToChars(this.chunkOverlap) + const result: string[] = [] + + for (let i = 0; i < groups.length; i++) { + if (i === 0) { + result.push(groups[i].join(' ')) + continue + } + + const prevGroup = groups[i - 1] + const overlapSentences: string[] = [] + let overlapLen = 0 + + for (let j = prevGroup.length - 1; j >= 0; j--) { + if (overlapLen + prevGroup[j].length > overlapChars) break + overlapSentences.unshift(prevGroup[j]) + overlapLen += prevGroup[j].length + } + + const currentText = groups[i].join(' ') + if (overlapSentences.length > 0) { + result.push(`${overlapSentences.join(' ')} ${currentText}`) + } else { + // No complete sentence fits — fall back to character-level overlap + const prevText = prevGroup.join(' ') + const tail = prevText.slice(-overlapChars) + const wordMatch = tail.match(/^\s*\S/) + const cleanTail = wordMatch ? tail.slice(tail.indexOf(wordMatch[0].trim())) : tail + if (cleanTail.trim()) { + result.push(`${cleanTail.trim()} ${currentText}`) + } else { + result.push(currentText) + } + } + } + + return result + } +} diff --git a/apps/sim/lib/chunkers/structured-data-chunker.test.ts b/apps/sim/lib/chunkers/structured-data-chunker.test.ts index ad1aef5c70a..3cd6b7ec27a 100644 --- a/apps/sim/lib/chunkers/structured-data-chunker.test.ts +++ b/apps/sim/lib/chunkers/structured-data-chunker.test.ts @@ -11,19 +11,16 @@ vi.mock('@sim/logger', () => loggerMock) describe('StructuredDataChunker', () => { describe('isStructuredData', () => { it('should detect CSV content with many columns', () => { - // Detection requires >2 delimiters per line on average const csv = 'name,age,city,country\nAlice,30,NYC,USA\nBob,25,LA,USA' expect(StructuredDataChunker.isStructuredData(csv)).toBe(true) }) it('should detect TSV content with many columns', () => { - // Detection requires >2 delimiters per line on average const tsv = 'name\tage\tcity\tcountry\nAlice\t30\tNYC\tUSA\nBob\t25\tLA\tUSA' expect(StructuredDataChunker.isStructuredData(tsv)).toBe(true) }) it('should detect pipe-delimited content with many columns', () => { - // Detection requires >2 delimiters per line on average const piped = 'name|age|city|country\nAlice|30|NYC|USA\nBob|25|LA|USA' expect(StructuredDataChunker.isStructuredData(piped)).toBe(true) }) @@ -64,7 +61,6 @@ describe('StructuredDataChunker', () => { it('should handle inconsistent delimiter counts', () => { const inconsistent = 'name,age\nAlice,30,extra\nBob' - // May or may not detect as structured depending on variance threshold const result = StructuredDataChunker.isStructuredData(inconsistent) expect(typeof result).toBe('boolean') }) @@ -100,7 +96,7 @@ Bob,25` const chunks = await StructuredDataChunker.chunkStructuredData(csv) expect(chunks.length).toBeGreaterThan(0) - expect(chunks[0].text).toContain('Rows') + expect(chunks[0].text).toContain('rows of data') }) it.concurrent('should include sheet name when provided', async () => { @@ -184,7 +180,6 @@ Alice,30` const csv = 'name,age,city' const chunks = await StructuredDataChunker.chunkStructuredData(csv) - // Only header, no data rows expect(chunks.length).toBeGreaterThanOrEqual(0) }) @@ -271,9 +266,8 @@ Alice,30` const chunks = await StructuredDataChunker.chunkStructuredData(csv, { chunkSize: 500 }) expect(chunks.length).toBeGreaterThan(1) - // Verify total rows are distributed across chunks const totalRowCount = chunks.reduce((sum, chunk) => { - const match = chunk.text.match(/\[Rows (\d+) of data\]/) + const match = chunk.text.match(/\[(\d+) rows of data\]/) return sum + (match ? Number.parseInt(match[1]) : 0) }, 0) expect(totalRowCount).toBeGreaterThan(0) @@ -319,9 +313,7 @@ Alice,30` it.concurrent('should not detect with fewer than 3 delimiters per line', async () => { const sparse = `a,b 1,2` - // Only 1 comma per line, below threshold of >2 const result = StructuredDataChunker.isStructuredData(sparse) - // May or may not pass depending on implementation threshold expect(typeof result).toBe('boolean') }) }) @@ -337,7 +329,6 @@ Alice,30` const chunks = await StructuredDataChunker.chunkStructuredData(csv, { chunkSize: 200 }) expect(chunks.length).toBeGreaterThan(1) - // Each chunk should contain header info for (const chunk of chunks) { expect(chunk.text).toContain('Headers:') } diff --git a/apps/sim/lib/chunkers/structured-data-chunker.ts b/apps/sim/lib/chunkers/structured-data-chunker.ts index 0d962072440..757e8b67fdb 100644 --- a/apps/sim/lib/chunkers/structured-data-chunker.ts +++ b/apps/sim/lib/chunkers/structured-data-chunker.ts @@ -1,37 +1,22 @@ import { createLogger } from '@sim/logger' import type { Chunk, StructuredDataOptions } from '@/lib/chunkers/types' +/** Structured data is denser in tokens (~3 chars/token vs ~4 for prose) */ +function estimateStructuredTokens(text: string): number { + if (!text?.trim()) return 0 + return Math.ceil(text.length / 3) +} + const logger = createLogger('StructuredDataChunker') -/** - * Default configuration for structured data chunking (CSV, XLSX, etc.) - * These are used when user doesn't provide preferences - */ const DEFAULT_CONFIG = { - // Target chunk size in tokens TARGET_CHUNK_SIZE: 1024, - MIN_CHUNK_SIZE: 100, - MAX_CHUNK_SIZE: 4000, - - // For spreadsheets, group rows together - ROWS_PER_CHUNK: 100, - MIN_ROWS_PER_CHUNK: 20, + MIN_ROWS_PER_CHUNK: 5, MAX_ROWS_PER_CHUNK: 500, - - // For better embeddings quality INCLUDE_HEADERS_IN_EACH_CHUNK: true, - MAX_HEADER_SIZE: 200, // tokens -} +} as const -/** - * Smart chunker for structured data (CSV, XLSX) that preserves semantic meaning - * Preserves headers in each chunk for better semantic context - */ export class StructuredDataChunker { - /** - * Chunk structured data intelligently based on rows and semantic boundaries - * Respects user's chunkSize preference when provided - */ static async chunkStructuredData( content: string, options: StructuredDataOptions = {} @@ -43,15 +28,12 @@ export class StructuredDataChunker { return chunks } - // Use user's chunk size or fall back to default const targetChunkSize = options.chunkSize ?? DEFAULT_CONFIG.TARGET_CHUNK_SIZE - // Detect headers (first line or provided) const headerLine = options.headers?.join('\t') || lines[0] const dataStartIndex = options.headers ? 0 : 1 - // Calculate optimal rows per chunk based on content and user's target size - const estimatedTokensPerRow = StructuredDataChunker.estimateTokensPerRow( + const estimatedTokensPerRow = StructuredDataChunker.estimateStructuredTokensPerRow( lines.slice(dataStartIndex, Math.min(10, lines.length)) ) const optimalRowsPerChunk = StructuredDataChunker.calculateOptimalRowsPerChunk( @@ -65,14 +47,13 @@ export class StructuredDataChunker { let currentChunkRows: string[] = [] let currentTokenEstimate = 0 - const headerTokens = StructuredDataChunker.estimateTokens(headerLine) + const headerTokens = estimateStructuredTokens(headerLine) let chunkStartRow = dataStartIndex for (let i = dataStartIndex; i < lines.length; i++) { const row = lines[i] - const rowTokens = StructuredDataChunker.estimateTokens(row) + const rowTokens = estimateStructuredTokens(row) - // Check if adding this row would exceed our target const projectedTokens = currentTokenEstimate + rowTokens + @@ -84,7 +65,6 @@ export class StructuredDataChunker { currentChunkRows.length >= optimalRowsPerChunk if (shouldCreateChunk && currentChunkRows.length > 0) { - // Create chunk with current rows const chunkContent = StructuredDataChunker.formatChunk( headerLine, currentChunkRows, @@ -92,7 +72,6 @@ export class StructuredDataChunker { ) chunks.push(StructuredDataChunker.createChunk(chunkContent, chunkStartRow, i - 1)) - // Reset for next chunk currentChunkRows = [] currentTokenEstimate = 0 chunkStartRow = i @@ -102,7 +81,6 @@ export class StructuredDataChunker { currentTokenEstimate += rowTokens } - // Add remaining rows as final chunk if (currentChunkRows.length > 0) { const chunkContent = StructuredDataChunker.formatChunk( headerLine, @@ -117,41 +95,28 @@ export class StructuredDataChunker { return chunks } - /** - * Format a chunk with headers and context - */ private static formatChunk(headerLine: string, rows: string[], sheetName?: string): string { let content = '' - // Add sheet name context if available if (sheetName) { content += `=== ${sheetName} ===\n\n` } - // Add headers for context if (DEFAULT_CONFIG.INCLUDE_HEADERS_IN_EACH_CHUNK) { content += `Headers: ${headerLine}\n` content += `${'-'.repeat(Math.min(80, headerLine.length))}\n` } - // Add data rows content += rows.join('\n') - - // Add row count for context - content += `\n\n[Rows ${rows.length} of data]` + content += `\n\n[${rows.length} rows of data]` return content } - /** - * Create a chunk object with actual row indices - */ private static createChunk(content: string, startRow: number, endRow: number): Chunk { - const tokenCount = StructuredDataChunker.estimateTokens(content) - return { text: content, - tokenCount, + tokenCount: estimateStructuredTokens(content), metadata: { startIndex: startRow, endIndex: endRow, @@ -159,30 +124,13 @@ export class StructuredDataChunker { } } - /** - * Estimate tokens in text (rough approximation) - * For structured data with numbers, uses 1 token per 3 characters - */ - private static estimateTokens(text: string): number { - return Math.ceil(text.length / 3) - } - - /** - * Estimate average tokens per row from sample - */ - private static estimateTokensPerRow(sampleRows: string[]): number { - if (sampleRows.length === 0) return 50 // default estimate + private static estimateStructuredTokensPerRow(sampleRows: string[]): number { + if (sampleRows.length === 0) return 50 - const totalTokens = sampleRows.reduce( - (sum, row) => sum + StructuredDataChunker.estimateTokens(row), - 0 - ) + const totalTokens = sampleRows.reduce((sum, row) => sum + estimateStructuredTokens(row), 0) return Math.ceil(totalTokens / sampleRows.length) } - /** - * Calculate optimal rows per chunk based on token estimates and target size - */ private static calculateOptimalRowsPerChunk( tokensPerRow: number, targetChunkSize: number @@ -195,11 +143,7 @@ export class StructuredDataChunker { ) } - /** - * Check if content appears to be structured data - */ static isStructuredData(content: string, mimeType?: string): boolean { - // Check mime type first if (mimeType) { const structuredMimeTypes = [ 'text/csv', @@ -212,20 +156,17 @@ export class StructuredDataChunker { } } - // Check content structure - const lines = content.split('\n').slice(0, 10) // Check first 10 lines + const lines = content.split('\n').slice(0, 10) if (lines.length < 2) return false - // Check for consistent delimiters (comma, tab, pipe) const delimiters = [',', '\t', '|'] for (const delimiter of delimiters) { - const counts = lines.map( - (line) => (line.match(new RegExp(`\\${delimiter}`, 'g')) || []).length - ) + const escaped = delimiter.replace(/[.*+?^${}()|[\]\\]/g, '\\$&') + const counts = lines.map((line) => (line.match(new RegExp(escaped, 'g')) || []).length) const avgCount = counts.reduce((a, b) => a + b, 0) / counts.length - // If most lines have similar delimiter counts, it's likely structured - if (avgCount > 2 && counts.every((c) => Math.abs(c - avgCount) <= 2)) { + const tolerance = Math.max(1, Math.ceil(avgCount * 0.2)) + if (avgCount > 2 && counts.every((c) => Math.abs(c - avgCount) <= tolerance)) { return true } } diff --git a/apps/sim/lib/chunkers/text-chunker.test.ts b/apps/sim/lib/chunkers/text-chunker.test.ts index 3b8b8455691..f7c2458d4b5 100644 --- a/apps/sim/lib/chunkers/text-chunker.test.ts +++ b/apps/sim/lib/chunkers/text-chunker.test.ts @@ -30,7 +30,7 @@ describe('TextChunker', () => { it.concurrent('should include token count in chunk metadata', async () => { const chunker = new TextChunker({ chunkSize: 100 }) - const text = 'Hello world' // ~3 tokens (11 chars / 4) + const text = 'Hello world' const chunks = await chunker.chunk(text) expect(chunks[0].tokenCount).toBe(3) @@ -201,7 +201,6 @@ describe('TextChunker', () => { it.concurrent('should use default minCharactersPerChunk of 100', async () => { const chunker = new TextChunker({ chunkSize: 10 }) - // Text with 150+ characters to ensure chunks pass the 100 character minimum const text = 'This is a longer sentence with more content. '.repeat(5) const chunks = await chunker.chunk(text) @@ -266,7 +265,6 @@ describe('TextChunker', () => { describe('boundary conditions', () => { it.concurrent('should handle text exactly at chunk size boundary', async () => { const chunker = new TextChunker({ chunkSize: 10 }) - // 40 characters = 10 tokens exactly const text = 'A'.repeat(40) const chunks = await chunker.chunk(text) @@ -276,7 +274,6 @@ describe('TextChunker', () => { it.concurrent('should handle text one token over chunk size', async () => { const chunker = new TextChunker({ chunkSize: 10 }) - // 44 characters = 11 tokens, just over limit const text = 'A'.repeat(44) const chunks = await chunker.chunk(text) @@ -300,7 +297,6 @@ describe('TextChunker', () => { }) it.concurrent('should clamp overlap to max 50% of chunk size', async () => { - // Overlap of 60 should be clamped to 10 (50% of chunkSize 20) const chunker = new TextChunker({ chunkSize: 20, chunkOverlap: 60 }) const text = 'First paragraph here.\n\nSecond paragraph here.\n\nThird paragraph here.' const chunks = await chunker.chunk(text) @@ -359,7 +355,6 @@ describe('TextChunker', () => { it.concurrent('should handle combining diacritics', async () => { const chunker = new TextChunker({ chunkSize: 100 }) - // e + combining acute accent const text = 'cafe\u0301 resume\u0301 naive\u0308' const chunks = await chunker.chunk(text) @@ -368,7 +363,6 @@ describe('TextChunker', () => { it.concurrent('should handle zero-width characters', async () => { const chunker = new TextChunker({ chunkSize: 100 }) - // Zero-width space, zero-width non-joiner, zero-width joiner const text = 'Hello\u200B\u200C\u200DWorld' const chunks = await chunker.chunk(text) @@ -391,14 +385,12 @@ describe('TextChunker', () => { const chunks = await chunker.chunk(text) expect(chunks.length).toBeGreaterThan(1) - // Verify all content is preserved const totalChars = chunks.reduce((sum, c) => sum + c.text.length, 0) expect(totalChars).toBeGreaterThan(0) }) it.concurrent('should handle 1MB of text', async () => { const chunker = new TextChunker({ chunkSize: 500 }) - // 1MB of text const text = 'Lorem ipsum dolor sit amet. '.repeat(40000) const chunks = await chunker.chunk(text) @@ -407,7 +399,6 @@ describe('TextChunker', () => { it.concurrent('should handle very long single line', async () => { const chunker = new TextChunker({ chunkSize: 50 }) - // Single line with no natural break points const text = 'Word'.repeat(10000) const chunks = await chunker.chunk(text) diff --git a/apps/sim/lib/chunkers/text-chunker.ts b/apps/sim/lib/chunkers/text-chunker.ts index 7dbbde0cf97..eb993b609aa 100644 --- a/apps/sim/lib/chunkers/text-chunker.ts +++ b/apps/sim/lib/chunkers/text-chunker.ts @@ -1,99 +1,61 @@ import type { Chunk, ChunkerOptions } from '@/lib/chunkers/types' +import { + addOverlap, + buildChunks, + cleanText, + estimateTokens, + resolveChunkerOptions, + splitAtWordBoundaries, + tokensToChars, +} from '@/lib/chunkers/utils' -/** - * Lightweight text chunker optimized for RAG applications - * Uses hierarchical splitting with simple character-based token estimation - * - * Parameters: - * - chunkSize: Maximum chunk size in TOKENS (default: 1024) - * - chunkOverlap: Overlap between chunks in TOKENS (default: 0) - * - minCharactersPerChunk: Minimum characters to keep a chunk (default: 100) - */ export class TextChunker { - private readonly chunkSize: number // Max chunk size in tokens - private readonly chunkOverlap: number // Overlap in tokens - private readonly minCharactersPerChunk: number // Min characters per chunk + private readonly chunkSize: number + private readonly chunkOverlap: number - // Hierarchical separators ordered from largest to smallest semantic units private readonly separators = [ - '\n\n\n', // Document sections - '\n---\n', // Markdown horizontal rules - '\n***\n', // Markdown horizontal rules (alternative) - '\n___\n', // Markdown horizontal rules (alternative) - '\n# ', // Markdown H1 headings - '\n## ', // Markdown H2 headings - '\n### ', // Markdown H3 headings - '\n#### ', // Markdown H4 headings - '\n##### ', // Markdown H5 headings - '\n###### ', // Markdown H6 headings - '\n\n', // Paragraphs - '\n', // Lines - '. ', // Sentences - '! ', // Exclamations - '? ', // Questions - '; ', // Semicolons - ', ', // Commas - ' ', // Words + '\n---\n', + '\n***\n', + '\n___\n', + '\n# ', + '\n## ', + '\n### ', + '\n#### ', + '\n##### ', + '\n###### ', + '\n\n', + '\n', + '. ', + '! ', + '? ', + '; ', + ', ', + ' ', ] constructor(options: ChunkerOptions = {}) { - this.chunkSize = options.chunkSize ?? 1024 - // Clamp overlap to prevent exceeding chunk size (max 50% of chunk size) - const maxOverlap = Math.floor(this.chunkSize * 0.5) - this.chunkOverlap = Math.min(options.chunkOverlap ?? 0, maxOverlap) - this.minCharactersPerChunk = options.minCharactersPerChunk ?? 100 + const resolved = resolveChunkerOptions(options) + this.chunkSize = resolved.chunkSize + this.chunkOverlap = resolved.chunkOverlap } - /** - * Simple token estimation using character count - * 1 token ≈ 4 characters for English text - */ - private estimateTokens(text: string): number { - if (!text?.trim()) return 0 - return Math.ceil(text.length / 4) - } - - /** - * Convert tokens to approximate character count - */ - private tokensToChars(tokens: number): number { - return tokens * 4 - } - - /** - * Split text recursively using hierarchical separators - */ - private async splitRecursively(text: string, separatorIndex = 0): Promise { - const tokenCount = this.estimateTokens(text) + private splitRecursively(text: string, separatorIndex = 0): string[] { + const tokenCount = estimateTokens(text) - // If chunk is small enough (within max token limit), return it - // Keep chunks even if below minCharactersPerChunk to avoid data loss if (tokenCount <= this.chunkSize) { - // Only filter out empty/whitespace-only text, not small chunks return text.trim() ? [text] : [] } - // If we've run out of separators, force split by character count if (separatorIndex >= this.separators.length) { - const chunks: string[] = [] - const targetLength = Math.ceil((text.length * this.chunkSize) / tokenCount) - - for (let i = 0; i < text.length; i += targetLength) { - const chunk = text.slice(i, i + targetLength).trim() - // Keep all non-empty chunks to avoid data loss - if (chunk) { - chunks.push(chunk) - } - } - return chunks + const chunkSizeChars = tokensToChars(this.chunkSize) + return splitAtWordBoundaries(text, chunkSizeChars) } const separator = this.separators[separatorIndex] const parts = text.split(separator).filter((part) => part.trim()) - // If no split occurred, try next separator if (parts.length <= 1) { - return await this.splitRecursively(text, separatorIndex + 1) + return this.splitRecursively(text, separatorIndex + 1) } const chunks: string[] = [] @@ -102,17 +64,15 @@ export class TextChunker { for (const part of parts) { const testChunk = currentChunk + (currentChunk ? separator : '') + part - if (this.estimateTokens(testChunk) <= this.chunkSize) { + if (estimateTokens(testChunk) <= this.chunkSize) { currentChunk = testChunk } else { - // Save current chunk - keep even if below minCharactersPerChunk to avoid data loss if (currentChunk.trim()) { chunks.push(currentChunk.trim()) } - // If part itself is too large, split it further - if (this.estimateTokens(part) > this.chunkSize) { - const subChunks = await this.splitRecursively(part, separatorIndex + 1) + if (estimateTokens(part) > this.chunkSize) { + const subChunks = this.splitRecursively(part, separatorIndex + 1) for (const subChunk of subChunks) { chunks.push(subChunk) } @@ -123,7 +83,6 @@ export class TextChunker { } } - // Add final chunk if it exists - keep even if below minCharactersPerChunk to avoid data loss if (currentChunk.trim()) { chunks.push(currentChunk.trim()) } @@ -131,111 +90,19 @@ export class TextChunker { return chunks } - /** - * Add overlap between chunks (overlap is in tokens, converted to characters) - */ - private addOverlap(chunks: string[]): string[] { - if (this.chunkOverlap <= 0 || chunks.length <= 1) { - return chunks - } - - const overlappedChunks: string[] = [] - // Convert token overlap to character overlap - const overlapChars = this.tokensToChars(this.chunkOverlap) - - for (let i = 0; i < chunks.length; i++) { - let chunk = chunks[i] - - // Add overlap from previous chunk (converted from tokens to characters) - if (i > 0) { - const prevChunk = chunks[i - 1] - // Take the last N characters from previous chunk (based on token overlap) - const overlapLength = Math.min(overlapChars, prevChunk.length) - const overlapText = prevChunk.slice(-overlapLength) - - // Try to start overlap at a word boundary for cleaner text - const wordBoundaryMatch = overlapText.match(/^\s*\S/) - const cleanOverlap = wordBoundaryMatch - ? overlapText.slice(overlapText.indexOf(wordBoundaryMatch[0].trim())) - : overlapText - - if (cleanOverlap.trim()) { - chunk = `${cleanOverlap.trim()} ${chunk}` - } - } - - overlappedChunks.push(chunk) - } - - return overlappedChunks - } - - /** - * Clean and normalize text - */ - private cleanText(text: string): string { - return text - .replace(/\r\n/g, '\n') // Normalize Windows line endings - .replace(/\r/g, '\n') // Normalize old Mac line endings - .replace(/\n{3,}/g, '\n\n') // Limit consecutive newlines - .replace(/\t/g, ' ') // Convert tabs to spaces - .replace(/ {2,}/g, ' ') // Collapse multiple spaces - .trim() - } - - /** - * Main chunking method - */ async chunk(text: string): Promise { if (!text?.trim()) { return [] } - // Clean the text - const cleanedText = this.cleanText(text) - - // Split into chunks - let chunks = await this.splitRecursively(cleanedText) - - // Add overlap if configured - chunks = this.addOverlap(chunks) - - // Convert to Chunk objects with metadata - let previousEndIndex = 0 - const chunkPromises = chunks.map(async (chunkText, index) => { - let startIndex: number - let actualContentLength: number + const cleaned = cleanText(text) + let chunks = this.splitRecursively(cleaned) - if (index === 0 || this.chunkOverlap <= 0) { - // First chunk or no overlap - start from previous end - startIndex = previousEndIndex - actualContentLength = chunkText.length - } else { - // Calculate overlap length in characters (converted from tokens) - const prevChunk = chunks[index - 1] - const overlapChars = this.tokensToChars(this.chunkOverlap) - const overlapLength = Math.min(overlapChars, prevChunk.length, chunkText.length) - - startIndex = previousEndIndex - overlapLength - actualContentLength = chunkText.length - overlapLength - } - - const safeStart = Math.max(0, startIndex) - const endIndexSafe = safeStart + Math.max(0, actualContentLength) - - const chunk: Chunk = { - text: chunkText, - tokenCount: this.estimateTokens(chunkText), - metadata: { - startIndex: safeStart, - endIndex: endIndexSafe, - }, - } - - previousEndIndex = endIndexSafe - return chunk - }) + if (this.chunkOverlap > 0) { + const overlapChars = tokensToChars(this.chunkOverlap) + chunks = addOverlap(chunks, overlapChars) + } - return await Promise.all(chunkPromises) + return buildChunks(chunks, this.chunkOverlap) } } diff --git a/apps/sim/lib/chunkers/token-chunker.test.ts b/apps/sim/lib/chunkers/token-chunker.test.ts new file mode 100644 index 00000000000..420224c4d6e --- /dev/null +++ b/apps/sim/lib/chunkers/token-chunker.test.ts @@ -0,0 +1,239 @@ +/** + * @vitest-environment node + */ + +import { loggerMock } from '@sim/testing' +import { describe, expect, it, vi } from 'vitest' +import { TokenChunker } from './token-chunker' + +vi.mock('@sim/logger', () => loggerMock) + +describe('TokenChunker', () => { + describe('empty and whitespace input', () => { + it.concurrent('should return empty array for empty string', async () => { + const chunker = new TokenChunker({ chunkSize: 100 }) + const chunks = await chunker.chunk('') + expect(chunks).toEqual([]) + }) + + it.concurrent('should return empty array for whitespace-only input', async () => { + const chunker = new TokenChunker({ chunkSize: 100 }) + const chunks = await chunker.chunk(' \n\n\t ') + expect(chunks).toEqual([]) + }) + }) + + describe('small content', () => { + it.concurrent('should return single chunk when content fits within chunkSize', async () => { + const chunker = new TokenChunker({ chunkSize: 100 }) + const text = 'This is a short text.' + const chunks = await chunker.chunk(text) + + expect(chunks).toHaveLength(1) + expect(chunks[0].text).toBe(text) + }) + }) + + describe('token count accuracy', () => { + it.concurrent('should compute tokenCount as Math.ceil(text.length / 4)', async () => { + const chunker = new TokenChunker({ chunkSize: 100 }) + const text = 'Hello world' + const chunks = await chunker.chunk(text) + + expect(chunks[0].tokenCount).toBe(Math.ceil(text.length / 4)) + }) + + it.concurrent('should compute tokenCount correctly for longer text', async () => { + const chunker = new TokenChunker({ chunkSize: 100 }) + const text = 'The quick brown fox jumps over the lazy dog.' + const chunks = await chunker.chunk(text) + + expect(chunks[0].tokenCount).toBe(11) + }) + }) + + describe('chunk metadata', () => { + it.concurrent( + 'should include text, tokenCount, and metadata with startIndex and endIndex', + async () => { + const chunker = new TokenChunker({ chunkSize: 100 }) + const text = 'Some test content here.' + const chunks = await chunker.chunk(text) + + expect(chunks[0]).toHaveProperty('text') + expect(chunks[0]).toHaveProperty('tokenCount') + expect(chunks[0].metadata).toHaveProperty('startIndex') + expect(chunks[0].metadata).toHaveProperty('endIndex') + expect(chunks[0].metadata.startIndex).toBe(0) + expect(chunks[0].metadata.endIndex).toBeGreaterThan(0) + } + ) + + it.concurrent('should have non-negative indices across all chunks', async () => { + const chunker = new TokenChunker({ chunkSize: 20, chunkOverlap: 0 }) + const text = 'First part of the text. Second part of the text. Third part of the text.' + const chunks = await chunker.chunk(text) + + for (const chunk of chunks) { + expect(chunk.metadata.startIndex).toBeGreaterThanOrEqual(0) + expect(chunk.metadata.endIndex).toBeGreaterThanOrEqual(chunk.metadata.startIndex) + } + }) + }) + + describe('respects chunk size', () => { + it.concurrent('should not produce chunks exceeding chunkSize tokens', async () => { + const chunkSize = 50 + const chunker = new TokenChunker({ chunkSize }) + const text = 'This is a test sentence with several words. '.repeat(30) + const chunks = await chunker.chunk(text) + + for (const chunk of chunks) { + expect(chunk.tokenCount).toBeLessThanOrEqual(chunkSize) + } + }) + }) + + describe('splitting behavior', () => { + it.concurrent('should produce multiple chunks for long text', async () => { + const chunker = new TokenChunker({ chunkSize: 50 }) + const text = 'This is a test sentence. '.repeat(30) + const chunks = await chunker.chunk(text) + + expect(chunks.length).toBeGreaterThan(1) + }) + + it.concurrent('should create more chunks with smaller chunkSize', async () => { + const text = 'This is a test sentence with content. '.repeat(20) + + const largeChunker = new TokenChunker({ chunkSize: 200 }) + const smallChunker = new TokenChunker({ chunkSize: 50 }) + + const largeChunks = await largeChunker.chunk(text) + const smallChunks = await smallChunker.chunk(text) + + expect(smallChunks.length).toBeGreaterThan(largeChunks.length) + }) + }) + + describe('sliding window overlap', () => { + it.concurrent('should produce more chunks with overlap than without', async () => { + const text = + 'Alpha bravo charlie delta echo foxtrot golf hotel india juliet kilo lima mike november oscar papa quebec romeo sierra tango uniform victor whiskey xray yankee zulu. '.repeat( + 5 + ) + const withOverlap = new TokenChunker({ chunkSize: 30, chunkOverlap: 10 }) + const withoutOverlap = new TokenChunker({ chunkSize: 30, chunkOverlap: 0 }) + + const overlapChunks = await withOverlap.chunk(text) + const noOverlapChunks = await withoutOverlap.chunk(text) + + expect(overlapChunks.length).toBeGreaterThan(noOverlapChunks.length) + }) + + it.concurrent('should not share text between chunks when chunkOverlap is 0', async () => { + const chunker = new TokenChunker({ chunkSize: 20, chunkOverlap: 0 }) + const text = + 'First sentence here. Second sentence here. Third sentence here. Fourth sentence here.' + const chunks = await chunker.chunk(text) + + if (chunks.length > 1) { + const firstChunkEnd = chunks[0].text.slice(-10) + expect(chunks[1].text.startsWith(firstChunkEnd)).toBe(false) + } + }) + }) + + describe('overlap clamped to 50%', () => { + it.concurrent('should still work when overlap is set >= chunkSize', async () => { + const chunker = new TokenChunker({ chunkSize: 20, chunkOverlap: 100 }) + const text = + 'First paragraph content here. Second paragraph content here. Third paragraph here.' + const chunks = await chunker.chunk(text) + + expect(chunks.length).toBeGreaterThan(0) + }) + + it.concurrent('should clamp overlap to 50% of chunkSize', async () => { + const chunkerClamped = new TokenChunker({ chunkSize: 20, chunkOverlap: 100 }) + const chunkerHalf = new TokenChunker({ chunkSize: 20, chunkOverlap: 10 }) + const text = + 'Word one two three four five six seven eight nine ten eleven twelve thirteen fourteen fifteen sixteen seventeen eighteen nineteen twenty. '.repeat( + 5 + ) + + const clampedChunks = await chunkerClamped.chunk(text) + const halfChunks = await chunkerHalf.chunk(text) + + expect(clampedChunks.length).toBe(halfChunks.length) + }) + }) + + describe('word boundary snapping', () => { + it.concurrent('should produce trimmed chunks without leading or trailing spaces', async () => { + const chunker = new TokenChunker({ chunkSize: 20 }) + const text = + 'the cat sat on the mat and the dog ran fast over the big red fox and then the bird flew high up in the clear blue sky above the green hill' + const chunks = await chunker.chunk(text) + + expect(chunks.length).toBeGreaterThan(1) + for (const chunk of chunks) { + const trimmed = chunk.text.trim() + expect(trimmed).toBe(chunk.text) + expect(trimmed.length).toBeGreaterThan(0) + } + }) + + it.concurrent('should produce chunks that start and end on word boundaries', async () => { + const chunker = new TokenChunker({ chunkSize: 15 }) + const text = + 'The quick brown fox jumps over the lazy dog and then runs away quickly into the forest' + const chunks = await chunker.chunk(text) + + for (const chunk of chunks) { + const trimmed = chunk.text.trim() + expect(trimmed).toBe(chunk.text) + } + }) + }) + + describe('consistent coverage', () => { + it.concurrent('should represent all content from original text across chunks', async () => { + const chunker = new TokenChunker({ chunkSize: 30, chunkOverlap: 0 }) + const originalText = + 'The quick brown fox jumps over the lazy dog. Pack my box with five dozen liquor jugs.' + const chunks = await chunker.chunk(originalText) + + const allText = chunks.map((c) => c.text).join(' ') + expect(allText).toContain('quick') + expect(allText).toContain('fox') + expect(allText).toContain('lazy') + expect(allText).toContain('dog') + expect(allText).toContain('liquor') + expect(allText).toContain('jugs') + }) + + it.concurrent('should preserve all words across chunks for longer text', async () => { + const chunker = new TokenChunker({ chunkSize: 20, chunkOverlap: 0 }) + const words = [ + 'alpha', + 'bravo', + 'charlie', + 'delta', + 'echo', + 'foxtrot', + 'golf', + 'hotel', + 'india', + 'juliet', + ] + const originalText = `${words.join(' is a word and ')} is also a word.` + const chunks = await chunker.chunk(originalText) + + const combined = chunks.map((c) => c.text).join(' ') + for (const word of words) { + expect(combined).toContain(word) + } + }) + }) +}) diff --git a/apps/sim/lib/chunkers/token-chunker.ts b/apps/sim/lib/chunkers/token-chunker.ts new file mode 100644 index 00000000000..d98b4d1651a --- /dev/null +++ b/apps/sim/lib/chunkers/token-chunker.ts @@ -0,0 +1,54 @@ +import { createLogger } from '@sim/logger' +import type { Chunk, ChunkerOptions } from '@/lib/chunkers/types' +import { + buildChunks, + cleanText, + estimateTokens, + resolveChunkerOptions, + splitAtWordBoundaries, + tokensToChars, +} from '@/lib/chunkers/utils' + +const logger = createLogger('TokenChunker') + +export class TokenChunker { + private readonly chunkSize: number + private readonly chunkOverlap: number + private readonly minCharactersPerChunk: number + + constructor(options: ChunkerOptions = {}) { + const resolved = resolveChunkerOptions(options) + this.chunkSize = resolved.chunkSize + this.chunkOverlap = resolved.chunkOverlap + this.minCharactersPerChunk = resolved.minCharactersPerChunk + } + + async chunk(content: string): Promise { + if (!content?.trim()) { + return [] + } + + const cleaned = cleanText(content) + + if (estimateTokens(cleaned) <= this.chunkSize) { + logger.info('Content fits in single chunk') + return buildChunks([cleaned], 0) + } + + const chunkSizeChars = tokensToChars(this.chunkSize) + const overlapChars = tokensToChars(this.chunkOverlap) + const stepChars = this.chunkOverlap > 0 ? chunkSizeChars - overlapChars : undefined + + const rawChunks = splitAtWordBoundaries(cleaned, chunkSizeChars, stepChars) + + const filtered = + rawChunks.length > 1 + ? rawChunks.filter((c) => c.length >= this.minCharactersPerChunk) + : rawChunks + + const chunks = filtered.length > 0 ? filtered : rawChunks + + logger.info(`Chunked into ${chunks.length} token-based chunks`) + return buildChunks(chunks, this.chunkOverlap) + } +} diff --git a/apps/sim/lib/chunkers/types.ts b/apps/sim/lib/chunkers/types.ts index a316d643f03..692e84d12fc 100644 --- a/apps/sim/lib/chunkers/types.ts +++ b/apps/sim/lib/chunkers/types.ts @@ -1,17 +1,11 @@ /** - * Options for configuring text chunkers - * * Units: - * - chunkSize: Maximum chunk size in TOKENS (1 token ≈ 4 characters) - * - chunkOverlap: Overlap between chunks in TOKENS - * - minCharactersPerChunk: Minimum chunk size in CHARACTERS (filters tiny fragments) + * - chunkSize/chunkOverlap: TOKENS (1 token ≈ 4 characters) + * - minCharactersPerChunk: CHARACTERS */ export interface ChunkerOptions { - /** Maximum chunk size in tokens (default: 1024) */ chunkSize?: number - /** Overlap between chunks in tokens (default: 0) */ chunkOverlap?: number - /** Minimum chunk size in characters to avoid tiny fragments (default: 100) */ minCharactersPerChunk?: number } @@ -51,3 +45,26 @@ export interface DocChunk { export interface DocsChunkerOptions extends ChunkerOptions { baseUrl?: string } + +export type ChunkingStrategy = 'auto' | 'text' | 'regex' | 'recursive' | 'sentence' | 'token' + +export type RecursiveRecipe = 'plain' | 'markdown' | 'code' + +export interface StrategyOptions { + pattern?: string + separators?: string[] + recipe?: RecursiveRecipe +} + +export interface SentenceChunkerOptions extends ChunkerOptions { + minSentencesPerChunk?: number +} + +export interface RecursiveChunkerOptions extends ChunkerOptions { + separators?: string[] + recipe?: RecursiveRecipe +} + +export interface RegexChunkerOptions extends ChunkerOptions { + pattern: string +} diff --git a/apps/sim/lib/chunkers/utils.test.ts b/apps/sim/lib/chunkers/utils.test.ts new file mode 100644 index 00000000000..bc88bc0e46a --- /dev/null +++ b/apps/sim/lib/chunkers/utils.test.ts @@ -0,0 +1,217 @@ +/** + * @vitest-environment node + */ + +import { describe, expect, it } from 'vitest' +import { + addOverlap, + buildChunks, + cleanText, + estimateTokens, + resolveChunkerOptions, + splitAtWordBoundaries, + tokensToChars, +} from './utils' + +describe('estimateTokens', () => { + it('returns 0 for empty string', () => { + expect(estimateTokens('')).toBe(0) + }) + + it('returns 0 for whitespace-only string', () => { + expect(estimateTokens(' ')).toBe(0) + }) + + it('returns 0 for null or undefined via optional chaining', () => { + expect(estimateTokens(null as unknown as string)).toBe(0) + expect(estimateTokens(undefined as unknown as string)).toBe(0) + }) + + it('returns Math.ceil(text.length / 4) for normal text', () => { + const text = 'Hello world' + expect(estimateTokens(text)).toBe(Math.ceil(text.length / 4)) + }) + + it('estimates "Hello world" (11 chars) as 3 tokens', () => { + expect(estimateTokens('Hello world')).toBe(3) + }) +}) + +describe('tokensToChars', () => { + it('returns tokens * 4', () => { + expect(tokensToChars(1)).toBe(4) + expect(tokensToChars(5)).toBe(20) + }) + + it('converts 10 tokens to 40 chars', () => { + expect(tokensToChars(10)).toBe(40) + }) +}) + +describe('cleanText', () => { + it('normalizes \\r\\n to \\n', () => { + expect(cleanText('hello\r\nworld')).toBe('hello\nworld') + }) + + it('normalizes \\r to \\n', () => { + expect(cleanText('hello\rworld')).toBe('hello\nworld') + }) + + it('collapses 3+ newlines to \\n\\n', () => { + expect(cleanText('hello\n\n\n\nworld')).toBe('hello\n\nworld') + }) + + it('replaces tabs with spaces', () => { + expect(cleanText('hello\tworld')).toBe('hello world') + }) + + it('collapses multiple spaces to single space', () => { + expect(cleanText('hello world')).toBe('hello world') + }) + + it('trims leading and trailing whitespace', () => { + expect(cleanText(' hello world ')).toBe('hello world') + }) +}) + +describe('addOverlap', () => { + it('returns unchanged chunks when overlapChars <= 0', () => { + const chunks = ['chunk one', 'chunk two'] + expect(addOverlap(chunks, 0)).toEqual(chunks) + expect(addOverlap(chunks, -5)).toEqual(chunks) + }) + + it('returns unchanged chunks when only 1 chunk', () => { + const chunks = ['only chunk'] + expect(addOverlap(chunks, 10)).toEqual(chunks) + }) + + it('prepends tail of previous chunk to next chunk with overlap > 0', () => { + const chunks = ['first chunk here', 'second chunk here'] + const result = addOverlap(chunks, 10) + expect(result[0]).toBe('first chunk here') + expect(result[1]).toContain('second chunk here') + expect(result[1].length).toBeGreaterThan('second chunk here'.length) + }) + + it('joins overlap text with space', () => { + const chunks = ['first chunk here', 'second chunk here'] + const result = addOverlap(chunks, 10) + expect(result[1]).toContain('here second') + }) + + it('snaps overlap to word boundary', () => { + const chunks = ['hello beautiful world', 'next chunk'] + const result = addOverlap(chunks, 15) + expect(result[1]).toBe('beautiful world next chunk') + }) +}) + +describe('splitAtWordBoundaries', () => { + it('returns single element for short text', () => { + const result = splitAtWordBoundaries('short text', 100) + expect(result).toHaveLength(1) + expect(result[0]).toBe('short text') + }) + + it('produces multiple chunks for long text', () => { + const text = 'word '.repeat(100).trim() + const result = splitAtWordBoundaries(text, 20) + expect(result.length).toBeGreaterThan(1) + }) + + it('respects chunk size limit', () => { + const text = 'word '.repeat(100).trim() + const chunkSize = 25 + const result = splitAtWordBoundaries(text, chunkSize) + for (const chunk of result) { + expect(chunk.length).toBeLessThanOrEqual(chunkSize) + } + }) + + it('does not break mid-word', () => { + const text = 'internationalization globalization modernization' + const result = splitAtWordBoundaries(text, 25) + for (const chunk of result) { + expect(chunk).not.toMatch(/^\S+\s\S+$.*\S$/) + const words = chunk.split(' ') + for (const word of words) { + expect(text).toContain(word) + } + } + }) + + it('produces overlapping chunks with stepChars < chunkSizeChars', () => { + const text = 'one two three four five six seven eight nine ten' + const result = splitAtWordBoundaries(text, 20, 10) + expect(result.length).toBeGreaterThan(1) + const combined = result.join(' ') + for (const word of text.split(' ')) { + expect(combined).toContain(word) + } + }) + + it('ensures step is at least 1 to prevent infinite loops', () => { + const text = 'hello world test' + const result = splitAtWordBoundaries(text, 10, 0) + expect(result.length).toBeGreaterThan(0) + }) +}) + +describe('buildChunks', () => { + it('creates Chunk objects with text, tokenCount, and metadata', () => { + const texts = ['hello world', 'foo bar'] + const chunks = buildChunks(texts, 0) + for (const chunk of chunks) { + expect(chunk).toHaveProperty('text') + expect(chunk).toHaveProperty('tokenCount') + expect(chunk).toHaveProperty('metadata') + expect(chunk.metadata).toHaveProperty('startIndex') + expect(chunk.metadata).toHaveProperty('endIndex') + } + }) + + it('sets metadata with startIndex and endIndex', () => { + const texts = ['chunk one', 'chunk two'] + const chunks = buildChunks(texts, 0) + expect(typeof chunks[0].metadata.startIndex).toBe('number') + expect(typeof chunks[0].metadata.endIndex).toBe('number') + }) + + it('sets startIndex of first chunk to 0', () => { + const texts = ['first chunk', 'second chunk'] + const chunks = buildChunks(texts, 0) + expect(chunks[0].metadata.startIndex).toBe(0) + }) + + it('produces contiguous chunks with overlapTokens=0', () => { + const texts = ['hello world', 'foo bar baz'] + const chunks = buildChunks(texts, 0) + expect(chunks[0].metadata.endIndex).toBe(chunks[1].metadata.startIndex) + }) +}) + +describe('resolveChunkerOptions', () => { + it('applies defaults: chunkSize=1024, chunkOverlap=0, minCharactersPerChunk=100', () => { + const result = resolveChunkerOptions({}) + expect(result.chunkSize).toBe(1024) + expect(result.chunkOverlap).toBe(0) + expect(result.minCharactersPerChunk).toBe(100) + }) + + it('clamps overlap to max 50% of chunkSize', () => { + const result = resolveChunkerOptions({ chunkSize: 100, chunkOverlap: 80 }) + expect(result.chunkOverlap).toBe(50) + }) + + it('respects provided values when within limits', () => { + const result = resolveChunkerOptions({ + chunkSize: 500, + chunkOverlap: 100, + minCharactersPerChunk: 50, + }) + expect(result.chunkSize).toBe(500) + expect(result.chunkOverlap).toBe(100) + expect(result.minCharactersPerChunk).toBe(50) + }) +}) diff --git a/apps/sim/lib/chunkers/utils.ts b/apps/sim/lib/chunkers/utils.ts new file mode 100644 index 00000000000..ded68dbc192 --- /dev/null +++ b/apps/sim/lib/chunkers/utils.ts @@ -0,0 +1,143 @@ +import type { Chunk } from '@/lib/chunkers/types' + +/** 1 token ≈ 4 characters for English text */ +export function estimateTokens(text: string): number { + if (!text?.trim()) return 0 + return Math.ceil(text.length / 4) +} + +export function tokensToChars(tokens: number): number { + return tokens * 4 +} + +export function cleanText(text: string): string { + return text + .replace(/\r\n/g, '\n') + .replace(/\r/g, '\n') + .replace(/\n{3,}/g, '\n\n') + .replace(/\t/g, ' ') + .replace(/ {2,}/g, ' ') + .trim() +} + +export function addOverlap(chunks: string[], overlapChars: number): string[] { + if (overlapChars <= 0 || chunks.length <= 1) { + return chunks + } + + const result: string[] = [] + + for (let i = 0; i < chunks.length; i++) { + let chunk = chunks[i] + + if (i > 0) { + const prevChunk = chunks[i - 1] + const overlapLength = Math.min(overlapChars, prevChunk.length) + const overlapText = prevChunk.slice(-overlapLength) + + const wordBoundaryMatch = overlapText.match(/^\s*\S/) + const cleanOverlap = wordBoundaryMatch + ? overlapText.slice(overlapText.indexOf(wordBoundaryMatch[0].trim())) + : overlapText + + if (cleanOverlap.trim()) { + chunk = `${cleanOverlap.trim()} ${chunk}` + } + } + + result.push(chunk) + } + + return result +} + +/** + * When stepChars is provided (< chunkSizeChars), produces overlapping chunks + * using a sliding window where chunks stay within the size limit. + */ +export function splitAtWordBoundaries( + text: string, + chunkSizeChars: number, + stepChars?: number +): string[] { + const parts: string[] = [] + let pos = 0 + + while (pos < text.length) { + let end = Math.min(pos + chunkSizeChars, text.length) + + if (end < text.length) { + const lastSpace = text.lastIndexOf(' ', end) + if (lastSpace > pos) { + end = lastSpace + } + } + + const part = text.slice(pos, end).trim() + if (part) { + parts.push(part) + } + + if (stepChars !== undefined) { + // Sliding window: advance by step for predictable overlap + const nextPos = pos + Math.max(1, stepChars) + if (nextPos >= text.length) break + pos = nextPos + } else { + // Non-overlapping: advance from end of extracted content + if (end >= text.length) break + pos = end + } + while (pos < text.length && text[pos] === ' ') pos++ + } + + return parts +} + +export function buildChunks(texts: string[], overlapTokens: number): Chunk[] { + let previousEndIndex = 0 + const overlapChars = tokensToChars(overlapTokens) + + return texts.map((text, index) => { + let startIndex: number + let actualContentLength: number + + if (index === 0 || overlapTokens <= 0) { + startIndex = previousEndIndex + actualContentLength = text.length + } else { + const prevChunk = texts[index - 1] + const overlapLength = Math.min(overlapChars, prevChunk.length, text.length) + startIndex = previousEndIndex - overlapLength + actualContentLength = text.length - overlapLength + } + + const safeStart = Math.max(0, startIndex) + const endIndex = safeStart + Math.max(0, actualContentLength) + + previousEndIndex = endIndex + + return { + text, + tokenCount: estimateTokens(text), + metadata: { + startIndex: safeStart, + endIndex, + }, + } + }) +} + +export function resolveChunkerOptions(options: { + chunkSize?: number + chunkOverlap?: number + minCharactersPerChunk?: number +}): { chunkSize: number; chunkOverlap: number; minCharactersPerChunk: number } { + const chunkSize = options.chunkSize ?? 1024 + const maxOverlap = Math.floor(chunkSize * 0.5) + return { + chunkSize, + chunkOverlap: Math.min(options.chunkOverlap ?? 0, maxOverlap), + minCharactersPerChunk: options.minCharactersPerChunk ?? 100, + } +} diff --git a/apps/sim/lib/file-parsers/index.ts b/apps/sim/lib/file-parsers/index.ts index a69a8abdf26..28080e54667 100644 --- a/apps/sim/lib/file-parsers/index.ts +++ b/apps/sim/lib/file-parsers/index.ts @@ -86,12 +86,21 @@ function getParserInstances(): Record { } try { - const { parseJSON, parseJSONBuffer } = require('@/lib/file-parsers/json-parser') + const { + parseJSON, + parseJSONBuffer, + parseJSONL, + parseJSONLBuffer, + } = require('@/lib/file-parsers/json-parser') parserInstances.json = { parseFile: parseJSON, parseBuffer: parseJSONBuffer, } - logger.info('Loaded JSON parser') + parserInstances.jsonl = { + parseFile: parseJSONL, + parseBuffer: parseJSONLBuffer, + } + logger.info('Loaded JSON/JSONL parser') } catch (error) { logger.error('Failed to load JSON parser:', error) } diff --git a/apps/sim/lib/file-parsers/json-parser.ts b/apps/sim/lib/file-parsers/json-parser.ts index 15881131501..ac239fb6e71 100644 --- a/apps/sim/lib/file-parsers/json-parser.ts +++ b/apps/sim/lib/file-parsers/json-parser.ts @@ -59,6 +59,49 @@ export async function parseJSONBuffer(buffer: Buffer): Promise } } +/** + * Parse JSONL (JSON Lines) files — one JSON object per line + */ +export async function parseJSONL(filePath: string): Promise { + const fs = await import('fs/promises') + const content = await fs.readFile(filePath, 'utf-8') + return parseJSONLContent(content) +} + +/** + * Parse JSONL from buffer + */ +export async function parseJSONLBuffer(buffer: Buffer): Promise { + const content = buffer.toString('utf-8') + return parseJSONLContent(content) +} + +function parseJSONLContent(content: string): FileParseResult { + const lines = content.split('\n').filter((line) => line.trim()) + const items: unknown[] = [] + + for (const line of lines) { + try { + items.push(JSON.parse(line)) + } catch { + throw new Error(`Invalid JSONL: failed to parse line: ${line.slice(0, 100)}`) + } + } + + const formattedContent = JSON.stringify(items, null, 2) + + return { + content: formattedContent, + metadata: { + type: 'json', + isArray: true, + keys: [], + itemCount: items.length, + depth: items.length > 0 ? 1 + getJsonDepth(items[0]) : 1, + }, + } +} + /** * Calculate the depth of a JSON object */ diff --git a/apps/sim/lib/knowledge/documents/document-processor.ts b/apps/sim/lib/knowledge/documents/document-processor.ts index 5ca6de84c9e..2d652e9a11a 100644 --- a/apps/sim/lib/knowledge/documents/document-processor.ts +++ b/apps/sim/lib/knowledge/documents/document-processor.ts @@ -1,7 +1,17 @@ import { createLogger } from '@sim/logger' import { PDFDocument } from 'pdf-lib' import { getBYOKKey } from '@/lib/api-key/byok' -import { type Chunk, JsonYamlChunker, StructuredDataChunker, TextChunker } from '@/lib/chunkers' +import { + type Chunk, + JsonYamlChunker, + RecursiveChunker, + RegexChunker, + SentenceChunker, + StructuredDataChunker, + TextChunker, + TokenChunker, +} from '@/lib/chunkers' +import type { ChunkingStrategy, StrategyOptions } from '@/lib/chunkers/types' import { env } from '@/lib/core/config/env' import { parseBuffer, parseFile } from '@/lib/file-parsers' import type { FileParseMetadata } from '@/lib/file-parsers/types' @@ -44,9 +54,6 @@ type OCRRequestBody = { const MISTRAL_MAX_PAGES = 1000 -/** - * Get page count from a PDF buffer using unpdf - */ async function getPdfPageCount(buffer: Buffer): Promise { try { const { getDocumentProxy } = await import('unpdf') @@ -59,10 +66,6 @@ async function getPdfPageCount(buffer: Buffer): Promise { } } -/** - * Split a PDF buffer into multiple smaller PDFs - * Returns an array of PDF buffers, each with at most maxPages pages - */ async function splitPdfIntoChunks( pdfBuffer: Buffer, maxPages: number @@ -112,6 +115,54 @@ class APIError extends Error { } } +async function applyStrategy( + strategy: ChunkingStrategy, + content: string, + chunkSize: number, + chunkOverlap: number, + minCharactersPerChunk: number, + strategyOptions?: StrategyOptions +): Promise { + const baseOptions = { chunkSize, chunkOverlap, minCharactersPerChunk } + + switch (strategy) { + case 'token': { + const chunker = new TokenChunker(baseOptions) + return chunker.chunk(content) + } + case 'sentence': { + const chunker = new SentenceChunker(baseOptions) + return chunker.chunk(content) + } + case 'recursive': { + const chunker = new RecursiveChunker({ + ...baseOptions, + separators: strategyOptions?.separators, + recipe: strategyOptions?.recipe, + }) + return chunker.chunk(content) + } + case 'regex': { + if (!strategyOptions?.pattern) { + logger.warn( + 'Regex strategy requested but no pattern provided, falling back to text chunker' + ) + const chunker = new TextChunker(baseOptions) + return chunker.chunk(content) + } + const chunker = new RegexChunker({ + ...baseOptions, + pattern: strategyOptions.pattern, + }) + return chunker.chunk(content) + } + default: { + const chunker = new TextChunker(baseOptions) + return chunker.chunk(content) + } + } +} + export async function processDocument( fileUrl: string, filename: string, @@ -120,7 +171,9 @@ export async function processDocument( chunkOverlap = 200, minCharactersPerChunk = 100, userId?: string, - workspaceId?: string | null + workspaceId?: string | null, + strategy?: ChunkingStrategy, + strategyOptions?: StrategyOptions ): Promise<{ chunks: Chunk[] metadata: { @@ -144,30 +197,42 @@ export async function processDocument( let chunks: Chunk[] const metadata: FileParseMetadata = parseResult.metadata ?? {} - const isJsonYaml = - metadata.type === 'json' || - metadata.type === 'yaml' || - mimeType.includes('json') || - mimeType.includes('yaml') - - if (isJsonYaml && JsonYamlChunker.isStructuredData(content)) { - logger.info('Using JSON/YAML chunker for structured data') - chunks = await JsonYamlChunker.chunkJsonYaml(content, { + if (strategy && strategy !== 'auto') { + logger.info(`Using explicit chunking strategy: ${strategy}`) + chunks = await applyStrategy( + strategy, + content, chunkSize, + chunkOverlap, minCharactersPerChunk, - }) - } else if (StructuredDataChunker.isStructuredData(content, mimeType)) { - logger.info('Using structured data chunker for spreadsheet/CSV content') - const rowCount = metadata.totalRows ?? metadata.rowCount - chunks = await StructuredDataChunker.chunkStructuredData(content, { - chunkSize, - headers: metadata.headers, - totalRows: typeof rowCount === 'number' ? rowCount : undefined, - sheetName: metadata.sheetNames?.[0], - }) + strategyOptions + ) } else { - const chunker = new TextChunker({ chunkSize, chunkOverlap, minCharactersPerChunk }) - chunks = await chunker.chunk(content) + const isJsonYaml = + metadata.type === 'json' || + metadata.type === 'yaml' || + mimeType.includes('json') || + mimeType.includes('yaml') + + if (isJsonYaml && JsonYamlChunker.isStructuredData(content)) { + logger.info('Using JSON/YAML chunker for structured data') + chunks = await JsonYamlChunker.chunkJsonYaml(content, { + chunkSize, + minCharactersPerChunk, + }) + } else if (StructuredDataChunker.isStructuredData(content, mimeType)) { + logger.info('Using structured data chunker for spreadsheet/CSV content') + const rowCount = metadata.totalRows ?? metadata.rowCount + chunks = await StructuredDataChunker.chunkStructuredData(content, { + chunkSize, + headers: metadata.headers, + totalRows: typeof rowCount === 'number' ? rowCount : undefined, + sheetName: metadata.sheetNames?.[0], + }) + } else { + const chunker = new TextChunker({ chunkSize, chunkOverlap, minCharactersPerChunk }) + chunks = await chunker.chunk(content) + } } const characterCount = content.length @@ -565,9 +630,6 @@ async function executeMistralOCRRequest( ) } -/** - * Process a single PDF chunk: upload to S3, OCR, cleanup - */ async function processChunk( chunk: { buffer: Buffer; startPage: number; endPage: number }, chunkIndex: number, @@ -585,7 +647,6 @@ async function processChunk( let uploadedKey: string | null = null try { - // Upload the chunk to S3 const timestamp = Date.now() const uniqueId = Math.random().toString(36).substring(2, 9) const safeFileName = filename.replace(/[^a-zA-Z0-9.-]/g, '_') @@ -617,7 +678,6 @@ async function processChunk( logger.info(`Uploaded chunk ${chunkIndex + 1} to S3: ${chunkKey}`) - // Process the chunk with Mistral OCR const params = { filePath: chunkUrl, apiKey, @@ -639,7 +699,6 @@ async function processChunk( }) return { index: chunkIndex, content: null } } finally { - // Clean up the chunk file from S3 after processing if (uploadedKey) { try { await StorageService.deleteFile({ key: uploadedKey, context: 'knowledge-base' }) @@ -674,7 +733,6 @@ async function processMistralOCRInBatches( `Split into ${pdfChunks.length} chunks, processing with concurrency ${MAX_CONCURRENT_CHUNKS}` ) - // Process chunks concurrently with limited concurrency const results: { index: number; content: string | null }[] = [] for (let i = 0; i < pdfChunks.length; i += MAX_CONCURRENT_CHUNKS) { @@ -693,15 +751,12 @@ async function processMistralOCRInBatches( ) } - // Sort by index to maintain page order and filter out nulls const sortedResults = results .sort((a, b) => a.index - b.index) .filter((r) => r.content !== null) .map((r) => r.content as string) if (sortedResults.length === 0) { - // Don't fall back to file parser for large PDFs - it produces poor results - // Better to fail clearly than return low-quality extraction throw new Error( `OCR failed for all ${pdfChunks.length} chunks of ${filename}. ` + `Large PDFs require OCR - file parser fallback would produce poor results.` diff --git a/apps/sim/lib/knowledge/documents/service.ts b/apps/sim/lib/knowledge/documents/service.ts index ff613b4e8cd..c37aa22a53d 100644 --- a/apps/sim/lib/knowledge/documents/service.ts +++ b/apps/sim/lib/knowledge/documents/service.ts @@ -27,6 +27,7 @@ import { } from 'drizzle-orm' import { recordUsage } from '@/lib/billing/core/usage-log' import { checkAndBillOverageThreshold } from '@/lib/billing/threshold-billing' +import type { ChunkingStrategy, StrategyOptions } from '@/lib/chunkers/types' import { createBullMQJobData, isBullMQEnabled } from '@/lib/core/bullmq' import { env } from '@/lib/core/config/env' import { getCostMultiplier, isTriggerDevEnabled } from '@/lib/core/config/feature-flags' @@ -51,10 +52,9 @@ import { calculateCost } from '@/providers/utils' const logger = createLogger('DocumentService') const TIMEOUTS = { - OVERALL_PROCESSING: (env.KB_CONFIG_MAX_DURATION || 600) * 1000, // Default 10 minutes for KB document processing + OVERALL_PROCESSING: (env.KB_CONFIG_MAX_DURATION || 600) * 1000, } as const -// Configuration for handling large documents const LARGE_DOC_CONFIG = { MAX_CHUNKS_PER_BATCH: 500, MAX_EMBEDDING_BATCH: env.KB_CONFIG_BATCH_SIZE || 2000, @@ -62,9 +62,6 @@ const LARGE_DOC_CONFIG = { MAX_CHUNKS_PER_DOCUMENT: 100000, } -/** - * Create a timeout wrapper for async operations - */ function withTimeout( promise: Promise, timeoutMs: number, @@ -173,10 +170,6 @@ export interface DocumentTagData { value: string } -/** - * Process structured document tags and validate them against existing definitions - * Throws an error if a tag doesn't exist or if the value doesn't match the expected type - */ export async function processDocumentTags( knowledgeBaseId: string, tagData: DocumentTagData[], @@ -354,9 +347,6 @@ export async function processDocumentTags( return result } -/** - * Process documents with the configured background execution backend. - */ export async function processDocumentsWithQueue( createdDocuments: DocumentData[], knowledgeBaseId: string, @@ -407,9 +397,6 @@ export async function processDocumentsWithQueue( return } -/** - * Process a document asynchronously with full error handling - */ export async function processDocumentAsync( knowledgeBaseId: string, documentId: string, @@ -457,6 +444,8 @@ export async function processDocumentAsync( maxSize?: number minSize?: number overlap?: number + strategy?: ChunkingStrategy + strategyOptions?: StrategyOptions } | null const kbConfig = { maxSize: rawConfig?.maxSize ?? 1024, @@ -478,7 +467,9 @@ export async function processDocumentAsync( kbConfig.overlap, kbConfig.minSize, kb[0].userId, - kb[0].workspaceId + kb[0].workspaceId, + rawConfig?.strategy, + rawConfig?.strategyOptions ) if (processed.chunks.length > LARGE_DOC_CONFIG.MAX_CHUNKS_PER_DOCUMENT) { @@ -529,7 +520,6 @@ export async function processDocumentAsync( const documentRecord = await db .select({ - // Text tags (7 slots) tag1: document.tag1, tag2: document.tag2, tag3: document.tag3, @@ -537,16 +527,13 @@ export async function processDocumentAsync( tag5: document.tag5, tag6: document.tag6, tag7: document.tag7, - // Number tags (5 slots) number1: document.number1, number2: document.number2, number3: document.number3, number4: document.number4, number5: document.number5, - // Date tags (2 slots) date1: document.date1, date2: document.date2, - // Boolean tags (3 slots) boolean1: document.boolean1, boolean2: document.boolean2, boolean3: document.boolean3, @@ -578,7 +565,6 @@ export async function processDocumentAsync( embeddingModel: 'text-embedding-3-small', startOffset: chunk.metadata.startIndex, endOffset: chunk.metadata.endIndex, - // Copy text tags from document (7 slots) tag1: documentTags.tag1, tag2: documentTags.tag2, tag3: documentTags.tag3, @@ -586,16 +572,13 @@ export async function processDocumentAsync( tag5: documentTags.tag5, tag6: documentTags.tag6, tag7: documentTags.tag7, - // Copy number tags from document (5 slots) number1: documentTags.number1, number2: documentTags.number2, number3: documentTags.number3, number4: documentTags.number4, number5: documentTags.number5, - // Copy date tags from document (2 slots) date1: documentTags.date1, date2: documentTags.date2, - // Copy boolean tags from document (3 slots) boolean1: documentTags.boolean1, boolean2: documentTags.boolean2, boolean3: documentTags.boolean3, @@ -719,16 +702,10 @@ export async function processDocumentAsync( } } -/** - * Check if Trigger.dev is available and configured - */ export function isTriggerAvailable(): boolean { return Boolean(env.TRIGGER_SECRET_KEY) && isTriggerDevEnabled } -/** - * Process documents using Trigger.dev - */ export async function processDocumentsWithTrigger( documents: DocumentProcessingPayload[], requestId: string @@ -777,9 +754,6 @@ export async function processDocumentsWithTrigger( } } -/** - * Create document records in database with tags - */ export async function createDocumentRecords( documents: Array<{ filename: string @@ -848,7 +822,6 @@ export async function createDocumentRecords( processingStatus: 'pending' as const, enabled: true, uploadedAt: now, - // Text tags - use processed tags if available, otherwise fall back to individual tag fields tag1: processedTags.tag1 ?? docData.tag1 ?? null, tag2: processedTags.tag2 ?? docData.tag2 ?? null, tag3: processedTags.tag3 ?? docData.tag3 ?? null, @@ -856,16 +829,13 @@ export async function createDocumentRecords( tag5: processedTags.tag5 ?? docData.tag5 ?? null, tag6: processedTags.tag6 ?? docData.tag6 ?? null, tag7: processedTags.tag7 ?? docData.tag7 ?? null, - // Number tags (5 slots) number1: processedTags.number1 ?? null, number2: processedTags.number2 ?? null, number3: processedTags.number3 ?? null, number4: processedTags.number4 ?? null, number5: processedTags.number5 ?? null, - // Date tags (2 slots) date1: processedTags.date1 ?? null, date2: processedTags.date2 ?? null, - // Boolean tags (3 slots) boolean1: processedTags.boolean1 ?? null, boolean2: processedTags.boolean2 ?? null, boolean3: processedTags.boolean3 ?? null, @@ -897,9 +867,6 @@ export async function createDocumentRecords( }) } -/** - * A single tag filter condition passed from the API layer. - */ export interface TagFilterCondition { tagSlot: string fieldType: 'text' | 'number' | 'date' | 'boolean' @@ -908,9 +875,6 @@ export interface TagFilterCondition { valueTo?: string } -/** - * Builds a Drizzle SQL condition from a tag filter. - */ const ALLOWED_TAG_SLOTS = new Set([ 'tag1', 'tag2', @@ -1039,9 +1003,6 @@ function buildTagFilterCondition(filter: TagFilterCondition): SQL | undefined { return undefined } -/** - * Get documents for a knowledge base with filtering and pagination - */ export async function getDocuments( knowledgeBaseId: string, options: { @@ -1070,7 +1031,6 @@ export async function getDocuments( processingError: string | null enabled: boolean uploadedAt: Date - // Text tags tag1: string | null tag2: string | null tag3: string | null @@ -1078,20 +1038,16 @@ export async function getDocuments( tag5: string | null tag6: string | null tag7: string | null - // Number tags number1: number | null number2: number | null number3: number | null number4: number | null number5: number | null - // Date tags date1: Date | null date2: Date | null - // Boolean tags boolean1: boolean | null boolean2: boolean | null boolean3: boolean | null - // Connector fields connectorId: string | null connectorType: string | null sourceUrl: string | null @@ -1188,7 +1144,6 @@ export async function getDocuments( processingError: document.processingError, enabled: document.enabled, uploadedAt: document.uploadedAt, - // Text tags (7 slots) tag1: document.tag1, tag2: document.tag2, tag3: document.tag3, @@ -1196,20 +1151,16 @@ export async function getDocuments( tag5: document.tag5, tag6: document.tag6, tag7: document.tag7, - // Number tags (5 slots) number1: document.number1, number2: document.number2, number3: document.number3, number4: document.number4, number5: document.number5, - // Date tags (2 slots) date1: document.date1, date2: document.date2, - // Boolean tags (3 slots) boolean1: document.boolean1, boolean2: document.boolean2, boolean3: document.boolean3, - // Connector fields connectorId: document.connectorId, connectorType: knowledgeConnector.connectorType, sourceUrl: document.sourceUrl, @@ -1241,7 +1192,6 @@ export async function getDocuments( processingError: doc.processingError, enabled: doc.enabled, uploadedAt: doc.uploadedAt, - // Text tags tag1: doc.tag1, tag2: doc.tag2, tag3: doc.tag3, @@ -1249,20 +1199,16 @@ export async function getDocuments( tag5: doc.tag5, tag6: doc.tag6, tag7: doc.tag7, - // Number tags number1: doc.number1, number2: doc.number2, number3: doc.number3, number4: doc.number4, number5: doc.number5, - // Date tags date1: doc.date1, date2: doc.date2, - // Boolean tags boolean1: doc.boolean1, boolean2: doc.boolean2, boolean3: doc.boolean3, - // Connector fields connectorId: doc.connectorId, connectorType: doc.connectorType ?? null, sourceUrl: doc.sourceUrl, @@ -1276,9 +1222,6 @@ export async function getDocuments( } } -/** - * Create a single document record - */ export async function createSingleDocument( documentData: { filename: string @@ -1320,7 +1263,6 @@ export async function createSingleDocument( const now = new Date() let processedTags: ProcessedDocumentTags = { - // Text tags (7 slots) tag1: documentData.tag1 ?? null, tag2: documentData.tag2 ?? null, tag3: documentData.tag3 ?? null, @@ -1328,16 +1270,13 @@ export async function createSingleDocument( tag5: documentData.tag5 ?? null, tag6: documentData.tag6 ?? null, tag7: documentData.tag7 ?? null, - // Number tags (5 slots) number1: null, number2: null, number3: null, number4: null, number5: null, - // Date tags (2 slots) date1: null, date2: null, - // Boolean tags (3 slots) boolean1: null, boolean2: null, boolean3: null, @@ -1417,9 +1356,6 @@ export async function createSingleDocument( } } -/** - * Perform bulk operations on documents - */ export async function bulkDocumentOperation( knowledgeBaseId: string, operation: 'enable' | 'disable' | 'delete', @@ -1509,9 +1445,6 @@ export async function bulkDocumentOperation( } } -/** - * Perform bulk operations on all documents matching a filter - */ export async function bulkDocumentOperationByFilter( knowledgeBaseId: string, operation: 'enable' | 'disable' | 'delete', @@ -1583,9 +1516,6 @@ export async function bulkDocumentOperationByFilter( } } -/** - * Mark a document as failed due to timeout - */ export async function markDocumentAsFailedTimeout( documentId: string, processingStartedAt: Date, @@ -1618,9 +1548,6 @@ export async function markDocumentAsFailedTimeout( } } -/** - * Retry processing a failed document - */ export async function retryDocumentProcessing( knowledgeBaseId: string, documentId: string, @@ -1673,9 +1600,6 @@ export async function retryDocumentProcessing( } } -/** - * Update a document with specified fields - */ export async function updateDocument( documentId: string, updateData: { @@ -1686,7 +1610,6 @@ export async function updateDocument( characterCount?: number processingStatus?: 'pending' | 'processing' | 'completed' | 'failed' processingError?: string - // Text tags tag1?: string tag2?: string tag3?: string @@ -1694,16 +1617,13 @@ export async function updateDocument( tag5?: string tag6?: string tag7?: string - // Number tags number1?: string number2?: string number3?: string number4?: string number5?: string - // Date tags date1?: string date2?: string - // Boolean tags boolean1?: string boolean2?: string boolean3?: string @@ -1772,7 +1692,6 @@ export async function updateDocument( boolean2: boolean | null boolean3: boolean | null }> = {} - // All tag slots across all field types const ALL_TAG_SLOTS = [ 'tag1', 'tag2', @@ -1794,7 +1713,6 @@ export async function updateDocument( ] as const type TagSlot = (typeof ALL_TAG_SLOTS)[number] - // Regular field updates if (updateData.filename !== undefined) dbUpdateData.filename = updateData.filename if (updateData.enabled !== undefined) dbUpdateData.enabled = updateData.enabled if (updateData.chunkCount !== undefined) dbUpdateData.chunkCount = updateData.chunkCount @@ -1812,26 +1730,21 @@ export async function updateDocument( ): string | number | Date | boolean | null => { if (value === undefined || value === '') return null - // Number slots if (slot.startsWith('number')) { return parseNumberValue(value) } - // Date slots if (slot.startsWith('date')) { return parseDateValue(value) } - // Boolean slots if (slot.startsWith('boolean')) { return parseBooleanValue(value) ?? false } - // Text slots: keep as string return value || null } - // Type-safe access to tag slots in updateData type UpdateDataWithTags = typeof updateData & Record const typedUpdateData = updateData as UpdateDataWithTags @@ -2044,9 +1957,6 @@ export async function hardDeleteDocuments( return existingIds.length } -/** - * Hard delete a document. - */ export async function deleteDocument( documentId: string, requestId: string diff --git a/apps/sim/lib/knowledge/types.ts b/apps/sim/lib/knowledge/types.ts index b761597c790..6fe1a8bbaff 100644 --- a/apps/sim/lib/knowledge/types.ts +++ b/apps/sim/lib/knowledge/types.ts @@ -1,18 +1,16 @@ +import type { ChunkingStrategy, StrategyOptions } from '@/lib/chunkers/types' + /** - * Configuration for document chunking in knowledge bases - * * Units: - * - maxSize: Maximum chunk size in TOKENS (1 token ≈ 4 characters) - * - minSize: Minimum chunk size in CHARACTERS (floor to avoid tiny fragments) - * - overlap: Overlap between chunks in TOKENS (1 token ≈ 4 characters) + * - maxSize/overlap: TOKENS (1 token ≈ 4 characters) + * - minSize: CHARACTERS */ export interface ChunkingConfig { - /** Maximum chunk size in tokens (default: 1024, range: 100-4000) */ maxSize: number - /** Minimum chunk size in characters (default: 100, range: 1-2000) */ minSize: number - /** Overlap between chunks in tokens (default: 200, range: 0-500) */ overlap: number + strategy?: ChunkingStrategy + strategyOptions?: StrategyOptions } export interface KnowledgeBaseWithCounts { @@ -63,19 +61,16 @@ export interface UpdateTagDefinitionData { fieldType?: string } -/** Tag filter for knowledge base search */ export interface StructuredFilter { - tagName?: string // Human-readable name (input from frontend) - tagSlot: string // Database column (resolved from tagName) + tagName?: string + tagSlot: string fieldType: string operator: string value: string | number | boolean valueTo?: string | number } -/** Processed document tags ready for database storage */ export interface ProcessedDocumentTags { - // Text tags tag1: string | null tag2: string | null tag3: string | null @@ -83,39 +78,29 @@ export interface ProcessedDocumentTags { tag5: string | null tag6: string | null tag7: string | null - // Number tags number1: number | null number2: number | null number3: number | null number4: number | null number5: number | null - // Date tags date1: Date | null date2: Date | null - // Boolean tags boolean1: boolean | null boolean2: boolean | null boolean3: boolean | null - // Index signature for dynamic access [key: string]: string | number | Date | boolean | null } -/** - * Frontend/API Types - * These types use string dates for JSON serialization - */ +/** These types use string dates for JSON serialization */ -/** Extended chunking config with optional fields */ export interface ExtendedChunkingConfig extends ChunkingConfig { chunkSize?: number minCharactersPerChunk?: number recipe?: string lang?: string - strategy?: 'recursive' | 'semantic' | 'sentence' | 'paragraph' [key: string]: unknown } -/** Knowledge base data for API responses */ export interface KnowledgeBaseData { id: string userId: string @@ -132,7 +117,6 @@ export interface KnowledgeBaseData { connectorTypes?: string[] } -/** Document data for API responses */ export interface DocumentData { id: string knowledgeBaseId: string @@ -171,7 +155,6 @@ export interface DocumentData { sourceUrl?: string | null } -/** Chunk data for API responses */ export interface ChunkData { id: string chunkIndex: number @@ -202,7 +185,6 @@ export interface ChunkData { updatedAt: string } -/** Pagination info for chunks */ export interface ChunksPagination { total: number limit: number @@ -210,7 +192,6 @@ export interface ChunksPagination { hasMore: boolean } -/** Pagination info for documents */ export interface DocumentsPagination { total: number limit: number diff --git a/apps/sim/lib/uploads/utils/file-utils.ts b/apps/sim/lib/uploads/utils/file-utils.ts index 007014f5f42..95dd217c297 100644 --- a/apps/sim/lib/uploads/utils/file-utils.ts +++ b/apps/sim/lib/uploads/utils/file-utils.ts @@ -366,7 +366,7 @@ export function validateKnowledgeBaseFile( return null } - return `File "${file.name}" has an unsupported format. Please use PDF, DOC, DOCX, TXT, CSV, XLS, XLSX, MD, PPT, PPTX, HTML, JSON, YAML, or YML files.` + return `File "${file.name}" has an unsupported format. Please use PDF, DOC, DOCX, TXT, CSV, XLS, XLSX, MD, PPT, PPTX, HTML, JSON, JSONL, YAML, or YML files.` } /** diff --git a/apps/sim/lib/uploads/utils/validation.ts b/apps/sim/lib/uploads/utils/validation.ts index 3752e421d79..10ce9364bec 100644 --- a/apps/sim/lib/uploads/utils/validation.ts +++ b/apps/sim/lib/uploads/utils/validation.ts @@ -28,6 +28,7 @@ export const SUPPORTED_DOCUMENT_EXTENSIONS = [ 'html', 'htm', 'json', + 'jsonl', 'yaml', 'yml', ] as const @@ -135,6 +136,7 @@ export const SUPPORTED_MIME_TYPES: Record html: ['text/html', 'application/xhtml+xml'], htm: ['text/html', 'application/xhtml+xml'], json: ['application/json', 'text/json', 'application/x-json'], + jsonl: ['application/jsonl', 'application/x-jsonlines', 'text/jsonl', 'application/octet-stream'], yaml: ['text/yaml', 'text/x-yaml', 'application/yaml', 'application/x-yaml'], yml: ['text/yaml', 'text/x-yaml', 'application/yaml', 'application/x-yaml'], }