import { createHash } from 'node:crypto'; import type { Database } from 'bun:sqlite'; import type { ResearchJournalEntry, SearchAnswerResponse, SearchCitation, SearchResult, SearchSource, TaskStageContext } from '@/lib/types'; import { runAiAnalysis, runAiEmbeddings } from '@/lib/server/ai'; import { __dbInternals, getSqliteClient } from '@/lib/server/db'; import { fetchPrimaryFilingText } from '@/lib/server/sec'; import { getFilingByAccession, listFilingsRecords } from '@/lib/server/repos/filings'; import { getResearchJournalEntryRecord, listResearchJournalEntries, listResearchJournalEntriesForUser } from '@/lib/server/repos/research-journal'; type SearchDocumentScope = 'global' | 'user'; type SearchDocumentSourceKind = 'filing_document' | 'filing_brief' | 'research_note'; type MaterializedSearchDocument = { sourceKind: SearchDocumentSourceKind; sourceRef: string; scope: SearchDocumentScope; userId: string | null; ticker: string | null; accessionNumber: string | null; filingDate: string | null; title: string | null; contentText: string; metadata: Record; }; type SearchChunkRecord = { chunkIndex: number; chunkText: string; charCount: number; startOffset: number; endOffset: number; headingPath: string | null; citationLabel: string; }; type SearchDocumentRow = { id: number; source_kind: SearchDocumentSourceKind; source_ref: string; scope: SearchDocumentScope; user_id: string | null; ticker: string | null; accession_number: string | null; title: string | null; content_text: string; content_hash: string; metadata: Record | null; index_status: 'pending' | 'indexed' | 'failed'; indexed_at: string | null; last_error: string | null; created_at: string; updated_at: string; }; type SearchChunkJoinRow = { chunk_id: number; document_id: number; chunk_text: string; heading_path: string | null; citation_label: string; source_kind: SearchDocumentSourceKind; source_ref: string; title: string | null; ticker: string | null; accession_number: string | null; metadata: Record | null; }; type DeleteSourceRef = { sourceKind: SearchDocumentSourceKind; sourceRef: string; scope: SearchDocumentScope; userId?: string | null; }; type IndexSearchDocumentsInput = { userId: string; ticker?: string | null; accessionNumber?: string | null; journalEntryId?: number | null; sourceKinds?: SearchDocumentSourceKind[]; deleteSourceRefs?: DeleteSourceRef[]; onStage?: ( stage: 'collect' | 'fetch' | 'chunk' | 'embed' | 'persist', detail: string, context?: TaskStageContext | null ) => Promise | void; }; type SearchInput = { userId: string; query: string; ticker?: string | null; sources?: SearchSource[]; limit?: number; }; type SearchMatch = { chunkId: number; vectorRank: number | null; lexicalRank: number | null; snippet: string | null; }; const RRF_K = 60; const SEARCH_RESULT_LIMIT_DEFAULT = 10; const SEARCH_RESULT_LIMIT_MIN = 4; const SEARCH_RESULT_LIMIT_MAX = 12; const MAX_RESULTS_PER_DOCUMENT = 2; const MAX_CONTEXT_RESULTS = 6; const MAX_CONTEXT_CHARS = 8_000; const SOURCE_KIND_BY_SEARCH_SOURCE: Record = { documents: 'filing_document', filings: 'filing_brief', research: 'research_note' }; function escapeLike(value: string) { return value.replace(/[%_]/g, (match) => `\\${match}`); } function normalizeTicker(value: string | null | undefined) { const normalized = value?.trim().toUpperCase() ?? ''; return normalized.length > 0 ? normalized : null; } function normalizeSearchSources(sources?: SearchSource[]) { const normalized = new Set(); for (const source of sources ?? ['documents', 'filings', 'research']) { if (source === 'documents' || source === 'filings' || source === 'research') { normalized.add(source); } } return normalized.size > 0 ? [...normalized] : ['documents', 'filings', 'research'] as SearchSource[]; } function clampLimit(limit?: number) { const value = Number.isFinite(limit) ? Number(limit) : SEARCH_RESULT_LIMIT_DEFAULT; const intValue = Math.trunc(value); return Math.min(Math.max(intValue, SEARCH_RESULT_LIMIT_MIN), SEARCH_RESULT_LIMIT_MAX); } function hashContent(content: string) { return createHash('sha256').update(content).digest('hex'); } function stripMarkdown(markdown: string) { return markdown .replace(/```[\s\S]*?```/g, ' ') .replace(/`([^`]+)`/g, '$1') .replace(/!\[[^\]]*\]\([^)]+\)/g, ' ') .replace(/\[([^\]]+)\]\([^)]+\)/g, '$1') .replace(/^>\s?/gm, '') .replace(/^#{1,6}\s+/gm, '') .replace(/[*_~]/g, '') .replace(/^\s*[-+]\s+/gm, '') .replace(/\r/g, '\n') .replace(/[ \t]+\n/g, '\n') .replace(/\n{3,}/g, '\n\n') .trim(); } function normalizeWhitespace(value: string) { return value .replace(/\r/g, '\n') .replace(/[ \t]+\n/g, '\n') .replace(/\n[ \t]+/g, '\n') .replace(/[ \t]{2,}/g, ' ') .replace(/\n{3,}/g, '\n\n') .trim(); } function buildFilingBriefContent(input: { ticker: string; companyName: string; accessionNumber: string; filingDate: string; filingType: string; metrics: Record | null; analysis: Record | null; }) { const extraction = input.analysis?.extraction; const extractionLines = extraction && typeof extraction === 'object' ? Object.entries(extraction as Record) .map(([key, value]) => `${key}: ${Array.isArray(value) ? value.join(' | ') : String(value ?? '')}`) .filter((line) => !line.endsWith(': ')) : []; const reportText = typeof input.analysis?.text === 'string' ? input.analysis.text : typeof input.analysis?.legacyInsights === 'string' ? input.analysis.legacyInsights : null; return normalizeWhitespace([ `${input.companyName} (${input.ticker}) filing brief`, `Form: ${input.filingType}`, `Filed: ${input.filingDate}`, `Accession: ${input.accessionNumber}`, input.metrics ? `Key metrics: ${JSON.stringify(input.metrics)}` : null, reportText ? `AI summary:\n${reportText}` : null, extractionLines.length > 0 ? `Structured extraction:\n${extractionLines.join('\n')}` : null ].filter((entry): entry is string => Boolean(entry)).join('\n\n')); } function buildCitationLabel(document: MaterializedSearchDocument, chunkIndex: number) { if (document.sourceKind === 'research_note') { return `${document.ticker ?? 'Research'} journal note [${chunkIndex + 1}]`; } const parts = [ document.ticker, document.accessionNumber, document.filingDate ].filter((entry): entry is string => Boolean(entry)); return `${parts.join(' ยท ')} [${chunkIndex + 1}]`; } function inferHeadingPath(text: string, offset: number) { const windowStart = Math.max(0, offset - 600); const context = text.slice(windowStart, offset); const lines = context .split('\n') .map((line) => line.trim()) .filter((line) => line.length > 0); for (let index = lines.length - 1; index >= 0; index -= 1) { const candidate = lines[index]!; const looksLikeHeading = candidate.length <= 100 && ( /:$/.test(candidate) || /^[A-Z0-9][A-Z0-9\s/&,-]{4,}$/.test(candidate) || /^\d+(\.\d+)*\s+[A-Z]/.test(candidate) ); if (looksLikeHeading) { return candidate; } } return null; } function chunkText( text: string, options: { targetChars: number; overlapChars: number; maxSingleChunk?: number }, document: MaterializedSearchDocument ) { const normalized = normalizeWhitespace(text); if (!normalized) { return [] satisfies SearchChunkRecord[]; } const maxSingleChunk = options.maxSingleChunk ?? 0; if (maxSingleChunk > 0 && normalized.length <= maxSingleChunk) { return [{ chunkIndex: 0, chunkText: normalized, charCount: normalized.length, startOffset: 0, endOffset: normalized.length, headingPath: inferHeadingPath(normalized, 0), citationLabel: buildCitationLabel(document, 0) }]; } const chunks: SearchChunkRecord[] = []; let start = 0; let chunkIndex = 0; while (start < normalized.length) { const tentativeEnd = Math.min(start + options.targetChars, normalized.length); let end = tentativeEnd; if (tentativeEnd < normalized.length) { const localWindow = normalized.slice(start, Math.min(normalized.length, tentativeEnd + 180)); const paragraphBreak = localWindow.lastIndexOf('\n\n'); const sentenceBreak = Math.max(localWindow.lastIndexOf('. '), localWindow.lastIndexOf('\n')); const boundary = Math.max(paragraphBreak, sentenceBreak); if (boundary >= options.targetChars * 0.55) { end = start + boundary + (paragraphBreak === boundary ? 0 : 1); } } const chunkTextValue = normalized.slice(start, end).trim(); if (chunkTextValue) { chunks.push({ chunkIndex, chunkText: chunkTextValue, charCount: chunkTextValue.length, startOffset: start, endOffset: end, headingPath: inferHeadingPath(normalized, start), citationLabel: buildCitationLabel(document, chunkIndex) }); chunkIndex += 1; } if (end >= normalized.length) { break; } start = Math.max(end - options.overlapChars, start + 1); } return chunks; } function chunkDocument(document: MaterializedSearchDocument) { switch (document.sourceKind) { case 'filing_document': return chunkText(document.contentText, { targetChars: 1400, overlapChars: 200 }, document); case 'filing_brief': return chunkText(document.contentText, { targetChars: 1000, overlapChars: 100 }, document); case 'research_note': return chunkText(document.contentText, { targetChars: 1000, overlapChars: 100, maxSingleChunk: 2000 }, document); default: return []; } } function mapSourceKindToSearchSource(sourceKind: SearchDocumentSourceKind): SearchSource { switch (sourceKind) { case 'filing_document': return 'documents'; case 'filing_brief': return 'filings'; case 'research_note': return 'research'; default: return 'documents'; } } function buildSearchHref(row: SearchChunkJoinRow) { if (row.source_kind === 'research_note') { return `/analysis?ticker=${encodeURIComponent(row.ticker ?? '')}&journalId=${encodeURIComponent(row.source_ref)}`; } const hasAnalysis = Boolean((row.metadata as { hasAnalysis?: unknown } | null)?.hasAnalysis); if (hasAnalysis && row.ticker && row.accession_number) { return `/analysis/reports/${encodeURIComponent(row.ticker)}/${encodeURIComponent(row.accession_number)}`; } if (row.ticker) { return `/filings?ticker=${encodeURIComponent(row.ticker)}`; } return '/filings'; } function manualSnippet(text: string, query: string) { const tokens = query.toLowerCase().split(/\W+/).filter((token) => token.length > 1); const lower = text.toLowerCase(); const matchIndex = tokens .map((token) => lower.indexOf(token)) .find((index) => index >= 0) ?? 0; const start = Math.max(0, matchIndex - 90); const end = Math.min(text.length, start + 220); const prefix = start > 0 ? '... ' : ''; const suffix = end < text.length ? ' ...' : ''; return `${prefix}${text.slice(start, end).trim()}${suffix}`.trim(); } function scoreSearchMatch(match: SearchMatch) { let score = 0; if (match.vectorRank !== null) { score += 1 / (RRF_K + match.vectorRank); } if (match.lexicalRank !== null) { score += 1 / (RRF_K + match.lexicalRank); } return score; } function toFtsQuery(query: string) { const tokens = query .trim() .split(/\W+/) .map((token) => token.trim().toLowerCase()) .filter((token) => token.length > 1); if (tokens.length === 0) { return null; } return tokens.map((token) => `"${token.replace(/"/g, '""')}"`).join(' AND '); } function createPlaceholders(length: number) { return new Array(length).fill('?').join(', '); } function queryOneDocument( client: Database, input: Pick ) { return client .query(` SELECT * FROM search_document WHERE scope = ? AND ifnull(user_id, '') = ? AND source_kind = ? AND source_ref = ? LIMIT 1 `) .get( input.scope, input.userId ?? '', input.sourceKind, input.sourceRef ) as SearchDocumentRow | null; } function listDocumentsForScope(client: Database, input: { scope: SearchDocumentScope; userId?: string | null; sourceKind: SearchDocumentSourceKind; ticker?: string | null; }) { const conditions = [ 'scope = ?', "ifnull(user_id, '') = ?", 'source_kind = ?' ]; const values: Array = [input.scope, input.userId ?? '', input.sourceKind]; if (input.ticker) { conditions.push('ticker = ?'); values.push(input.ticker); } return client .query(` SELECT * FROM search_document WHERE ${conditions.join(' AND ')} `) .all(...values) as SearchDocumentRow[]; } function deleteDocumentCascade(client: Database, documentId: number) { const chunkRows = client .query('SELECT id FROM search_chunk WHERE document_id = ?') .all(documentId) as Array<{ id: number }>; for (const row of chunkRows) { client.query('DELETE FROM search_chunk_vec WHERE chunk_id = ?').run(row.id); client.query('DELETE FROM search_chunk_fts WHERE chunk_id = ?').run(row.id); } client.query('DELETE FROM search_chunk WHERE document_id = ?').run(documentId); client.query('DELETE FROM search_document WHERE id = ?').run(documentId); } function withTransaction(client: Database, fn: () => T) { client.exec('BEGIN IMMEDIATE'); try { const result = fn(); client.exec('COMMIT'); return result; } catch (error) { client.exec('ROLLBACK'); throw error; } } async function collectFilingDocuments(ticker?: string | null, accessionNumber?: string | null) { const filings = accessionNumber ? await Promise.all([getFilingByAccession(accessionNumber)]).then((rows) => rows.filter(Boolean)) : await listFilingsRecords({ ticker: ticker ?? undefined, limit: 250 }); const documents: MaterializedSearchDocument[] = []; for (const filing of filings) { if (!filing) { continue; } const filingText = await fetchPrimaryFilingText({ filingUrl: filing.filing_url, cik: filing.cik, accessionNumber: filing.accession_number, primaryDocument: filing.primary_document ?? null }).catch(() => null); if (!filingText?.text) { continue; } documents.push({ sourceKind: 'filing_document', sourceRef: filing.accession_number, scope: 'global', userId: null, ticker: filing.ticker, accessionNumber: filing.accession_number, filingDate: filing.filing_date, title: `${filing.ticker} ${filing.filing_type} primary filing`, contentText: filingText.text, metadata: { filingType: filing.filing_type, filingDate: filing.filing_date, filingUrl: filing.filing_url, submissionUrl: filing.submission_url ?? null, primaryDocument: filing.primary_document ?? null, hasAnalysis: Boolean(filing.analysis?.text || filing.analysis?.legacyInsights) } }); } return documents; } async function collectFilingBriefs(ticker?: string | null, accessionNumber?: string | null) { const filings = accessionNumber ? await Promise.all([getFilingByAccession(accessionNumber)]).then((rows) => rows.filter(Boolean)) : await listFilingsRecords({ ticker: ticker ?? undefined, limit: 250 }); return filings .filter((filing): filing is NonNullable => Boolean(filing)) .map((filing) => ({ sourceKind: 'filing_brief' as const, sourceRef: filing.accession_number, scope: 'global' as const, userId: null, ticker: filing.ticker, accessionNumber: filing.accession_number, filingDate: filing.filing_date, title: `${filing.ticker} ${filing.filing_type} filing brief`, contentText: buildFilingBriefContent({ ticker: filing.ticker, companyName: filing.company_name, accessionNumber: filing.accession_number, filingDate: filing.filing_date, filingType: filing.filing_type, metrics: filing.metrics, analysis: filing.analysis as Record | null }), metadata: { filingType: filing.filing_type, filingDate: filing.filing_date, hasAnalysis: Boolean(filing.analysis?.text || filing.analysis?.legacyInsights) } })); } function materializeResearchNote(entry: ResearchJournalEntry): MaterializedSearchDocument | null { const text = normalizeWhitespace(stripMarkdown(entry.body_markdown)); if (!text) { return null; } return { sourceKind: 'research_note', sourceRef: String(entry.id), scope: 'user', userId: entry.user_id, ticker: entry.ticker, accessionNumber: entry.accession_number, filingDate: null, title: entry.title ?? `${entry.ticker} research note`, contentText: text, metadata: { entryType: entry.entry_type, createdAt: entry.created_at, updatedAt: entry.updated_at } }; } async function collectResearchNotes(userId: string, ticker?: string | null, journalEntryId?: number | null) { if (journalEntryId) { const entry = await getResearchJournalEntryRecord(userId, journalEntryId); const materialized = entry ? materializeResearchNote(entry) : null; return materialized ? [materialized] : []; } const entries = ticker ? await listResearchJournalEntries(userId, ticker, 250) : await listResearchJournalEntriesForUser(userId, 250); return entries .map(materializeResearchNote) .filter((entry): entry is MaterializedSearchDocument => Boolean(entry)); } async function collectMaterializedDocuments(input: IndexSearchDocumentsInput) { const sourceKinds = input.sourceKinds ?? ['filing_document', 'filing_brief', 'research_note']; const documents: MaterializedSearchDocument[] = []; for (const sourceKind of sourceKinds) { if (sourceKind === 'filing_document') { documents.push(...await collectFilingDocuments(input.ticker ?? null, input.accessionNumber ?? null)); continue; } if (sourceKind === 'filing_brief') { documents.push(...await collectFilingBriefs(input.ticker ?? null, input.accessionNumber ?? null)); continue; } if (sourceKind === 'research_note') { documents.push(...await collectResearchNotes( input.userId, input.ticker ?? null, input.journalEntryId ?? null )); } } return documents; } function persistDocumentIndex( client: Database, document: MaterializedSearchDocument, chunks: SearchChunkRecord[], embeddings: number[][] ) { const now = new Date().toISOString(); const contentHash = hashContent(document.contentText); const existing = queryOneDocument(client, document); if (existing && existing.content_hash === contentHash && existing.index_status === 'indexed') { return { indexed: false, skipped: true, chunkCount: 0 }; } const documentId = withTransaction(client, () => { if (existing) { deleteDocumentCascade(client, existing.id); } const inserted = client .query(` INSERT INTO search_document ( source_kind, source_ref, scope, user_id, ticker, accession_number, title, content_text, content_hash, metadata, index_status, indexed_at, last_error, created_at, updated_at ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, 'indexed', ?, NULL, ?, ?) RETURNING id `) .get( document.sourceKind, document.sourceRef, document.scope, document.userId, document.ticker, document.accessionNumber, document.title, document.contentText, contentHash, JSON.stringify(document.metadata), now, existing?.created_at ?? now, now ) as { id: number }; for (let index = 0; index < chunks.length; index += 1) { const chunk = chunks[index]!; const embedding = embeddings[index]!; const insertedChunk = client .query(` INSERT INTO search_chunk ( document_id, chunk_index, chunk_text, char_count, start_offset, end_offset, heading_path, citation_label, created_at ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?) RETURNING id `) .get( inserted.id, chunk.chunkIndex, chunk.chunkText, chunk.charCount, chunk.startOffset, chunk.endOffset, chunk.headingPath, chunk.citationLabel, now ) as { id: number }; client .query(` INSERT INTO search_chunk_fts ( rowid, chunk_text, citation_label, heading_path, chunk_id, document_id, chunk_index, scope, user_id, source_kind, ticker, accession_number, filing_date ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) `) .run( insertedChunk.id, chunk.chunkText, chunk.citationLabel, chunk.headingPath, insertedChunk.id, inserted.id, chunk.chunkIndex, document.scope, document.userId, document.sourceKind, document.ticker, document.accessionNumber, document.filingDate ); client .query(` INSERT INTO search_chunk_vec ( chunk_id, embedding, scope, user_id, source_kind, ticker, accession_number, filing_date, document_id, chunk_index, citation_label ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) `) .run( insertedChunk.id, JSON.stringify(embedding), document.scope, document.userId, document.sourceKind, document.ticker, document.accessionNumber, document.filingDate, inserted.id, chunk.chunkIndex, chunk.citationLabel ); } return inserted.id; }); return { indexed: true, skipped: false, chunkCount: chunks.length, documentId }; } function deleteSourceRefs(client: Database, refs: DeleteSourceRef[]) { let deleted = 0; for (const ref of refs) { const row = queryOneDocument(client, { scope: ref.scope, userId: ref.userId ?? null, sourceKind: ref.sourceKind, sourceRef: ref.sourceRef }); if (!row) { continue; } withTransaction(client, () => { deleteDocumentCascade(client, row.id); }); deleted += 1; } return deleted; } export async function indexSearchDocuments(input: IndexSearchDocumentsInput) { const client = getSqliteClient(); await input.onStage?.('collect', 'Collecting materialized search sources'); const materialized = await collectMaterializedDocuments(input); const sourceKinds = input.sourceKinds ?? ['filing_document', 'filing_brief', 'research_note']; let indexed = 0; let skipped = 0; let deleted = 0; let chunksEmbedded = 0; const totalDocuments = materialized.length; const stageContext = (current: number, subject?: TaskStageContext['subject'] | null): TaskStageContext => ({ progress: { current, total: totalDocuments || 1, unit: 'sources' }, counters: { sourcesCollected: totalDocuments, indexed, skipped, deleted, chunksEmbedded }, subject: subject ?? (input.ticker ? { ticker: input.ticker } : input.accessionNumber ? { accessionNumber: input.accessionNumber } : null) }); if (input.deleteSourceRefs && input.deleteSourceRefs.length > 0) { deleted += deleteSourceRefs(client, input.deleteSourceRefs); } await input.onStage?.( 'collect', `Collected ${materialized.length} source records for search indexing`, { counters: { sourcesCollected: materialized.length, deleted }, subject: input.ticker ? { ticker: input.ticker } : input.accessionNumber ? { accessionNumber: input.accessionNumber } : null } ); for (let index = 0; index < materialized.length; index += 1) { const document = materialized[index]; await input.onStage?.( 'fetch', `Preparing ${document.sourceKind} ${document.sourceRef}`, stageContext(index + 1, { ticker: document.ticker ?? undefined, accessionNumber: document.accessionNumber ?? undefined, label: document.sourceRef }) ); const chunks = chunkDocument(document); if (chunks.length === 0) { continue; } await input.onStage?.( 'chunk', `Chunking ${document.sourceKind} ${document.sourceRef}`, stageContext(index + 1, { ticker: document.ticker ?? undefined, accessionNumber: document.accessionNumber ?? undefined, label: document.sourceRef }) ); await input.onStage?.( 'embed', `Embedding ${chunks.length} chunks for ${document.sourceRef}`, { ...stageContext(index + 1, { ticker: document.ticker ?? undefined, accessionNumber: document.accessionNumber ?? undefined, label: document.sourceRef }), counters: { sourcesCollected: totalDocuments, indexed, skipped, deleted, chunksEmbedded } } ); const embeddings = await runAiEmbeddings(chunks.map((chunk) => chunk.chunkText)); await input.onStage?.( 'persist', `Persisting indexed chunks for ${document.sourceRef}`, stageContext(index + 1, { ticker: document.ticker ?? undefined, accessionNumber: document.accessionNumber ?? undefined, label: document.sourceRef }) ); const result = persistDocumentIndex(client, document, chunks, embeddings); if (result.skipped) { skipped += 1; continue; } indexed += 1; chunksEmbedded += result.chunkCount; } if (input.ticker && !input.accessionNumber && !input.journalEntryId) { for (const sourceKind of sourceKinds) { const scope = sourceKind === 'research_note' ? 'user' : 'global'; const expectedRefs = new Set( materialized .filter((document) => document.sourceKind === sourceKind) .map((document) => document.sourceRef) ); const existingRows = listDocumentsForScope(client, { scope, userId: scope === 'user' ? input.userId : null, sourceKind, ticker: input.ticker }); for (const row of existingRows) { if (expectedRefs.has(row.source_ref)) { continue; } withTransaction(client, () => { deleteDocumentCascade(client, row.id); }); deleted += 1; } } } return { sourcesCollected: materialized.length, indexed, skipped, deleted, chunksEmbedded }; } function vectorSearch( client: Database, input: { embedding: number[]; limit: number; sourceKind: SearchDocumentSourceKind; scope: SearchDocumentScope; userId?: string | null; ticker?: string | null; } ) { if (!__dbInternals.isVectorExtensionLoaded(client)) { const conditions = [ 'scope = ?', 'source_kind = ?' ]; const values: Array = [input.scope, input.sourceKind]; if (input.scope === 'user') { conditions.push('user_id = ?'); values.push(input.userId ?? null); } if (input.ticker) { conditions.push('ticker = ?'); values.push(input.ticker); } const rows = client .query(` SELECT chunk_id, embedding FROM search_chunk_vec WHERE ${conditions.join(' AND ')} `) .all(...values) as Array<{ chunk_id: number; embedding: string }>; const queryNorm = Math.hypot(...input.embedding) || 1; return rows .map((row) => { const candidate = JSON.parse(row.embedding) as number[]; const dot = candidate.reduce((sum, value, index) => sum + (value * (input.embedding[index] ?? 0)), 0); const candidateNorm = Math.hypot(...candidate) || 1; const cosineDistance = 1 - (dot / (candidateNorm * queryNorm)); return { chunk_id: row.chunk_id, distance: cosineDistance }; }) .sort((left, right) => left.distance - right.distance) .slice(0, Math.max(input.limit, 4)); } const conditions = [ 'embedding MATCH ?', 'k = ?', 'scope = ?', 'source_kind = ?' ]; const values: Array = [ JSON.stringify(input.embedding), Math.max(input.limit, 4), input.scope, input.sourceKind ]; if (input.scope === 'user') { conditions.push('user_id = ?'); values.push(input.userId ?? null); } if (input.ticker) { conditions.push('ticker = ?'); values.push(input.ticker); } const rows = client .query(` SELECT chunk_id, distance FROM search_chunk_vec WHERE ${conditions.join(' AND ')} ORDER BY distance ASC LIMIT ? `) .all(...values, Math.max(input.limit, 4)) as Array<{ chunk_id: number; distance: number }>; return rows; } function lexicalSearch( client: Database, input: { ftsQuery: string; limit: number; sourceKind: SearchDocumentSourceKind; scope: SearchDocumentScope; userId?: string | null; ticker?: string | null; } ) { const conditions = [ 'search_chunk_fts MATCH ?', 'source_kind = ?', 'scope = ?' ]; const values: Array = [ input.ftsQuery, input.sourceKind, input.scope ]; if (input.scope === 'user') { conditions.push('user_id = ?'); values.push(input.userId ?? null); } if (input.ticker) { conditions.push('ticker = ?'); values.push(input.ticker); } const rows = client .query(` SELECT chunk_id, bm25(search_chunk_fts) AS bm25, snippet(search_chunk_fts, 0, '[', ']', ' ... ', 18) AS snippet FROM search_chunk_fts WHERE ${conditions.join(' AND ')} ORDER BY bm25(search_chunk_fts) LIMIT ? `) .all(...values, Math.max(input.limit, 4)) as Array<{ chunk_id: number; bm25: number; snippet: string | null; }>; return rows; } function hydrateResults( client: Database, query: string, matches: Map, limit: number ) { const chunkIds = [...matches.keys()]; if (chunkIds.length === 0) { return [] satisfies SearchResult[]; } const placeholders = createPlaceholders(chunkIds.length); const rows = client .query(` SELECT c.id AS chunk_id, c.document_id, c.chunk_text, c.heading_path, c.citation_label, d.source_kind, d.source_ref, d.title, d.ticker, d.accession_number, d.metadata FROM search_chunk c INNER JOIN search_document d ON d.id = c.document_id WHERE c.id IN (${placeholders}) `) .all(...chunkIds) as SearchChunkJoinRow[]; const dedupePerDocument = new Map(); const enriched = rows .map((row) => { const match = matches.get(row.chunk_id); if (!match) { return null; } return { chunkId: row.chunk_id, documentId: row.document_id, source: mapSourceKindToSearchSource(row.source_kind), sourceKind: row.source_kind, sourceRef: row.source_ref, title: row.title, ticker: row.ticker, accessionNumber: row.accession_number, filingDate: typeof row.metadata?.filingDate === 'string' ? row.metadata.filingDate : null, citationLabel: row.citation_label, headingPath: row.heading_path, chunkText: row.chunk_text, snippet: match.snippet ?? manualSnippet(row.chunk_text, query), score: scoreSearchMatch(match), vectorRank: match.vectorRank, lexicalRank: match.lexicalRank, href: buildSearchHref(row) } satisfies SearchResult; }) .filter((row): row is SearchResult => Boolean(row)) .sort((left, right) => right.score - left.score); const results: SearchResult[] = []; for (const row of enriched) { const count = dedupePerDocument.get(row.documentId) ?? 0; if (count >= MAX_RESULTS_PER_DOCUMENT) { continue; } dedupePerDocument.set(row.documentId, count + 1); results.push(row); if (results.length >= limit) { break; } } return results; } export async function searchKnowledgeBase(input: SearchInput) { const normalizedQuery = input.query.trim(); if (normalizedQuery.length < 2) { return [] satisfies SearchResult[]; } const limit = clampLimit(input.limit); const normalizedTicker = normalizeTicker(input.ticker); const includedSources = normalizeSearchSources(input.sources); const client = getSqliteClient(); const [queryEmbedding] = await runAiEmbeddings([normalizedQuery]); const ftsQuery = toFtsQuery(normalizedQuery); const matches = new Map(); for (const source of includedSources) { const sourceKind = SOURCE_KIND_BY_SEARCH_SOURCE[source]; const scope = sourceKind === 'research_note' ? 'user' : 'global'; const vectorRows = vectorSearch(client, { embedding: queryEmbedding, limit: limit * 3, sourceKind, scope, userId: scope === 'user' ? input.userId : null, ticker: normalizedTicker }); vectorRows.forEach((row, index) => { const existing = matches.get(row.chunk_id); matches.set(row.chunk_id, { chunkId: row.chunk_id, vectorRank: existing?.vectorRank ?? index + 1, lexicalRank: existing?.lexicalRank ?? null, snippet: existing?.snippet ?? null }); }); if (!ftsQuery) { continue; } const lexicalRows = lexicalSearch(client, { ftsQuery, limit: limit * 3, sourceKind, scope, userId: scope === 'user' ? input.userId : null, ticker: normalizedTicker }); lexicalRows.forEach((row, index) => { const existing = matches.get(row.chunk_id); matches.set(row.chunk_id, { chunkId: row.chunk_id, vectorRank: existing?.vectorRank ?? null, lexicalRank: existing?.lexicalRank ?? index + 1, snippet: existing?.snippet ?? row.snippet ?? null }); }); } return hydrateResults(client, normalizedQuery, matches, limit); } function buildAnswerPrompt(query: string, evidence: SearchResult[]) { const evidenceText = evidence.map((result, index) => { const reference = index + 1; return [ `[${reference}] ${result.citationLabel}`, `Source: ${result.title ?? result.sourceRef}`, `Ticker: ${result.ticker ?? 'n/a'}`, `Excerpt: ${result.chunkText}` ].join('\n'); }).join('\n\n'); return [ 'Answer the question using only the evidence below.', 'Every factual claim must include at least one citation like [1] or [2].', 'If the evidence is insufficient, respond with exactly INSUFFICIENT_EVIDENCE.', `Question: ${query}`, '', 'Evidence:', evidenceText ].join('\n'); } function finalizeAnswer(answer: string, evidence: SearchResult[]) { const trimmed = answer.trim(); if (!trimmed || trimmed === 'INSUFFICIENT_EVIDENCE') { return { answer: 'Insufficient evidence to answer from the indexed sources.', citations: [] satisfies SearchCitation[] }; } const matches = [...trimmed.matchAll(/\[(\d+)\]/g)]; const seen = new Set(); const citations: SearchCitation[] = []; for (const match of matches) { const index = Number(match[1]); if (!Number.isInteger(index) || seen.has(index) || index < 1 || index > evidence.length) { continue; } seen.add(index); const result = evidence[index - 1]!; citations.push({ index, label: result.citationLabel, chunkId: result.chunkId, href: result.href }); } if (citations.length === 0) { return { answer: 'Insufficient evidence to answer from the indexed sources.', citations: [] satisfies SearchCitation[] }; } return { answer: trimmed, citations }; } export async function answerSearchQuery(input: SearchInput): Promise { const results = await searchKnowledgeBase({ ...input, limit: clampLimit(input.limit) }); if (results.length === 0) { return { answer: 'Insufficient evidence to answer from the indexed sources.', citations: [], results }; } const evidence: SearchResult[] = []; let totalChars = 0; for (const result of results) { if (evidence.length >= MAX_CONTEXT_RESULTS) { break; } if (totalChars + result.chunkText.length > MAX_CONTEXT_CHARS && evidence.length > 0) { break; } evidence.push(result); totalChars += result.chunkText.length; } const response = await runAiAnalysis( buildAnswerPrompt(input.query, evidence), 'Use neutral analyst prose. Do not use outside knowledge.', { workload: 'report' } ); const finalized = finalizeAnswer(response.text, evidence); return { answer: finalized.answer, citations: finalized.citations, results }; } export const __searchInternals = { buildCitationLabel, buildFilingBriefContent, chunkDocument, chunkText, deleteSourceRefs, finalizeAnswer, hashContent, hydrateResults, lexicalSearch, normalizeSearchSources, persistDocumentIndex, queryOneDocument, scoreSearchMatch, stripMarkdown, toFtsQuery, vectorSearch };