Files
Neon-Desk/lib/server/search.ts

1393 lines
38 KiB
TypeScript

import { createHash } from 'node:crypto';
import type { Database } from 'bun:sqlite';
import type {
ResearchJournalEntry,
SearchAnswerResponse,
SearchCitation,
SearchResult,
SearchSource,
TaskStageContext
} from '@/lib/types';
import { runAiAnalysis, runAiEmbeddings } from '@/lib/server/ai';
import { __dbInternals, getSqliteClient } from '@/lib/server/db';
import { fetchPrimaryFilingText } from '@/lib/server/sec';
import { getFilingByAccession, listFilingsRecords } from '@/lib/server/repos/filings';
import {
getResearchJournalEntryRecord,
listResearchJournalEntries,
listResearchJournalEntriesForUser
} from '@/lib/server/repos/research-journal';
type SearchDocumentScope = 'global' | 'user';
type SearchDocumentSourceKind = 'filing_document' | 'filing_brief' | 'research_note';
type MaterializedSearchDocument = {
sourceKind: SearchDocumentSourceKind;
sourceRef: string;
scope: SearchDocumentScope;
userId: string | null;
ticker: string | null;
accessionNumber: string | null;
filingDate: string | null;
title: string | null;
contentText: string;
metadata: Record<string, unknown>;
};
type SearchChunkRecord = {
chunkIndex: number;
chunkText: string;
charCount: number;
startOffset: number;
endOffset: number;
headingPath: string | null;
citationLabel: string;
};
type SearchDocumentRow = {
id: number;
source_kind: SearchDocumentSourceKind;
source_ref: string;
scope: SearchDocumentScope;
user_id: string | null;
ticker: string | null;
accession_number: string | null;
title: string | null;
content_text: string;
content_hash: string;
metadata: Record<string, unknown> | null;
index_status: 'pending' | 'indexed' | 'failed';
indexed_at: string | null;
last_error: string | null;
created_at: string;
updated_at: string;
};
type SearchChunkJoinRow = {
chunk_id: number;
document_id: number;
chunk_text: string;
heading_path: string | null;
citation_label: string;
source_kind: SearchDocumentSourceKind;
source_ref: string;
title: string | null;
ticker: string | null;
accession_number: string | null;
metadata: Record<string, unknown> | null;
};
type DeleteSourceRef = {
sourceKind: SearchDocumentSourceKind;
sourceRef: string;
scope: SearchDocumentScope;
userId?: string | null;
};
type IndexSearchDocumentsInput = {
userId: string;
ticker?: string | null;
accessionNumber?: string | null;
journalEntryId?: number | null;
sourceKinds?: SearchDocumentSourceKind[];
deleteSourceRefs?: DeleteSourceRef[];
onStage?: (
stage: 'collect' | 'fetch' | 'chunk' | 'embed' | 'persist',
detail: string,
context?: TaskStageContext | null
) => Promise<void> | void;
};
type SearchInput = {
userId: string;
query: string;
ticker?: string | null;
sources?: SearchSource[];
limit?: number;
};
type SearchMatch = {
chunkId: number;
vectorRank: number | null;
lexicalRank: number | null;
snippet: string | null;
};
const RRF_K = 60;
const SEARCH_RESULT_LIMIT_DEFAULT = 10;
const SEARCH_RESULT_LIMIT_MIN = 4;
const SEARCH_RESULT_LIMIT_MAX = 12;
const MAX_RESULTS_PER_DOCUMENT = 2;
const MAX_CONTEXT_RESULTS = 6;
const MAX_CONTEXT_CHARS = 8_000;
const SOURCE_KIND_BY_SEARCH_SOURCE: Record<SearchSource, SearchDocumentSourceKind> = {
documents: 'filing_document',
filings: 'filing_brief',
research: 'research_note'
};
function escapeLike(value: string) {
return value.replace(/[%_]/g, (match) => `\\${match}`);
}
function normalizeTicker(value: string | null | undefined) {
const normalized = value?.trim().toUpperCase() ?? '';
return normalized.length > 0 ? normalized : null;
}
function normalizeSearchSources(sources?: SearchSource[]) {
const normalized = new Set<SearchSource>();
for (const source of sources ?? ['documents', 'filings', 'research']) {
if (source === 'documents' || source === 'filings' || source === 'research') {
normalized.add(source);
}
}
return normalized.size > 0
? [...normalized]
: ['documents', 'filings', 'research'] as SearchSource[];
}
function clampLimit(limit?: number) {
const value = Number.isFinite(limit) ? Number(limit) : SEARCH_RESULT_LIMIT_DEFAULT;
const intValue = Math.trunc(value);
return Math.min(Math.max(intValue, SEARCH_RESULT_LIMIT_MIN), SEARCH_RESULT_LIMIT_MAX);
}
function hashContent(content: string) {
return createHash('sha256').update(content).digest('hex');
}
function stripMarkdown(markdown: string) {
return markdown
.replace(/```[\s\S]*?```/g, ' ')
.replace(/`([^`]+)`/g, '$1')
.replace(/!\[[^\]]*\]\([^)]+\)/g, ' ')
.replace(/\[([^\]]+)\]\([^)]+\)/g, '$1')
.replace(/^>\s?/gm, '')
.replace(/^#{1,6}\s+/gm, '')
.replace(/[*_~]/g, '')
.replace(/^\s*[-+]\s+/gm, '')
.replace(/\r/g, '\n')
.replace(/[ \t]+\n/g, '\n')
.replace(/\n{3,}/g, '\n\n')
.trim();
}
function normalizeWhitespace(value: string) {
return value
.replace(/\r/g, '\n')
.replace(/[ \t]+\n/g, '\n')
.replace(/\n[ \t]+/g, '\n')
.replace(/[ \t]{2,}/g, ' ')
.replace(/\n{3,}/g, '\n\n')
.trim();
}
function buildFilingBriefContent(input: {
ticker: string;
companyName: string;
accessionNumber: string;
filingDate: string;
filingType: string;
metrics: Record<string, number | null> | null;
analysis: Record<string, unknown> | null;
}) {
const extraction = input.analysis?.extraction;
const extractionLines = extraction && typeof extraction === 'object'
? Object.entries(extraction as Record<string, unknown>)
.map(([key, value]) => `${key}: ${Array.isArray(value) ? value.join(' | ') : String(value ?? '')}`)
.filter((line) => !line.endsWith(': '))
: [];
const reportText = typeof input.analysis?.text === 'string'
? input.analysis.text
: typeof input.analysis?.legacyInsights === 'string'
? input.analysis.legacyInsights
: null;
return normalizeWhitespace([
`${input.companyName} (${input.ticker}) filing brief`,
`Form: ${input.filingType}`,
`Filed: ${input.filingDate}`,
`Accession: ${input.accessionNumber}`,
input.metrics ? `Key metrics: ${JSON.stringify(input.metrics)}` : null,
reportText ? `AI summary:\n${reportText}` : null,
extractionLines.length > 0 ? `Structured extraction:\n${extractionLines.join('\n')}` : null
].filter((entry): entry is string => Boolean(entry)).join('\n\n'));
}
function buildCitationLabel(document: MaterializedSearchDocument, chunkIndex: number) {
if (document.sourceKind === 'research_note') {
return `${document.ticker ?? 'Research'} journal note [${chunkIndex + 1}]`;
}
const parts = [
document.ticker,
document.accessionNumber,
document.filingDate
].filter((entry): entry is string => Boolean(entry));
return `${parts.join(' · ')} [${chunkIndex + 1}]`;
}
function inferHeadingPath(text: string, offset: number) {
const windowStart = Math.max(0, offset - 600);
const context = text.slice(windowStart, offset);
const lines = context
.split('\n')
.map((line) => line.trim())
.filter((line) => line.length > 0);
for (let index = lines.length - 1; index >= 0; index -= 1) {
const candidate = lines[index]!;
const looksLikeHeading = candidate.length <= 100 && (
/:$/.test(candidate)
|| /^[A-Z0-9][A-Z0-9\s/&,-]{4,}$/.test(candidate)
|| /^\d+(\.\d+)*\s+[A-Z]/.test(candidate)
);
if (looksLikeHeading) {
return candidate;
}
}
return null;
}
function chunkText(
text: string,
options: { targetChars: number; overlapChars: number; maxSingleChunk?: number },
document: MaterializedSearchDocument
) {
const normalized = normalizeWhitespace(text);
if (!normalized) {
return [] satisfies SearchChunkRecord[];
}
const maxSingleChunk = options.maxSingleChunk ?? 0;
if (maxSingleChunk > 0 && normalized.length <= maxSingleChunk) {
return [{
chunkIndex: 0,
chunkText: normalized,
charCount: normalized.length,
startOffset: 0,
endOffset: normalized.length,
headingPath: inferHeadingPath(normalized, 0),
citationLabel: buildCitationLabel(document, 0)
}];
}
const chunks: SearchChunkRecord[] = [];
let start = 0;
let chunkIndex = 0;
while (start < normalized.length) {
const tentativeEnd = Math.min(start + options.targetChars, normalized.length);
let end = tentativeEnd;
if (tentativeEnd < normalized.length) {
const localWindow = normalized.slice(start, Math.min(normalized.length, tentativeEnd + 180));
const paragraphBreak = localWindow.lastIndexOf('\n\n');
const sentenceBreak = Math.max(localWindow.lastIndexOf('. '), localWindow.lastIndexOf('\n'));
const boundary = Math.max(paragraphBreak, sentenceBreak);
if (boundary >= options.targetChars * 0.55) {
end = start + boundary + (paragraphBreak === boundary ? 0 : 1);
}
}
const chunkTextValue = normalized.slice(start, end).trim();
if (chunkTextValue) {
chunks.push({
chunkIndex,
chunkText: chunkTextValue,
charCount: chunkTextValue.length,
startOffset: start,
endOffset: end,
headingPath: inferHeadingPath(normalized, start),
citationLabel: buildCitationLabel(document, chunkIndex)
});
chunkIndex += 1;
}
if (end >= normalized.length) {
break;
}
start = Math.max(end - options.overlapChars, start + 1);
}
return chunks;
}
function chunkDocument(document: MaterializedSearchDocument) {
switch (document.sourceKind) {
case 'filing_document':
return chunkText(document.contentText, { targetChars: 1400, overlapChars: 200 }, document);
case 'filing_brief':
return chunkText(document.contentText, { targetChars: 1000, overlapChars: 100 }, document);
case 'research_note':
return chunkText(document.contentText, {
targetChars: 1000,
overlapChars: 100,
maxSingleChunk: 2000
}, document);
default:
return [];
}
}
function mapSourceKindToSearchSource(sourceKind: SearchDocumentSourceKind): SearchSource {
switch (sourceKind) {
case 'filing_document':
return 'documents';
case 'filing_brief':
return 'filings';
case 'research_note':
return 'research';
default:
return 'documents';
}
}
function buildSearchHref(row: SearchChunkJoinRow) {
if (row.source_kind === 'research_note') {
return `/analysis?ticker=${encodeURIComponent(row.ticker ?? '')}&journalId=${encodeURIComponent(row.source_ref)}`;
}
const hasAnalysis = Boolean((row.metadata as { hasAnalysis?: unknown } | null)?.hasAnalysis);
if (hasAnalysis && row.ticker && row.accession_number) {
return `/analysis/reports/${encodeURIComponent(row.ticker)}/${encodeURIComponent(row.accession_number)}`;
}
if (row.ticker) {
return `/filings?ticker=${encodeURIComponent(row.ticker)}`;
}
return '/filings';
}
function manualSnippet(text: string, query: string) {
const tokens = query.toLowerCase().split(/\W+/).filter((token) => token.length > 1);
const lower = text.toLowerCase();
const matchIndex = tokens
.map((token) => lower.indexOf(token))
.find((index) => index >= 0) ?? 0;
const start = Math.max(0, matchIndex - 90);
const end = Math.min(text.length, start + 220);
const prefix = start > 0 ? '... ' : '';
const suffix = end < text.length ? ' ...' : '';
return `${prefix}${text.slice(start, end).trim()}${suffix}`.trim();
}
function scoreSearchMatch(match: SearchMatch) {
let score = 0;
if (match.vectorRank !== null) {
score += 1 / (RRF_K + match.vectorRank);
}
if (match.lexicalRank !== null) {
score += 1 / (RRF_K + match.lexicalRank);
}
return score;
}
function toFtsQuery(query: string) {
const tokens = query
.trim()
.split(/\W+/)
.map((token) => token.trim().toLowerCase())
.filter((token) => token.length > 1);
if (tokens.length === 0) {
return null;
}
return tokens.map((token) => `"${token.replace(/"/g, '""')}"`).join(' AND ');
}
function createPlaceholders(length: number) {
return new Array(length).fill('?').join(', ');
}
function queryOneDocument(
client: Database,
input: Pick<MaterializedSearchDocument, 'scope' | 'userId' | 'sourceKind' | 'sourceRef'>
) {
return client
.query(`
SELECT *
FROM search_document
WHERE scope = ?
AND ifnull(user_id, '') = ?
AND source_kind = ?
AND source_ref = ?
LIMIT 1
`)
.get(
input.scope,
input.userId ?? '',
input.sourceKind,
input.sourceRef
) as SearchDocumentRow | null;
}
function listDocumentsForScope(client: Database, input: {
scope: SearchDocumentScope;
userId?: string | null;
sourceKind: SearchDocumentSourceKind;
ticker?: string | null;
}) {
const conditions = [
'scope = ?',
"ifnull(user_id, '') = ?",
'source_kind = ?'
];
const values: Array<string | null> = [input.scope, input.userId ?? '', input.sourceKind];
if (input.ticker) {
conditions.push('ticker = ?');
values.push(input.ticker);
}
return client
.query(`
SELECT *
FROM search_document
WHERE ${conditions.join(' AND ')}
`)
.all(...values) as SearchDocumentRow[];
}
function deleteDocumentCascade(client: Database, documentId: number) {
const chunkRows = client
.query('SELECT id FROM search_chunk WHERE document_id = ?')
.all(documentId) as Array<{ id: number }>;
for (const row of chunkRows) {
client.query('DELETE FROM search_chunk_vec WHERE chunk_id = ?').run(row.id);
client.query('DELETE FROM search_chunk_fts WHERE chunk_id = ?').run(row.id);
}
client.query('DELETE FROM search_chunk WHERE document_id = ?').run(documentId);
client.query('DELETE FROM search_document WHERE id = ?').run(documentId);
}
function withTransaction<T>(client: Database, fn: () => T) {
client.exec('BEGIN IMMEDIATE');
try {
const result = fn();
client.exec('COMMIT');
return result;
} catch (error) {
client.exec('ROLLBACK');
throw error;
}
}
async function collectFilingDocuments(ticker?: string | null, accessionNumber?: string | null) {
const filings = accessionNumber
? await Promise.all([getFilingByAccession(accessionNumber)]).then((rows) => rows.filter(Boolean))
: await listFilingsRecords({
ticker: ticker ?? undefined,
limit: 250
});
const documents: MaterializedSearchDocument[] = [];
for (const filing of filings) {
if (!filing) {
continue;
}
const filingText = await fetchPrimaryFilingText({
filingUrl: filing.filing_url,
cik: filing.cik,
accessionNumber: filing.accession_number,
primaryDocument: filing.primary_document ?? null
}).catch(() => null);
if (!filingText?.text) {
continue;
}
documents.push({
sourceKind: 'filing_document',
sourceRef: filing.accession_number,
scope: 'global',
userId: null,
ticker: filing.ticker,
accessionNumber: filing.accession_number,
filingDate: filing.filing_date,
title: `${filing.ticker} ${filing.filing_type} primary filing`,
contentText: filingText.text,
metadata: {
filingType: filing.filing_type,
filingDate: filing.filing_date,
filingUrl: filing.filing_url,
submissionUrl: filing.submission_url ?? null,
primaryDocument: filing.primary_document ?? null,
hasAnalysis: Boolean(filing.analysis?.text || filing.analysis?.legacyInsights)
}
});
}
return documents;
}
async function collectFilingBriefs(ticker?: string | null, accessionNumber?: string | null) {
const filings = accessionNumber
? await Promise.all([getFilingByAccession(accessionNumber)]).then((rows) => rows.filter(Boolean))
: await listFilingsRecords({
ticker: ticker ?? undefined,
limit: 250
});
return filings
.filter((filing): filing is NonNullable<typeof filing> => Boolean(filing))
.map((filing) => ({
sourceKind: 'filing_brief' as const,
sourceRef: filing.accession_number,
scope: 'global' as const,
userId: null,
ticker: filing.ticker,
accessionNumber: filing.accession_number,
filingDate: filing.filing_date,
title: `${filing.ticker} ${filing.filing_type} filing brief`,
contentText: buildFilingBriefContent({
ticker: filing.ticker,
companyName: filing.company_name,
accessionNumber: filing.accession_number,
filingDate: filing.filing_date,
filingType: filing.filing_type,
metrics: filing.metrics,
analysis: filing.analysis as Record<string, unknown> | null
}),
metadata: {
filingType: filing.filing_type,
filingDate: filing.filing_date,
hasAnalysis: Boolean(filing.analysis?.text || filing.analysis?.legacyInsights)
}
}));
}
function materializeResearchNote(entry: ResearchJournalEntry): MaterializedSearchDocument | null {
const text = normalizeWhitespace(stripMarkdown(entry.body_markdown));
if (!text) {
return null;
}
return {
sourceKind: 'research_note',
sourceRef: String(entry.id),
scope: 'user',
userId: entry.user_id,
ticker: entry.ticker,
accessionNumber: entry.accession_number,
filingDate: null,
title: entry.title ?? `${entry.ticker} research note`,
contentText: text,
metadata: {
entryType: entry.entry_type,
createdAt: entry.created_at,
updatedAt: entry.updated_at
}
};
}
async function collectResearchNotes(userId: string, ticker?: string | null, journalEntryId?: number | null) {
if (journalEntryId) {
const entry = await getResearchJournalEntryRecord(userId, journalEntryId);
const materialized = entry ? materializeResearchNote(entry) : null;
return materialized ? [materialized] : [];
}
const entries = ticker
? await listResearchJournalEntries(userId, ticker, 250)
: await listResearchJournalEntriesForUser(userId, 250);
return entries
.map(materializeResearchNote)
.filter((entry): entry is MaterializedSearchDocument => Boolean(entry));
}
async function collectMaterializedDocuments(input: IndexSearchDocumentsInput) {
const sourceKinds = input.sourceKinds ?? ['filing_document', 'filing_brief', 'research_note'];
const documents: MaterializedSearchDocument[] = [];
for (const sourceKind of sourceKinds) {
if (sourceKind === 'filing_document') {
documents.push(...await collectFilingDocuments(input.ticker ?? null, input.accessionNumber ?? null));
continue;
}
if (sourceKind === 'filing_brief') {
documents.push(...await collectFilingBriefs(input.ticker ?? null, input.accessionNumber ?? null));
continue;
}
if (sourceKind === 'research_note') {
documents.push(...await collectResearchNotes(
input.userId,
input.ticker ?? null,
input.journalEntryId ?? null
));
}
}
return documents;
}
function persistDocumentIndex(
client: Database,
document: MaterializedSearchDocument,
chunks: SearchChunkRecord[],
embeddings: number[][]
) {
const now = new Date().toISOString();
const contentHash = hashContent(document.contentText);
const existing = queryOneDocument(client, document);
if (existing && existing.content_hash === contentHash && existing.index_status === 'indexed') {
return { indexed: false, skipped: true, chunkCount: 0 };
}
const documentId = withTransaction(client, () => {
if (existing) {
deleteDocumentCascade(client, existing.id);
}
const inserted = client
.query(`
INSERT INTO search_document (
source_kind,
source_ref,
scope,
user_id,
ticker,
accession_number,
title,
content_text,
content_hash,
metadata,
index_status,
indexed_at,
last_error,
created_at,
updated_at
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, 'indexed', ?, NULL, ?, ?)
RETURNING id
`)
.get(
document.sourceKind,
document.sourceRef,
document.scope,
document.userId,
document.ticker,
document.accessionNumber,
document.title,
document.contentText,
contentHash,
JSON.stringify(document.metadata),
now,
existing?.created_at ?? now,
now
) as { id: number };
for (let index = 0; index < chunks.length; index += 1) {
const chunk = chunks[index]!;
const embedding = embeddings[index]!;
const insertedChunk = client
.query(`
INSERT INTO search_chunk (
document_id,
chunk_index,
chunk_text,
char_count,
start_offset,
end_offset,
heading_path,
citation_label,
created_at
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
RETURNING id
`)
.get(
inserted.id,
chunk.chunkIndex,
chunk.chunkText,
chunk.charCount,
chunk.startOffset,
chunk.endOffset,
chunk.headingPath,
chunk.citationLabel,
now
) as { id: number };
client
.query(`
INSERT INTO search_chunk_fts (
rowid,
chunk_text,
citation_label,
heading_path,
chunk_id,
document_id,
chunk_index,
scope,
user_id,
source_kind,
ticker,
accession_number,
filing_date
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
`)
.run(
insertedChunk.id,
chunk.chunkText,
chunk.citationLabel,
chunk.headingPath,
insertedChunk.id,
inserted.id,
chunk.chunkIndex,
document.scope,
document.userId,
document.sourceKind,
document.ticker,
document.accessionNumber,
document.filingDate
);
client
.query(`
INSERT INTO search_chunk_vec (
chunk_id,
embedding,
scope,
user_id,
source_kind,
ticker,
accession_number,
filing_date,
document_id,
chunk_index,
citation_label
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
`)
.run(
insertedChunk.id,
JSON.stringify(embedding),
document.scope,
document.userId,
document.sourceKind,
document.ticker,
document.accessionNumber,
document.filingDate,
inserted.id,
chunk.chunkIndex,
chunk.citationLabel
);
}
return inserted.id;
});
return {
indexed: true,
skipped: false,
chunkCount: chunks.length,
documentId
};
}
function deleteSourceRefs(client: Database, refs: DeleteSourceRef[]) {
let deleted = 0;
for (const ref of refs) {
const row = queryOneDocument(client, {
scope: ref.scope,
userId: ref.userId ?? null,
sourceKind: ref.sourceKind,
sourceRef: ref.sourceRef
});
if (!row) {
continue;
}
withTransaction(client, () => {
deleteDocumentCascade(client, row.id);
});
deleted += 1;
}
return deleted;
}
export async function indexSearchDocuments(input: IndexSearchDocumentsInput) {
const client = getSqliteClient();
await input.onStage?.('collect', 'Collecting materialized search sources');
const materialized = await collectMaterializedDocuments(input);
const sourceKinds = input.sourceKinds ?? ['filing_document', 'filing_brief', 'research_note'];
let indexed = 0;
let skipped = 0;
let deleted = 0;
let chunksEmbedded = 0;
const totalDocuments = materialized.length;
const stageContext = (current: number, subject?: TaskStageContext['subject'] | null): TaskStageContext => ({
progress: {
current,
total: totalDocuments || 1,
unit: 'sources'
},
counters: {
sourcesCollected: totalDocuments,
indexed,
skipped,
deleted,
chunksEmbedded
},
subject: subject ?? (input.ticker ? { ticker: input.ticker } : input.accessionNumber ? { accessionNumber: input.accessionNumber } : null)
});
if (input.deleteSourceRefs && input.deleteSourceRefs.length > 0) {
deleted += deleteSourceRefs(client, input.deleteSourceRefs);
}
await input.onStage?.(
'collect',
`Collected ${materialized.length} source records for search indexing`,
{
counters: {
sourcesCollected: materialized.length,
deleted
},
subject: input.ticker ? { ticker: input.ticker } : input.accessionNumber ? { accessionNumber: input.accessionNumber } : null
}
);
for (let index = 0; index < materialized.length; index += 1) {
const document = materialized[index];
await input.onStage?.(
'fetch',
`Preparing ${document.sourceKind} ${document.sourceRef}`,
stageContext(index + 1, {
ticker: document.ticker ?? undefined,
accessionNumber: document.accessionNumber ?? undefined,
label: document.sourceRef
})
);
const chunks = chunkDocument(document);
if (chunks.length === 0) {
continue;
}
await input.onStage?.(
'chunk',
`Chunking ${document.sourceKind} ${document.sourceRef}`,
stageContext(index + 1, {
ticker: document.ticker ?? undefined,
accessionNumber: document.accessionNumber ?? undefined,
label: document.sourceRef
})
);
await input.onStage?.(
'embed',
`Embedding ${chunks.length} chunks for ${document.sourceRef}`,
{
...stageContext(index + 1, {
ticker: document.ticker ?? undefined,
accessionNumber: document.accessionNumber ?? undefined,
label: document.sourceRef
}),
counters: {
sourcesCollected: totalDocuments,
indexed,
skipped,
deleted,
chunksEmbedded
}
}
);
const embeddings = await runAiEmbeddings(chunks.map((chunk) => chunk.chunkText));
await input.onStage?.(
'persist',
`Persisting indexed chunks for ${document.sourceRef}`,
stageContext(index + 1, {
ticker: document.ticker ?? undefined,
accessionNumber: document.accessionNumber ?? undefined,
label: document.sourceRef
})
);
const result = persistDocumentIndex(client, document, chunks, embeddings);
if (result.skipped) {
skipped += 1;
continue;
}
indexed += 1;
chunksEmbedded += result.chunkCount;
}
if (input.ticker && !input.accessionNumber && !input.journalEntryId) {
for (const sourceKind of sourceKinds) {
const scope = sourceKind === 'research_note' ? 'user' : 'global';
const expectedRefs = new Set(
materialized
.filter((document) => document.sourceKind === sourceKind)
.map((document) => document.sourceRef)
);
const existingRows = listDocumentsForScope(client, {
scope,
userId: scope === 'user' ? input.userId : null,
sourceKind,
ticker: input.ticker
});
for (const row of existingRows) {
if (expectedRefs.has(row.source_ref)) {
continue;
}
withTransaction(client, () => {
deleteDocumentCascade(client, row.id);
});
deleted += 1;
}
}
}
return {
sourcesCollected: materialized.length,
indexed,
skipped,
deleted,
chunksEmbedded
};
}
function vectorSearch(
client: Database,
input: {
embedding: number[];
limit: number;
sourceKind: SearchDocumentSourceKind;
scope: SearchDocumentScope;
userId?: string | null;
ticker?: string | null;
}
) {
if (!__dbInternals.isVectorExtensionLoaded(client)) {
const conditions = [
'scope = ?',
'source_kind = ?'
];
const values: Array<string | null> = [input.scope, input.sourceKind];
if (input.scope === 'user') {
conditions.push('user_id = ?');
values.push(input.userId ?? null);
}
if (input.ticker) {
conditions.push('ticker = ?');
values.push(input.ticker);
}
const rows = client
.query(`
SELECT chunk_id, embedding
FROM search_chunk_vec
WHERE ${conditions.join(' AND ')}
`)
.all(...values) as Array<{ chunk_id: number; embedding: string }>;
const queryNorm = Math.hypot(...input.embedding) || 1;
return rows
.map((row) => {
const candidate = JSON.parse(row.embedding) as number[];
const dot = candidate.reduce((sum, value, index) => sum + (value * (input.embedding[index] ?? 0)), 0);
const candidateNorm = Math.hypot(...candidate) || 1;
const cosineDistance = 1 - (dot / (candidateNorm * queryNorm));
return {
chunk_id: row.chunk_id,
distance: cosineDistance
};
})
.sort((left, right) => left.distance - right.distance)
.slice(0, Math.max(input.limit, 4));
}
const conditions = [
'embedding MATCH ?',
'k = ?',
'scope = ?',
'source_kind = ?'
];
const values: Array<string | number | null> = [
JSON.stringify(input.embedding),
Math.max(input.limit, 4),
input.scope,
input.sourceKind
];
if (input.scope === 'user') {
conditions.push('user_id = ?');
values.push(input.userId ?? null);
}
if (input.ticker) {
conditions.push('ticker = ?');
values.push(input.ticker);
}
const rows = client
.query(`
SELECT chunk_id, distance
FROM search_chunk_vec
WHERE ${conditions.join(' AND ')}
ORDER BY distance ASC
LIMIT ?
`)
.all(...values, Math.max(input.limit, 4)) as Array<{ chunk_id: number; distance: number }>;
return rows;
}
function lexicalSearch(
client: Database,
input: {
ftsQuery: string;
limit: number;
sourceKind: SearchDocumentSourceKind;
scope: SearchDocumentScope;
userId?: string | null;
ticker?: string | null;
}
) {
const conditions = [
'search_chunk_fts MATCH ?',
'source_kind = ?',
'scope = ?'
];
const values: Array<string | number | null> = [
input.ftsQuery,
input.sourceKind,
input.scope
];
if (input.scope === 'user') {
conditions.push('user_id = ?');
values.push(input.userId ?? null);
}
if (input.ticker) {
conditions.push('ticker = ?');
values.push(input.ticker);
}
const rows = client
.query(`
SELECT
chunk_id,
bm25(search_chunk_fts) AS bm25,
snippet(search_chunk_fts, 0, '[', ']', ' ... ', 18) AS snippet
FROM search_chunk_fts
WHERE ${conditions.join(' AND ')}
ORDER BY bm25(search_chunk_fts)
LIMIT ?
`)
.all(...values, Math.max(input.limit, 4)) as Array<{
chunk_id: number;
bm25: number;
snippet: string | null;
}>;
return rows;
}
function hydrateResults(
client: Database,
query: string,
matches: Map<number, SearchMatch>,
limit: number
) {
const chunkIds = [...matches.keys()];
if (chunkIds.length === 0) {
return [] satisfies SearchResult[];
}
const placeholders = createPlaceholders(chunkIds.length);
const rows = client
.query(`
SELECT
c.id AS chunk_id,
c.document_id,
c.chunk_text,
c.heading_path,
c.citation_label,
d.source_kind,
d.source_ref,
d.title,
d.ticker,
d.accession_number,
d.metadata
FROM search_chunk c
INNER JOIN search_document d ON d.id = c.document_id
WHERE c.id IN (${placeholders})
`)
.all(...chunkIds) as SearchChunkJoinRow[];
const dedupePerDocument = new Map<number, number>();
const enriched = rows
.map((row) => {
const match = matches.get(row.chunk_id);
if (!match) {
return null;
}
return {
chunkId: row.chunk_id,
documentId: row.document_id,
source: mapSourceKindToSearchSource(row.source_kind),
sourceKind: row.source_kind,
sourceRef: row.source_ref,
title: row.title,
ticker: row.ticker,
accessionNumber: row.accession_number,
filingDate: typeof row.metadata?.filingDate === 'string' ? row.metadata.filingDate : null,
citationLabel: row.citation_label,
headingPath: row.heading_path,
chunkText: row.chunk_text,
snippet: match.snippet ?? manualSnippet(row.chunk_text, query),
score: scoreSearchMatch(match),
vectorRank: match.vectorRank,
lexicalRank: match.lexicalRank,
href: buildSearchHref(row)
} satisfies SearchResult;
})
.filter((row): row is SearchResult => Boolean(row))
.sort((left, right) => right.score - left.score);
const results: SearchResult[] = [];
for (const row of enriched) {
const count = dedupePerDocument.get(row.documentId) ?? 0;
if (count >= MAX_RESULTS_PER_DOCUMENT) {
continue;
}
dedupePerDocument.set(row.documentId, count + 1);
results.push(row);
if (results.length >= limit) {
break;
}
}
return results;
}
export async function searchKnowledgeBase(input: SearchInput) {
const normalizedQuery = input.query.trim();
if (normalizedQuery.length < 2) {
return [] satisfies SearchResult[];
}
const limit = clampLimit(input.limit);
const normalizedTicker = normalizeTicker(input.ticker);
const includedSources = normalizeSearchSources(input.sources);
const client = getSqliteClient();
const [queryEmbedding] = await runAiEmbeddings([normalizedQuery]);
const ftsQuery = toFtsQuery(normalizedQuery);
const matches = new Map<number, SearchMatch>();
for (const source of includedSources) {
const sourceKind = SOURCE_KIND_BY_SEARCH_SOURCE[source];
const scope = sourceKind === 'research_note' ? 'user' : 'global';
const vectorRows = vectorSearch(client, {
embedding: queryEmbedding,
limit: limit * 3,
sourceKind,
scope,
userId: scope === 'user' ? input.userId : null,
ticker: normalizedTicker
});
vectorRows.forEach((row, index) => {
const existing = matches.get(row.chunk_id);
matches.set(row.chunk_id, {
chunkId: row.chunk_id,
vectorRank: existing?.vectorRank ?? index + 1,
lexicalRank: existing?.lexicalRank ?? null,
snippet: existing?.snippet ?? null
});
});
if (!ftsQuery) {
continue;
}
const lexicalRows = lexicalSearch(client, {
ftsQuery,
limit: limit * 3,
sourceKind,
scope,
userId: scope === 'user' ? input.userId : null,
ticker: normalizedTicker
});
lexicalRows.forEach((row, index) => {
const existing = matches.get(row.chunk_id);
matches.set(row.chunk_id, {
chunkId: row.chunk_id,
vectorRank: existing?.vectorRank ?? null,
lexicalRank: existing?.lexicalRank ?? index + 1,
snippet: existing?.snippet ?? row.snippet ?? null
});
});
}
return hydrateResults(client, normalizedQuery, matches, limit);
}
function buildAnswerPrompt(query: string, evidence: SearchResult[]) {
const evidenceText = evidence.map((result, index) => {
const reference = index + 1;
return [
`[${reference}] ${result.citationLabel}`,
`Source: ${result.title ?? result.sourceRef}`,
`Ticker: ${result.ticker ?? 'n/a'}`,
`Excerpt: ${result.chunkText}`
].join('\n');
}).join('\n\n');
return [
'Answer the question using only the evidence below.',
'Every factual claim must include at least one citation like [1] or [2].',
'If the evidence is insufficient, respond with exactly INSUFFICIENT_EVIDENCE.',
`Question: ${query}`,
'',
'Evidence:',
evidenceText
].join('\n');
}
function finalizeAnswer(answer: string, evidence: SearchResult[]) {
const trimmed = answer.trim();
if (!trimmed || trimmed === 'INSUFFICIENT_EVIDENCE') {
return {
answer: 'Insufficient evidence to answer from the indexed sources.',
citations: [] satisfies SearchCitation[]
};
}
const matches = [...trimmed.matchAll(/\[(\d+)\]/g)];
const seen = new Set<number>();
const citations: SearchCitation[] = [];
for (const match of matches) {
const index = Number(match[1]);
if (!Number.isInteger(index) || seen.has(index) || index < 1 || index > evidence.length) {
continue;
}
seen.add(index);
const result = evidence[index - 1]!;
citations.push({
index,
label: result.citationLabel,
chunkId: result.chunkId,
href: result.href
});
}
if (citations.length === 0) {
return {
answer: 'Insufficient evidence to answer from the indexed sources.',
citations: [] satisfies SearchCitation[]
};
}
return {
answer: trimmed,
citations
};
}
export async function answerSearchQuery(input: SearchInput): Promise<SearchAnswerResponse> {
const results = await searchKnowledgeBase({
...input,
limit: clampLimit(input.limit)
});
if (results.length === 0) {
return {
answer: 'Insufficient evidence to answer from the indexed sources.',
citations: [],
results
};
}
const evidence: SearchResult[] = [];
let totalChars = 0;
for (const result of results) {
if (evidence.length >= MAX_CONTEXT_RESULTS) {
break;
}
if (totalChars + result.chunkText.length > MAX_CONTEXT_CHARS && evidence.length > 0) {
break;
}
evidence.push(result);
totalChars += result.chunkText.length;
}
const response = await runAiAnalysis(
buildAnswerPrompt(input.query, evidence),
'Use neutral analyst prose. Do not use outside knowledge.',
{
workload: 'report'
}
);
const finalized = finalizeAnswer(response.text, evidence);
return {
answer: finalized.answer,
citations: finalized.citations,
results
};
}
export const __searchInternals = {
buildCitationLabel,
buildFilingBriefContent,
chunkDocument,
chunkText,
deleteSourceRefs,
finalizeAnswer,
hashContent,
hydrateResults,
lexicalSearch,
normalizeSearchSources,
persistDocumentIndex,
queryOneDocument,
scoreSearchMatch,
stripMarkdown,
toFtsQuery,
vectorSearch
};