1316 lines
36 KiB
TypeScript
1316 lines
36 KiB
TypeScript
import { createHash } from 'node:crypto';
|
|
import type { Database } from 'bun:sqlite';
|
|
import type {
|
|
ResearchJournalEntry,
|
|
SearchAnswerResponse,
|
|
SearchCitation,
|
|
SearchResult,
|
|
SearchSource
|
|
} from '@/lib/types';
|
|
import { runAiAnalysis, runAiEmbeddings } from '@/lib/server/ai';
|
|
import { __dbInternals, getSqliteClient } from '@/lib/server/db';
|
|
import { fetchPrimaryFilingText } from '@/lib/server/sec';
|
|
import { getFilingByAccession, listFilingsRecords } from '@/lib/server/repos/filings';
|
|
import {
|
|
getResearchJournalEntryRecord,
|
|
listResearchJournalEntries,
|
|
listResearchJournalEntriesForUser
|
|
} from '@/lib/server/repos/research-journal';
|
|
|
|
type SearchDocumentScope = 'global' | 'user';
|
|
type SearchDocumentSourceKind = 'filing_document' | 'filing_brief' | 'research_note';
|
|
|
|
type MaterializedSearchDocument = {
|
|
sourceKind: SearchDocumentSourceKind;
|
|
sourceRef: string;
|
|
scope: SearchDocumentScope;
|
|
userId: string | null;
|
|
ticker: string | null;
|
|
accessionNumber: string | null;
|
|
filingDate: string | null;
|
|
title: string | null;
|
|
contentText: string;
|
|
metadata: Record<string, unknown>;
|
|
};
|
|
|
|
type SearchChunkRecord = {
|
|
chunkIndex: number;
|
|
chunkText: string;
|
|
charCount: number;
|
|
startOffset: number;
|
|
endOffset: number;
|
|
headingPath: string | null;
|
|
citationLabel: string;
|
|
};
|
|
|
|
type SearchDocumentRow = {
|
|
id: number;
|
|
source_kind: SearchDocumentSourceKind;
|
|
source_ref: string;
|
|
scope: SearchDocumentScope;
|
|
user_id: string | null;
|
|
ticker: string | null;
|
|
accession_number: string | null;
|
|
title: string | null;
|
|
content_text: string;
|
|
content_hash: string;
|
|
metadata: Record<string, unknown> | null;
|
|
index_status: 'pending' | 'indexed' | 'failed';
|
|
indexed_at: string | null;
|
|
last_error: string | null;
|
|
created_at: string;
|
|
updated_at: string;
|
|
};
|
|
|
|
type SearchChunkJoinRow = {
|
|
chunk_id: number;
|
|
document_id: number;
|
|
chunk_text: string;
|
|
heading_path: string | null;
|
|
citation_label: string;
|
|
source_kind: SearchDocumentSourceKind;
|
|
source_ref: string;
|
|
title: string | null;
|
|
ticker: string | null;
|
|
accession_number: string | null;
|
|
metadata: Record<string, unknown> | null;
|
|
};
|
|
|
|
type DeleteSourceRef = {
|
|
sourceKind: SearchDocumentSourceKind;
|
|
sourceRef: string;
|
|
scope: SearchDocumentScope;
|
|
userId?: string | null;
|
|
};
|
|
|
|
type IndexSearchDocumentsInput = {
|
|
userId: string;
|
|
ticker?: string | null;
|
|
accessionNumber?: string | null;
|
|
journalEntryId?: number | null;
|
|
sourceKinds?: SearchDocumentSourceKind[];
|
|
deleteSourceRefs?: DeleteSourceRef[];
|
|
onStage?: (stage: 'collect' | 'fetch' | 'chunk' | 'embed' | 'persist', detail: string) => Promise<void> | void;
|
|
};
|
|
|
|
type SearchInput = {
|
|
userId: string;
|
|
query: string;
|
|
ticker?: string | null;
|
|
sources?: SearchSource[];
|
|
limit?: number;
|
|
};
|
|
|
|
type SearchMatch = {
|
|
chunkId: number;
|
|
vectorRank: number | null;
|
|
lexicalRank: number | null;
|
|
snippet: string | null;
|
|
};
|
|
|
|
const RRF_K = 60;
|
|
const SEARCH_RESULT_LIMIT_DEFAULT = 10;
|
|
const SEARCH_RESULT_LIMIT_MIN = 4;
|
|
const SEARCH_RESULT_LIMIT_MAX = 12;
|
|
const MAX_RESULTS_PER_DOCUMENT = 2;
|
|
const MAX_CONTEXT_RESULTS = 6;
|
|
const MAX_CONTEXT_CHARS = 8_000;
|
|
|
|
const SOURCE_KIND_BY_SEARCH_SOURCE: Record<SearchSource, SearchDocumentSourceKind> = {
|
|
documents: 'filing_document',
|
|
filings: 'filing_brief',
|
|
research: 'research_note'
|
|
};
|
|
|
|
function escapeLike(value: string) {
|
|
return value.replace(/[%_]/g, (match) => `\\${match}`);
|
|
}
|
|
|
|
function normalizeTicker(value: string | null | undefined) {
|
|
const normalized = value?.trim().toUpperCase() ?? '';
|
|
return normalized.length > 0 ? normalized : null;
|
|
}
|
|
|
|
function normalizeSearchSources(sources?: SearchSource[]) {
|
|
const normalized = new Set<SearchSource>();
|
|
|
|
for (const source of sources ?? ['documents', 'filings', 'research']) {
|
|
if (source === 'documents' || source === 'filings' || source === 'research') {
|
|
normalized.add(source);
|
|
}
|
|
}
|
|
|
|
return normalized.size > 0
|
|
? [...normalized]
|
|
: ['documents', 'filings', 'research'] as SearchSource[];
|
|
}
|
|
|
|
function clampLimit(limit?: number) {
|
|
const value = Number.isFinite(limit) ? Number(limit) : SEARCH_RESULT_LIMIT_DEFAULT;
|
|
const intValue = Math.trunc(value);
|
|
return Math.min(Math.max(intValue, SEARCH_RESULT_LIMIT_MIN), SEARCH_RESULT_LIMIT_MAX);
|
|
}
|
|
|
|
function hashContent(content: string) {
|
|
return createHash('sha256').update(content).digest('hex');
|
|
}
|
|
|
|
function stripMarkdown(markdown: string) {
|
|
return markdown
|
|
.replace(/```[\s\S]*?```/g, ' ')
|
|
.replace(/`([^`]+)`/g, '$1')
|
|
.replace(/!\[[^\]]*\]\([^)]+\)/g, ' ')
|
|
.replace(/\[([^\]]+)\]\([^)]+\)/g, '$1')
|
|
.replace(/^>\s?/gm, '')
|
|
.replace(/^#{1,6}\s+/gm, '')
|
|
.replace(/[*_~]/g, '')
|
|
.replace(/^\s*[-+]\s+/gm, '')
|
|
.replace(/\r/g, '\n')
|
|
.replace(/[ \t]+\n/g, '\n')
|
|
.replace(/\n{3,}/g, '\n\n')
|
|
.trim();
|
|
}
|
|
|
|
function normalizeWhitespace(value: string) {
|
|
return value
|
|
.replace(/\r/g, '\n')
|
|
.replace(/[ \t]+\n/g, '\n')
|
|
.replace(/\n[ \t]+/g, '\n')
|
|
.replace(/[ \t]{2,}/g, ' ')
|
|
.replace(/\n{3,}/g, '\n\n')
|
|
.trim();
|
|
}
|
|
|
|
function buildFilingBriefContent(input: {
|
|
ticker: string;
|
|
companyName: string;
|
|
accessionNumber: string;
|
|
filingDate: string;
|
|
filingType: string;
|
|
metrics: Record<string, number | null> | null;
|
|
analysis: Record<string, unknown> | null;
|
|
}) {
|
|
const extraction = input.analysis?.extraction;
|
|
const extractionLines = extraction && typeof extraction === 'object'
|
|
? Object.entries(extraction as Record<string, unknown>)
|
|
.map(([key, value]) => `${key}: ${Array.isArray(value) ? value.join(' | ') : String(value ?? '')}`)
|
|
.filter((line) => !line.endsWith(': '))
|
|
: [];
|
|
|
|
const reportText = typeof input.analysis?.text === 'string'
|
|
? input.analysis.text
|
|
: typeof input.analysis?.legacyInsights === 'string'
|
|
? input.analysis.legacyInsights
|
|
: null;
|
|
|
|
return normalizeWhitespace([
|
|
`${input.companyName} (${input.ticker}) filing brief`,
|
|
`Form: ${input.filingType}`,
|
|
`Filed: ${input.filingDate}`,
|
|
`Accession: ${input.accessionNumber}`,
|
|
input.metrics ? `Key metrics: ${JSON.stringify(input.metrics)}` : null,
|
|
reportText ? `AI summary:\n${reportText}` : null,
|
|
extractionLines.length > 0 ? `Structured extraction:\n${extractionLines.join('\n')}` : null
|
|
].filter((entry): entry is string => Boolean(entry)).join('\n\n'));
|
|
}
|
|
|
|
function buildCitationLabel(document: MaterializedSearchDocument, chunkIndex: number) {
|
|
if (document.sourceKind === 'research_note') {
|
|
return `${document.ticker ?? 'Research'} journal note [${chunkIndex + 1}]`;
|
|
}
|
|
|
|
const parts = [
|
|
document.ticker,
|
|
document.accessionNumber,
|
|
document.filingDate
|
|
].filter((entry): entry is string => Boolean(entry));
|
|
|
|
return `${parts.join(' · ')} [${chunkIndex + 1}]`;
|
|
}
|
|
|
|
function inferHeadingPath(text: string, offset: number) {
|
|
const windowStart = Math.max(0, offset - 600);
|
|
const context = text.slice(windowStart, offset);
|
|
const lines = context
|
|
.split('\n')
|
|
.map((line) => line.trim())
|
|
.filter((line) => line.length > 0);
|
|
|
|
for (let index = lines.length - 1; index >= 0; index -= 1) {
|
|
const candidate = lines[index]!;
|
|
const looksLikeHeading = candidate.length <= 100 && (
|
|
/:$/.test(candidate)
|
|
|| /^[A-Z0-9][A-Z0-9\s/&,-]{4,}$/.test(candidate)
|
|
|| /^\d+(\.\d+)*\s+[A-Z]/.test(candidate)
|
|
);
|
|
|
|
if (looksLikeHeading) {
|
|
return candidate;
|
|
}
|
|
}
|
|
|
|
return null;
|
|
}
|
|
|
|
function chunkText(
|
|
text: string,
|
|
options: { targetChars: number; overlapChars: number; maxSingleChunk?: number },
|
|
document: MaterializedSearchDocument
|
|
) {
|
|
const normalized = normalizeWhitespace(text);
|
|
if (!normalized) {
|
|
return [] satisfies SearchChunkRecord[];
|
|
}
|
|
|
|
const maxSingleChunk = options.maxSingleChunk ?? 0;
|
|
if (maxSingleChunk > 0 && normalized.length <= maxSingleChunk) {
|
|
return [{
|
|
chunkIndex: 0,
|
|
chunkText: normalized,
|
|
charCount: normalized.length,
|
|
startOffset: 0,
|
|
endOffset: normalized.length,
|
|
headingPath: inferHeadingPath(normalized, 0),
|
|
citationLabel: buildCitationLabel(document, 0)
|
|
}];
|
|
}
|
|
|
|
const chunks: SearchChunkRecord[] = [];
|
|
let start = 0;
|
|
let chunkIndex = 0;
|
|
|
|
while (start < normalized.length) {
|
|
const tentativeEnd = Math.min(start + options.targetChars, normalized.length);
|
|
let end = tentativeEnd;
|
|
|
|
if (tentativeEnd < normalized.length) {
|
|
const localWindow = normalized.slice(start, Math.min(normalized.length, tentativeEnd + 180));
|
|
const paragraphBreak = localWindow.lastIndexOf('\n\n');
|
|
const sentenceBreak = Math.max(localWindow.lastIndexOf('. '), localWindow.lastIndexOf('\n'));
|
|
const boundary = Math.max(paragraphBreak, sentenceBreak);
|
|
|
|
if (boundary >= options.targetChars * 0.55) {
|
|
end = start + boundary + (paragraphBreak === boundary ? 0 : 1);
|
|
}
|
|
}
|
|
|
|
const chunkTextValue = normalized.slice(start, end).trim();
|
|
if (chunkTextValue) {
|
|
chunks.push({
|
|
chunkIndex,
|
|
chunkText: chunkTextValue,
|
|
charCount: chunkTextValue.length,
|
|
startOffset: start,
|
|
endOffset: end,
|
|
headingPath: inferHeadingPath(normalized, start),
|
|
citationLabel: buildCitationLabel(document, chunkIndex)
|
|
});
|
|
chunkIndex += 1;
|
|
}
|
|
|
|
if (end >= normalized.length) {
|
|
break;
|
|
}
|
|
|
|
start = Math.max(end - options.overlapChars, start + 1);
|
|
}
|
|
|
|
return chunks;
|
|
}
|
|
|
|
function chunkDocument(document: MaterializedSearchDocument) {
|
|
switch (document.sourceKind) {
|
|
case 'filing_document':
|
|
return chunkText(document.contentText, { targetChars: 1400, overlapChars: 200 }, document);
|
|
case 'filing_brief':
|
|
return chunkText(document.contentText, { targetChars: 1000, overlapChars: 100 }, document);
|
|
case 'research_note':
|
|
return chunkText(document.contentText, {
|
|
targetChars: 1000,
|
|
overlapChars: 100,
|
|
maxSingleChunk: 2000
|
|
}, document);
|
|
default:
|
|
return [];
|
|
}
|
|
}
|
|
|
|
function mapSourceKindToSearchSource(sourceKind: SearchDocumentSourceKind): SearchSource {
|
|
switch (sourceKind) {
|
|
case 'filing_document':
|
|
return 'documents';
|
|
case 'filing_brief':
|
|
return 'filings';
|
|
case 'research_note':
|
|
return 'research';
|
|
default:
|
|
return 'documents';
|
|
}
|
|
}
|
|
|
|
function buildSearchHref(row: SearchChunkJoinRow) {
|
|
if (row.source_kind === 'research_note') {
|
|
return `/analysis?ticker=${encodeURIComponent(row.ticker ?? '')}&journalId=${encodeURIComponent(row.source_ref)}`;
|
|
}
|
|
|
|
const hasAnalysis = Boolean((row.metadata as { hasAnalysis?: unknown } | null)?.hasAnalysis);
|
|
if (hasAnalysis && row.ticker && row.accession_number) {
|
|
return `/analysis/reports/${encodeURIComponent(row.ticker)}/${encodeURIComponent(row.accession_number)}`;
|
|
}
|
|
|
|
if (row.ticker) {
|
|
return `/filings?ticker=${encodeURIComponent(row.ticker)}`;
|
|
}
|
|
|
|
return '/filings';
|
|
}
|
|
|
|
function manualSnippet(text: string, query: string) {
|
|
const tokens = query.toLowerCase().split(/\W+/).filter((token) => token.length > 1);
|
|
const lower = text.toLowerCase();
|
|
const matchIndex = tokens
|
|
.map((token) => lower.indexOf(token))
|
|
.find((index) => index >= 0) ?? 0;
|
|
const start = Math.max(0, matchIndex - 90);
|
|
const end = Math.min(text.length, start + 220);
|
|
const prefix = start > 0 ? '... ' : '';
|
|
const suffix = end < text.length ? ' ...' : '';
|
|
return `${prefix}${text.slice(start, end).trim()}${suffix}`.trim();
|
|
}
|
|
|
|
function scoreSearchMatch(match: SearchMatch) {
|
|
let score = 0;
|
|
if (match.vectorRank !== null) {
|
|
score += 1 / (RRF_K + match.vectorRank);
|
|
}
|
|
|
|
if (match.lexicalRank !== null) {
|
|
score += 1 / (RRF_K + match.lexicalRank);
|
|
}
|
|
|
|
return score;
|
|
}
|
|
|
|
function toFtsQuery(query: string) {
|
|
const tokens = query
|
|
.trim()
|
|
.split(/\W+/)
|
|
.map((token) => token.trim().toLowerCase())
|
|
.filter((token) => token.length > 1);
|
|
|
|
if (tokens.length === 0) {
|
|
return null;
|
|
}
|
|
|
|
return tokens.map((token) => `"${token.replace(/"/g, '""')}"`).join(' AND ');
|
|
}
|
|
|
|
function createPlaceholders(length: number) {
|
|
return new Array(length).fill('?').join(', ');
|
|
}
|
|
|
|
function queryOneDocument(
|
|
client: Database,
|
|
input: Pick<MaterializedSearchDocument, 'scope' | 'userId' | 'sourceKind' | 'sourceRef'>
|
|
) {
|
|
return client
|
|
.query(`
|
|
SELECT *
|
|
FROM search_document
|
|
WHERE scope = ?
|
|
AND ifnull(user_id, '') = ?
|
|
AND source_kind = ?
|
|
AND source_ref = ?
|
|
LIMIT 1
|
|
`)
|
|
.get(
|
|
input.scope,
|
|
input.userId ?? '',
|
|
input.sourceKind,
|
|
input.sourceRef
|
|
) as SearchDocumentRow | null;
|
|
}
|
|
|
|
function listDocumentsForScope(client: Database, input: {
|
|
scope: SearchDocumentScope;
|
|
userId?: string | null;
|
|
sourceKind: SearchDocumentSourceKind;
|
|
ticker?: string | null;
|
|
}) {
|
|
const conditions = [
|
|
'scope = ?',
|
|
"ifnull(user_id, '') = ?",
|
|
'source_kind = ?'
|
|
];
|
|
const values: Array<string | null> = [input.scope, input.userId ?? '', input.sourceKind];
|
|
|
|
if (input.ticker) {
|
|
conditions.push('ticker = ?');
|
|
values.push(input.ticker);
|
|
}
|
|
|
|
return client
|
|
.query(`
|
|
SELECT *
|
|
FROM search_document
|
|
WHERE ${conditions.join(' AND ')}
|
|
`)
|
|
.all(...values) as SearchDocumentRow[];
|
|
}
|
|
|
|
function deleteDocumentCascade(client: Database, documentId: number) {
|
|
const chunkRows = client
|
|
.query('SELECT id FROM search_chunk WHERE document_id = ?')
|
|
.all(documentId) as Array<{ id: number }>;
|
|
|
|
for (const row of chunkRows) {
|
|
client.query('DELETE FROM search_chunk_vec WHERE chunk_id = ?').run(row.id);
|
|
client.query('DELETE FROM search_chunk_fts WHERE chunk_id = ?').run(row.id);
|
|
}
|
|
|
|
client.query('DELETE FROM search_chunk WHERE document_id = ?').run(documentId);
|
|
client.query('DELETE FROM search_document WHERE id = ?').run(documentId);
|
|
}
|
|
|
|
function withTransaction<T>(client: Database, fn: () => T) {
|
|
client.exec('BEGIN IMMEDIATE');
|
|
|
|
try {
|
|
const result = fn();
|
|
client.exec('COMMIT');
|
|
return result;
|
|
} catch (error) {
|
|
client.exec('ROLLBACK');
|
|
throw error;
|
|
}
|
|
}
|
|
|
|
async function collectFilingDocuments(ticker?: string | null, accessionNumber?: string | null) {
|
|
const filings = accessionNumber
|
|
? await Promise.all([getFilingByAccession(accessionNumber)]).then((rows) => rows.filter(Boolean))
|
|
: await listFilingsRecords({
|
|
ticker: ticker ?? undefined,
|
|
limit: 250
|
|
});
|
|
|
|
const documents: MaterializedSearchDocument[] = [];
|
|
|
|
for (const filing of filings) {
|
|
if (!filing) {
|
|
continue;
|
|
}
|
|
|
|
const filingText = await fetchPrimaryFilingText({
|
|
filingUrl: filing.filing_url,
|
|
cik: filing.cik,
|
|
accessionNumber: filing.accession_number,
|
|
primaryDocument: filing.primary_document ?? null
|
|
}).catch(() => null);
|
|
|
|
if (!filingText?.text) {
|
|
continue;
|
|
}
|
|
|
|
documents.push({
|
|
sourceKind: 'filing_document',
|
|
sourceRef: filing.accession_number,
|
|
scope: 'global',
|
|
userId: null,
|
|
ticker: filing.ticker,
|
|
accessionNumber: filing.accession_number,
|
|
filingDate: filing.filing_date,
|
|
title: `${filing.ticker} ${filing.filing_type} primary filing`,
|
|
contentText: filingText.text,
|
|
metadata: {
|
|
filingType: filing.filing_type,
|
|
filingDate: filing.filing_date,
|
|
filingUrl: filing.filing_url,
|
|
submissionUrl: filing.submission_url ?? null,
|
|
primaryDocument: filing.primary_document ?? null,
|
|
hasAnalysis: Boolean(filing.analysis?.text || filing.analysis?.legacyInsights)
|
|
}
|
|
});
|
|
}
|
|
|
|
return documents;
|
|
}
|
|
|
|
async function collectFilingBriefs(ticker?: string | null, accessionNumber?: string | null) {
|
|
const filings = accessionNumber
|
|
? await Promise.all([getFilingByAccession(accessionNumber)]).then((rows) => rows.filter(Boolean))
|
|
: await listFilingsRecords({
|
|
ticker: ticker ?? undefined,
|
|
limit: 250
|
|
});
|
|
|
|
return filings
|
|
.filter((filing): filing is NonNullable<typeof filing> => Boolean(filing))
|
|
.map((filing) => ({
|
|
sourceKind: 'filing_brief' as const,
|
|
sourceRef: filing.accession_number,
|
|
scope: 'global' as const,
|
|
userId: null,
|
|
ticker: filing.ticker,
|
|
accessionNumber: filing.accession_number,
|
|
filingDate: filing.filing_date,
|
|
title: `${filing.ticker} ${filing.filing_type} filing brief`,
|
|
contentText: buildFilingBriefContent({
|
|
ticker: filing.ticker,
|
|
companyName: filing.company_name,
|
|
accessionNumber: filing.accession_number,
|
|
filingDate: filing.filing_date,
|
|
filingType: filing.filing_type,
|
|
metrics: filing.metrics,
|
|
analysis: filing.analysis as Record<string, unknown> | null
|
|
}),
|
|
metadata: {
|
|
filingType: filing.filing_type,
|
|
filingDate: filing.filing_date,
|
|
hasAnalysis: Boolean(filing.analysis?.text || filing.analysis?.legacyInsights)
|
|
}
|
|
}));
|
|
}
|
|
|
|
function materializeResearchNote(entry: ResearchJournalEntry): MaterializedSearchDocument | null {
|
|
const text = normalizeWhitespace(stripMarkdown(entry.body_markdown));
|
|
if (!text) {
|
|
return null;
|
|
}
|
|
|
|
return {
|
|
sourceKind: 'research_note',
|
|
sourceRef: String(entry.id),
|
|
scope: 'user',
|
|
userId: entry.user_id,
|
|
ticker: entry.ticker,
|
|
accessionNumber: entry.accession_number,
|
|
filingDate: null,
|
|
title: entry.title ?? `${entry.ticker} research note`,
|
|
contentText: text,
|
|
metadata: {
|
|
entryType: entry.entry_type,
|
|
createdAt: entry.created_at,
|
|
updatedAt: entry.updated_at
|
|
}
|
|
};
|
|
}
|
|
|
|
async function collectResearchNotes(userId: string, ticker?: string | null, journalEntryId?: number | null) {
|
|
if (journalEntryId) {
|
|
const entry = await getResearchJournalEntryRecord(userId, journalEntryId);
|
|
const materialized = entry ? materializeResearchNote(entry) : null;
|
|
return materialized ? [materialized] : [];
|
|
}
|
|
|
|
const entries = ticker
|
|
? await listResearchJournalEntries(userId, ticker, 250)
|
|
: await listResearchJournalEntriesForUser(userId, 250);
|
|
|
|
return entries
|
|
.map(materializeResearchNote)
|
|
.filter((entry): entry is MaterializedSearchDocument => Boolean(entry));
|
|
}
|
|
|
|
async function collectMaterializedDocuments(input: IndexSearchDocumentsInput) {
|
|
const sourceKinds = input.sourceKinds ?? ['filing_document', 'filing_brief', 'research_note'];
|
|
const documents: MaterializedSearchDocument[] = [];
|
|
|
|
for (const sourceKind of sourceKinds) {
|
|
if (sourceKind === 'filing_document') {
|
|
documents.push(...await collectFilingDocuments(input.ticker ?? null, input.accessionNumber ?? null));
|
|
continue;
|
|
}
|
|
|
|
if (sourceKind === 'filing_brief') {
|
|
documents.push(...await collectFilingBriefs(input.ticker ?? null, input.accessionNumber ?? null));
|
|
continue;
|
|
}
|
|
|
|
if (sourceKind === 'research_note') {
|
|
documents.push(...await collectResearchNotes(
|
|
input.userId,
|
|
input.ticker ?? null,
|
|
input.journalEntryId ?? null
|
|
));
|
|
}
|
|
}
|
|
|
|
return documents;
|
|
}
|
|
|
|
function persistDocumentIndex(
|
|
client: Database,
|
|
document: MaterializedSearchDocument,
|
|
chunks: SearchChunkRecord[],
|
|
embeddings: number[][]
|
|
) {
|
|
const now = new Date().toISOString();
|
|
const contentHash = hashContent(document.contentText);
|
|
const existing = queryOneDocument(client, document);
|
|
|
|
if (existing && existing.content_hash === contentHash && existing.index_status === 'indexed') {
|
|
return { indexed: false, skipped: true, chunkCount: 0 };
|
|
}
|
|
|
|
const documentId = withTransaction(client, () => {
|
|
if (existing) {
|
|
deleteDocumentCascade(client, existing.id);
|
|
}
|
|
|
|
const inserted = client
|
|
.query(`
|
|
INSERT INTO search_document (
|
|
source_kind,
|
|
source_ref,
|
|
scope,
|
|
user_id,
|
|
ticker,
|
|
accession_number,
|
|
title,
|
|
content_text,
|
|
content_hash,
|
|
metadata,
|
|
index_status,
|
|
indexed_at,
|
|
last_error,
|
|
created_at,
|
|
updated_at
|
|
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, 'indexed', ?, NULL, ?, ?)
|
|
RETURNING id
|
|
`)
|
|
.get(
|
|
document.sourceKind,
|
|
document.sourceRef,
|
|
document.scope,
|
|
document.userId,
|
|
document.ticker,
|
|
document.accessionNumber,
|
|
document.title,
|
|
document.contentText,
|
|
contentHash,
|
|
JSON.stringify(document.metadata),
|
|
now,
|
|
existing?.created_at ?? now,
|
|
now
|
|
) as { id: number };
|
|
|
|
for (let index = 0; index < chunks.length; index += 1) {
|
|
const chunk = chunks[index]!;
|
|
const embedding = embeddings[index]!;
|
|
const insertedChunk = client
|
|
.query(`
|
|
INSERT INTO search_chunk (
|
|
document_id,
|
|
chunk_index,
|
|
chunk_text,
|
|
char_count,
|
|
start_offset,
|
|
end_offset,
|
|
heading_path,
|
|
citation_label,
|
|
created_at
|
|
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
RETURNING id
|
|
`)
|
|
.get(
|
|
inserted.id,
|
|
chunk.chunkIndex,
|
|
chunk.chunkText,
|
|
chunk.charCount,
|
|
chunk.startOffset,
|
|
chunk.endOffset,
|
|
chunk.headingPath,
|
|
chunk.citationLabel,
|
|
now
|
|
) as { id: number };
|
|
|
|
client
|
|
.query(`
|
|
INSERT INTO search_chunk_fts (
|
|
rowid,
|
|
chunk_text,
|
|
citation_label,
|
|
heading_path,
|
|
chunk_id,
|
|
document_id,
|
|
chunk_index,
|
|
scope,
|
|
user_id,
|
|
source_kind,
|
|
ticker,
|
|
accession_number,
|
|
filing_date
|
|
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
`)
|
|
.run(
|
|
insertedChunk.id,
|
|
chunk.chunkText,
|
|
chunk.citationLabel,
|
|
chunk.headingPath,
|
|
insertedChunk.id,
|
|
inserted.id,
|
|
chunk.chunkIndex,
|
|
document.scope,
|
|
document.userId,
|
|
document.sourceKind,
|
|
document.ticker,
|
|
document.accessionNumber,
|
|
document.filingDate
|
|
);
|
|
|
|
client
|
|
.query(`
|
|
INSERT INTO search_chunk_vec (
|
|
chunk_id,
|
|
embedding,
|
|
scope,
|
|
user_id,
|
|
source_kind,
|
|
ticker,
|
|
accession_number,
|
|
filing_date,
|
|
document_id,
|
|
chunk_index,
|
|
citation_label
|
|
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
`)
|
|
.run(
|
|
insertedChunk.id,
|
|
JSON.stringify(embedding),
|
|
document.scope,
|
|
document.userId,
|
|
document.sourceKind,
|
|
document.ticker,
|
|
document.accessionNumber,
|
|
document.filingDate,
|
|
inserted.id,
|
|
chunk.chunkIndex,
|
|
chunk.citationLabel
|
|
);
|
|
}
|
|
|
|
return inserted.id;
|
|
});
|
|
|
|
return {
|
|
indexed: true,
|
|
skipped: false,
|
|
chunkCount: chunks.length,
|
|
documentId
|
|
};
|
|
}
|
|
|
|
function deleteSourceRefs(client: Database, refs: DeleteSourceRef[]) {
|
|
let deleted = 0;
|
|
|
|
for (const ref of refs) {
|
|
const row = queryOneDocument(client, {
|
|
scope: ref.scope,
|
|
userId: ref.userId ?? null,
|
|
sourceKind: ref.sourceKind,
|
|
sourceRef: ref.sourceRef
|
|
});
|
|
|
|
if (!row) {
|
|
continue;
|
|
}
|
|
|
|
withTransaction(client, () => {
|
|
deleteDocumentCascade(client, row.id);
|
|
});
|
|
deleted += 1;
|
|
}
|
|
|
|
return deleted;
|
|
}
|
|
|
|
export async function indexSearchDocuments(input: IndexSearchDocumentsInput) {
|
|
const client = getSqliteClient();
|
|
await input.onStage?.('collect', 'Collecting materialized search sources');
|
|
const materialized = await collectMaterializedDocuments(input);
|
|
const sourceKinds = input.sourceKinds ?? ['filing_document', 'filing_brief', 'research_note'];
|
|
|
|
let indexed = 0;
|
|
let skipped = 0;
|
|
let deleted = 0;
|
|
let chunksEmbedded = 0;
|
|
|
|
if (input.deleteSourceRefs && input.deleteSourceRefs.length > 0) {
|
|
deleted += deleteSourceRefs(client, input.deleteSourceRefs);
|
|
}
|
|
|
|
for (const document of materialized) {
|
|
await input.onStage?.('fetch', `Preparing ${document.sourceKind} ${document.sourceRef}`);
|
|
const chunks = chunkDocument(document);
|
|
if (chunks.length === 0) {
|
|
continue;
|
|
}
|
|
|
|
await input.onStage?.('chunk', `Chunking ${document.sourceKind} ${document.sourceRef}`);
|
|
await input.onStage?.('embed', `Embedding ${chunks.length} chunks for ${document.sourceRef}`);
|
|
const embeddings = await runAiEmbeddings(chunks.map((chunk) => chunk.chunkText));
|
|
await input.onStage?.('persist', `Persisting indexed chunks for ${document.sourceRef}`);
|
|
const result = persistDocumentIndex(client, document, chunks, embeddings);
|
|
|
|
if (result.skipped) {
|
|
skipped += 1;
|
|
continue;
|
|
}
|
|
|
|
indexed += 1;
|
|
chunksEmbedded += result.chunkCount;
|
|
}
|
|
|
|
if (input.ticker && !input.accessionNumber && !input.journalEntryId) {
|
|
for (const sourceKind of sourceKinds) {
|
|
const scope = sourceKind === 'research_note' ? 'user' : 'global';
|
|
const expectedRefs = new Set(
|
|
materialized
|
|
.filter((document) => document.sourceKind === sourceKind)
|
|
.map((document) => document.sourceRef)
|
|
);
|
|
const existingRows = listDocumentsForScope(client, {
|
|
scope,
|
|
userId: scope === 'user' ? input.userId : null,
|
|
sourceKind,
|
|
ticker: input.ticker
|
|
});
|
|
|
|
for (const row of existingRows) {
|
|
if (expectedRefs.has(row.source_ref)) {
|
|
continue;
|
|
}
|
|
|
|
withTransaction(client, () => {
|
|
deleteDocumentCascade(client, row.id);
|
|
});
|
|
deleted += 1;
|
|
}
|
|
}
|
|
}
|
|
|
|
return {
|
|
sourcesCollected: materialized.length,
|
|
indexed,
|
|
skipped,
|
|
deleted,
|
|
chunksEmbedded
|
|
};
|
|
}
|
|
|
|
function vectorSearch(
|
|
client: Database,
|
|
input: {
|
|
embedding: number[];
|
|
limit: number;
|
|
sourceKind: SearchDocumentSourceKind;
|
|
scope: SearchDocumentScope;
|
|
userId?: string | null;
|
|
ticker?: string | null;
|
|
}
|
|
) {
|
|
if (!__dbInternals.isVectorExtensionLoaded(client)) {
|
|
const conditions = [
|
|
'scope = ?',
|
|
'source_kind = ?'
|
|
];
|
|
const values: Array<string | null> = [input.scope, input.sourceKind];
|
|
|
|
if (input.scope === 'user') {
|
|
conditions.push('user_id = ?');
|
|
values.push(input.userId ?? null);
|
|
}
|
|
|
|
if (input.ticker) {
|
|
conditions.push('ticker = ?');
|
|
values.push(input.ticker);
|
|
}
|
|
|
|
const rows = client
|
|
.query(`
|
|
SELECT chunk_id, embedding
|
|
FROM search_chunk_vec
|
|
WHERE ${conditions.join(' AND ')}
|
|
`)
|
|
.all(...values) as Array<{ chunk_id: number; embedding: string }>;
|
|
|
|
const queryNorm = Math.hypot(...input.embedding) || 1;
|
|
return rows
|
|
.map((row) => {
|
|
const candidate = JSON.parse(row.embedding) as number[];
|
|
const dot = candidate.reduce((sum, value, index) => sum + (value * (input.embedding[index] ?? 0)), 0);
|
|
const candidateNorm = Math.hypot(...candidate) || 1;
|
|
const cosineDistance = 1 - (dot / (candidateNorm * queryNorm));
|
|
|
|
return {
|
|
chunk_id: row.chunk_id,
|
|
distance: cosineDistance
|
|
};
|
|
})
|
|
.sort((left, right) => left.distance - right.distance)
|
|
.slice(0, Math.max(input.limit, 4));
|
|
}
|
|
|
|
const conditions = [
|
|
'embedding MATCH ?',
|
|
'k = ?',
|
|
'scope = ?',
|
|
'source_kind = ?'
|
|
];
|
|
const values: Array<string | number | null> = [
|
|
JSON.stringify(input.embedding),
|
|
Math.max(input.limit, 4),
|
|
input.scope,
|
|
input.sourceKind
|
|
];
|
|
|
|
if (input.scope === 'user') {
|
|
conditions.push('user_id = ?');
|
|
values.push(input.userId ?? null);
|
|
}
|
|
|
|
if (input.ticker) {
|
|
conditions.push('ticker = ?');
|
|
values.push(input.ticker);
|
|
}
|
|
|
|
const rows = client
|
|
.query(`
|
|
SELECT chunk_id, distance
|
|
FROM search_chunk_vec
|
|
WHERE ${conditions.join(' AND ')}
|
|
ORDER BY distance ASC
|
|
LIMIT ?
|
|
`)
|
|
.all(...values, Math.max(input.limit, 4)) as Array<{ chunk_id: number; distance: number }>;
|
|
|
|
return rows;
|
|
}
|
|
|
|
function lexicalSearch(
|
|
client: Database,
|
|
input: {
|
|
ftsQuery: string;
|
|
limit: number;
|
|
sourceKind: SearchDocumentSourceKind;
|
|
scope: SearchDocumentScope;
|
|
userId?: string | null;
|
|
ticker?: string | null;
|
|
}
|
|
) {
|
|
const conditions = [
|
|
'search_chunk_fts MATCH ?',
|
|
'source_kind = ?',
|
|
'scope = ?'
|
|
];
|
|
const values: Array<string | number | null> = [
|
|
input.ftsQuery,
|
|
input.sourceKind,
|
|
input.scope
|
|
];
|
|
|
|
if (input.scope === 'user') {
|
|
conditions.push('user_id = ?');
|
|
values.push(input.userId ?? null);
|
|
}
|
|
|
|
if (input.ticker) {
|
|
conditions.push('ticker = ?');
|
|
values.push(input.ticker);
|
|
}
|
|
|
|
const rows = client
|
|
.query(`
|
|
SELECT
|
|
chunk_id,
|
|
bm25(search_chunk_fts) AS bm25,
|
|
snippet(search_chunk_fts, 0, '[', ']', ' ... ', 18) AS snippet
|
|
FROM search_chunk_fts
|
|
WHERE ${conditions.join(' AND ')}
|
|
ORDER BY bm25(search_chunk_fts)
|
|
LIMIT ?
|
|
`)
|
|
.all(...values, Math.max(input.limit, 4)) as Array<{
|
|
chunk_id: number;
|
|
bm25: number;
|
|
snippet: string | null;
|
|
}>;
|
|
|
|
return rows;
|
|
}
|
|
|
|
function hydrateResults(
|
|
client: Database,
|
|
query: string,
|
|
matches: Map<number, SearchMatch>,
|
|
limit: number
|
|
) {
|
|
const chunkIds = [...matches.keys()];
|
|
if (chunkIds.length === 0) {
|
|
return [] satisfies SearchResult[];
|
|
}
|
|
|
|
const placeholders = createPlaceholders(chunkIds.length);
|
|
const rows = client
|
|
.query(`
|
|
SELECT
|
|
c.id AS chunk_id,
|
|
c.document_id,
|
|
c.chunk_text,
|
|
c.heading_path,
|
|
c.citation_label,
|
|
d.source_kind,
|
|
d.source_ref,
|
|
d.title,
|
|
d.ticker,
|
|
d.accession_number,
|
|
d.metadata
|
|
FROM search_chunk c
|
|
INNER JOIN search_document d ON d.id = c.document_id
|
|
WHERE c.id IN (${placeholders})
|
|
`)
|
|
.all(...chunkIds) as SearchChunkJoinRow[];
|
|
|
|
const dedupePerDocument = new Map<number, number>();
|
|
const enriched = rows
|
|
.map((row) => {
|
|
const match = matches.get(row.chunk_id);
|
|
if (!match) {
|
|
return null;
|
|
}
|
|
|
|
return {
|
|
chunkId: row.chunk_id,
|
|
documentId: row.document_id,
|
|
source: mapSourceKindToSearchSource(row.source_kind),
|
|
sourceKind: row.source_kind,
|
|
sourceRef: row.source_ref,
|
|
title: row.title,
|
|
ticker: row.ticker,
|
|
accessionNumber: row.accession_number,
|
|
filingDate: typeof row.metadata?.filingDate === 'string' ? row.metadata.filingDate : null,
|
|
citationLabel: row.citation_label,
|
|
headingPath: row.heading_path,
|
|
chunkText: row.chunk_text,
|
|
snippet: match.snippet ?? manualSnippet(row.chunk_text, query),
|
|
score: scoreSearchMatch(match),
|
|
vectorRank: match.vectorRank,
|
|
lexicalRank: match.lexicalRank,
|
|
href: buildSearchHref(row)
|
|
} satisfies SearchResult;
|
|
})
|
|
.filter((row): row is SearchResult => Boolean(row))
|
|
.sort((left, right) => right.score - left.score);
|
|
|
|
const results: SearchResult[] = [];
|
|
|
|
for (const row of enriched) {
|
|
const count = dedupePerDocument.get(row.documentId) ?? 0;
|
|
if (count >= MAX_RESULTS_PER_DOCUMENT) {
|
|
continue;
|
|
}
|
|
|
|
dedupePerDocument.set(row.documentId, count + 1);
|
|
results.push(row);
|
|
|
|
if (results.length >= limit) {
|
|
break;
|
|
}
|
|
}
|
|
|
|
return results;
|
|
}
|
|
|
|
export async function searchKnowledgeBase(input: SearchInput) {
|
|
const normalizedQuery = input.query.trim();
|
|
if (normalizedQuery.length < 2) {
|
|
return [] satisfies SearchResult[];
|
|
}
|
|
|
|
const limit = clampLimit(input.limit);
|
|
const normalizedTicker = normalizeTicker(input.ticker);
|
|
const includedSources = normalizeSearchSources(input.sources);
|
|
const client = getSqliteClient();
|
|
const [queryEmbedding] = await runAiEmbeddings([normalizedQuery]);
|
|
const ftsQuery = toFtsQuery(normalizedQuery);
|
|
const matches = new Map<number, SearchMatch>();
|
|
|
|
for (const source of includedSources) {
|
|
const sourceKind = SOURCE_KIND_BY_SEARCH_SOURCE[source];
|
|
const scope = sourceKind === 'research_note' ? 'user' : 'global';
|
|
const vectorRows = vectorSearch(client, {
|
|
embedding: queryEmbedding,
|
|
limit: limit * 3,
|
|
sourceKind,
|
|
scope,
|
|
userId: scope === 'user' ? input.userId : null,
|
|
ticker: normalizedTicker
|
|
});
|
|
|
|
vectorRows.forEach((row, index) => {
|
|
const existing = matches.get(row.chunk_id);
|
|
matches.set(row.chunk_id, {
|
|
chunkId: row.chunk_id,
|
|
vectorRank: existing?.vectorRank ?? index + 1,
|
|
lexicalRank: existing?.lexicalRank ?? null,
|
|
snippet: existing?.snippet ?? null
|
|
});
|
|
});
|
|
|
|
if (!ftsQuery) {
|
|
continue;
|
|
}
|
|
|
|
const lexicalRows = lexicalSearch(client, {
|
|
ftsQuery,
|
|
limit: limit * 3,
|
|
sourceKind,
|
|
scope,
|
|
userId: scope === 'user' ? input.userId : null,
|
|
ticker: normalizedTicker
|
|
});
|
|
|
|
lexicalRows.forEach((row, index) => {
|
|
const existing = matches.get(row.chunk_id);
|
|
matches.set(row.chunk_id, {
|
|
chunkId: row.chunk_id,
|
|
vectorRank: existing?.vectorRank ?? null,
|
|
lexicalRank: existing?.lexicalRank ?? index + 1,
|
|
snippet: existing?.snippet ?? row.snippet ?? null
|
|
});
|
|
});
|
|
}
|
|
|
|
return hydrateResults(client, normalizedQuery, matches, limit);
|
|
}
|
|
|
|
function buildAnswerPrompt(query: string, evidence: SearchResult[]) {
|
|
const evidenceText = evidence.map((result, index) => {
|
|
const reference = index + 1;
|
|
return [
|
|
`[${reference}] ${result.citationLabel}`,
|
|
`Source: ${result.title ?? result.sourceRef}`,
|
|
`Ticker: ${result.ticker ?? 'n/a'}`,
|
|
`Excerpt: ${result.chunkText}`
|
|
].join('\n');
|
|
}).join('\n\n');
|
|
|
|
return [
|
|
'Answer the question using only the evidence below.',
|
|
'Every factual claim must include at least one citation like [1] or [2].',
|
|
'If the evidence is insufficient, respond with exactly INSUFFICIENT_EVIDENCE.',
|
|
`Question: ${query}`,
|
|
'',
|
|
'Evidence:',
|
|
evidenceText
|
|
].join('\n');
|
|
}
|
|
|
|
function finalizeAnswer(answer: string, evidence: SearchResult[]) {
|
|
const trimmed = answer.trim();
|
|
if (!trimmed || trimmed === 'INSUFFICIENT_EVIDENCE') {
|
|
return {
|
|
answer: 'Insufficient evidence to answer from the indexed sources.',
|
|
citations: [] satisfies SearchCitation[]
|
|
};
|
|
}
|
|
|
|
const matches = [...trimmed.matchAll(/\[(\d+)\]/g)];
|
|
const seen = new Set<number>();
|
|
const citations: SearchCitation[] = [];
|
|
|
|
for (const match of matches) {
|
|
const index = Number(match[1]);
|
|
if (!Number.isInteger(index) || seen.has(index) || index < 1 || index > evidence.length) {
|
|
continue;
|
|
}
|
|
|
|
seen.add(index);
|
|
const result = evidence[index - 1]!;
|
|
citations.push({
|
|
index,
|
|
label: result.citationLabel,
|
|
chunkId: result.chunkId,
|
|
href: result.href
|
|
});
|
|
}
|
|
|
|
if (citations.length === 0) {
|
|
return {
|
|
answer: 'Insufficient evidence to answer from the indexed sources.',
|
|
citations: [] satisfies SearchCitation[]
|
|
};
|
|
}
|
|
|
|
return {
|
|
answer: trimmed,
|
|
citations
|
|
};
|
|
}
|
|
|
|
export async function answerSearchQuery(input: SearchInput): Promise<SearchAnswerResponse> {
|
|
const results = await searchKnowledgeBase({
|
|
...input,
|
|
limit: clampLimit(input.limit)
|
|
});
|
|
|
|
if (results.length === 0) {
|
|
return {
|
|
answer: 'Insufficient evidence to answer from the indexed sources.',
|
|
citations: [],
|
|
results
|
|
};
|
|
}
|
|
|
|
const evidence: SearchResult[] = [];
|
|
let totalChars = 0;
|
|
|
|
for (const result of results) {
|
|
if (evidence.length >= MAX_CONTEXT_RESULTS) {
|
|
break;
|
|
}
|
|
|
|
if (totalChars + result.chunkText.length > MAX_CONTEXT_CHARS && evidence.length > 0) {
|
|
break;
|
|
}
|
|
|
|
evidence.push(result);
|
|
totalChars += result.chunkText.length;
|
|
}
|
|
|
|
const response = await runAiAnalysis(
|
|
buildAnswerPrompt(input.query, evidence),
|
|
'Use neutral analyst prose. Do not use outside knowledge.',
|
|
{
|
|
workload: 'report'
|
|
}
|
|
);
|
|
|
|
const finalized = finalizeAnswer(response.text, evidence);
|
|
|
|
return {
|
|
answer: finalized.answer,
|
|
citations: finalized.citations,
|
|
results
|
|
};
|
|
}
|
|
|
|
export const __searchInternals = {
|
|
buildCitationLabel,
|
|
buildFilingBriefContent,
|
|
chunkDocument,
|
|
chunkText,
|
|
deleteSourceRefs,
|
|
finalizeAnswer,
|
|
hashContent,
|
|
hydrateResults,
|
|
lexicalSearch,
|
|
normalizeSearchSources,
|
|
persistDocumentIndex,
|
|
queryOneDocument,
|
|
scoreSearchMatch,
|
|
stripMarkdown,
|
|
toFtsQuery,
|
|
vectorSearch
|
|
};
|