Neon-Desk/lib/server/search.ts

import { createHash } from 'node:crypto';
import type { Database } from 'bun:sqlite';
import type {
  ResearchJournalEntry,
  SearchAnswerResponse,
  SearchCitation,
  SearchResult,
  SearchSource,
  TaskStageContext
} from '@/lib/types';
import { runAiAnalysis, runAiEmbeddings } from '@/lib/server/ai';
import { __dbInternals, getSqliteClient } from '@/lib/server/db';
import { fetchPrimaryFilingText } from '@/lib/server/sec';
import { getFilingByAccession, listFilingsRecords } from '@/lib/server/repos/filings';
import {
  getResearchJournalEntryRecord,
  listResearchJournalEntries,
  listResearchJournalEntriesForUser
} from '@/lib/server/repos/research-journal';

type SearchDocumentScope = 'global' | 'user';
type SearchDocumentSourceKind = 'filing_document' | 'filing_brief' | 'research_note';

type MaterializedSearchDocument = {
  sourceKind: SearchDocumentSourceKind;
  sourceRef: string;
  scope: SearchDocumentScope;
  userId: string | null;
  ticker: string | null;
  accessionNumber: string | null;
  filingDate: string | null;
  title: string | null;
  contentText: string;
  metadata: Record<string, unknown>;
};

type SearchChunkRecord = {
  chunkIndex: number;
  chunkText: string;
  charCount: number;
  startOffset: number;
  endOffset: number;
  headingPath: string | null;
  citationLabel: string;
};

type SearchDocumentRow = {
  id: number;
  source_kind: SearchDocumentSourceKind;
  source_ref: string;
  scope: SearchDocumentScope;
  user_id: string | null;
  ticker: string | null;
  accession_number: string | null;
  title: string | null;
  content_text: string;
  content_hash: string;
  metadata: Record<string, unknown> | null;
  index_status: 'pending' | 'indexed' | 'failed';
  indexed_at: string | null;
  last_error: string | null;
  created_at: string;
  updated_at: string;
};

type SearchChunkJoinRow = {
  chunk_id: number;
  document_id: number;
  chunk_text: string;
  heading_path: string | null;
  citation_label: string;
  source_kind: SearchDocumentSourceKind;
  source_ref: string;
  title: string | null;
  ticker: string | null;
  accession_number: string | null;
  metadata: Record<string, unknown> | null;
};

type DeleteSourceRef = {
  sourceKind: SearchDocumentSourceKind;
  sourceRef: string;
  scope: SearchDocumentScope;
  userId?: string | null;
};

type IndexSearchDocumentsInput = {
  userId: string;
  ticker?: string | null;
  accessionNumber?: string | null;
  journalEntryId?: number | null;
  sourceKinds?: SearchDocumentSourceKind[];
  deleteSourceRefs?: DeleteSourceRef[];
  onStage?: (
    stage: 'collect' | 'fetch' | 'chunk' | 'embed' | 'persist',
    detail: string,
    context?: TaskStageContext | null
  ) => Promise<void> | void;
};

type SearchInput = {
  userId: string;
  query: string;
  ticker?: string | null;
  sources?: SearchSource[];
  limit?: number;
};

type SearchMatch = {
  chunkId: number;
  vectorRank: number | null;
  lexicalRank: number | null;
  snippet: string | null;
};

const RRF_K = 60;
const SEARCH_RESULT_LIMIT_DEFAULT = 10;
const SEARCH_RESULT_LIMIT_MIN = 4;
const SEARCH_RESULT_LIMIT_MAX = 12;
const MAX_RESULTS_PER_DOCUMENT = 2;
const MAX_CONTEXT_RESULTS = 6;
const MAX_CONTEXT_CHARS = 8_000;

const SOURCE_KIND_BY_SEARCH_SOURCE: Record<SearchSource, SearchDocumentSourceKind> = {
  documents: 'filing_document',
  filings: 'filing_brief',
  research: 'research_note'
};

function escapeLike(value: string) {
  return value.replace(/[%_]/g, (match) => `\\${match}`);
}

function normalizeTicker(value: string | null | undefined) {
  const normalized = value?.trim().toUpperCase() ?? '';
  return normalized.length > 0 ? normalized : null;
}

function normalizeSearchSources(sources?: SearchSource[]) {
  const normalized = new Set<SearchSource>();

  for (const source of sources ?? ['documents', 'filings', 'research']) {
    if (source === 'documents' || source === 'filings' || source === 'research') {
      normalized.add(source);
    }
  }

  return normalized.size > 0
    ? [...normalized]
    : ['documents', 'filings', 'research'] as SearchSource[];
}

function clampLimit(limit?: number) {
  const value = Number.isFinite(limit) ? Number(limit) : SEARCH_RESULT_LIMIT_DEFAULT;
  const intValue = Math.trunc(value);
  return Math.min(Math.max(intValue, SEARCH_RESULT_LIMIT_MIN), SEARCH_RESULT_LIMIT_MAX);
}

function hashContent(content: string) {
  return createHash('sha256').update(content).digest('hex');
}

function stripMarkdown(markdown: string) {
  return markdown
    .replace(/```[\s\S]*?```/g, ' ')
    .replace(/`([^`]+)`/g, '$1')
    .replace(/!\[[^\]]*\]\([^)]+\)/g, ' ')
    .replace(/\[([^\]]+)\]\([^)]+\)/g, '$1')
    .replace(/^>\s?/gm, '')
    .replace(/^#{1,6}\s+/gm, '')
    .replace(/[*_~]/g, '')
    .replace(/^\s*[-+]\s+/gm, '')
    .replace(/\r/g, '\n')
    .replace(/[ \t]+\n/g, '\n')
    .replace(/\n{3,}/g, '\n\n')
    .trim();
}

function normalizeWhitespace(value: string) {
  return value
    .replace(/\r/g, '\n')
    .replace(/[ \t]+\n/g, '\n')
    .replace(/\n[ \t]+/g, '\n')
    .replace(/[ \t]{2,}/g, ' ')
    .replace(/\n{3,}/g, '\n\n')
    .trim();
}

function buildFilingBriefContent(input: {
  ticker: string;
  companyName: string;
  accessionNumber: string;
  filingDate: string;
  filingType: string;
  metrics: Record<string, number | null> | null;
  analysis: Record<string, unknown> | null;
}) {
  const extraction = input.analysis?.extraction;
  const extractionLines = extraction && typeof extraction === 'object'
    ? Object.entries(extraction as Record<string, unknown>)
      .map(([key, value]) => `${key}: ${Array.isArray(value) ? value.join(' | ') : String(value ?? '')}`)
      .filter((line) => !line.endsWith(': '))
    : [];

  const reportText = typeof input.analysis?.text === 'string'
    ? input.analysis.text
    : typeof input.analysis?.legacyInsights === 'string'
      ? input.analysis.legacyInsights
      : null;

  return normalizeWhitespace([
    `${input.companyName} (${input.ticker}) filing brief`,
    `Form: ${input.filingType}`,
    `Filed: ${input.filingDate}`,
    `Accession: ${input.accessionNumber}`,
    input.metrics ? `Key metrics: ${JSON.stringify(input.metrics)}` : null,
    reportText ? `AI summary:\n${reportText}` : null,
    extractionLines.length > 0 ? `Structured extraction:\n${extractionLines.join('\n')}` : null
  ].filter((entry): entry is string => Boolean(entry)).join('\n\n'));
}

function buildCitationLabel(document: MaterializedSearchDocument, chunkIndex: number) {
  if (document.sourceKind === 'research_note') {
    return `${document.ticker ?? 'Research'} journal note [${chunkIndex + 1}]`;
  }

  const parts = [
    document.ticker,
    document.accessionNumber,
    document.filingDate
  ].filter((entry): entry is string => Boolean(entry));

  return `${parts.join(' · ')} [${chunkIndex + 1}]`;
}

function inferHeadingPath(text: string, offset: number) {
  const windowStart = Math.max(0, offset - 600);
  const context = text.slice(windowStart, offset);
  const lines = context
    .split('\n')
    .map((line) => line.trim())
    .filter((line) => line.length > 0);

  for (let index = lines.length - 1; index >= 0; index -= 1) {
    const candidate = lines[index]!;
    const looksLikeHeading = candidate.length <= 100 && (
      /:$/.test(candidate)
      || /^[A-Z0-9][A-Z0-9\s/&,-]{4,}$/.test(candidate)
      || /^\d+(\.\d+)*\s+[A-Z]/.test(candidate)
    );

    if (looksLikeHeading) {
      return candidate;
    }
  }

  return null;
}

function chunkText(
  text: string,
  options: { targetChars: number; overlapChars: number; maxSingleChunk?: number },
  document: MaterializedSearchDocument
) {
  const normalized = normalizeWhitespace(text);
  if (!normalized) {
    return [] satisfies SearchChunkRecord[];
  }

  const maxSingleChunk = options.maxSingleChunk ?? 0;
  if (maxSingleChunk > 0 && normalized.length <= maxSingleChunk) {
    return [{
      chunkIndex: 0,
      chunkText: normalized,
      charCount: normalized.length,
      startOffset: 0,
      endOffset: normalized.length,
      headingPath: inferHeadingPath(normalized, 0),
      citationLabel: buildCitationLabel(document, 0)
    }];
  }

  const chunks: SearchChunkRecord[] = [];
  let start = 0;
  let chunkIndex = 0;

  while (start < normalized.length) {
    const tentativeEnd = Math.min(start + options.targetChars, normalized.length);
    let end = tentativeEnd;

    if (tentativeEnd < normalized.length) {
      const localWindow = normalized.slice(start, Math.min(normalized.length, tentativeEnd + 180));
      const paragraphBreak = localWindow.lastIndexOf('\n\n');
      const sentenceBreak = Math.max(localWindow.lastIndexOf('. '), localWindow.lastIndexOf('\n'));
      const boundary = Math.max(paragraphBreak, sentenceBreak);

      if (boundary >= options.targetChars * 0.55) {
        end = start + boundary + (paragraphBreak === boundary ? 0 : 1);
      }
    }

    const chunkTextValue = normalized.slice(start, end).trim();
    if (chunkTextValue) {
      chunks.push({
        chunkIndex,
        chunkText: chunkTextValue,
        charCount: chunkTextValue.length,
        startOffset: start,
        endOffset: end,
        headingPath: inferHeadingPath(normalized, start),
        citationLabel: buildCitationLabel(document, chunkIndex)
      });
      chunkIndex += 1;
    }

    if (end >= normalized.length) {
      break;
    }

    start = Math.max(end - options.overlapChars, start + 1);
  }

  return chunks;
}

function chunkDocument(document: MaterializedSearchDocument) {
  switch (document.sourceKind) {
    case 'filing_document':
      return chunkText(document.contentText, { targetChars: 1400, overlapChars: 200 }, document);
    case 'filing_brief':
      return chunkText(document.contentText, { targetChars: 1000, overlapChars: 100 }, document);
    case 'research_note':
      return chunkText(document.contentText, {
        targetChars: 1000,
        overlapChars: 100,
        maxSingleChunk: 2000
      }, document);
    default:
      return [];
  }
}

function mapSourceKindToSearchSource(sourceKind: SearchDocumentSourceKind): SearchSource {
  switch (sourceKind) {
    case 'filing_document':
      return 'documents';
    case 'filing_brief':
      return 'filings';
    case 'research_note':
      return 'research';
    default:
      return 'documents';
  }
}

function buildSearchHref(row: SearchChunkJoinRow) {
  if (row.source_kind === 'research_note') {
    return `/analysis?ticker=${encodeURIComponent(row.ticker ?? '')}&journalId=${encodeURIComponent(row.source_ref)}`;
  }

  const hasAnalysis = Boolean((row.metadata as { hasAnalysis?: unknown } | null)?.hasAnalysis);
  if (hasAnalysis && row.ticker && row.accession_number) {
    return `/analysis/reports/${encodeURIComponent(row.ticker)}/${encodeURIComponent(row.accession_number)}`;
  }

  if (row.ticker) {
    return `/filings?ticker=${encodeURIComponent(row.ticker)}`;
  }

  return '/filings';
}

function manualSnippet(text: string, query: string) {
  const tokens = query.toLowerCase().split(/\W+/).filter((token) => token.length > 1);
  const lower = text.toLowerCase();
  const matchIndex = tokens
    .map((token) => lower.indexOf(token))
    .find((index) => index >= 0) ?? 0;
  const start = Math.max(0, matchIndex - 90);
  const end = Math.min(text.length, start + 220);
  const prefix = start > 0 ? '... ' : '';
  const suffix = end < text.length ? ' ...' : '';
  return `${prefix}${text.slice(start, end).trim()}${suffix}`.trim();
}

function scoreSearchMatch(match: SearchMatch) {
  let score = 0;
  if (match.vectorRank !== null) {
    score += 1 / (RRF_K + match.vectorRank);
  }

  if (match.lexicalRank !== null) {
    score += 1 / (RRF_K + match.lexicalRank);
  }

  return score;
}

function toFtsQuery(query: string) {
  const tokens = query
    .trim()
    .split(/\W+/)
    .map((token) => token.trim().toLowerCase())
    .filter((token) => token.length > 1);

  if (tokens.length === 0) {
    return null;
  }

  return tokens.map((token) => `"${token.replace(/"/g, '""')}"`).join(' AND ');
}

function createPlaceholders(length: number) {
  return new Array(length).fill('?').join(', ');
}

function queryOneDocument(
  client: Database,
  input: Pick<MaterializedSearchDocument, 'scope' | 'userId' | 'sourceKind' | 'sourceRef'>
) {
  return client
    .query(`
      SELECT *
      FROM search_document
      WHERE scope = ?
        AND ifnull(user_id, '') = ?
        AND source_kind = ?
        AND source_ref = ?
      LIMIT 1
    `)
    .get(
      input.scope,
      input.userId ?? '',
      input.sourceKind,
      input.sourceRef
    ) as SearchDocumentRow | null;
}

function listDocumentsForScope(client: Database, input: {
  scope: SearchDocumentScope;
  userId?: string | null;
  sourceKind: SearchDocumentSourceKind;
  ticker?: string | null;
}) {
  const conditions = [
    'scope = ?',
    "ifnull(user_id, '') = ?",
    'source_kind = ?'
  ];
  const values: Array<string | null> = [input.scope, input.userId ?? '', input.sourceKind];

  if (input.ticker) {
    conditions.push('ticker = ?');
    values.push(input.ticker);
  }

  return client
    .query(`
      SELECT *
      FROM search_document
      WHERE ${conditions.join(' AND ')}
    `)
    .all(...values) as SearchDocumentRow[];
}

function deleteDocumentCascade(client: Database, documentId: number) {
  const chunkRows = client
    .query('SELECT id FROM search_chunk WHERE document_id = ?')
    .all(documentId) as Array<{ id: number }>;

  for (const row of chunkRows) {
    client.query('DELETE FROM search_chunk_vec WHERE chunk_id = ?').run(row.id);
    client.query('DELETE FROM search_chunk_fts WHERE chunk_id = ?').run(row.id);
  }

  client.query('DELETE FROM search_chunk WHERE document_id = ?').run(documentId);
  client.query('DELETE FROM search_document WHERE id = ?').run(documentId);
}

function withTransaction<T>(client: Database, fn: () => T) {
  client.exec('BEGIN IMMEDIATE');

  try {
    const result = fn();
    client.exec('COMMIT');
    return result;
  } catch (error) {
    client.exec('ROLLBACK');
    throw error;
  }
}

async function collectFilingDocuments(ticker?: string | null, accessionNumber?: string | null) {
  const filings = accessionNumber
    ? await Promise.all([getFilingByAccession(accessionNumber)]).then((rows) => rows.filter(Boolean))
    : await listFilingsRecords({
      ticker: ticker ?? undefined,
      limit: 250
    });

  const documents: MaterializedSearchDocument[] = [];

  for (const filing of filings) {
    if (!filing) {
      continue;
    }

    const filingText = await fetchPrimaryFilingText({
      filingUrl: filing.filing_url,
      cik: filing.cik,
      accessionNumber: filing.accession_number,
      primaryDocument: filing.primary_document ?? null
    }).catch(() => null);

    if (!filingText?.text) {
      continue;
    }

    documents.push({
      sourceKind: 'filing_document',
      sourceRef: filing.accession_number,
      scope: 'global',
      userId: null,
      ticker: filing.ticker,
      accessionNumber: filing.accession_number,
      filingDate: filing.filing_date,
      title: `${filing.ticker} ${filing.filing_type} primary filing`,
      contentText: filingText.text,
      metadata: {
        filingType: filing.filing_type,
        filingDate: filing.filing_date,
        filingUrl: filing.filing_url,
        submissionUrl: filing.submission_url ?? null,
        primaryDocument: filing.primary_document ?? null,
        hasAnalysis: Boolean(filing.analysis?.text || filing.analysis?.legacyInsights)
      }
    });
  }

  return documents;
}

async function collectFilingBriefs(ticker?: string | null, accessionNumber?: string | null) {
  const filings = accessionNumber
    ? await Promise.all([getFilingByAccession(accessionNumber)]).then((rows) => rows.filter(Boolean))
    : await listFilingsRecords({
      ticker: ticker ?? undefined,
      limit: 250
    });

  return filings
    .filter((filing): filing is NonNullable<typeof filing> => Boolean(filing))
    .map((filing) => ({
      sourceKind: 'filing_brief' as const,
      sourceRef: filing.accession_number,
      scope: 'global' as const,
      userId: null,
      ticker: filing.ticker,
      accessionNumber: filing.accession_number,
      filingDate: filing.filing_date,
      title: `${filing.ticker} ${filing.filing_type} filing brief`,
      contentText: buildFilingBriefContent({
        ticker: filing.ticker,
        companyName: filing.company_name,
        accessionNumber: filing.accession_number,
        filingDate: filing.filing_date,
        filingType: filing.filing_type,
        metrics: filing.metrics,
        analysis: filing.analysis as Record<string, unknown> | null
      }),
      metadata: {
        filingType: filing.filing_type,
        filingDate: filing.filing_date,
        hasAnalysis: Boolean(filing.analysis?.text || filing.analysis?.legacyInsights)
      }
    }));
}

function materializeResearchNote(entry: ResearchJournalEntry): MaterializedSearchDocument | null {
  const text = normalizeWhitespace(stripMarkdown(entry.body_markdown));
  if (!text) {
    return null;
  }

  return {
    sourceKind: 'research_note',
    sourceRef: String(entry.id),
    scope: 'user',
    userId: entry.user_id,
    ticker: entry.ticker,
    accessionNumber: entry.accession_number,
    filingDate: null,
    title: entry.title ?? `${entry.ticker} research note`,
    contentText: text,
    metadata: {
      entryType: entry.entry_type,
      createdAt: entry.created_at,
      updatedAt: entry.updated_at
    }
  };
}

async function collectResearchNotes(userId: string, ticker?: string | null, journalEntryId?: number | null) {
  if (journalEntryId) {
    const entry = await getResearchJournalEntryRecord(userId, journalEntryId);
    const materialized = entry ? materializeResearchNote(entry) : null;
    return materialized ? [materialized] : [];
  }

  const entries = ticker
    ? await listResearchJournalEntries(userId, ticker, 250)
    : await listResearchJournalEntriesForUser(userId, 250);

  return entries
    .map(materializeResearchNote)
    .filter((entry): entry is MaterializedSearchDocument => Boolean(entry));
}

async function collectMaterializedDocuments(input: IndexSearchDocumentsInput) {
  const sourceKinds = input.sourceKinds ?? ['filing_document', 'filing_brief', 'research_note'];
  const documents: MaterializedSearchDocument[] = [];

  for (const sourceKind of sourceKinds) {
    if (sourceKind === 'filing_document') {
      documents.push(...await collectFilingDocuments(input.ticker ?? null, input.accessionNumber ?? null));
      continue;
    }

    if (sourceKind === 'filing_brief') {
      documents.push(...await collectFilingBriefs(input.ticker ?? null, input.accessionNumber ?? null));
      continue;
    }

    if (sourceKind === 'research_note') {
      documents.push(...await collectResearchNotes(
        input.userId,
        input.ticker ?? null,
        input.journalEntryId ?? null
      ));
    }
  }

  return documents;
}

function persistDocumentIndex(
  client: Database,
  document: MaterializedSearchDocument,
  chunks: SearchChunkRecord[],
  embeddings: number[][]
) {
  const now = new Date().toISOString();
  const contentHash = hashContent(document.contentText);
  const existing = queryOneDocument(client, document);

  if (existing && existing.content_hash === contentHash && existing.index_status === 'indexed') {
    return { indexed: false, skipped: true, chunkCount: 0 };
  }

  const documentId = withTransaction(client, () => {
    if (existing) {
      deleteDocumentCascade(client, existing.id);
    }

    const inserted = client
      .query(`
        INSERT INTO search_document (
          source_kind,
          source_ref,
          scope,
          user_id,
          ticker,
          accession_number,
          title,
          content_text,
          content_hash,
          metadata,
          index_status,
          indexed_at,
          last_error,
          created_at,
          updated_at
        ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, 'indexed', ?, NULL, ?, ?)
        RETURNING id
      `)
      .get(
        document.sourceKind,
        document.sourceRef,
        document.scope,
        document.userId,
        document.ticker,
        document.accessionNumber,
        document.title,
        document.contentText,
        contentHash,
        JSON.stringify(document.metadata),
        now,
        existing?.created_at ?? now,
        now
      ) as { id: number };

    for (let index = 0; index < chunks.length; index += 1) {
      const chunk = chunks[index]!;
      const embedding = embeddings[index]!;
      const insertedChunk = client
        .query(`
          INSERT INTO search_chunk (
            document_id,
            chunk_index,
            chunk_text,
            char_count,
            start_offset,
            end_offset,
            heading_path,
            citation_label,
            created_at
          ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
          RETURNING id
        `)
        .get(
          inserted.id,
          chunk.chunkIndex,
          chunk.chunkText,
          chunk.charCount,
          chunk.startOffset,
          chunk.endOffset,
          chunk.headingPath,
          chunk.citationLabel,
          now
        ) as { id: number };

      client
        .query(`
          INSERT INTO search_chunk_fts (
            rowid,
            chunk_text,
            citation_label,
            heading_path,
            chunk_id,
            document_id,
            chunk_index,
            scope,
            user_id,
            source_kind,
            ticker,
            accession_number,
            filing_date
          ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
        `)
        .run(
          insertedChunk.id,
          chunk.chunkText,
          chunk.citationLabel,
          chunk.headingPath,
          insertedChunk.id,
          inserted.id,
          chunk.chunkIndex,
          document.scope,
          document.userId,
          document.sourceKind,
          document.ticker,
          document.accessionNumber,
          document.filingDate
        );

      client
        .query(`
          INSERT INTO search_chunk_vec (
            chunk_id,
            embedding,
            scope,
            user_id,
            source_kind,
            ticker,
            accession_number,
            filing_date,
            document_id,
            chunk_index,
            citation_label
          ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
        `)
        .run(
          insertedChunk.id,
          JSON.stringify(embedding),
          document.scope,
          document.userId,
          document.sourceKind,
          document.ticker,
          document.accessionNumber,
          document.filingDate,
          inserted.id,
          chunk.chunkIndex,
          chunk.citationLabel
        );
    }

    return inserted.id;
  });

  return {
    indexed: true,
    skipped: false,
    chunkCount: chunks.length,
    documentId
  };
}

function deleteSourceRefs(client: Database, refs: DeleteSourceRef[]) {
  let deleted = 0;

  for (const ref of refs) {
    const row = queryOneDocument(client, {
      scope: ref.scope,
      userId: ref.userId ?? null,
      sourceKind: ref.sourceKind,
      sourceRef: ref.sourceRef
    });

    if (!row) {
      continue;
    }

    withTransaction(client, () => {
      deleteDocumentCascade(client, row.id);
    });
    deleted += 1;
  }

  return deleted;
}

export async function indexSearchDocuments(input: IndexSearchDocumentsInput) {
  const client = getSqliteClient();
  await input.onStage?.('collect', 'Collecting materialized search sources');
  const materialized = await collectMaterializedDocuments(input);
  const sourceKinds = input.sourceKinds ?? ['filing_document', 'filing_brief', 'research_note'];

  let indexed = 0;
  let skipped = 0;
  let deleted = 0;
  let chunksEmbedded = 0;
  const totalDocuments = materialized.length;

  const stageContext = (current: number, subject?: TaskStageContext['subject'] | null): TaskStageContext => ({
    progress: {
      current,
      total: totalDocuments || 1,
      unit: 'sources'
    },
    counters: {
      sourcesCollected: totalDocuments,
      indexed,
      skipped,
      deleted,
      chunksEmbedded
    },
    subject: subject ?? (input.ticker ? { ticker: input.ticker } : input.accessionNumber ? { accessionNumber: input.accessionNumber } : null)
  });

  if (input.deleteSourceRefs && input.deleteSourceRefs.length > 0) {
    deleted += deleteSourceRefs(client, input.deleteSourceRefs);
  }

  await input.onStage?.(
    'collect',
    `Collected ${materialized.length} source records for search indexing`,
    {
      counters: {
        sourcesCollected: materialized.length,
        deleted
      },
      subject: input.ticker ? { ticker: input.ticker } : input.accessionNumber ? { accessionNumber: input.accessionNumber } : null
    }
  );

  for (let index = 0; index < materialized.length; index += 1) {
    const document = materialized[index];

    await input.onStage?.(
      'fetch',
      `Preparing ${document.sourceKind} ${document.sourceRef}`,
      stageContext(index + 1, {
        ticker: document.ticker ?? undefined,
        accessionNumber: document.accessionNumber ?? undefined,
        label: document.sourceRef
      })
    );
    const chunks = chunkDocument(document);
    if (chunks.length === 0) {
      continue;
    }

    await input.onStage?.(
      'chunk',
      `Chunking ${document.sourceKind} ${document.sourceRef}`,
      stageContext(index + 1, {
        ticker: document.ticker ?? undefined,
        accessionNumber: document.accessionNumber ?? undefined,
        label: document.sourceRef
      })
    );
    await input.onStage?.(
      'embed',
      `Embedding ${chunks.length} chunks for ${document.sourceRef}`,
      {
        ...stageContext(index + 1, {
          ticker: document.ticker ?? undefined,
          accessionNumber: document.accessionNumber ?? undefined,
          label: document.sourceRef
        }),
        counters: {
          sourcesCollected: totalDocuments,
          indexed,
          skipped,
          deleted,
          chunksEmbedded
        }
      }
    );
    const embeddings = await runAiEmbeddings(chunks.map((chunk) => chunk.chunkText));
    await input.onStage?.(
      'persist',
      `Persisting indexed chunks for ${document.sourceRef}`,
      stageContext(index + 1, {
        ticker: document.ticker ?? undefined,
        accessionNumber: document.accessionNumber ?? undefined,
        label: document.sourceRef
      })
    );
    const result = persistDocumentIndex(client, document, chunks, embeddings);

    if (result.skipped) {
      skipped += 1;
      continue;
    }

    indexed += 1;
    chunksEmbedded += result.chunkCount;
  }

  if (input.ticker && !input.accessionNumber && !input.journalEntryId) {
    for (const sourceKind of sourceKinds) {
      const scope = sourceKind === 'research_note' ? 'user' : 'global';
      const expectedRefs = new Set(
        materialized
          .filter((document) => document.sourceKind === sourceKind)
          .map((document) => document.sourceRef)
      );
      const existingRows = listDocumentsForScope(client, {
        scope,
        userId: scope === 'user' ? input.userId : null,
        sourceKind,
        ticker: input.ticker
      });

      for (const row of existingRows) {
        if (expectedRefs.has(row.source_ref)) {
          continue;
        }

        withTransaction(client, () => {
          deleteDocumentCascade(client, row.id);
        });
        deleted += 1;
      }
    }
  }

  return {
    sourcesCollected: materialized.length,
    indexed,
    skipped,
    deleted,
    chunksEmbedded
  };
}

function vectorSearch(
  client: Database,
  input: {
    embedding: number[];
    limit: number;
    sourceKind: SearchDocumentSourceKind;
    scope: SearchDocumentScope;
    userId?: string | null;
    ticker?: string | null;
  }
) {
  if (!__dbInternals.isVectorExtensionLoaded(client)) {
    const conditions = [
      'scope = ?',
      'source_kind = ?'
    ];
    const values: Array<string | null> = [input.scope, input.sourceKind];

    if (input.scope === 'user') {
      conditions.push('user_id = ?');
      values.push(input.userId ?? null);
    }

    if (input.ticker) {
      conditions.push('ticker = ?');
      values.push(input.ticker);
    }

    const rows = client
      .query(`
        SELECT chunk_id, embedding
        FROM search_chunk_vec
        WHERE ${conditions.join(' AND ')}
      `)
      .all(...values) as Array<{ chunk_id: number; embedding: string }>;

    const queryNorm = Math.hypot(...input.embedding) || 1;
    return rows
      .map((row) => {
        const candidate = JSON.parse(row.embedding) as number[];
        const dot = candidate.reduce((sum, value, index) => sum + (value * (input.embedding[index] ?? 0)), 0);
        const candidateNorm = Math.hypot(...candidate) || 1;
        const cosineDistance = 1 - (dot / (candidateNorm * queryNorm));

        return {
          chunk_id: row.chunk_id,
          distance: cosineDistance
        };
      })
      .sort((left, right) => left.distance - right.distance)
      .slice(0, Math.max(input.limit, 4));
  }

  const conditions = [
    'embedding MATCH ?',
    'k = ?',
    'scope = ?',
    'source_kind = ?'
  ];
  const values: Array<string | number | null> = [
    JSON.stringify(input.embedding),
    Math.max(input.limit, 4),
    input.scope,
    input.sourceKind
  ];

  if (input.scope === 'user') {
    conditions.push('user_id = ?');
    values.push(input.userId ?? null);
  }

  if (input.ticker) {
    conditions.push('ticker = ?');
    values.push(input.ticker);
  }

  const rows = client
    .query(`
      SELECT chunk_id, distance
      FROM search_chunk_vec
      WHERE ${conditions.join(' AND ')}
      ORDER BY distance ASC
      LIMIT ?
    `)
    .all(...values, Math.max(input.limit, 4)) as Array<{ chunk_id: number; distance: number }>;

  return rows;
}

function lexicalSearch(
  client: Database,
  input: {
    ftsQuery: string;
    limit: number;
    sourceKind: SearchDocumentSourceKind;
    scope: SearchDocumentScope;
    userId?: string | null;
    ticker?: string | null;
  }
) {
  const conditions = [
    'search_chunk_fts MATCH ?',
    'source_kind = ?',
    'scope = ?'
  ];
  const values: Array<string | number | null> = [
    input.ftsQuery,
    input.sourceKind,
    input.scope
  ];

  if (input.scope === 'user') {
    conditions.push('user_id = ?');
    values.push(input.userId ?? null);
  }

  if (input.ticker) {
    conditions.push('ticker = ?');
    values.push(input.ticker);
  }

  const rows = client
    .query(`
      SELECT
        chunk_id,
        bm25(search_chunk_fts) AS bm25,
        snippet(search_chunk_fts, 0, '[', ']', ' ... ', 18) AS snippet
      FROM search_chunk_fts
      WHERE ${conditions.join(' AND ')}
      ORDER BY bm25(search_chunk_fts)
      LIMIT ?
    `)
    .all(...values, Math.max(input.limit, 4)) as Array<{
      chunk_id: number;
      bm25: number;
      snippet: string | null;
    }>;

  return rows;
}

function hydrateResults(
  client: Database,
  query: string,
  matches: Map<number, SearchMatch>,
  limit: number
) {
  const chunkIds = [...matches.keys()];
  if (chunkIds.length === 0) {
    return [] satisfies SearchResult[];
  }

  const placeholders = createPlaceholders(chunkIds.length);
  const rows = client
    .query(`
      SELECT
        c.id AS chunk_id,
        c.document_id,
        c.chunk_text,
        c.heading_path,
        c.citation_label,
        d.source_kind,
        d.source_ref,
        d.title,
        d.ticker,
        d.accession_number,
        d.metadata
      FROM search_chunk c
      INNER JOIN search_document d ON d.id = c.document_id
      WHERE c.id IN (${placeholders})
    `)
    .all(...chunkIds) as SearchChunkJoinRow[];

  const dedupePerDocument = new Map<number, number>();
  const enriched = rows
    .map((row) => {
      const match = matches.get(row.chunk_id);
      if (!match) {
        return null;
      }

      return {
        chunkId: row.chunk_id,
        documentId: row.document_id,
        source: mapSourceKindToSearchSource(row.source_kind),
        sourceKind: row.source_kind,
        sourceRef: row.source_ref,
        title: row.title,
        ticker: row.ticker,
        accessionNumber: row.accession_number,
        filingDate: typeof row.metadata?.filingDate === 'string' ? row.metadata.filingDate : null,
        citationLabel: row.citation_label,
        headingPath: row.heading_path,
        chunkText: row.chunk_text,
        snippet: match.snippet ?? manualSnippet(row.chunk_text, query),
        score: scoreSearchMatch(match),
        vectorRank: match.vectorRank,
        lexicalRank: match.lexicalRank,
        href: buildSearchHref(row)
      } satisfies SearchResult;
    })
    .filter((row): row is SearchResult => Boolean(row))
    .sort((left, right) => right.score - left.score);

  const results: SearchResult[] = [];

  for (const row of enriched) {
    const count = dedupePerDocument.get(row.documentId) ?? 0;
    if (count >= MAX_RESULTS_PER_DOCUMENT) {
      continue;
    }

    dedupePerDocument.set(row.documentId, count + 1);
    results.push(row);

    if (results.length >= limit) {
      break;
    }
  }

  return results;
}

export async function searchKnowledgeBase(input: SearchInput) {
  const normalizedQuery = input.query.trim();
  if (normalizedQuery.length < 2) {
    return [] satisfies SearchResult[];
  }

  const limit = clampLimit(input.limit);
  const normalizedTicker = normalizeTicker(input.ticker);
  const includedSources = normalizeSearchSources(input.sources);
  const client = getSqliteClient();
  const [queryEmbedding] = await runAiEmbeddings([normalizedQuery]);
  const ftsQuery = toFtsQuery(normalizedQuery);
  const matches = new Map<number, SearchMatch>();

  for (const source of includedSources) {
    const sourceKind = SOURCE_KIND_BY_SEARCH_SOURCE[source];
    const scope = sourceKind === 'research_note' ? 'user' : 'global';
    const vectorRows = vectorSearch(client, {
      embedding: queryEmbedding,
      limit: limit * 3,
      sourceKind,
      scope,
      userId: scope === 'user' ? input.userId : null,
      ticker: normalizedTicker
    });

    vectorRows.forEach((row, index) => {
      const existing = matches.get(row.chunk_id);
      matches.set(row.chunk_id, {
        chunkId: row.chunk_id,
        vectorRank: existing?.vectorRank ?? index + 1,
        lexicalRank: existing?.lexicalRank ?? null,
        snippet: existing?.snippet ?? null
      });
    });

    if (!ftsQuery) {
      continue;
    }

    const lexicalRows = lexicalSearch(client, {
      ftsQuery,
      limit: limit * 3,
      sourceKind,
      scope,
      userId: scope === 'user' ? input.userId : null,
      ticker: normalizedTicker
    });

    lexicalRows.forEach((row, index) => {
      const existing = matches.get(row.chunk_id);
      matches.set(row.chunk_id, {
        chunkId: row.chunk_id,
        vectorRank: existing?.vectorRank ?? null,
        lexicalRank: existing?.lexicalRank ?? index + 1,
        snippet: existing?.snippet ?? row.snippet ?? null
      });
    });
  }

  return hydrateResults(client, normalizedQuery, matches, limit);
}

function buildAnswerPrompt(query: string, evidence: SearchResult[]) {
  const evidenceText = evidence.map((result, index) => {
    const reference = index + 1;
    return [
      `[${reference}] ${result.citationLabel}`,
      `Source: ${result.title ?? result.sourceRef}`,
      `Ticker: ${result.ticker ?? 'n/a'}`,
      `Excerpt: ${result.chunkText}`
    ].join('\n');
  }).join('\n\n');

  return [
    'Answer the question using only the evidence below.',
    'Every factual claim must include at least one citation like [1] or [2].',
    'If the evidence is insufficient, respond with exactly INSUFFICIENT_EVIDENCE.',
    `Question: ${query}`,
    '',
    'Evidence:',
    evidenceText
  ].join('\n');
}

function finalizeAnswer(answer: string, evidence: SearchResult[]) {
  const trimmed = answer.trim();
  if (!trimmed || trimmed === 'INSUFFICIENT_EVIDENCE') {
    return {
      answer: 'Insufficient evidence to answer from the indexed sources.',
      citations: [] satisfies SearchCitation[]
    };
  }

  const matches = [...trimmed.matchAll(/\[(\d+)\]/g)];
  const seen = new Set<number>();
  const citations: SearchCitation[] = [];

  for (const match of matches) {
    const index = Number(match[1]);
    if (!Number.isInteger(index) || seen.has(index) || index < 1 || index > evidence.length) {
      continue;
    }

    seen.add(index);
    const result = evidence[index - 1]!;
    citations.push({
      index,
      label: result.citationLabel,
      chunkId: result.chunkId,
      href: result.href
    });
  }

  if (citations.length === 0) {
    return {
      answer: 'Insufficient evidence to answer from the indexed sources.',
      citations: [] satisfies SearchCitation[]
    };
  }

  return {
    answer: trimmed,
    citations
  };
}

export async function answerSearchQuery(input: SearchInput): Promise<SearchAnswerResponse> {
  const results = await searchKnowledgeBase({
    ...input,
    limit: clampLimit(input.limit)
  });

  if (results.length === 0) {
    return {
      answer: 'Insufficient evidence to answer from the indexed sources.',
      citations: [],
      results
    };
  }

  const evidence: SearchResult[] = [];
  let totalChars = 0;

  for (const result of results) {
    if (evidence.length >= MAX_CONTEXT_RESULTS) {
      break;
    }

    if (totalChars + result.chunkText.length > MAX_CONTEXT_CHARS && evidence.length > 0) {
      break;
    }

    evidence.push(result);
    totalChars += result.chunkText.length;
  }

  const response = await runAiAnalysis(
    buildAnswerPrompt(input.query, evidence),
    'Use neutral analyst prose. Do not use outside knowledge.',
    {
      workload: 'report'
    }
  );

  const finalized = finalizeAnswer(response.text, evidence);

  return {
    answer: finalized.answer,
    citations: finalized.citations,
    results
  };
}

export const __searchInternals = {
  buildCitationLabel,
  buildFilingBriefContent,
  chunkDocument,
  chunkText,
  deleteSourceRefs,
  finalizeAnswer,
  hashContent,
  hydrateResults,
  lexicalSearch,
  normalizeSearchSources,
  persistDocumentIndex,
  queryOneDocument,
  scoreSearchMatch,
  stripMarkdown,
  toFtsQuery,
  vectorSearch
};