import { load } from 'cheerio'; import type { FinancialStatementPeriod, StructuredKpiRow } from '@/lib/types'; import { resolvePrimaryFilingUrl } from '@/lib/server/sec'; import type { KpiDefinition } from '@/lib/server/financials/kpi-registry'; type FilingDocumentRef = { filingId: number; cik: string; accessionNumber: string; filingUrl: string | null; primaryDocument: string | null; }; function parseNumericCell(value: string) { const normalized = value.replace(/[$,%]/g, '').replace(/[(),]/g, '').trim(); if (!normalized) { return null; } const numeric = Number(normalized); return Number.isFinite(numeric) ? numeric : null; } function buildRowKey(definition: KpiDefinition, label: string) { const normalized = label.trim().toLowerCase().replace(/[^a-z0-9]+/g, '_').replace(/^_+|_+$/g, ''); return normalized ? `${definition.key}__note__${normalized}` : definition.key; } async function fetchHtml(ref: FilingDocumentRef) { const url = resolvePrimaryFilingUrl({ filingUrl: ref.filingUrl, cik: ref.cik, accessionNumber: ref.accessionNumber, primaryDocument: ref.primaryDocument }); if (!url) { return null; } try { const response = await fetch(url, { headers: { 'User-Agent': process.env.SEC_USER_AGENT || 'Fiscal Clone ' }, cache: 'no-store' }); if (!response.ok) { return null; } return await response.text(); } catch { return null; } } export async function extractStructuredKpisFromNotes(input: { ticker: string; periods: FinancialStatementPeriod[]; filings: FilingDocumentRef[]; definitions: KpiDefinition[]; }) { const rows = new Map(); for (const definition of input.definitions) { if (!definition.noteLabelIncludes || definition.noteLabelIncludes.length === 0) { continue; } for (const period of input.periods) { const filing = input.filings.find((entry) => entry.filingId === period.filingId); if (!filing) { continue; } const html = await fetchHtml(filing); if (!html) { continue; } const $ = load(html); $('table tr').each((_index, element) => { const cells = $(element).find('th,td').toArray().map((node) => $(node).text().replace(/\s+/g, ' ').trim()).filter(Boolean); if (cells.length < 2) { return; } const label = cells[0] ?? ''; const normalizedLabel = label.toLowerCase(); if (!definition.noteLabelIncludes?.some((token) => normalizedLabel.includes(token.toLowerCase()))) { return; } const numericCell = cells.slice(1).map(parseNumericCell).find((value) => value !== null) ?? null; if (numericCell === null) { return; } const key = buildRowKey(definition, label === definition.label ? '' : label); const existing = rows.get(key); if (existing) { existing.values[period.id] = numericCell; return; } rows.set(key, { key, label: label || definition.label, category: definition.category, unit: definition.unit, order: 500, segment: null, axis: null, member: null, values: { [period.id]: numericCell }, sourceConcepts: [], sourceFactIds: [], provenanceType: 'structured_note', hasDimensions: false }); }); } } return [...rows.values()].sort((left, right) => left.label.localeCompare(right.label)); }