From 8e62c66677970e3ba4965535e13cae7e031feae9 Mon Sep 17 00:00:00 2001 From: francy51 Date: Fri, 6 Mar 2026 14:40:43 -0500 Subject: [PATCH] Run playwright UI tests --- app/financials/page.tsx | 264 ++++--- .../notifications/task-stage-helpers.ts | 14 +- drizzle/0005_financial_taxonomy_v3.sql | 124 ++++ drizzle/meta/_journal.json | 7 + hooks/use-link-prefetch.ts | 1 - hooks/use-task-notifications-center.ts | 2 +- lib/api.ts | 13 +- lib/query/keys.ts | 6 +- lib/query/options.ts | 18 +- lib/server/api/app.ts | 42 +- .../api/task-workflow-hybrid.e2e.test.ts | 3 +- lib/server/db/schema.ts | 183 +++++ lib/server/financial-statements.test.ts | 130 ---- lib/server/financial-statements.ts | 317 +------- lib/server/financial-taxonomy.test.ts | 142 ++++ lib/server/financial-taxonomy.ts | 384 ++++++++++ lib/server/repos/filing-statements.ts | 1 + lib/server/repos/filing-taxonomy.ts | 676 ++++++++++++++++++ lib/server/repos/filings.ts | 16 + lib/server/sec.ts | 1 + lib/server/task-processors.ts | 120 ++-- lib/server/taxonomy/asset-discovery.test.ts | 73 ++ lib/server/taxonomy/asset-discovery.ts | 283 ++++++++ lib/server/taxonomy/engine.ts | 185 +++++ lib/server/taxonomy/linkbase-parser.test.ts | 63 ++ lib/server/taxonomy/linkbase-parser.ts | 310 ++++++++ lib/server/taxonomy/materialize.ts | 374 ++++++++++ lib/server/taxonomy/metrics.test.ts | 55 ++ lib/server/taxonomy/metrics.ts | 106 +++ lib/server/taxonomy/pdf-validation.test.ts | 49 ++ lib/server/taxonomy/pdf-validation.ts | 336 +++++++++ lib/server/taxonomy/types.ts | 136 ++++ lib/server/taxonomy/xbrl-parser.test.ts | 60 ++ lib/server/taxonomy/xbrl-parser.ts | 264 +++++++ lib/types.ts | 87 ++- package.json | 1 + scripts/backfill-taxonomy-snapshots.ts | 227 ++++++ 37 files changed, 4430 insertions(+), 643 deletions(-) create mode 100644 drizzle/0005_financial_taxonomy_v3.sql create mode 100644 lib/server/financial-taxonomy.test.ts create mode 100644 lib/server/financial-taxonomy.ts create mode 100644 lib/server/repos/filing-taxonomy.ts create mode 100644 lib/server/taxonomy/asset-discovery.test.ts create mode 100644 lib/server/taxonomy/asset-discovery.ts create mode 100644 lib/server/taxonomy/engine.ts create mode 100644 lib/server/taxonomy/linkbase-parser.test.ts create mode 100644 lib/server/taxonomy/linkbase-parser.ts create mode 100644 lib/server/taxonomy/materialize.ts create mode 100644 lib/server/taxonomy/metrics.test.ts create mode 100644 lib/server/taxonomy/metrics.ts create mode 100644 lib/server/taxonomy/pdf-validation.test.ts create mode 100644 lib/server/taxonomy/pdf-validation.ts create mode 100644 lib/server/taxonomy/types.ts create mode 100644 lib/server/taxonomy/xbrl-parser.test.ts create mode 100644 lib/server/taxonomy/xbrl-parser.ts create mode 100644 scripts/backfill-taxonomy-snapshots.ts diff --git a/app/financials/page.tsx b/app/financials/page.tsx index 632ded4..96f7666 100644 --- a/app/financials/page.tsx +++ b/app/financials/page.tsx @@ -17,7 +17,7 @@ import { XAxis, YAxis } from 'recharts'; -import { ChartNoAxesCombined, ChevronDown, Download, RefreshCcw, Search } from 'lucide-react'; +import { AlertTriangle, ChartNoAxesCombined, ChevronDown, Download, RefreshCcw, Search } from 'lucide-react'; import { AppShell } from '@/components/shell/app-shell'; import { MetricCard } from '@/components/dashboard/metric-card'; import { @@ -41,11 +41,9 @@ import { companyFinancialStatementsQueryOptions } from '@/lib/query/options'; import type { CompanyFinancialStatementsResponse, DimensionBreakdownRow, - FilingFaithfulStatementRow, FinancialHistoryWindow, FinancialStatementKind, - FinancialStatementMode, - StandardizedStatementRow + TaxonomyStatementRow } from '@/lib/types'; type LoadOptions = { @@ -59,11 +57,6 @@ const FINANCIAL_VALUE_SCALE_OPTIONS: Array<{ value: NumberScaleUnit; label: stri { value: 'billions', label: 'Billions (B)' } ]; -const MODE_OPTIONS: Array<{ value: FinancialStatementMode; label: string }> = [ - { value: 'standardized', label: 'Standardized' }, - { value: 'filing_faithful', label: 'Filing-faithful' } -]; - const STATEMENT_OPTIONS: Array<{ value: FinancialStatementKind; label: string }> = [ { value: 'income', label: 'Income' }, { value: 'balance', label: 'Balance Sheet' }, @@ -85,6 +78,7 @@ const CHART_TOOLTIP_BORDER = 'rgba(123, 255, 217, 0.45)'; type OverviewPoint = { periodId: string; filingDate: string; + periodEnd: string | null; label: string; revenue: number | null; netIncome: number | null; @@ -136,22 +130,6 @@ function ratioPercent(numerator: number | null, denominator: number | null) { return (numerator / denominator) * 100; } -function toStandardizedRows(data: CompanyFinancialStatementsResponse | null) { - if (!data || data.mode !== 'standardized') { - return []; - } - - return data.rows as StandardizedStatementRow[]; -} - -function toFilingFaithfulRows(data: CompanyFinancialStatementsResponse | null) { - if (!data || data.mode !== 'filing_faithful') { - return []; - } - - return data.rows as FilingFaithfulStatementRow[]; -} - function rowValue(row: { values: Record }, periodId: string) { return periodId in row.values ? row.values[periodId] : null; } @@ -168,30 +146,39 @@ function mergeFinancialPages( .filter((period, index, list) => list.findIndex((item) => item.id === period.id) === index) .sort((a, b) => Date.parse(a.filingDate) - Date.parse(b.filingDate)); - const rowMap = new Map(); + const rowMap = new Map(); for (const row of [...base.rows, ...next.rows]) { const existing = rowMap.get(row.key); if (!existing) { rowMap.set(row.key, { ...row, - values: { ...row.values } + values: { ...row.values }, + units: { ...row.units }, + sourceFactIds: [...row.sourceFactIds] }); continue; } existing.hasDimensions = existing.hasDimensions || row.hasDimensions; + existing.order = Math.min(existing.order, row.order); + existing.depth = Math.min(existing.depth, row.depth); + for (const [periodId, value] of Object.entries(row.values)) { if (!(periodId in existing.values)) { existing.values[periodId] = value; } } - if ('sourceConcepts' in existing && 'sourceConcepts' in row) { - for (const concept of row.sourceConcepts) { - if (!existing.sourceConcepts.includes(concept)) { - existing.sourceConcepts.push(concept); - } + for (const [periodId, unit] of Object.entries(row.units)) { + if (!(periodId in existing.units)) { + existing.units[periodId] = unit; + } + } + + for (const factId of row.sourceFactIds) { + if (!existing.sourceFactIds.includes(factId)) { + existing.sourceFactIds.push(factId); } } } @@ -220,21 +207,19 @@ function mergeFinancialPages( return Object.fromEntries(map.entries()); })(); - const mergedRows = [...rowMap.values()]; - return { ...next, periods, - rows: next.mode === 'standardized' - ? mergedRows as StandardizedStatementRow[] - : mergedRows as FilingFaithfulStatementRow[], + rows: [...rowMap.values()], nextCursor: next.nextCursor, coverage: { + ...next.coverage, filings: periods.length, rows: rowMap.size, dimensions: dimensionBreakdown ? Object.values(dimensionBreakdown).reduce((total, rows) => total + rows.length, 0) - : 0 + : 0, + facts: next.coverage.facts }, dataSourceStatus: { ...next.dataSourceStatus, @@ -244,20 +229,20 @@ function mergeFinancialPages( }; } -function findStandardizedRowValue( - data: CompanyFinancialStatementsResponse | null, - preferredKey: string, - fallbackIncludes: string[] -) { - const rows = toStandardizedRows(data); - const exact = rows.find((row) => row.key === preferredKey); +function findRowByLocalNames(rows: TaxonomyStatementRow[], localNames: string[]) { + const exact = rows.find((row) => localNames.some((name) => row.localName.toLowerCase() === name.toLowerCase())); if (exact) { return exact; } + const exactLabel = rows.find((row) => localNames.some((name) => row.label.toLowerCase() === name.toLowerCase())); + if (exactLabel) { + return exactLabel; + } + return rows.find((row) => { - const haystack = `${row.key} ${row.label} ${row.concept}`.toLowerCase(); - return fallbackIncludes.some((needle) => haystack.includes(needle)); + const haystack = `${row.key} ${row.label} ${row.localName} ${row.qname}`.toLowerCase(); + return localNames.some((name) => haystack.includes(name.toLowerCase())); }) ?? null; } @@ -265,28 +250,49 @@ function buildOverviewSeries( incomeData: CompanyFinancialStatementsResponse | null, balanceData: CompanyFinancialStatementsResponse | null ): OverviewPoint[] { - const periodMap = new Map(); + const periodMap = new Map(); for (const source of [incomeData, balanceData]) { for (const period of source?.periods ?? []) { - periodMap.set(period.id, { filingDate: period.filingDate }); + periodMap.set(period.id, { + filingDate: period.filingDate, + periodEnd: period.periodEnd + }); } } const periods = [...periodMap.entries()] - .map(([periodId, data]) => ({ periodId, filingDate: data.filingDate })) - .sort((a, b) => Date.parse(a.filingDate) - Date.parse(b.filingDate)); + .map(([periodId, data]) => ({ + periodId, + filingDate: data.filingDate, + periodEnd: data.periodEnd + })) + .sort((a, b) => Date.parse(a.periodEnd ?? a.filingDate) - Date.parse(b.periodEnd ?? b.filingDate)); - const revenueRow = findStandardizedRowValue(incomeData, 'revenue', ['revenue', 'sales']); - const netIncomeRow = findStandardizedRowValue(incomeData, 'net-income', ['net income', 'profit']); - const assetsRow = findStandardizedRowValue(balanceData, 'total-assets', ['total assets']); - const cashRow = findStandardizedRowValue(balanceData, 'cash-and-equivalents', ['cash']); - const debtRow = findStandardizedRowValue(balanceData, 'total-debt', ['debt', 'borrowings']); + const incomeRows = incomeData?.rows ?? []; + const balanceRows = balanceData?.rows ?? []; + + const revenueRow = findRowByLocalNames(incomeRows, [ + 'RevenueFromContractWithCustomerExcludingAssessedTax', + 'Revenues', + 'SalesRevenueNet', + 'Revenue' + ]); + const netIncomeRow = findRowByLocalNames(incomeRows, ['NetIncomeLoss', 'ProfitLoss']); + const assetsRow = findRowByLocalNames(balanceRows, ['Assets']); + const cashRow = findRowByLocalNames(balanceRows, [ + 'CashAndCashEquivalentsAtCarryingValue', + 'CashCashEquivalentsAndShortTermInvestments', + 'CashAndShortTermInvestments', + 'Cash' + ]); + const debtRow = findRowByLocalNames(balanceRows, ['LongTermDebt', 'Debt', 'LongTermDebtNoncurrent']); return periods.map((period) => ({ periodId: period.periodId, filingDate: period.filingDate, - label: formatShortDate(period.filingDate), + periodEnd: period.periodEnd, + label: formatShortDate(period.periodEnd ?? period.filingDate), revenue: revenueRow ? rowValue(revenueRow, period.periodId) : null, netIncome: netIncomeRow ? rowValue(netIncomeRow, period.periodId) : null, totalAssets: assetsRow ? rowValue(assetsRow, period.periodId) : null, @@ -311,7 +317,6 @@ function FinancialsPageContent() { const [tickerInput, setTickerInput] = useState('MSFT'); const [ticker, setTicker] = useState('MSFT'); - const [mode, setMode] = useState('standardized'); const [statement, setStatement] = useState('income'); const [window, setWindow] = useState('10y'); const [valueScale, setValueScale] = useState('millions'); @@ -344,18 +349,18 @@ function FinancialsPageContent() { const [incomeResponse, balanceResponse] = await Promise.all([ queryClient.ensureQueryData(companyFinancialStatementsQueryOptions({ ticker: symbol, - mode: 'standardized', statement: 'income', window: selectedWindow, includeDimensions: false, + includeFacts: false, limit: selectedWindow === 'all' ? 120 : 80 })), queryClient.ensureQueryData(companyFinancialStatementsQueryOptions({ ticker: symbol, - mode: 'standardized', statement: 'balance', window: selectedWindow, includeDimensions: false, + includeFacts: false, limit: selectedWindow === 'all' ? 120 : 80 })) ]); @@ -380,10 +385,10 @@ function FinancialsPageContent() { try { const response = await queryClient.ensureQueryData(companyFinancialStatementsQueryOptions({ ticker: normalizedTicker, - mode, statement, window, includeDimensions, + includeFacts: false, cursor: nextCursor, limit: window === 'all' ? 60 : 80 })); @@ -408,7 +413,6 @@ function FinancialsPageContent() { } }, [ queryClient, - mode, statement, window, dimensionsEnabled, @@ -429,7 +433,7 @@ function FinancialsPageContent() { await queueFilingSync({ ticker: targetTicker, limit: 20 }); void queryClient.invalidateQueries({ queryKey: queryKeys.recentTasks(20) }); void queryClient.invalidateQueries({ queryKey: ['filings'] }); - void queryClient.invalidateQueries({ queryKey: ['financials-v2'] }); + void queryClient.invalidateQueries({ queryKey: ['financials-v3'] }); await loadFinancials(targetTicker); } catch (err) { setError(err instanceof Error ? err.message : `Failed to queue financial sync for ${targetTicker}`); @@ -442,25 +446,30 @@ function FinancialsPageContent() { if (!isPending && isAuthenticated) { void loadFinancials(ticker); } - }, [isPending, isAuthenticated, ticker, mode, statement, window, dimensionsEnabled, loadFinancials]); + }, [isPending, isAuthenticated, ticker, statement, window, dimensionsEnabled, loadFinancials]); const periods = useMemo(() => { return [...(financials?.periods ?? [])] .sort((a, b) => Date.parse(a.filingDate) - Date.parse(b.filingDate)); }, [financials?.periods]); - const standardizedRows = useMemo(() => toStandardizedRows(financials), [financials]); - const filingFaithfulRows = useMemo(() => toFilingFaithfulRows(financials), [financials]); - - const statementRows = mode === 'standardized' - ? standardizedRows - : filingFaithfulRows; + const statementRows = useMemo(() => financials?.rows ?? [], [financials?.rows]); const overviewSeries = useMemo(() => { return buildOverviewSeries(overviewIncome, overviewBalance); }, [overviewIncome, overviewBalance]); const latestOverview = overviewSeries[overviewSeries.length - 1] ?? null; + const latestTaxonomyMetrics = financials?.metrics.taxonomy + ?? overviewIncome?.metrics.taxonomy + ?? overviewBalance?.metrics.taxonomy + ?? null; + const latestRevenue = latestOverview?.revenue ?? latestTaxonomyMetrics?.revenue ?? null; + const latestNetIncome = latestOverview?.netIncome ?? latestTaxonomyMetrics?.netIncome ?? null; + const latestTotalAssets = latestOverview?.totalAssets ?? latestTaxonomyMetrics?.totalAssets ?? null; + const latestCash = latestOverview?.cash ?? latestTaxonomyMetrics?.cash ?? null; + const latestDebt = latestOverview?.debt ?? latestTaxonomyMetrics?.debt ?? null; + const latestReferenceDate = latestOverview?.filingDate ?? periods[periods.length - 1]?.filingDate ?? null; const selectedRow = useMemo(() => { if (!selectedRowKey) { @@ -480,13 +489,11 @@ function FinancialsPageContent() { return direct; } - if ('concept' in selectedRow && selectedRow.concept) { - const conceptKey = selectedRow.concept.toLowerCase(); - for (const rows of Object.values(financials.dimensionBreakdown)) { - const matched = rows.filter((row) => (row.concept ?? '').toLowerCase() === conceptKey); - if (matched.length > 0) { - return matched; - } + const conceptKey = selectedRow.qname.toLowerCase(); + for (const rows of Object.values(financials.dimensionBreakdown)) { + const matched = rows.filter((row) => (row.concept ?? '').toLowerCase() === conceptKey); + if (matched.length > 0) { + return matched; } } @@ -498,16 +505,6 @@ function FinancialsPageContent() { }, [valueScale]); const controlSections = useMemo(() => [ - { - id: 'mode', - label: 'Mode', - value: mode, - options: MODE_OPTIONS, - onChange: (nextValue) => { - setMode(nextValue as FinancialStatementMode); - setSelectedRowKey(null); - } - }, { id: 'statement', label: 'Statement', @@ -535,7 +532,7 @@ function FinancialsPageContent() { options: FINANCIAL_VALUE_SCALE_OPTIONS, onChange: (nextValue) => setValueScale(nextValue as NumberScaleUnit) } - ], [mode, statement, window, valueScale]); + ], [statement, window, valueScale]); const controlActions = useMemo(() => { const actions: FinancialControlAction[] = []; @@ -581,7 +578,7 @@ function FinancialsPageContent() { return ( @@ -651,42 +648,42 @@ function FinancialsPageContent() {
= 0} + value={asDisplayCurrency(latestNetIncome, valueScale)} + delta={latestReferenceDate ? `Net margin ${formatPercent(ratioPercent(latestNetIncome, latestRevenue) ?? 0)}` : 'No history'} + positive={(latestNetIncome ?? 0) >= 0} />
- + {loading ? (

Loading overview chart...

) : overviewSeries.length === 0 ? ( -

No standardized income history available yet.

+

No income history available yet.

) : (
@@ -714,11 +711,11 @@ function FinancialsPageContent() { )} - + {loading ? (

Loading balance chart...

) : overviewSeries.length === 0 ? ( -

No standardized balance history available yet.

+

No balance history available yet.

) : (
@@ -750,7 +747,7 @@ function FinancialsPageContent() { {loading ? (

Loading statement matrix...

@@ -785,17 +782,11 @@ function FinancialsPageContent() { }} > - {'depth' in row ? ( -
- {row.label} - {row.hasDimensions ? : null} -
- ) : ( -
- {row.label} - {row.hasDimensions ? : null} -
- )} +
+ {row.label} + {row.isExtension ? Ext : null} + {row.hasDimensions ? : null} +
{periods.map((period) => ( @@ -850,7 +841,44 @@ function FinancialsPageContent() {
{financials ? ( - + +
+ + Overall status: {financials.metrics.validation?.status ?? 'not_run'} +
+ {(financials.metrics.validation?.checks.length ?? 0) === 0 ? ( +

No validation checks available yet.

+ ) : ( +
+ + + + + + + + + + + + {financials.metrics.validation?.checks.map((check) => ( + + + + + + + + ))} + +
MetricTaxonomyLLM (PDF)StatusPages
{check.metricKey}{asDisplayCurrency(check.taxonomyValue, valueScale)}{asDisplayCurrency(check.llmValue, valueScale)}{check.status}{check.evidencePages.join(', ') || 'n/a'}
+
+ )} +
+ ) : null} + + {financials ? ( +

Hydrated

@@ -879,7 +907,7 @@ function FinancialsPageContent() {
- Financial Statements V2: standardized + filing-faithful history + Financial Statements V3: taxonomy + PDF LLM validation
diff --git a/components/notifications/task-stage-helpers.ts b/components/notifications/task-stage-helpers.ts index b026351..5c834b3 100644 --- a/components/notifications/task-stage-helpers.ts +++ b/components/notifications/task-stage-helpers.ts @@ -21,6 +21,12 @@ const STAGE_LABELS: Record = { completed: 'Completed', failed: 'Failed', 'sync.fetch_filings': 'Fetch filings', + 'sync.discover_assets': 'Discover taxonomy assets', + 'sync.extract_taxonomy': 'Extract taxonomy', + 'sync.normalize_taxonomy': 'Normalize taxonomy', + 'sync.derive_metrics': 'Derive metrics', + 'sync.validate_pdf_metrics': 'Validate PDF metrics', + 'sync.persist_taxonomy': 'Persist taxonomy', 'sync.fetch_metrics': 'Fetch filing metrics', 'sync.persist_filings': 'Persist filings', 'sync.hydrate_statements': 'Hydrate statements', @@ -42,9 +48,13 @@ const TASK_STAGE_ORDER: Record = { 'queued', 'running', 'sync.fetch_filings', - 'sync.fetch_metrics', 'sync.persist_filings', - 'sync.hydrate_statements', + 'sync.discover_assets', + 'sync.extract_taxonomy', + 'sync.normalize_taxonomy', + 'sync.derive_metrics', + 'sync.validate_pdf_metrics', + 'sync.persist_taxonomy', 'completed' ], refresh_prices: [ diff --git a/drizzle/0005_financial_taxonomy_v3.sql b/drizzle/0005_financial_taxonomy_v3.sql new file mode 100644 index 0000000..907a1b5 --- /dev/null +++ b/drizzle/0005_financial_taxonomy_v3.sql @@ -0,0 +1,124 @@ +CREATE TABLE `filing_taxonomy_snapshot` ( + `id` integer PRIMARY KEY AUTOINCREMENT NOT NULL, + `filing_id` integer NOT NULL, + `ticker` text NOT NULL, + `filing_date` text NOT NULL, + `filing_type` text NOT NULL, + `parse_status` text NOT NULL, + `parse_error` text, + `source` text NOT NULL, + `periods` text, + `statement_rows` text, + `derived_metrics` text, + `validation_result` text, + `facts_count` integer DEFAULT 0 NOT NULL, + `concepts_count` integer DEFAULT 0 NOT NULL, + `dimensions_count` integer DEFAULT 0 NOT NULL, + `created_at` text NOT NULL, + `updated_at` text NOT NULL, + FOREIGN KEY (`filing_id`) REFERENCES `filing`(`id`) ON UPDATE no action ON DELETE cascade +); +--> statement-breakpoint +CREATE UNIQUE INDEX `filing_taxonomy_snapshot_filing_uidx` ON `filing_taxonomy_snapshot` (`filing_id`); +--> statement-breakpoint +CREATE INDEX `filing_taxonomy_snapshot_ticker_date_idx` ON `filing_taxonomy_snapshot` (`ticker`,`filing_date`); +--> statement-breakpoint +CREATE INDEX `filing_taxonomy_snapshot_status_idx` ON `filing_taxonomy_snapshot` (`parse_status`); +--> statement-breakpoint +CREATE TABLE `filing_taxonomy_asset` ( + `id` integer PRIMARY KEY AUTOINCREMENT NOT NULL, + `snapshot_id` integer NOT NULL, + `asset_type` text NOT NULL, + `name` text NOT NULL, + `url` text NOT NULL, + `size_bytes` integer, + `score` numeric, + `is_selected` integer DEFAULT false NOT NULL, + `created_at` text NOT NULL, + FOREIGN KEY (`snapshot_id`) REFERENCES `filing_taxonomy_snapshot`(`id`) ON UPDATE no action ON DELETE cascade +); +--> statement-breakpoint +CREATE INDEX `filing_taxonomy_asset_snapshot_idx` ON `filing_taxonomy_asset` (`snapshot_id`); +--> statement-breakpoint +CREATE INDEX `filing_taxonomy_asset_type_idx` ON `filing_taxonomy_asset` (`snapshot_id`,`asset_type`); +--> statement-breakpoint +CREATE TABLE `filing_taxonomy_concept` ( + `id` integer PRIMARY KEY AUTOINCREMENT NOT NULL, + `snapshot_id` integer NOT NULL, + `concept_key` text NOT NULL, + `qname` text NOT NULL, + `namespace_uri` text NOT NULL, + `local_name` text NOT NULL, + `label` text, + `is_extension` integer DEFAULT false NOT NULL, + `statement_kind` text, + `role_uri` text, + `presentation_order` numeric, + `presentation_depth` integer, + `parent_concept_key` text, + `is_abstract` integer DEFAULT false NOT NULL, + `created_at` text NOT NULL, + FOREIGN KEY (`snapshot_id`) REFERENCES `filing_taxonomy_snapshot`(`id`) ON UPDATE no action ON DELETE cascade +); +--> statement-breakpoint +CREATE INDEX `filing_taxonomy_concept_snapshot_idx` ON `filing_taxonomy_concept` (`snapshot_id`); +--> statement-breakpoint +CREATE INDEX `filing_taxonomy_concept_statement_idx` ON `filing_taxonomy_concept` (`snapshot_id`,`statement_kind`); +--> statement-breakpoint +CREATE UNIQUE INDEX `filing_taxonomy_concept_uidx` ON `filing_taxonomy_concept` (`snapshot_id`,`concept_key`,`role_uri`,`presentation_order`); +--> statement-breakpoint +CREATE TABLE `filing_taxonomy_fact` ( + `id` integer PRIMARY KEY AUTOINCREMENT NOT NULL, + `snapshot_id` integer NOT NULL, + `concept_key` text NOT NULL, + `qname` text NOT NULL, + `namespace_uri` text NOT NULL, + `local_name` text NOT NULL, + `statement_kind` text, + `role_uri` text, + `context_id` text NOT NULL, + `unit` text, + `decimals` text, + `value_num` numeric NOT NULL, + `period_start` text, + `period_end` text, + `period_instant` text, + `dimensions` text NOT NULL, + `is_dimensionless` integer DEFAULT true NOT NULL, + `source_file` text, + `created_at` text NOT NULL, + FOREIGN KEY (`snapshot_id`) REFERENCES `filing_taxonomy_snapshot`(`id`) ON UPDATE no action ON DELETE cascade +); +--> statement-breakpoint +CREATE INDEX `filing_taxonomy_fact_snapshot_idx` ON `filing_taxonomy_fact` (`snapshot_id`); +--> statement-breakpoint +CREATE INDEX `filing_taxonomy_fact_concept_idx` ON `filing_taxonomy_fact` (`snapshot_id`,`concept_key`); +--> statement-breakpoint +CREATE INDEX `filing_taxonomy_fact_period_idx` ON `filing_taxonomy_fact` (`snapshot_id`,`period_end`,`period_instant`); +--> statement-breakpoint +CREATE INDEX `filing_taxonomy_fact_statement_idx` ON `filing_taxonomy_fact` (`snapshot_id`,`statement_kind`); +--> statement-breakpoint +CREATE TABLE `filing_taxonomy_metric_validation` ( + `id` integer PRIMARY KEY AUTOINCREMENT NOT NULL, + `snapshot_id` integer NOT NULL, + `metric_key` text NOT NULL, + `taxonomy_value` numeric, + `llm_value` numeric, + `absolute_diff` numeric, + `relative_diff` numeric, + `status` text NOT NULL, + `evidence_pages` text NOT NULL, + `pdf_url` text, + `provider` text, + `model` text, + `error` text, + `created_at` text NOT NULL, + `updated_at` text NOT NULL, + FOREIGN KEY (`snapshot_id`) REFERENCES `filing_taxonomy_snapshot`(`id`) ON UPDATE no action ON DELETE cascade +); +--> statement-breakpoint +CREATE INDEX `filing_taxonomy_metric_validation_snapshot_idx` ON `filing_taxonomy_metric_validation` (`snapshot_id`); +--> statement-breakpoint +CREATE INDEX `filing_taxonomy_metric_validation_status_idx` ON `filing_taxonomy_metric_validation` (`snapshot_id`,`status`); +--> statement-breakpoint +CREATE UNIQUE INDEX `filing_taxonomy_metric_validation_uidx` ON `filing_taxonomy_metric_validation` (`snapshot_id`,`metric_key`); diff --git a/drizzle/meta/_journal.json b/drizzle/meta/_journal.json index 04c4830..ec559ef 100644 --- a/drizzle/meta/_journal.json +++ b/drizzle/meta/_journal.json @@ -36,6 +36,13 @@ "when": 1772568000000, "tag": "0004_watchlist_company_taxonomy", "breakpoints": true + }, + { + "idx": 5, + "version": "6", + "when": 1772668800000, + "tag": "0005_financial_taxonomy_v3", + "breakpoints": true } ] } diff --git a/hooks/use-link-prefetch.ts b/hooks/use-link-prefetch.ts index ad73092..0ce3a3c 100644 --- a/hooks/use-link-prefetch.ts +++ b/hooks/use-link-prefetch.ts @@ -40,7 +40,6 @@ export function useLinkPrefetch() { void queryClient.prefetchQuery(companyAnalysisQueryOptions(normalizedTicker)); void queryClient.prefetchQuery(companyFinancialStatementsQueryOptions({ ticker: normalizedTicker, - mode: 'standardized', statement: 'income', window: '10y', includeDimensions: false diff --git a/hooks/use-task-notifications-center.ts b/hooks/use-task-notifications-center.ts index bb238c2..f1449cb 100644 --- a/hooks/use-task-notifications-center.ts +++ b/hooks/use-task-notifications-center.ts @@ -123,7 +123,7 @@ export function useTaskNotificationsCenter(): UseTaskNotificationsCenterResult { case 'sync_filings': { void queryClient.invalidateQueries({ queryKey: ['filings'] }); void queryClient.invalidateQueries({ queryKey: ['analysis'] }); - void queryClient.invalidateQueries({ queryKey: ['financials-v2'] }); + void queryClient.invalidateQueries({ queryKey: ['financials-v3'] }); break; } case 'analyze_filing': { diff --git a/lib/api.ts b/lib/api.ts index c1ba561..03a265d 100644 --- a/lib/api.ts +++ b/lib/api.ts @@ -8,7 +8,6 @@ import type { Holding, FinancialHistoryWindow, FinancialStatementKind, - FinancialStatementMode, PortfolioInsight, PortfolioSummary, Task, @@ -199,24 +198,32 @@ export async function getCompanyAnalysis(ticker: string) { export async function getCompanyFinancialStatements(input: { ticker: string; - mode: FinancialStatementMode; statement: FinancialStatementKind; window: FinancialHistoryWindow; includeDimensions?: boolean; + includeFacts?: boolean; + factsCursor?: string | null; + factsLimit?: number; cursor?: string | null; limit?: number; }) { const query = { ticker: input.ticker.trim().toUpperCase(), - mode: input.mode, statement: input.statement, window: input.window, includeDimensions: input.includeDimensions ? 'true' : 'false', + includeFacts: input.includeFacts ? 'true' : 'false', ...(typeof input.cursor === 'string' && input.cursor.trim().length > 0 ? { cursor: input.cursor.trim() } : {}), ...(typeof input.limit === 'number' && Number.isFinite(input.limit) ? { limit: input.limit } + : {}), + ...(typeof input.factsCursor === 'string' && input.factsCursor.trim().length > 0 + ? { factsCursor: input.factsCursor.trim() } + : {}), + ...(typeof input.factsLimit === 'number' && Number.isFinite(input.factsLimit) + ? { factsLimit: input.factsLimit } : {}) }; diff --git a/lib/query/keys.ts b/lib/query/keys.ts index 6446e93..0bba6b6 100644 --- a/lib/query/keys.ts +++ b/lib/query/keys.ts @@ -2,13 +2,15 @@ export const queryKeys = { companyAnalysis: (ticker: string) => ['analysis', ticker] as const, companyFinancialStatements: ( ticker: string, - mode: string, statement: string, window: string, includeDimensions: boolean, + includeFacts: boolean, + factsCursor: string | null, + factsLimit: number, cursor: string | null, limit: number - ) => ['financials-v2', ticker, mode, statement, window, includeDimensions ? 'dims' : 'no-dims', cursor ?? '', limit] as const, + ) => ['financials-v3', ticker, statement, window, includeDimensions ? 'dims' : 'no-dims', includeFacts ? 'facts' : 'rows', factsCursor ?? '', factsLimit, cursor ?? '', limit] as const, filings: (ticker: string | null, limit: number) => ['filings', ticker ?? '', limit] as const, report: (accessionNumber: string) => ['report', accessionNumber] as const, watchlist: () => ['watchlist'] as const, diff --git a/lib/query/options.ts b/lib/query/options.ts index 1278c75..095b04d 100644 --- a/lib/query/options.ts +++ b/lib/query/options.ts @@ -15,8 +15,7 @@ import { import { queryKeys } from '@/lib/query/keys'; import type { FinancialHistoryWindow, - FinancialStatementKind, - FinancialStatementMode + FinancialStatementKind } from '@/lib/types'; export function companyAnalysisQueryOptions(ticker: string) { @@ -31,34 +30,43 @@ export function companyAnalysisQueryOptions(ticker: string) { export function companyFinancialStatementsQueryOptions(input: { ticker: string; - mode: FinancialStatementMode; statement: FinancialStatementKind; window: FinancialHistoryWindow; includeDimensions?: boolean; + includeFacts?: boolean; + factsCursor?: string | null; + factsLimit?: number; cursor?: string | null; limit?: number; }) { const normalizedTicker = input.ticker.trim().toUpperCase(); const includeDimensions = input.includeDimensions ?? false; + const includeFacts = input.includeFacts ?? false; + const factsCursor = input.factsCursor ?? null; + const factsLimit = input.factsLimit ?? 500; const cursor = input.cursor ?? null; const limit = input.limit ?? 40; return queryOptions({ queryKey: queryKeys.companyFinancialStatements( normalizedTicker, - input.mode, input.statement, input.window, includeDimensions, + includeFacts, + factsCursor, + factsLimit, cursor, limit ), queryFn: () => getCompanyFinancialStatements({ ticker: normalizedTicker, - mode: input.mode, statement: input.statement, window: input.window, includeDimensions, + includeFacts, + factsCursor, + factsLimit, cursor, limit }), diff --git a/lib/server/api/app.ts b/lib/server/api/app.ts index fe22f84..f5cc26c 100644 --- a/lib/server/api/app.ts +++ b/lib/server/api/app.ts @@ -4,7 +4,6 @@ import type { Filing, FinancialHistoryWindow, FinancialStatementKind, - FinancialStatementMode, TaskStatus } from '@/lib/types'; import { auth } from '@/lib/auth'; @@ -13,8 +12,8 @@ import { asErrorMessage, jsonError } from '@/lib/server/http'; import { buildPortfolioSummary } from '@/lib/server/portfolio'; import { defaultFinancialSyncLimit, - getCompanyFinancialStatements -} from '@/lib/server/financial-statements'; + getCompanyFinancialTaxonomy +} from '@/lib/server/financial-taxonomy'; import { redactInternalFilingAnalysisFields } from '@/lib/server/api/filing-redaction'; import { getFilingByAccession, listFilingsRecords } from '@/lib/server/repos/filings'; import { @@ -44,8 +43,7 @@ import { const ALLOWED_STATUSES: TaskStatus[] = ['queued', 'running', 'completed', 'failed']; const FINANCIAL_FORMS: ReadonlySet = new Set(['10-K', '10-Q']); const AUTO_FILING_SYNC_LIMIT = 20; -const FINANCIALS_V2_ENABLED = process.env.FINANCIALS_V2?.trim().toLowerCase() !== 'false'; -const FINANCIAL_STATEMENT_MODES: FinancialStatementMode[] = ['standardized', 'filing_faithful']; +const FINANCIALS_V3_ENABLED = process.env.FINANCIALS_V3?.trim().toLowerCase() !== 'false'; const FINANCIAL_STATEMENT_KINDS: FinancialStatementKind[] = [ 'income', 'balance', @@ -120,12 +118,6 @@ function asTags(value: unknown) { return [...unique]; } -function asStatementMode(value: unknown): FinancialStatementMode { - return FINANCIAL_STATEMENT_MODES.includes(value as FinancialStatementMode) - ? value as FinancialStatementMode - : 'standardized'; -} - function asStatementKind(value: unknown): FinancialStatementKind { return FINANCIAL_STATEMENT_KINDS.includes(value as FinancialStatementKind) ? value as FinancialStatementKind @@ -613,8 +605,8 @@ export const app = new Elysia({ prefix: '/api' }) return response; } - if (!FINANCIALS_V2_ENABLED) { - return jsonError('Financial statements v2 is disabled', 404); + if (!FINANCIALS_V3_ENABLED) { + return jsonError('Financial statements v3 is disabled', 404); } const ticker = typeof query.ticker === 'string' @@ -624,26 +616,34 @@ export const app = new Elysia({ prefix: '/api' }) return jsonError('ticker is required'); } - const mode = asStatementMode(query.mode); const statement = asStatementKind(query.statement); const window = asHistoryWindow(query.window); const includeDimensions = asBoolean(query.includeDimensions, false); + const includeFacts = asBoolean(query.includeFacts, false); const cursor = typeof query.cursor === 'string' && query.cursor.trim().length > 0 ? query.cursor.trim() : null; const limit = Number.isFinite(Number(query.limit)) ? Number(query.limit) : undefined; + const factsCursor = typeof query.factsCursor === 'string' && query.factsCursor.trim().length > 0 + ? query.factsCursor.trim() + : null; + const factsLimit = Number.isFinite(Number(query.factsLimit)) + ? Number(query.factsLimit) + : undefined; - let payload = await getCompanyFinancialStatements({ + let payload = await getCompanyFinancialTaxonomy({ ticker, - mode, statement, window, includeDimensions, + includeFacts, + factsCursor, + factsLimit, cursor, limit, - v2Enabled: FINANCIALS_V2_ENABLED, + v3Enabled: FINANCIALS_V3_ENABLED, queuedSync: false }); @@ -671,7 +671,7 @@ export const app = new Elysia({ prefix: '/api' }) }); queuedSync = true; } catch (error) { - console.error(`[financials-v2-sync] failed for ${ticker}:`, error); + console.error(`[financials-v3-sync] failed for ${ticker}:`, error); } } @@ -689,7 +689,6 @@ export const app = new Elysia({ prefix: '/api' }) }, { query: t.Object({ ticker: t.String({ minLength: 1 }), - mode: t.Optional(t.Union([t.Literal('standardized'), t.Literal('filing_faithful')])), statement: t.Optional(t.Union([ t.Literal('income'), t.Literal('balance'), @@ -699,8 +698,11 @@ export const app = new Elysia({ prefix: '/api' }) ])), window: t.Optional(t.Union([t.Literal('10y'), t.Literal('all')])), includeDimensions: t.Optional(t.Union([t.String(), t.Boolean()])), + includeFacts: t.Optional(t.Union([t.String(), t.Boolean()])), cursor: t.Optional(t.String()), - limit: t.Optional(t.Numeric()) + limit: t.Optional(t.Numeric()), + factsCursor: t.Optional(t.String()), + factsLimit: t.Optional(t.Numeric()) }) }) .get('/analysis/reports/:accessionNumber', async ({ params }) => { diff --git a/lib/server/api/task-workflow-hybrid.e2e.test.ts b/lib/server/api/task-workflow-hybrid.e2e.test.ts index e1ebd10..b1773c4 100644 --- a/lib/server/api/task-workflow-hybrid.e2e.test.ts +++ b/lib/server/api/task-workflow-hybrid.e2e.test.ts @@ -86,7 +86,8 @@ function applySqlMigrations(client: { exec: (query: string) => void }) { '0001_glossy_statement_snapshots.sql', '0002_workflow_task_projection_metadata.sql', '0003_task_stage_event_timeline.sql', - '0004_watchlist_company_taxonomy.sql' + '0004_watchlist_company_taxonomy.sql', + '0005_financial_taxonomy_v3.sql' ]; for (const file of migrationFiles) { diff --git a/lib/server/db/schema.ts b/lib/server/db/schema.ts index 39804c7..83c3e0f 100644 --- a/lib/server/db/schema.ts +++ b/lib/server/db/schema.ts @@ -15,6 +15,19 @@ type FilingMetrics = { debt: number | null; }; +type TaxonomyAssetType = + | 'instance' + | 'schema' + | 'presentation' + | 'label' + | 'calculation' + | 'definition' + | 'pdf' + | 'other'; + +type TaxonomyParseStatus = 'ready' | 'partial' | 'failed'; +type TaxonomyMetricValidationStatus = 'not_run' | 'matched' | 'mismatch' | 'error'; + type FilingAnalysis = { provider?: string; model?: string; @@ -47,6 +60,7 @@ type FilingStatementPeriod = { filingId: number; accessionNumber: string; filingDate: string; + periodStart: string | null; periodEnd: string | null; filingType: '10-K' | '10-Q'; periodLabel: string; @@ -97,6 +111,55 @@ type DimensionStatementBundle = { statements: Record; }; +type TaxonomyDimensionMember = { + axis: string; + member: string; +}; + +type TaxonomyStatementSnapshotRow = { + key: string; + label: string; + conceptKey: string; + qname: string; + namespaceUri: string; + localName: string; + isExtension: boolean; + statement: FinancialStatementKind; + roleUri: string | null; + order: number; + depth: number; + parentKey: string | null; + values: Record; + units: Record; + hasDimensions: boolean; + sourceFactIds: number[]; +}; + +type TaxonomyStatementBundle = { + periods: FilingStatementPeriod[]; + statements: Record; +}; + +type TaxonomyMetricValidationCheck = { + metricKey: keyof FilingMetrics; + taxonomyValue: number | null; + llmValue: number | null; + absoluteDiff: number | null; + relativeDiff: number | null; + status: TaxonomyMetricValidationStatus; + evidencePages: number[]; + pdfUrl: string | null; + provider: string | null; + model: string | null; + error: string | null; +}; + +type TaxonomyMetricValidationResult = { + status: TaxonomyMetricValidationStatus; + checks: TaxonomyMetricValidationCheck[]; + validatedAt: string | null; +}; + const authDateColumn = { mode: 'timestamp_ms' } as const; @@ -273,6 +336,121 @@ export const filingStatementSnapshot = sqliteTable('filing_statement_snapshot', filingStatementStatusIndex: index('filing_stmt_status_idx').on(table.parse_status) })); +export const filingTaxonomySnapshot = sqliteTable('filing_taxonomy_snapshot', { + id: integer('id').primaryKey({ autoIncrement: true }), + filing_id: integer('filing_id').notNull().references(() => filing.id, { onDelete: 'cascade' }), + ticker: text('ticker').notNull(), + filing_date: text('filing_date').notNull(), + filing_type: text('filing_type').$type<'10-K' | '10-Q'>().notNull(), + parse_status: text('parse_status').$type().notNull(), + parse_error: text('parse_error'), + source: text('source').$type<'xbrl_instance' | 'xbrl_instance_with_linkbase' | 'legacy_html_fallback'>().notNull(), + periods: text('periods', { mode: 'json' }).$type(), + statement_rows: text('statement_rows', { mode: 'json' }).$type(), + derived_metrics: text('derived_metrics', { mode: 'json' }).$type(), + validation_result: text('validation_result', { mode: 'json' }).$type(), + facts_count: integer('facts_count').notNull().default(0), + concepts_count: integer('concepts_count').notNull().default(0), + dimensions_count: integer('dimensions_count').notNull().default(0), + created_at: text('created_at').notNull(), + updated_at: text('updated_at').notNull() +}, (table) => ({ + filingTaxonomySnapshotFilingUnique: uniqueIndex('filing_taxonomy_snapshot_filing_uidx').on(table.filing_id), + filingTaxonomySnapshotTickerDateIndex: index('filing_taxonomy_snapshot_ticker_date_idx').on(table.ticker, table.filing_date), + filingTaxonomySnapshotStatusIndex: index('filing_taxonomy_snapshot_status_idx').on(table.parse_status) +})); + +export const filingTaxonomyAsset = sqliteTable('filing_taxonomy_asset', { + id: integer('id').primaryKey({ autoIncrement: true }), + snapshot_id: integer('snapshot_id').notNull().references(() => filingTaxonomySnapshot.id, { onDelete: 'cascade' }), + asset_type: text('asset_type').$type().notNull(), + name: text('name').notNull(), + url: text('url').notNull(), + size_bytes: integer('size_bytes'), + score: numeric('score'), + is_selected: integer('is_selected', { mode: 'boolean' }).notNull().default(false), + created_at: text('created_at').notNull() +}, (table) => ({ + filingTaxonomyAssetSnapshotIndex: index('filing_taxonomy_asset_snapshot_idx').on(table.snapshot_id), + filingTaxonomyAssetTypeIndex: index('filing_taxonomy_asset_type_idx').on(table.snapshot_id, table.asset_type) +})); + +export const filingTaxonomyConcept = sqliteTable('filing_taxonomy_concept', { + id: integer('id').primaryKey({ autoIncrement: true }), + snapshot_id: integer('snapshot_id').notNull().references(() => filingTaxonomySnapshot.id, { onDelete: 'cascade' }), + concept_key: text('concept_key').notNull(), + qname: text('qname').notNull(), + namespace_uri: text('namespace_uri').notNull(), + local_name: text('local_name').notNull(), + label: text('label'), + is_extension: integer('is_extension', { mode: 'boolean' }).notNull().default(false), + statement_kind: text('statement_kind').$type(), + role_uri: text('role_uri'), + presentation_order: numeric('presentation_order'), + presentation_depth: integer('presentation_depth'), + parent_concept_key: text('parent_concept_key'), + is_abstract: integer('is_abstract', { mode: 'boolean' }).notNull().default(false), + created_at: text('created_at').notNull() +}, (table) => ({ + filingTaxonomyConceptSnapshotIndex: index('filing_taxonomy_concept_snapshot_idx').on(table.snapshot_id), + filingTaxonomyConceptStatementIndex: index('filing_taxonomy_concept_statement_idx').on(table.snapshot_id, table.statement_kind), + filingTaxonomyConceptUnique: uniqueIndex('filing_taxonomy_concept_uidx').on( + table.snapshot_id, + table.concept_key, + table.role_uri, + table.presentation_order + ) +})); + +export const filingTaxonomyFact = sqliteTable('filing_taxonomy_fact', { + id: integer('id').primaryKey({ autoIncrement: true }), + snapshot_id: integer('snapshot_id').notNull().references(() => filingTaxonomySnapshot.id, { onDelete: 'cascade' }), + concept_key: text('concept_key').notNull(), + qname: text('qname').notNull(), + namespace_uri: text('namespace_uri').notNull(), + local_name: text('local_name').notNull(), + statement_kind: text('statement_kind').$type(), + role_uri: text('role_uri'), + context_id: text('context_id').notNull(), + unit: text('unit'), + decimals: text('decimals'), + value_num: numeric('value_num').notNull(), + period_start: text('period_start'), + period_end: text('period_end'), + period_instant: text('period_instant'), + dimensions: text('dimensions', { mode: 'json' }).$type().notNull(), + is_dimensionless: integer('is_dimensionless', { mode: 'boolean' }).notNull().default(true), + source_file: text('source_file'), + created_at: text('created_at').notNull() +}, (table) => ({ + filingTaxonomyFactSnapshotIndex: index('filing_taxonomy_fact_snapshot_idx').on(table.snapshot_id), + filingTaxonomyFactConceptIndex: index('filing_taxonomy_fact_concept_idx').on(table.snapshot_id, table.concept_key), + filingTaxonomyFactPeriodIndex: index('filing_taxonomy_fact_period_idx').on(table.snapshot_id, table.period_end, table.period_instant), + filingTaxonomyFactStatementIndex: index('filing_taxonomy_fact_statement_idx').on(table.snapshot_id, table.statement_kind) +})); + +export const filingTaxonomyMetricValidation = sqliteTable('filing_taxonomy_metric_validation', { + id: integer('id').primaryKey({ autoIncrement: true }), + snapshot_id: integer('snapshot_id').notNull().references(() => filingTaxonomySnapshot.id, { onDelete: 'cascade' }), + metric_key: text('metric_key').$type().notNull(), + taxonomy_value: numeric('taxonomy_value'), + llm_value: numeric('llm_value'), + absolute_diff: numeric('absolute_diff'), + relative_diff: numeric('relative_diff'), + status: text('status').$type().notNull(), + evidence_pages: text('evidence_pages', { mode: 'json' }).$type().notNull(), + pdf_url: text('pdf_url'), + provider: text('provider'), + model: text('model'), + error: text('error'), + created_at: text('created_at').notNull(), + updated_at: text('updated_at').notNull() +}, (table) => ({ + filingTaxonomyMetricValidationSnapshotIndex: index('filing_taxonomy_metric_validation_snapshot_idx').on(table.snapshot_id), + filingTaxonomyMetricValidationStatusIndex: index('filing_taxonomy_metric_validation_status_idx').on(table.snapshot_id, table.status), + filingTaxonomyMetricValidationUnique: uniqueIndex('filing_taxonomy_metric_validation_uidx').on(table.snapshot_id, table.metric_key) +})); + export const filingLink = sqliteTable('filing_link', { id: integer('id').primaryKey({ autoIncrement: true }), filing_id: integer('filing_id').notNull().references(() => filing.id, { onDelete: 'cascade' }), @@ -357,6 +535,11 @@ export const appSchema = { holding, filing, filingStatementSnapshot, + filingTaxonomySnapshot, + filingTaxonomyAsset, + filingTaxonomyConcept, + filingTaxonomyFact, + filingTaxonomyMetricValidation, filingLink, taskRun, taskStageEvent, diff --git a/lib/server/financial-statements.test.ts b/lib/server/financial-statements.test.ts index f0c40f3..28cf508 100644 --- a/lib/server/financial-statements.test.ts +++ b/lib/server/financial-statements.test.ts @@ -1,137 +1,7 @@ import { describe, expect, it } from 'bun:test'; import { __financialStatementsInternals } from './financial-statements'; -import type { FilingStatementSnapshotRecord } from '@/lib/server/repos/filing-statements'; - -function sampleSnapshot(): FilingStatementSnapshotRecord { - return { - id: 10, - filing_id: 44, - ticker: 'MSFT', - filing_date: '2025-12-31', - filing_type: '10-K', - period_end: '2025-12-31', - statement_bundle: { - periods: [ - { - id: '2025-12-31-0001', - filingId: 44, - accessionNumber: '0001', - filingDate: '2025-12-31', - periodEnd: '2025-12-31', - filingType: '10-K', - periodLabel: 'Fiscal Year End' - } - ], - statements: { - income: [ - { - key: 'revenue-line', - label: 'Revenue', - concept: 'us-gaap:Revenues', - order: 1, - depth: 0, - isSubtotal: false, - values: { '2025-12-31-0001': 120_000 } - } - ], - balance: [], - cash_flow: [], - equity: [], - comprehensive_income: [] - } - }, - standardized_bundle: { - periods: [ - { - id: '2025-12-31-0001', - filingId: 44, - accessionNumber: '0001', - filingDate: '2025-12-31', - periodEnd: '2025-12-31', - filingType: '10-K', - periodLabel: 'Fiscal Year End' - } - ], - statements: { - income: [ - { - key: 'revenue', - label: 'Revenue', - concept: 'us-gaap:Revenues', - category: 'core', - sourceConcepts: ['us-gaap:Revenues'], - values: { '2025-12-31-0001': 120_000 } - } - ], - balance: [], - cash_flow: [], - equity: [], - comprehensive_income: [] - } - }, - dimension_bundle: { - statements: { - income: [ - { - rowKey: 'revenue-line', - concept: 'us-gaap:Revenues', - periodId: '2025-12-31-0001', - axis: 'srt:StatementBusinessSegmentsAxis', - member: 'acme:CloudMember', - value: 55_000, - unit: 'USD' - } - ], - balance: [], - cash_flow: [], - equity: [], - comprehensive_income: [] - } - }, - parse_status: 'ready', - parse_error: null, - source: 'sec_filing_summary', - created_at: '2026-01-01T00:00:00.000Z', - updated_at: '2026-01-01T00:00:00.000Z' - }; -} describe('financial statements service internals', () => { - it('builds sorted periods for selected mode/statement', () => { - const snapshot = sampleSnapshot(); - - const periods = __financialStatementsInternals.buildPeriods( - [snapshot], - 'standardized', - 'income' - ); - - expect(periods.length).toBe(1); - expect(periods[0]?.id).toBe('2025-12-31-0001'); - }); - - it('builds standardized rows and includes dimensions when requested', () => { - const snapshot = sampleSnapshot(); - const periods = __financialStatementsInternals.buildPeriods( - [snapshot], - 'standardized', - 'income' - ); - - const result = __financialStatementsInternals.buildRows( - [snapshot], - periods, - 'standardized', - 'income', - true - ); - - expect(result.rows.length).toBe(1); - expect(result.rows[0]?.hasDimensions).toBe(true); - expect(result.dimensions).not.toBeNull(); - expect(result.dimensions?.['revenue-line']?.length).toBe(1); - }); - it('returns default sync limits by window', () => { expect(__financialStatementsInternals.defaultFinancialSyncLimit('10y')).toBe(60); expect(__financialStatementsInternals.defaultFinancialSyncLimit('all')).toBe(120); diff --git a/lib/server/financial-statements.ts b/lib/server/financial-statements.ts index 1abfa57..fd73084 100644 --- a/lib/server/financial-statements.ts +++ b/lib/server/financial-statements.ts @@ -1,315 +1,48 @@ import type { CompanyFinancialStatementsResponse, - DimensionBreakdownRow, - FilingFaithfulStatementRow, FinancialHistoryWindow, - FinancialStatementKind, - FinancialStatementMode, - FinancialStatementPeriod, - StandardizedStatementRow + FinancialStatementKind } from '@/lib/types'; -import { listFilingsRecords } from '@/lib/server/repos/filings'; import { - countFilingStatementSnapshotStatuses, - type DimensionStatementSnapshotRow, - type FilingFaithfulStatementSnapshotRow, - type FilingStatementSnapshotRecord, - listFilingStatementSnapshotsByTicker, - type StandardizedStatementSnapshotRow -} from '@/lib/server/repos/filing-statements'; + defaultFinancialSyncLimit, + getCompanyFinancialTaxonomy +} from '@/lib/server/financial-taxonomy'; type GetCompanyFinancialStatementsInput = { ticker: string; - mode: FinancialStatementMode; statement: FinancialStatementKind; window: FinancialHistoryWindow; includeDimensions: boolean; + includeFacts?: boolean; + factsCursor?: string | null; + factsLimit?: number; cursor?: string | null; limit?: number; - v2Enabled: boolean; + v2Enabled?: boolean; + v3Enabled?: boolean; queuedSync: boolean; }; -type FinancialStatementRowByMode = StandardizedStatementRow | FilingFaithfulStatementRow; - -function safeTicker(input: string) { - return input.trim().toUpperCase(); -} - -function isFinancialForm(type: string): type is '10-K' | '10-Q' { - return type === '10-K' || type === '10-Q'; -} - -function rowDimensionMatcher(row: { key: string; concept: string | null }, item: DimensionStatementSnapshotRow) { - const concept = row.concept?.toLowerCase() ?? ''; - const itemConcept = item.concept?.toLowerCase() ?? ''; - if (item.rowKey === row.key) { - return true; - } - - return Boolean(concept && itemConcept && concept === itemConcept); -} - -function periodSorter(left: FinancialStatementPeriod, right: FinancialStatementPeriod) { - const byDate = Date.parse(left.filingDate) - Date.parse(right.filingDate); - if (Number.isFinite(byDate) && byDate !== 0) { - return byDate; - } - - return left.id.localeCompare(right.id); -} - -function resolveDimensionPeriodId(rawPeriodId: string, periods: FinancialStatementPeriod[]) { - const exact = periods.find((period) => period.id === rawPeriodId); - if (exact) { - return exact.id; - } - - const byDate = periods.find((period) => period.filingDate === rawPeriodId || period.periodEnd === rawPeriodId); - return byDate?.id ?? null; -} - -function getRowsForSnapshot( - snapshot: FilingStatementSnapshotRecord, - mode: FinancialStatementMode, - statement: FinancialStatementKind -) { - if (mode === 'standardized') { - return snapshot.standardized_bundle?.statements?.[statement] ?? []; - } - - return snapshot.statement_bundle?.statements?.[statement] ?? []; -} - -function buildPeriods( - snapshots: FilingStatementSnapshotRecord[], - mode: FinancialStatementMode, - statement: FinancialStatementKind -) { - const map = new Map(); - - for (const snapshot of snapshots) { - const rows = getRowsForSnapshot(snapshot, mode, statement); - if (rows.length === 0) { - continue; - } - - const sourcePeriods = mode === 'standardized' - ? snapshot.standardized_bundle?.periods - : snapshot.statement_bundle?.periods; - - for (const period of sourcePeriods ?? []) { - if (!map.has(period.id)) { - map.set(period.id, { - id: period.id, - filingId: period.filingId, - accessionNumber: period.accessionNumber, - filingDate: period.filingDate, - periodEnd: period.periodEnd, - filingType: period.filingType, - periodLabel: period.periodLabel - }); - } - } - } - - return [...map.values()].sort(periodSorter); -} - -function buildRows( - snapshots: FilingStatementSnapshotRecord[], - periods: FinancialStatementPeriod[], - mode: FinancialStatementMode, - statement: FinancialStatementKind, - includeDimensions: boolean -) { - const rowMap = new Map(); - const dimensionMap = includeDimensions - ? new Map() - : null; - - for (const snapshot of snapshots) { - const rows = getRowsForSnapshot(snapshot, mode, statement); - const dimensions = snapshot.dimension_bundle?.statements?.[statement] ?? []; - - if (mode === 'standardized') { - for (const sourceRow of rows as StandardizedStatementSnapshotRow[]) { - const existing = rowMap.get(sourceRow.key) as StandardizedStatementRow | undefined; - const hasDimensions = dimensions.some((item) => rowDimensionMatcher(sourceRow, item)); - - if (!existing) { - rowMap.set(sourceRow.key, { - key: sourceRow.key, - label: sourceRow.label, - concept: sourceRow.concept, - category: sourceRow.category, - sourceConcepts: [...sourceRow.sourceConcepts], - values: { ...sourceRow.values }, - hasDimensions - }); - continue; - } - - existing.hasDimensions = existing.hasDimensions || hasDimensions; - for (const concept of sourceRow.sourceConcepts) { - if (!existing.sourceConcepts.includes(concept)) { - existing.sourceConcepts.push(concept); - } - } - - for (const [periodId, value] of Object.entries(sourceRow.values)) { - if (!(periodId in existing.values)) { - existing.values[periodId] = value; - } - } - } - } else { - for (const sourceRow of rows as FilingFaithfulStatementSnapshotRow[]) { - const rowKey = sourceRow.concept ? `concept-${sourceRow.concept.toLowerCase()}` : `label-${sourceRow.key}`; - const existing = rowMap.get(rowKey) as FilingFaithfulStatementRow | undefined; - const hasDimensions = dimensions.some((item) => rowDimensionMatcher(sourceRow, item)); - - if (!existing) { - rowMap.set(rowKey, { - key: rowKey, - label: sourceRow.label, - concept: sourceRow.concept, - order: sourceRow.order, - depth: sourceRow.depth, - isSubtotal: sourceRow.isSubtotal, - values: { ...sourceRow.values }, - hasDimensions - }); - continue; - } - - existing.hasDimensions = existing.hasDimensions || hasDimensions; - existing.order = Math.min(existing.order, sourceRow.order); - existing.depth = Math.min(existing.depth, sourceRow.depth); - existing.isSubtotal = existing.isSubtotal || sourceRow.isSubtotal; - for (const [periodId, value] of Object.entries(sourceRow.values)) { - if (!(periodId in existing.values)) { - existing.values[periodId] = value; - } - } - } - } - - if (dimensionMap) { - for (const item of dimensions) { - const periodId = resolveDimensionPeriodId(item.periodId, periods); - if (!periodId) { - continue; - } - - const entry: DimensionBreakdownRow = { - rowKey: item.rowKey, - concept: item.concept, - periodId, - axis: item.axis, - member: item.member, - value: item.value, - unit: item.unit - }; - - const group = dimensionMap.get(item.rowKey); - if (group) { - group.push(entry); - } else { - dimensionMap.set(item.rowKey, [entry]); - } - } - } - } - - const rows = [...rowMap.values()].sort((a, b) => { - const left = mode === 'standardized' ? a.label : `${(a as FilingFaithfulStatementRow).order.toString().padStart(5, '0')}::${a.label}`; - const right = mode === 'standardized' ? b.label : `${(b as FilingFaithfulStatementRow).order.toString().padStart(5, '0')}::${b.label}`; - return left.localeCompare(right); - }); - - if (mode === 'standardized') { - const standardized = rows as StandardizedStatementRow[]; - const core = standardized.filter((row) => row.category === 'core'); - const nonCore = standardized.filter((row) => row.category !== 'core'); - const orderedRows = [...core, ...nonCore]; - - return { - rows: orderedRows, - dimensions: dimensionMap ? Object.fromEntries(dimensionMap.entries()) : null - }; - } - - return { - rows: rows as FilingFaithfulStatementRow[], - dimensions: dimensionMap ? Object.fromEntries(dimensionMap.entries()) : null - }; -} - -export function defaultFinancialSyncLimit(window: FinancialHistoryWindow) { - return window === 'all' ? 120 : 60; -} - -export async function getCompanyFinancialStatements(input: GetCompanyFinancialStatementsInput): Promise { - const ticker = safeTicker(input.ticker); - const snapshotResult = await listFilingStatementSnapshotsByTicker({ - ticker, - window: input.window, - limit: input.limit, - cursor: input.cursor - }); - - const statuses = await countFilingStatementSnapshotStatuses(ticker); - const filings = await listFilingsRecords({ - ticker, - limit: input.window === 'all' ? 250 : 120 - }); - - const financialFilings = filings.filter((filing) => isFinancialForm(filing.filing_type)); - const periods = buildPeriods(snapshotResult.snapshots, input.mode, input.statement); - const rowResult = buildRows( - snapshotResult.snapshots, - periods, - input.mode, - input.statement, - input.includeDimensions - ); - - const latestFiling = filings[0] ?? null; - - return { - company: { - ticker, - companyName: latestFiling?.company_name ?? ticker, - cik: latestFiling?.cik ?? null - }, - mode: input.mode, +export async function getCompanyFinancialStatements( + input: GetCompanyFinancialStatementsInput +): Promise { + return await getCompanyFinancialTaxonomy({ + ticker: input.ticker, statement: input.statement, window: input.window, - periods, - rows: rowResult.rows, - nextCursor: snapshotResult.nextCursor, - coverage: { - filings: periods.length, - rows: rowResult.rows.length, - dimensions: rowResult.dimensions - ? Object.values(rowResult.dimensions).reduce((total, rows) => total + rows.length, 0) - : 0 - }, - dataSourceStatus: { - enabled: input.v2Enabled, - hydratedFilings: statuses.ready, - partialFilings: statuses.partial, - failedFilings: statuses.failed, - pendingFilings: Math.max(0, financialFilings.length - statuses.ready - statuses.partial - statuses.failed), - queuedSync: input.queuedSync - }, - dimensionBreakdown: rowResult.dimensions - }; + includeDimensions: input.includeDimensions, + includeFacts: input.includeFacts ?? false, + factsCursor: input.factsCursor, + factsLimit: input.factsLimit, + cursor: input.cursor, + limit: input.limit, + v3Enabled: input.v3Enabled ?? input.v2Enabled ?? true, + queuedSync: input.queuedSync + }); } +export { defaultFinancialSyncLimit }; + export const __financialStatementsInternals = { - buildPeriods, - buildRows, defaultFinancialSyncLimit }; diff --git a/lib/server/financial-taxonomy.test.ts b/lib/server/financial-taxonomy.test.ts new file mode 100644 index 0000000..29f284a --- /dev/null +++ b/lib/server/financial-taxonomy.test.ts @@ -0,0 +1,142 @@ +import { describe, expect, it } from 'bun:test'; +import { __financialTaxonomyInternals } from './financial-taxonomy'; +import type { FilingTaxonomySnapshotRecord } from './repos/filing-taxonomy'; +import type { FinancialStatementKind, TaxonomyStatementRow } from '@/lib/types'; + +function createRow(periodIds: string[]): TaxonomyStatementRow { + return { + key: 'us-gaap:RevenueFromContractWithCustomerExcludingAssessedTax', + label: 'Revenue From Contract With Customer Excluding Assessed Tax', + conceptKey: 'us-gaap:RevenueFromContractWithCustomerExcludingAssessedTax', + qname: 'us-gaap:RevenueFromContractWithCustomerExcludingAssessedTax', + namespaceUri: 'http://fasb.org/us-gaap/2021-01-31', + localName: 'RevenueFromContractWithCustomerExcludingAssessedTax', + isExtension: false, + statement: 'income', + roleUri: 'income', + order: 1, + depth: 0, + parentKey: null, + values: Object.fromEntries(periodIds.map((periodId, index) => [periodId, 100 + index])), + units: Object.fromEntries(periodIds.map((periodId) => [periodId, 'iso4217:USD'])), + hasDimensions: false, + sourceFactIds: periodIds.map((_, index) => index + 1) + }; +} + +function createSnapshot(input: { + filingId: number; + filingType: '10-K' | '10-Q'; + filingDate: string; + periods: Array<{ + id: string; + periodStart: string | null; + periodEnd: string; + periodLabel: string; + }>; + statement: FinancialStatementKind; +}) { + const row = createRow(input.periods.map((period) => period.id)); + + return { + id: input.filingId, + filing_id: input.filingId, + ticker: 'MSFT', + filing_date: input.filingDate, + filing_type: input.filingType, + parse_status: 'ready', + parse_error: null, + source: 'xbrl_instance', + periods: input.periods.map((period) => ({ + id: period.id, + filingId: input.filingId, + accessionNumber: `0000-${input.filingId}`, + filingDate: input.filingDate, + periodStart: period.periodStart, + periodEnd: period.periodEnd, + filingType: input.filingType, + periodLabel: period.periodLabel + })), + statement_rows: { + income: input.statement === 'income' ? [row] : [], + balance: input.statement === 'balance' ? [{ ...row, statement: 'balance' }] : [], + cash_flow: [], + equity: [], + comprehensive_income: [] + }, + derived_metrics: null, + validation_result: null, + facts_count: 0, + concepts_count: 0, + dimensions_count: 0, + created_at: input.filingDate, + updated_at: input.filingDate + } satisfies FilingTaxonomySnapshotRecord; +} + +describe('financial taxonomy internals', () => { + it('selects the primary quarter duration for 10-Q income statements', () => { + const snapshot = createSnapshot({ + filingId: 1, + filingType: '10-Q', + filingDate: '2026-01-28', + statement: 'income', + periods: [ + { id: 'instant', periodStart: null, periodEnd: '2025-12-31', periodLabel: 'Instant' }, + { id: 'quarter', periodStart: '2025-10-01', periodEnd: '2025-12-31', periodLabel: '2025-10-01 to 2025-12-31' }, + { id: 'ytd', periodStart: '2025-07-01', periodEnd: '2025-12-31', periodLabel: '2025-07-01 to 2025-12-31' } + ] + }); + + const selection = __financialTaxonomyInternals.selectPrimaryPeriods([snapshot], 'income'); + + expect(selection.periods).toHaveLength(1); + expect(selection.periods[0]?.id).toBe('quarter'); + }); + + it('selects the latest instant for balance sheets', () => { + const snapshot = createSnapshot({ + filingId: 2, + filingType: '10-K', + filingDate: '2025-07-30', + statement: 'balance', + periods: [ + { id: 'prior', periodStart: null, periodEnd: '2024-06-30', periodLabel: 'Instant' }, + { id: 'current', periodStart: null, periodEnd: '2025-06-30', periodLabel: 'Instant' } + ] + }); + + const selection = __financialTaxonomyInternals.selectPrimaryPeriods([snapshot], 'balance'); + + expect(selection.periods).toHaveLength(1); + expect(selection.periods[0]?.id).toBe('current'); + }); + + it('builds one reporting period per filing for the selected statement', () => { + const annual = createSnapshot({ + filingId: 10, + filingType: '10-K', + filingDate: '2025-07-30', + statement: 'income', + periods: [ + { id: 'annual', periodStart: '2024-07-01', periodEnd: '2025-06-30', periodLabel: '2024-07-01 to 2025-06-30' }, + { id: 'quarter', periodStart: '2025-04-01', periodEnd: '2025-06-30', periodLabel: '2025-04-01 to 2025-06-30' } + ] + }); + const quarterly = createSnapshot({ + filingId: 11, + filingType: '10-Q', + filingDate: '2025-10-29', + statement: 'income', + periods: [ + { id: 'instant', periodStart: null, periodEnd: '2025-09-30', periodLabel: 'Instant' }, + { id: 'quarter', periodStart: '2025-07-01', periodEnd: '2025-09-30', periodLabel: '2025-07-01 to 2025-09-30' }, + { id: 'ytd', periodStart: '2025-01-01', periodEnd: '2025-09-30', periodLabel: '2025-01-01 to 2025-09-30' } + ] + }); + + const periods = __financialTaxonomyInternals.buildPeriods([annual, quarterly], 'income'); + + expect(periods.map((period) => period.id)).toEqual(['annual', 'quarter']); + }); +}); diff --git a/lib/server/financial-taxonomy.ts b/lib/server/financial-taxonomy.ts new file mode 100644 index 0000000..f9bd8b2 --- /dev/null +++ b/lib/server/financial-taxonomy.ts @@ -0,0 +1,384 @@ +import type { + CompanyFinancialStatementsResponse, + DimensionBreakdownRow, + FinancialHistoryWindow, + FinancialStatementKind, + FinancialStatementPeriod, + TaxonomyStatementRow +} from '@/lib/types'; +import { listFilingsRecords } from '@/lib/server/repos/filings'; +import { + countFilingTaxonomySnapshotStatuses, + listFilingTaxonomySnapshotsByTicker, + listTaxonomyFactsByTicker, + type FilingTaxonomySnapshotRecord +} from '@/lib/server/repos/filing-taxonomy'; + +type GetCompanyFinancialTaxonomyInput = { + ticker: string; + statement: FinancialStatementKind; + window: FinancialHistoryWindow; + includeDimensions: boolean; + includeFacts: boolean; + factsCursor?: string | null; + factsLimit?: number; + cursor?: string | null; + limit?: number; + v3Enabled: boolean; + queuedSync: boolean; +}; + +function safeTicker(input: string) { + return input.trim().toUpperCase(); +} + +function isFinancialForm(type: string): type is '10-K' | '10-Q' { + return type === '10-K' || type === '10-Q'; +} + +function parseEpoch(value: string | null) { + if (!value) { + return Number.NaN; + } + + return Date.parse(value); +} + +function periodSorter(left: FinancialStatementPeriod, right: FinancialStatementPeriod) { + const leftDate = parseEpoch(left.periodEnd ?? left.filingDate); + const rightDate = parseEpoch(right.periodEnd ?? right.filingDate); + if (Number.isFinite(leftDate) && Number.isFinite(rightDate) && leftDate !== rightDate) { + return leftDate - rightDate; + } + + return left.id.localeCompare(right.id); +} + +function isInstantPeriod(period: FinancialStatementPeriod) { + return period.periodStart === null; +} + +function periodDurationDays(period: FinancialStatementPeriod) { + if (!period.periodStart || !period.periodEnd) { + return null; + } + + const start = Date.parse(period.periodStart); + const end = Date.parse(period.periodEnd); + if (!Number.isFinite(start) || !Number.isFinite(end) || end < start) { + return null; + } + + return Math.round((end - start) / 86_400_000) + 1; +} + +function preferredDurationDays(filingType: FinancialStatementPeriod['filingType']) { + return filingType === '10-K' ? 365 : 90; +} + +function selectPrimaryPeriods( + snapshots: FilingTaxonomySnapshotRecord[], + statement: FinancialStatementKind +) { + const selectedByFilingId = new Map(); + + for (const snapshot of snapshots) { + const rows = snapshot.statement_rows?.[statement] ?? []; + if (rows.length === 0) { + continue; + } + + const usedPeriodIds = new Set(); + for (const row of rows) { + for (const periodId of Object.keys(row.values)) { + usedPeriodIds.add(periodId); + } + } + + const candidates = (snapshot.periods ?? []).filter((period) => usedPeriodIds.has(period.id)); + if (candidates.length === 0) { + continue; + } + + const selected = (() => { + if (statement === 'balance') { + const instantCandidates = candidates.filter(isInstantPeriod); + return (instantCandidates.length > 0 ? instantCandidates : candidates) + .sort((left, right) => periodSorter(right, left))[0] ?? null; + } + + const durationCandidates = candidates.filter((period) => !isInstantPeriod(period)); + if (durationCandidates.length === 0) { + return candidates.sort((left, right) => periodSorter(right, left))[0] ?? null; + } + + const targetDays = preferredDurationDays(snapshot.filing_type); + return durationCandidates.sort((left, right) => { + const leftDate = parseEpoch(left.periodEnd ?? left.filingDate); + const rightDate = parseEpoch(right.periodEnd ?? right.filingDate); + if (Number.isFinite(leftDate) && Number.isFinite(rightDate) && leftDate !== rightDate) { + return rightDate - leftDate; + } + + const leftDistance = Math.abs((periodDurationDays(left) ?? targetDays) - targetDays); + const rightDistance = Math.abs((periodDurationDays(right) ?? targetDays) - targetDays); + if (leftDistance !== rightDistance) { + return leftDistance - rightDistance; + } + + return left.id.localeCompare(right.id); + })[0] ?? null; + })(); + + if (selected) { + selectedByFilingId.set(selected.filingId, selected); + } + } + + const periods = [...selectedByFilingId.values()].sort(periodSorter); + return { + periods, + selectedPeriodIds: new Set(periods.map((period) => period.id)), + periodByFilingId: new Map(periods.map((period) => [period.filingId, period])) + }; +} + +function buildPeriods( + snapshots: FilingTaxonomySnapshotRecord[], + statement: FinancialStatementKind +) { + return selectPrimaryPeriods(snapshots, statement).periods; +} + +function buildRows( + snapshots: FilingTaxonomySnapshotRecord[], + statement: FinancialStatementKind, + selectedPeriodIds: Set +) { + const rowMap = new Map(); + + for (const snapshot of snapshots) { + const rows = snapshot.statement_rows?.[statement] ?? []; + + for (const row of rows) { + const existing = rowMap.get(row.key); + if (!existing) { + rowMap.set(row.key, { + ...row, + values: Object.fromEntries( + Object.entries(row.values).filter(([periodId]) => selectedPeriodIds.has(periodId)) + ), + units: Object.fromEntries( + Object.entries(row.units).filter(([periodId]) => selectedPeriodIds.has(periodId)) + ), + sourceFactIds: [...row.sourceFactIds] + }); + if (Object.keys(rowMap.get(row.key)?.values ?? {}).length === 0) { + rowMap.delete(row.key); + } + continue; + } + + existing.hasDimensions = existing.hasDimensions || row.hasDimensions; + existing.order = Math.min(existing.order, row.order); + existing.depth = Math.min(existing.depth, row.depth); + if (!existing.parentKey && row.parentKey) { + existing.parentKey = row.parentKey; + } + + for (const [periodId, value] of Object.entries(row.values)) { + if (selectedPeriodIds.has(periodId) && !(periodId in existing.values)) { + existing.values[periodId] = value; + } + } + + for (const [periodId, unit] of Object.entries(row.units)) { + if (selectedPeriodIds.has(periodId) && !(periodId in existing.units)) { + existing.units[periodId] = unit; + } + } + + for (const factId of row.sourceFactIds) { + if (!existing.sourceFactIds.includes(factId)) { + existing.sourceFactIds.push(factId); + } + } + } + } + + return [...rowMap.values()].sort((left, right) => { + if (left.order !== right.order) { + return left.order - right.order; + } + + return left.label.localeCompare(right.label); + }); +} + +function buildDimensionBreakdown( + facts: Awaited>['facts'], + periods: FinancialStatementPeriod[] +) { + const periodByFilingId = new Map(); + for (const period of periods) { + periodByFilingId.set(period.filingId, period); + } + + const map = new Map(); + + for (const fact of facts) { + if (fact.dimensions.length === 0) { + continue; + } + + const period = periodByFilingId.get(fact.filingId) ?? null; + if (!period) { + continue; + } + + const matchesPeriod = period.periodStart + ? fact.periodStart === period.periodStart && fact.periodEnd === period.periodEnd + : (fact.periodInstant ?? fact.periodEnd) === period.periodEnd; + + if (!matchesPeriod) { + continue; + } + + for (const dimension of fact.dimensions) { + const row: DimensionBreakdownRow = { + rowKey: fact.conceptKey, + concept: fact.qname, + periodId: period.id, + axis: dimension.axis, + member: dimension.member, + value: fact.value, + unit: fact.unit + }; + + const existing = map.get(fact.conceptKey); + if (existing) { + existing.push(row); + } else { + map.set(fact.conceptKey, [row]); + } + } + } + + return map.size > 0 ? Object.fromEntries(map.entries()) : null; +} + +function latestMetrics(snapshots: FilingTaxonomySnapshotRecord[]) { + for (const snapshot of snapshots) { + if (snapshot.derived_metrics) { + return { + taxonomy: snapshot.derived_metrics, + validation: snapshot.validation_result + }; + } + } + + return { + taxonomy: null, + validation: null + }; +} + +export function defaultFinancialSyncLimit(window: FinancialHistoryWindow) { + return window === 'all' ? 120 : 60; +} + +export async function getCompanyFinancialTaxonomy(input: GetCompanyFinancialTaxonomyInput): Promise { + const ticker = safeTicker(input.ticker); + const snapshotResult = await listFilingTaxonomySnapshotsByTicker({ + ticker, + window: input.window, + limit: input.limit, + cursor: input.cursor + }); + + const statuses = await countFilingTaxonomySnapshotStatuses(ticker); + const filings = await listFilingsRecords({ + ticker, + limit: input.window === 'all' ? 250 : 120 + }); + + const financialFilings = filings.filter((filing) => isFinancialForm(filing.filing_type)); + const selection = selectPrimaryPeriods(snapshotResult.snapshots, input.statement); + const periods = selection.periods; + const rows = buildRows(snapshotResult.snapshots, input.statement, selection.selectedPeriodIds); + + const factsResult = input.includeFacts + ? await listTaxonomyFactsByTicker({ + ticker, + window: input.window, + statement: input.statement, + cursor: input.factsCursor, + limit: input.factsLimit + }) + : { facts: [], nextCursor: null }; + + const dimensionFacts = input.includeDimensions + ? await listTaxonomyFactsByTicker({ + ticker, + window: input.window, + statement: input.statement, + limit: 1200 + }) + : { facts: [], nextCursor: null }; + + const latestFiling = filings[0] ?? null; + const metrics = latestMetrics(snapshotResult.snapshots); + const dimensionBreakdown = input.includeDimensions + ? buildDimensionBreakdown(dimensionFacts.facts, periods) + : null; + + const dimensionsCount = dimensionBreakdown + ? Object.values(dimensionBreakdown).reduce((total, entries) => total + entries.length, 0) + : 0; + + const factsCoverage = input.includeFacts + ? factsResult.facts.length + : snapshotResult.snapshots.reduce((total, snapshot) => total + snapshot.facts_count, 0); + + return { + company: { + ticker, + companyName: latestFiling?.company_name ?? ticker, + cik: latestFiling?.cik ?? null + }, + statement: input.statement, + window: input.window, + periods, + rows, + nextCursor: snapshotResult.nextCursor, + facts: input.includeFacts + ? { + rows: factsResult.facts, + nextCursor: factsResult.nextCursor + } + : null, + coverage: { + filings: periods.length, + rows: rows.length, + dimensions: dimensionsCount, + facts: factsCoverage + }, + dataSourceStatus: { + enabled: input.v3Enabled, + hydratedFilings: statuses.ready, + partialFilings: statuses.partial, + failedFilings: statuses.failed, + pendingFilings: Math.max(0, financialFilings.length - statuses.ready - statuses.partial - statuses.failed), + queuedSync: input.queuedSync + }, + metrics, + dimensionBreakdown + }; +} + +export const __financialTaxonomyInternals = { + buildPeriods, + isInstantPeriod, + periodDurationDays, + selectPrimaryPeriods +}; diff --git a/lib/server/repos/filing-statements.ts b/lib/server/repos/filing-statements.ts index c808025..6d0e4ab 100644 --- a/lib/server/repos/filing-statements.ts +++ b/lib/server/repos/filing-statements.ts @@ -16,6 +16,7 @@ export type FilingStatementSnapshotPeriod = { filingId: number; accessionNumber: string; filingDate: string; + periodStart: string | null; periodEnd: string | null; filingType: '10-K' | '10-Q'; periodLabel: string; diff --git a/lib/server/repos/filing-taxonomy.ts b/lib/server/repos/filing-taxonomy.ts new file mode 100644 index 0000000..600eace --- /dev/null +++ b/lib/server/repos/filing-taxonomy.ts @@ -0,0 +1,676 @@ +import { and, desc, eq, gte, inArray, lt, sql } from 'drizzle-orm'; +import type { Filing, FinancialStatementKind, MetricValidationResult, TaxonomyDimensionMember, TaxonomyFactRow, TaxonomyStatementRow } from '@/lib/types'; +import { db } from '@/lib/server/db'; +import { + filingTaxonomyAsset, + filingTaxonomyConcept, + filingTaxonomyFact, + filingTaxonomyMetricValidation, + filingTaxonomySnapshot +} from '@/lib/server/db/schema'; + +export type FilingTaxonomyParseStatus = 'ready' | 'partial' | 'failed'; +export type FilingTaxonomySource = 'xbrl_instance' | 'xbrl_instance_with_linkbase' | 'legacy_html_fallback'; +export type FilingTaxonomyAssetType = + | 'instance' + | 'schema' + | 'presentation' + | 'label' + | 'calculation' + | 'definition' + | 'pdf' + | 'other'; + +export type FilingTaxonomyPeriod = { + id: string; + filingId: number; + accessionNumber: string; + filingDate: string; + periodStart: string | null; + periodEnd: string | null; + filingType: '10-K' | '10-Q'; + periodLabel: string; +}; + +export type FilingTaxonomySnapshotRecord = { + id: number; + filing_id: number; + ticker: string; + filing_date: string; + filing_type: '10-K' | '10-Q'; + parse_status: FilingTaxonomyParseStatus; + parse_error: string | null; + source: FilingTaxonomySource; + periods: FilingTaxonomyPeriod[]; + statement_rows: Record; + derived_metrics: Filing['metrics']; + validation_result: MetricValidationResult | null; + facts_count: number; + concepts_count: number; + dimensions_count: number; + created_at: string; + updated_at: string; +}; + +export type FilingTaxonomyAssetRecord = { + id: number; + snapshot_id: number; + asset_type: FilingTaxonomyAssetType; + name: string; + url: string; + size_bytes: number | null; + score: number | null; + is_selected: boolean; + created_at: string; +}; + +export type FilingTaxonomyConceptRecord = { + id: number; + snapshot_id: number; + concept_key: string; + qname: string; + namespace_uri: string; + local_name: string; + label: string | null; + is_extension: boolean; + statement_kind: FinancialStatementKind | null; + role_uri: string | null; + presentation_order: number | null; + presentation_depth: number | null; + parent_concept_key: string | null; + is_abstract: boolean; + created_at: string; +}; + +export type FilingTaxonomyFactRecord = { + id: number; + snapshot_id: number; + concept_key: string; + qname: string; + namespace_uri: string; + local_name: string; + statement_kind: FinancialStatementKind | null; + role_uri: string | null; + context_id: string; + unit: string | null; + decimals: string | null; + value_num: number; + period_start: string | null; + period_end: string | null; + period_instant: string | null; + dimensions: TaxonomyDimensionMember[]; + is_dimensionless: boolean; + source_file: string | null; + created_at: string; +}; + +export type FilingTaxonomyMetricValidationRecord = { + id: number; + snapshot_id: number; + metric_key: keyof NonNullable; + taxonomy_value: number | null; + llm_value: number | null; + absolute_diff: number | null; + relative_diff: number | null; + status: 'not_run' | 'matched' | 'mismatch' | 'error'; + evidence_pages: number[]; + pdf_url: string | null; + provider: string | null; + model: string | null; + error: string | null; + created_at: string; + updated_at: string; +}; + +export type UpsertFilingTaxonomySnapshotInput = { + filing_id: number; + ticker: string; + filing_date: string; + filing_type: '10-K' | '10-Q'; + parse_status: FilingTaxonomyParseStatus; + parse_error: string | null; + source: FilingTaxonomySource; + periods: FilingTaxonomyPeriod[]; + statement_rows: Record; + derived_metrics: Filing['metrics']; + validation_result: MetricValidationResult | null; + facts_count: number; + concepts_count: number; + dimensions_count: number; + assets: Array<{ + asset_type: FilingTaxonomyAssetType; + name: string; + url: string; + size_bytes: number | null; + score: number | null; + is_selected: boolean; + }>; + concepts: Array<{ + concept_key: string; + qname: string; + namespace_uri: string; + local_name: string; + label: string | null; + is_extension: boolean; + statement_kind: FinancialStatementKind | null; + role_uri: string | null; + presentation_order: number | null; + presentation_depth: number | null; + parent_concept_key: string | null; + is_abstract: boolean; + }>; + facts: Array<{ + concept_key: string; + qname: string; + namespace_uri: string; + local_name: string; + statement_kind: FinancialStatementKind | null; + role_uri: string | null; + context_id: string; + unit: string | null; + decimals: string | null; + value_num: number; + period_start: string | null; + period_end: string | null; + period_instant: string | null; + dimensions: TaxonomyDimensionMember[]; + is_dimensionless: boolean; + source_file: string | null; + }>; + metric_validations: Array<{ + metric_key: keyof NonNullable; + taxonomy_value: number | null; + llm_value: number | null; + absolute_diff: number | null; + relative_diff: number | null; + status: 'not_run' | 'matched' | 'mismatch' | 'error'; + evidence_pages: number[]; + pdf_url: string | null; + provider: string | null; + model: string | null; + error: string | null; + }>; +}; + +function tenYearsAgoIso() { + const date = new Date(); + date.setUTCFullYear(date.getUTCFullYear() - 10); + return date.toISOString().slice(0, 10); +} + +function asNumber(value: unknown) { + if (typeof value === 'number') { + return Number.isFinite(value) ? value : null; + } + + if (typeof value === 'string') { + const parsed = Number(value); + return Number.isFinite(parsed) ? parsed : null; + } + + return null; +} + +function asNumericText(value: number | null) { + if (value === null || !Number.isFinite(value)) { + return null; + } + + return String(value); +} + +function emptyStatementRows(): Record { + return { + income: [], + balance: [], + cash_flow: [], + equity: [], + comprehensive_income: [] + }; +} + +function toSnapshotRecord(row: typeof filingTaxonomySnapshot.$inferSelect): FilingTaxonomySnapshotRecord { + return { + id: row.id, + filing_id: row.filing_id, + ticker: row.ticker, + filing_date: row.filing_date, + filing_type: row.filing_type, + parse_status: row.parse_status, + parse_error: row.parse_error, + source: row.source, + periods: row.periods ?? [], + statement_rows: row.statement_rows ?? emptyStatementRows(), + derived_metrics: row.derived_metrics ?? null, + validation_result: row.validation_result ?? null, + facts_count: row.facts_count, + concepts_count: row.concepts_count, + dimensions_count: row.dimensions_count, + created_at: row.created_at, + updated_at: row.updated_at + }; +} + +function toAssetRecord(row: typeof filingTaxonomyAsset.$inferSelect): FilingTaxonomyAssetRecord { + return { + id: row.id, + snapshot_id: row.snapshot_id, + asset_type: row.asset_type, + name: row.name, + url: row.url, + size_bytes: row.size_bytes, + score: asNumber(row.score), + is_selected: row.is_selected, + created_at: row.created_at + }; +} + +function toConceptRecord(row: typeof filingTaxonomyConcept.$inferSelect): FilingTaxonomyConceptRecord { + return { + id: row.id, + snapshot_id: row.snapshot_id, + concept_key: row.concept_key, + qname: row.qname, + namespace_uri: row.namespace_uri, + local_name: row.local_name, + label: row.label, + is_extension: row.is_extension, + statement_kind: row.statement_kind ?? null, + role_uri: row.role_uri, + presentation_order: asNumber(row.presentation_order), + presentation_depth: row.presentation_depth, + parent_concept_key: row.parent_concept_key, + is_abstract: row.is_abstract, + created_at: row.created_at + }; +} + +function toFactRecord(row: typeof filingTaxonomyFact.$inferSelect): FilingTaxonomyFactRecord { + const value = asNumber(row.value_num); + if (value === null) { + throw new Error(`Invalid value_num for taxonomy fact row ${row.id}`); + } + + return { + id: row.id, + snapshot_id: row.snapshot_id, + concept_key: row.concept_key, + qname: row.qname, + namespace_uri: row.namespace_uri, + local_name: row.local_name, + statement_kind: row.statement_kind ?? null, + role_uri: row.role_uri, + context_id: row.context_id, + unit: row.unit, + decimals: row.decimals, + value_num: value, + period_start: row.period_start, + period_end: row.period_end, + period_instant: row.period_instant, + dimensions: row.dimensions, + is_dimensionless: row.is_dimensionless, + source_file: row.source_file, + created_at: row.created_at + }; +} + +function toMetricValidationRecord(row: typeof filingTaxonomyMetricValidation.$inferSelect): FilingTaxonomyMetricValidationRecord { + return { + id: row.id, + snapshot_id: row.snapshot_id, + metric_key: row.metric_key, + taxonomy_value: asNumber(row.taxonomy_value), + llm_value: asNumber(row.llm_value), + absolute_diff: asNumber(row.absolute_diff), + relative_diff: asNumber(row.relative_diff), + status: row.status, + evidence_pages: row.evidence_pages ?? [], + pdf_url: row.pdf_url, + provider: row.provider, + model: row.model, + error: row.error, + created_at: row.created_at, + updated_at: row.updated_at + }; +} + +export async function getFilingTaxonomySnapshotByFilingId(filingId: number) { + const [row] = await db + .select() + .from(filingTaxonomySnapshot) + .where(eq(filingTaxonomySnapshot.filing_id, filingId)) + .limit(1); + + return row ? toSnapshotRecord(row) : null; +} + +export async function listFilingTaxonomyAssets(snapshotId: number) { + const rows = await db + .select() + .from(filingTaxonomyAsset) + .where(eq(filingTaxonomyAsset.snapshot_id, snapshotId)) + .orderBy(desc(filingTaxonomyAsset.id)); + + return rows.map(toAssetRecord); +} + +export async function listFilingTaxonomyConcepts(snapshotId: number) { + const rows = await db + .select() + .from(filingTaxonomyConcept) + .where(eq(filingTaxonomyConcept.snapshot_id, snapshotId)) + .orderBy(desc(filingTaxonomyConcept.id)); + + return rows.map(toConceptRecord); +} + +export async function listFilingTaxonomyFacts(snapshotId: number) { + const rows = await db + .select() + .from(filingTaxonomyFact) + .where(eq(filingTaxonomyFact.snapshot_id, snapshotId)) + .orderBy(desc(filingTaxonomyFact.id)); + + return rows.map(toFactRecord); +} + +export async function listFilingTaxonomyMetricValidations(snapshotId: number) { + const rows = await db + .select() + .from(filingTaxonomyMetricValidation) + .where(eq(filingTaxonomyMetricValidation.snapshot_id, snapshotId)) + .orderBy(desc(filingTaxonomyMetricValidation.id)); + + return rows.map(toMetricValidationRecord); +} + +export async function upsertFilingTaxonomySnapshot(input: UpsertFilingTaxonomySnapshotInput) { + const now = new Date().toISOString(); + + const [saved] = await db + .insert(filingTaxonomySnapshot) + .values({ + filing_id: input.filing_id, + ticker: input.ticker, + filing_date: input.filing_date, + filing_type: input.filing_type, + parse_status: input.parse_status, + parse_error: input.parse_error, + source: input.source, + periods: input.periods, + statement_rows: input.statement_rows, + derived_metrics: input.derived_metrics, + validation_result: input.validation_result, + facts_count: input.facts_count, + concepts_count: input.concepts_count, + dimensions_count: input.dimensions_count, + created_at: now, + updated_at: now + }) + .onConflictDoUpdate({ + target: filingTaxonomySnapshot.filing_id, + set: { + ticker: input.ticker, + filing_date: input.filing_date, + filing_type: input.filing_type, + parse_status: input.parse_status, + parse_error: input.parse_error, + source: input.source, + periods: input.periods, + statement_rows: input.statement_rows, + derived_metrics: input.derived_metrics, + validation_result: input.validation_result, + facts_count: input.facts_count, + concepts_count: input.concepts_count, + dimensions_count: input.dimensions_count, + updated_at: now + } + }) + .returning(); + + const snapshotId = saved.id; + + await db.delete(filingTaxonomyAsset).where(eq(filingTaxonomyAsset.snapshot_id, snapshotId)); + await db.delete(filingTaxonomyConcept).where(eq(filingTaxonomyConcept.snapshot_id, snapshotId)); + await db.delete(filingTaxonomyFact).where(eq(filingTaxonomyFact.snapshot_id, snapshotId)); + await db.delete(filingTaxonomyMetricValidation).where(eq(filingTaxonomyMetricValidation.snapshot_id, snapshotId)); + + if (input.assets.length > 0) { + await db.insert(filingTaxonomyAsset).values(input.assets.map((asset) => ({ + snapshot_id: snapshotId, + asset_type: asset.asset_type, + name: asset.name, + url: asset.url, + size_bytes: asset.size_bytes, + score: asNumericText(asset.score), + is_selected: asset.is_selected, + created_at: now + }))); + } + + if (input.concepts.length > 0) { + await db.insert(filingTaxonomyConcept).values(input.concepts.map((concept) => ({ + snapshot_id: snapshotId, + concept_key: concept.concept_key, + qname: concept.qname, + namespace_uri: concept.namespace_uri, + local_name: concept.local_name, + label: concept.label, + is_extension: concept.is_extension, + statement_kind: concept.statement_kind, + role_uri: concept.role_uri, + presentation_order: asNumericText(concept.presentation_order), + presentation_depth: concept.presentation_depth, + parent_concept_key: concept.parent_concept_key, + is_abstract: concept.is_abstract, + created_at: now + }))); + } + + if (input.facts.length > 0) { + await db.insert(filingTaxonomyFact).values(input.facts.map((fact) => ({ + snapshot_id: snapshotId, + concept_key: fact.concept_key, + qname: fact.qname, + namespace_uri: fact.namespace_uri, + local_name: fact.local_name, + statement_kind: fact.statement_kind, + role_uri: fact.role_uri, + context_id: fact.context_id, + unit: fact.unit, + decimals: fact.decimals, + value_num: String(fact.value_num), + period_start: fact.period_start, + period_end: fact.period_end, + period_instant: fact.period_instant, + dimensions: fact.dimensions, + is_dimensionless: fact.is_dimensionless, + source_file: fact.source_file, + created_at: now + }))); + } + + if (input.metric_validations.length > 0) { + await db.insert(filingTaxonomyMetricValidation).values(input.metric_validations.map((check) => ({ + snapshot_id: snapshotId, + metric_key: check.metric_key, + taxonomy_value: asNumericText(check.taxonomy_value), + llm_value: asNumericText(check.llm_value), + absolute_diff: asNumericText(check.absolute_diff), + relative_diff: asNumericText(check.relative_diff), + status: check.status, + evidence_pages: check.evidence_pages, + pdf_url: check.pdf_url, + provider: check.provider, + model: check.model, + error: check.error, + created_at: now, + updated_at: now + }))); + } + + return toSnapshotRecord(saved); +} + +export async function listFilingTaxonomySnapshotsByTicker(input: { + ticker: string; + window: '10y' | 'all'; + limit?: number; + cursor?: string | null; +}) { + const safeLimit = Math.min(Math.max(Math.trunc(input.limit ?? 40), 1), 120); + const cursorId = input.cursor ? Number.parseInt(input.cursor, 10) : null; + const constraints = [eq(filingTaxonomySnapshot.ticker, input.ticker.trim().toUpperCase())]; + + if (input.window === '10y') { + constraints.push(gte(filingTaxonomySnapshot.filing_date, tenYearsAgoIso())); + } + + if (cursorId && Number.isFinite(cursorId) && cursorId > 0) { + constraints.push(lt(filingTaxonomySnapshot.id, cursorId)); + } + + const rows = await db + .select() + .from(filingTaxonomySnapshot) + .where(and(...constraints)) + .orderBy(desc(filingTaxonomySnapshot.filing_date), desc(filingTaxonomySnapshot.id)) + .limit(safeLimit + 1); + + const hasMore = rows.length > safeLimit; + const usedRows = hasMore ? rows.slice(0, safeLimit) : rows; + const nextCursor = hasMore + ? String(usedRows[usedRows.length - 1]?.id ?? '') + : null; + + return { + snapshots: usedRows.map(toSnapshotRecord), + nextCursor + }; +} + +export async function countFilingTaxonomySnapshotStatuses(ticker: string) { + const rows = await db + .select({ + status: filingTaxonomySnapshot.parse_status, + count: sql`count(*)` + }) + .from(filingTaxonomySnapshot) + .where(eq(filingTaxonomySnapshot.ticker, ticker.trim().toUpperCase())) + .groupBy(filingTaxonomySnapshot.parse_status); + + return rows.reduce>((acc, row) => { + acc[row.status] = Number(row.count); + return acc; + }, { + ready: 0, + partial: 0, + failed: 0 + }); +} + +export async function listTaxonomyFactsByTicker(input: { + ticker: string; + window: '10y' | 'all'; + statement?: FinancialStatementKind; + cursor?: string | null; + limit?: number; +}) { + const safeLimit = Math.min(Math.max(Math.trunc(input.limit ?? 500), 1), 2000); + const cursorId = input.cursor ? Number.parseInt(input.cursor, 10) : null; + const conditions = [eq(filingTaxonomySnapshot.ticker, input.ticker.trim().toUpperCase())]; + + if (input.window === '10y') { + conditions.push(gte(filingTaxonomySnapshot.filing_date, tenYearsAgoIso())); + } + + if (input.statement) { + conditions.push(eq(filingTaxonomyFact.statement_kind, input.statement)); + } + + if (cursorId && Number.isFinite(cursorId) && cursorId > 0) { + conditions.push(lt(filingTaxonomyFact.id, cursorId)); + } + + const rows = await db + .select({ + id: filingTaxonomyFact.id, + snapshot_id: filingTaxonomyFact.snapshot_id, + filing_id: filingTaxonomySnapshot.filing_id, + filing_date: filingTaxonomySnapshot.filing_date, + statement_kind: filingTaxonomyFact.statement_kind, + role_uri: filingTaxonomyFact.role_uri, + concept_key: filingTaxonomyFact.concept_key, + qname: filingTaxonomyFact.qname, + namespace_uri: filingTaxonomyFact.namespace_uri, + local_name: filingTaxonomyFact.local_name, + value_num: filingTaxonomyFact.value_num, + context_id: filingTaxonomyFact.context_id, + unit: filingTaxonomyFact.unit, + decimals: filingTaxonomyFact.decimals, + period_start: filingTaxonomyFact.period_start, + period_end: filingTaxonomyFact.period_end, + period_instant: filingTaxonomyFact.period_instant, + dimensions: filingTaxonomyFact.dimensions, + is_dimensionless: filingTaxonomyFact.is_dimensionless, + source_file: filingTaxonomyFact.source_file + }) + .from(filingTaxonomyFact) + .innerJoin(filingTaxonomySnapshot, eq(filingTaxonomyFact.snapshot_id, filingTaxonomySnapshot.id)) + .where(and(...conditions)) + .orderBy(desc(filingTaxonomyFact.id)) + .limit(safeLimit + 1); + + const hasMore = rows.length > safeLimit; + const used = hasMore ? rows.slice(0, safeLimit) : rows; + const nextCursor = hasMore ? String(used[used.length - 1]?.id ?? '') : null; + + const facts: TaxonomyFactRow[] = used.map((row) => { + const value = asNumber(row.value_num); + if (value === null) { + throw new Error(`Invalid value_num in taxonomy fact ${row.id}`); + } + + return { + id: row.id, + snapshotId: row.snapshot_id, + filingId: row.filing_id, + filingDate: row.filing_date, + statement: row.statement_kind, + roleUri: row.role_uri, + conceptKey: row.concept_key, + qname: row.qname, + namespaceUri: row.namespace_uri, + localName: row.local_name, + value, + contextId: row.context_id, + unit: row.unit, + decimals: row.decimals, + periodStart: row.period_start, + periodEnd: row.period_end, + periodInstant: row.period_instant, + dimensions: row.dimensions, + isDimensionless: row.is_dimensionless, + sourceFile: row.source_file + }; + }); + + return { + facts, + nextCursor + }; +} + +export async function listTaxonomyAssetsBySnapshotIds(snapshotIds: number[]) { + if (snapshotIds.length === 0) { + return []; + } + + const rows = await db + .select() + .from(filingTaxonomyAsset) + .where(inArray(filingTaxonomyAsset.snapshot_id, snapshotIds)) + .orderBy(desc(filingTaxonomyAsset.id)); + + return rows.map(toAssetRecord); +} diff --git a/lib/server/repos/filings.ts b/lib/server/repos/filings.ts index 4ff8e04..a44088e 100644 --- a/lib/server/repos/filings.ts +++ b/lib/server/repos/filings.ts @@ -170,3 +170,19 @@ export async function saveFilingAnalysis( return updated ? toFiling(updated) : null; } + +export async function updateFilingMetricsById( + filingId: number, + metrics: Filing['metrics'] +) { + const [updated] = await db + .update(filing) + .set({ + metrics, + updated_at: new Date().toISOString() + }) + .where(eq(filing.id, filingId)) + .returning(); + + return updated ? toFiling(updated) : null; +} diff --git a/lib/server/sec.ts b/lib/server/sec.ts index b712944..58b945e 100644 --- a/lib/server/sec.ts +++ b/lib/server/sec.ts @@ -1378,6 +1378,7 @@ export async function hydrateFilingStatementSnapshot( filingId: input.filingId, accessionNumber: input.accessionNumber, filingDate: input.filingDate, + periodStart: null, periodEnd: input.filingDate, filingType: input.filingType, periodLabel: input.filingType === '10-Q' ? 'Quarter End' : 'Fiscal Year End' diff --git a/lib/server/task-processors.ts b/lib/server/task-processors.ts index 8ed0295..4a1c76e 100644 --- a/lib/server/task-processors.ts +++ b/lib/server/task-processors.ts @@ -13,12 +13,13 @@ import { getFilingByAccession, listFilingsRecords, saveFilingAnalysis, + updateFilingMetricsById, upsertFilingsRecords } from '@/lib/server/repos/filings'; import { - getFilingStatementSnapshotByFilingId, - upsertFilingStatementSnapshot -} from '@/lib/server/repos/filing-statements'; + getFilingTaxonomySnapshotByFilingId, + upsertFilingTaxonomySnapshot +} from '@/lib/server/repos/filing-taxonomy'; import { applyRefreshedPrices, listHoldingsForPriceRefresh, @@ -27,11 +28,10 @@ import { import { createPortfolioInsight } from '@/lib/server/repos/insights'; import { updateTaskStage } from '@/lib/server/repos/tasks'; import { - fetchFilingMetricsForFilings, fetchPrimaryFilingText, - fetchRecentFilings, - hydrateFilingStatementSnapshot + fetchRecentFilings } from '@/lib/server/sec'; +import { hydrateFilingTaxonomySnapshot } from '@/lib/server/taxonomy/engine'; const EXTRACTION_REQUIRED_KEYS = [ 'summary', @@ -88,6 +88,10 @@ const COMPANY_SPECIFIC_PATTERNS = [ type FilingMetricKey = keyof NonNullable; +function isFinancialMetricsForm(filingType: string): filingType is '10-K' | '10-Q' { + return filingType === '10-K' || filingType === '10-Q'; +} + const METRIC_CHECK_PATTERNS: Array<{ key: FilingMetricKey; label: string; @@ -120,10 +124,6 @@ const METRIC_CHECK_PATTERNS: Array<{ } ]; -function isFinancialMetricsForm(form: Filing['filing_type']) { - return form === '10-K' || form === '10-Q'; -} - function toTaskResult(value: unknown): Record { if (!value || typeof value !== 'object' || Array.isArray(value)) { return { value }; @@ -565,40 +565,6 @@ async function processSyncFilings(task: Task) { `Fetching up to ${limit} filings for ${ticker}${scopeLabel ? ` (${scopeLabel})` : ''}` ); const filings = await fetchRecentFilings(ticker, limit); - const metricsByAccession = new Map(); - const filingsByCik = new Map(); - - for (const filing of filings) { - const group = filingsByCik.get(filing.cik); - if (group) { - group.push(filing); - continue; - } - - filingsByCik.set(filing.cik, [filing]); - } - - await setProjectionStage(task, 'sync.fetch_metrics', `Computing financial metrics for ${filings.length} filings`); - for (const [cik, filingsForCik] of filingsByCik) { - const filingsForFinancialMetrics = filingsForCik.filter((filing) => isFinancialMetricsForm(filing.filingType)); - if (filingsForFinancialMetrics.length === 0) { - continue; - } - - const metricsMap = await fetchFilingMetricsForFilings( - cik, - filingsForCik[0]?.ticker ?? ticker, - filingsForFinancialMetrics.map((filing) => ({ - accessionNumber: filing.accessionNumber, - filingDate: filing.filingDate, - filingType: filing.filingType - })) - ); - - for (const [accessionNumber, metrics] of metricsMap.entries()) { - metricsByAccession.set(accessionNumber, metrics); - } - } await setProjectionStage(task, 'sync.persist_filings', 'Persisting filings and links'); const saveResult = await upsertFilingsRecords( @@ -612,24 +578,24 @@ async function processSyncFilings(task: Task) { filing_url: filing.filingUrl, submission_url: filing.submissionUrl, primary_document: filing.primaryDocument, - metrics: metricsByAccession.get(filing.accessionNumber) ?? null, + metrics: null, links: filingLinks(filing) })) ); - let statementSnapshotsHydrated = 0; - let statementSnapshotsFailed = 0; + let taxonomySnapshotsHydrated = 0; + let taxonomySnapshotsFailed = 0; const hydrateCandidates = (await listFilingsRecords({ ticker, limit: Math.min(Math.max(limit * 3, 40), STATEMENT_HYDRATION_MAX_FILINGS) })) .filter((filing): filing is Filing & { filing_type: '10-K' | '10-Q' } => { - return filing.filing_type === '10-K' || filing.filing_type === '10-Q'; + return isFinancialMetricsForm(filing.filing_type); }); - await setProjectionStage(task, 'sync.hydrate_statements', `Hydrating statement snapshots for ${hydrateCandidates.length} candidate filings`); + await setProjectionStage(task, 'sync.discover_assets', `Discovering taxonomy assets for ${hydrateCandidates.length} candidate filings`); for (const filing of hydrateCandidates) { - const existingSnapshot = await getFilingStatementSnapshotByFilingId(filing.id); + const existingSnapshot = await getFilingTaxonomySnapshotByFilingId(filing.id); const shouldRefresh = !existingSnapshot || Date.parse(existingSnapshot.updated_at) < Date.parse(filing.updated_at); @@ -638,7 +604,8 @@ async function processSyncFilings(task: Task) { } try { - const snapshot = await hydrateFilingStatementSnapshot({ + await setProjectionStage(task, 'sync.extract_taxonomy', `Extracting XBRL taxonomy for ${filing.accession_number}`); + const snapshot = await hydrateFilingTaxonomySnapshot({ filingId: filing.id, ticker: filing.ticker, cik: filing.cik, @@ -646,27 +613,50 @@ async function processSyncFilings(task: Task) { filingDate: filing.filing_date, filingType: filing.filing_type, filingUrl: filing.filing_url, - primaryDocument: filing.primary_document ?? null, - metrics: filing.metrics + primaryDocument: filing.primary_document ?? null }); - await upsertFilingStatementSnapshot(snapshot); - statementSnapshotsHydrated += 1; + await setProjectionStage(task, 'sync.normalize_taxonomy', `Materializing statements for ${filing.accession_number}`); + await setProjectionStage(task, 'sync.derive_metrics', `Deriving taxonomy metrics for ${filing.accession_number}`); + await setProjectionStage(task, 'sync.validate_pdf_metrics', `Validating metrics via PDF + LLM for ${filing.accession_number}`); + await setProjectionStage(task, 'sync.persist_taxonomy', `Persisting taxonomy snapshot for ${filing.accession_number}`); + + await upsertFilingTaxonomySnapshot(snapshot); + await updateFilingMetricsById(filing.id, snapshot.derived_metrics); + taxonomySnapshotsHydrated += 1; } catch (error) { - await upsertFilingStatementSnapshot({ + const now = new Date().toISOString(); + await upsertFilingTaxonomySnapshot({ filing_id: filing.id, ticker: filing.ticker, filing_date: filing.filing_date, filing_type: filing.filing_type, - period_end: filing.filing_date, - statement_bundle: null, - standardized_bundle: null, - dimension_bundle: null, parse_status: 'failed', - parse_error: error instanceof Error ? error.message : 'Statement hydration failed', - source: 'companyfacts_fallback' + parse_error: error instanceof Error ? error.message : 'Taxonomy hydration failed', + source: 'legacy_html_fallback', + periods: [], + statement_rows: { + income: [], + balance: [], + cash_flow: [], + equity: [], + comprehensive_income: [] + }, + derived_metrics: filing.metrics ?? null, + validation_result: { + status: 'error', + checks: [], + validatedAt: now + }, + facts_count: 0, + concepts_count: 0, + dimensions_count: 0, + assets: [], + concepts: [], + facts: [], + metric_validations: [] }); - statementSnapshotsFailed += 1; + taxonomySnapshotsFailed += 1; } await Bun.sleep(STATEMENT_HYDRATION_DELAY_MS); @@ -679,8 +669,8 @@ async function processSyncFilings(task: Task) { fetched: filings.length, inserted: saveResult.inserted, updated: saveResult.updated, - statementSnapshotsHydrated, - statementSnapshotsFailed + taxonomySnapshotsHydrated, + taxonomySnapshotsFailed }; } diff --git a/lib/server/taxonomy/asset-discovery.test.ts b/lib/server/taxonomy/asset-discovery.test.ts new file mode 100644 index 0000000..3dc83be --- /dev/null +++ b/lib/server/taxonomy/asset-discovery.test.ts @@ -0,0 +1,73 @@ +import { describe, expect, it } from 'bun:test'; +import { discoverFilingAssets } from '@/lib/server/taxonomy/asset-discovery'; + +describe('taxonomy asset discovery', () => { + it('classifies assets and selects ranked instance/pdf candidates', async () => { + const fetchImpl = (async () => { + return new Response(JSON.stringify({ + directory: { + item: [ + { name: 'abc_htm.xml', size: '900000' }, + { name: 'abc_pre.xml', size: '250000' }, + { name: 'abc_lab.xml', size: '120000' }, + { name: '10k_financial_statements.pdf', size: '400000' }, + { name: 'annual_report.pdf', size: '300000' }, + { name: 'quarter_statement.pdf', size: '200000' }, + { name: 'exhibit99.pdf', size: '500000' } + ] + } + }), { + status: 200, + headers: { + 'content-type': 'application/json' + } + }); + }) as unknown as typeof fetch; + + const result = await discoverFilingAssets({ + cik: '0000123456', + accessionNumber: '0000123456-26-000001', + filingUrl: 'https://www.sec.gov/Archives/edgar/data/123456/000012345626000001/abc.htm', + primaryDocument: 'abc.htm', + fetchImpl + }); + + expect(result.directoryUrl).toBe('https://www.sec.gov/Archives/edgar/data/123456/000012345626000001/'); + + const selectedInstance = result.assets.find((asset) => asset.asset_type === 'instance' && asset.is_selected); + expect(selectedInstance?.name).toBe('abc_htm.xml'); + + const selectedPdfs = result.assets + .filter((asset) => asset.asset_type === 'pdf' && asset.is_selected) + .map((asset) => asset.name); + expect(selectedPdfs.length).toBe(3); + expect(selectedPdfs).toContain('10k_financial_statements.pdf'); + expect(selectedPdfs).toContain('annual_report.pdf'); + expect(selectedPdfs).toContain('quarter_statement.pdf'); + expect(selectedPdfs).not.toContain('exhibit99.pdf'); + }); + + it('falls back to filing url when SEC directory assets are unavailable', async () => { + const fetchImpl = (async () => { + return new Response('not found', { status: 404 }); + }) as unknown as typeof fetch; + + const result = await discoverFilingAssets({ + cik: '0000123456', + accessionNumber: '0000123456-26-000001', + filingUrl: 'https://www.sec.gov/Archives/edgar/data/123456/000012345626000001/abc.xml', + primaryDocument: 'abc.xml', + fetchImpl + }); + + expect(result.assets.length).toBe(1); + expect(result.assets[0]).toEqual({ + asset_type: 'instance', + name: 'abc.xml', + url: 'https://www.sec.gov/Archives/edgar/data/123456/000012345626000001/abc.xml', + size_bytes: null, + score: 6, + is_selected: true + }); + }); +}); diff --git a/lib/server/taxonomy/asset-discovery.ts b/lib/server/taxonomy/asset-discovery.ts new file mode 100644 index 0000000..9d1fec4 --- /dev/null +++ b/lib/server/taxonomy/asset-discovery.ts @@ -0,0 +1,283 @@ +import type { TaxonomyAsset } from '@/lib/server/taxonomy/types'; + +type FilingAssetDiscoveryInput = { + cik: string; + accessionNumber: string; + filingUrl: string | null; + primaryDocument: string | null; + fetchImpl?: typeof fetch; +}; + +type FilingDirectoryJson = { + directory?: { + item?: Array<{ + name?: string; + type?: string; + size?: string | number; + }>; + }; +}; + +function envUserAgent() { + return process.env.SEC_USER_AGENT || 'Fiscal Clone '; +} + +function compactAccessionNumber(value: string) { + return value.replace(/-/g, ''); +} + +function normalizeCikForPath(value: string) { + const digits = value.replace(/\D/g, ''); + if (!digits) { + return null; + } + + const numeric = Number(digits); + if (!Number.isFinite(numeric)) { + return null; + } + + return String(numeric); +} + +function resolveFilingDirectoryUrl(input: { + filingUrl: string | null; + cik: string; + accessionNumber: string; +}) { + const direct = input.filingUrl?.trim(); + if (direct) { + const lastSlash = direct.lastIndexOf('/'); + if (lastSlash > 'https://'.length) { + return direct.slice(0, lastSlash + 1); + } + } + + const cikPath = normalizeCikForPath(input.cik); + const accessionPath = compactAccessionNumber(input.accessionNumber); + if (!cikPath || !accessionPath) { + return null; + } + + return `https://www.sec.gov/Archives/edgar/data/${cikPath}/${accessionPath}/`; +} + +function classifyAssetType(name: string): TaxonomyAsset['asset_type'] { + const lower = name.toLowerCase(); + + if (lower.endsWith('.pdf')) { + return 'pdf'; + } + + if (lower.endsWith('.xsd')) { + return 'schema'; + } + + if (lower.endsWith('.xml')) { + if (/(_|-)pre\.xml$/.test(lower) || /presentation/.test(lower)) { + return 'presentation'; + } + + if (/(_|-)lab\.xml$/.test(lower) || /label/.test(lower)) { + return 'label'; + } + + if (/(_|-)cal\.xml$/.test(lower) || /calculation/.test(lower)) { + return 'calculation'; + } + + if (/(_|-)def\.xml$/.test(lower) || /definition/.test(lower)) { + return 'definition'; + } + + return 'instance'; + } + + return 'other'; +} + +function scorePdf(name: string, sizeBytes: number | null) { + const lower = name.toLowerCase(); + let score = 0; + + if (/financial|statement|annual|quarter|10k|10q/.test(lower)) { + score += 8; + } + + if (/exhibit|ex-\d+/.test(lower)) { + score -= 2; + } + + if (sizeBytes && sizeBytes > 100_000) { + score += 1; + } + + return score; +} + +function scoreInstance(name: string, primaryDocument: string | null) { + const lower = name.toLowerCase(); + let score = 1; + + if (/_htm\.xml$/.test(lower)) { + score += 4; + } + + if (/_ins\.xml$/.test(lower)) { + score += 4; + } + + const basePrimary = (primaryDocument ?? '').replace(/\.[a-z0-9]+$/i, '').toLowerCase(); + if (basePrimary && lower.includes(basePrimary)) { + score += 5; + } + + if (/cal|def|lab|pre/.test(lower)) { + score -= 3; + } + + return score; +} + +function parseSize(raw: unknown) { + if (typeof raw === 'number') { + return Number.isFinite(raw) ? raw : null; + } + + if (typeof raw === 'string') { + const parsed = Number(raw); + return Number.isFinite(parsed) ? parsed : null; + } + + return null; +} + +async function fetchJson(url: string, fetchImpl: typeof fetch): Promise { + const response = await fetchImpl(url, { + headers: { + 'User-Agent': envUserAgent(), + Accept: 'application/json' + }, + cache: 'no-store' + }); + + if (!response.ok) { + throw new Error(`SEC request failed (${response.status})`); + } + + return await response.json() as T; +} + +export async function discoverFilingAssets(input: FilingAssetDiscoveryInput): Promise<{ + directoryUrl: string | null; + assets: TaxonomyAsset[]; +}> { + const fetchImpl = input.fetchImpl ?? fetch; + const directoryUrl = resolveFilingDirectoryUrl({ + filingUrl: input.filingUrl, + cik: input.cik, + accessionNumber: input.accessionNumber + }); + + if (!directoryUrl) { + return { + directoryUrl: null, + assets: [] + }; + } + + let payload: FilingDirectoryJson | null = null; + try { + payload = await fetchJson(`${directoryUrl}index.json`, fetchImpl); + } catch { + payload = null; + } + + const discovered: TaxonomyAsset[] = []; + for (const item of payload?.directory?.item ?? []) { + const name = (item.name ?? '').trim(); + if (!name) { + continue; + } + + const url = `${directoryUrl}${name.replace(/^\/+/, '')}`; + const asset_type = classifyAssetType(name); + const size_bytes = parseSize(item.size); + + discovered.push({ + asset_type, + name, + url, + size_bytes, + score: null, + is_selected: false + }); + } + + if (discovered.length === 0 && input.filingUrl) { + const fallbackName = input.primaryDocument ?? input.filingUrl.split('/').pop() ?? 'primary_document'; + discovered.push({ + asset_type: fallbackName.toLowerCase().endsWith('.xml') ? 'instance' : 'other', + name: fallbackName, + url: input.filingUrl, + size_bytes: null, + score: null, + is_selected: true + }); + } + + const instanceCandidates = discovered + .filter((asset) => asset.asset_type === 'instance') + .map((asset) => ({ + asset, + score: scoreInstance(asset.name, input.primaryDocument) + })) + .sort((a, b) => b.score - a.score); + + const selectedInstanceUrl = instanceCandidates[0]?.asset.url ?? null; + + const selectedPdfUrls = discovered + .filter((asset) => asset.asset_type === 'pdf') + .map((asset) => ({ + asset, + score: scorePdf(asset.name, asset.size_bytes) + })) + .sort((a, b) => b.score - a.score) + .slice(0, 3) + .map((entry) => entry.asset.url); + + const assets = discovered.map((asset) => { + if (asset.asset_type === 'instance') { + const score = scoreInstance(asset.name, input.primaryDocument); + return { + ...asset, + score, + is_selected: asset.url === selectedInstanceUrl + }; + } + + if (asset.asset_type === 'pdf') { + const score = scorePdf(asset.name, asset.size_bytes); + return { + ...asset, + score, + is_selected: selectedPdfUrls.includes(asset.url) + }; + } + + return { + ...asset, + score: null, + is_selected: asset.asset_type === 'presentation' + || asset.asset_type === 'label' + || asset.asset_type === 'calculation' + || asset.asset_type === 'definition' + || asset.asset_type === 'schema' + }; + }); + + return { + directoryUrl, + assets + }; +} diff --git a/lib/server/taxonomy/engine.ts b/lib/server/taxonomy/engine.ts new file mode 100644 index 0000000..e27e2b2 --- /dev/null +++ b/lib/server/taxonomy/engine.ts @@ -0,0 +1,185 @@ +import type { FinancialStatementKind } from '@/lib/types'; +import { discoverFilingAssets } from '@/lib/server/taxonomy/asset-discovery'; +import { parseLabelLinkbase, parsePresentationLinkbase } from '@/lib/server/taxonomy/linkbase-parser'; +import { deriveTaxonomyMetrics } from '@/lib/server/taxonomy/metrics'; +import { materializeTaxonomyStatements } from '@/lib/server/taxonomy/materialize'; +import { validateMetricsWithPdfLlm } from '@/lib/server/taxonomy/pdf-validation'; +import type { TaxonomyHydrationInput, TaxonomyHydrationResult } from '@/lib/server/taxonomy/types'; +import { parseXbrlInstance } from '@/lib/server/taxonomy/xbrl-parser'; + +function createStatementRecord(factory: () => T): Record { + return { + income: factory(), + balance: factory(), + cash_flow: factory(), + equity: factory(), + comprehensive_income: factory() + }; +} + +function envUserAgent() { + return process.env.SEC_USER_AGENT || 'Fiscal Clone '; +} + +async function fetchText(url: string, fetchImpl: typeof fetch) { + const response = await fetchImpl(url, { + headers: { + 'User-Agent': envUserAgent(), + Accept: 'text/xml, text/plain, text/html;q=0.8, */*;q=0.5' + }, + cache: 'no-store' + }); + + if (!response.ok) { + throw new Error(`SEC request failed (${response.status})`); + } + + return await response.text(); +} + +export async function hydrateFilingTaxonomySnapshot( + input: TaxonomyHydrationInput, + options?: { + fetchImpl?: typeof fetch; + } +): Promise { + const fetchImpl = options?.fetchImpl ?? fetch; + + const discovered = await discoverFilingAssets({ + cik: input.cik, + accessionNumber: input.accessionNumber, + filingUrl: input.filingUrl, + primaryDocument: input.primaryDocument, + fetchImpl + }); + + const emptyResult: TaxonomyHydrationResult = { + filing_id: input.filingId, + ticker: input.ticker.trim().toUpperCase(), + filing_date: input.filingDate, + filing_type: input.filingType, + parse_status: 'failed', + parse_error: 'No XBRL instance found', + source: 'legacy_html_fallback', + periods: [], + statement_rows: createStatementRecord(() => []), + derived_metrics: null, + validation_result: { + status: 'not_run', + checks: [], + validatedAt: null + }, + facts_count: 0, + concepts_count: 0, + dimensions_count: 0, + assets: discovered.assets, + concepts: [], + facts: [], + metric_validations: [] + }; + + const selectedInstance = discovered.assets.find((asset) => asset.asset_type === 'instance' && asset.is_selected) + ?? discovered.assets.find((asset) => asset.asset_type === 'instance') + ?? null; + + if (!selectedInstance) { + return emptyResult; + } + + let parseError: string | null = null; + let source: TaxonomyHydrationResult['source'] = 'xbrl_instance'; + + let instanceText = ''; + try { + instanceText = await fetchText(selectedInstance.url, fetchImpl); + } catch (error) { + parseError = error instanceof Error ? error.message : 'Unable to fetch instance file'; + return { + ...emptyResult, + parse_error: parseError + }; + } + + const parsedInstance = parseXbrlInstance(instanceText, selectedInstance.name); + + const labelByConcept = new Map(); + const presentation: ReturnType = []; + + for (const asset of discovered.assets) { + if (!asset.is_selected) { + continue; + } + + if (asset.asset_type !== 'presentation' && asset.asset_type !== 'label') { + continue; + } + + try { + const content = await fetchText(asset.url, fetchImpl); + if (asset.asset_type === 'presentation') { + const parsed = parsePresentationLinkbase(content); + if (parsed.length > 0) { + source = 'xbrl_instance_with_linkbase'; + } + + presentation.push(...parsed); + } else if (asset.asset_type === 'label') { + const parsed = parseLabelLinkbase(content); + for (const [conceptKey, label] of parsed.entries()) { + if (!labelByConcept.has(conceptKey)) { + labelByConcept.set(conceptKey, label); + } + } + } + } catch (error) { + parseError = parseError ?? (error instanceof Error ? error.message : 'Failed to parse taxonomy linkbase'); + } + } + + const materialized = materializeTaxonomyStatements({ + filingId: input.filingId, + accessionNumber: input.accessionNumber, + filingDate: input.filingDate, + filingType: input.filingType, + facts: parsedInstance.facts, + presentation, + labelByConcept + }); + + const derivedMetrics = deriveTaxonomyMetrics(parsedInstance.facts); + const llmValidation = await validateMetricsWithPdfLlm({ + metrics: derivedMetrics, + assets: discovered.assets, + fetchImpl + }); + + const hasRows = (Object.values(materialized.statement_rows).reduce((total, rows) => total + rows.length, 0)) > 0; + const hasFacts = materialized.facts.length > 0; + + const parseStatus: TaxonomyHydrationResult['parse_status'] = hasRows && hasFacts + ? 'ready' + : hasFacts + ? 'partial' + : 'failed'; + + return { + filing_id: input.filingId, + ticker: input.ticker.trim().toUpperCase(), + filing_date: input.filingDate, + filing_type: input.filingType, + parse_status: parseStatus, + parse_error: parseStatus === 'failed' ? (parseError ?? 'No XBRL facts extracted') : parseError, + source, + periods: materialized.periods, + statement_rows: materialized.statement_rows, + derived_metrics: derivedMetrics, + validation_result: llmValidation.validation_result, + facts_count: materialized.facts.length, + concepts_count: materialized.concepts.length, + dimensions_count: materialized.dimensionsCount, + assets: discovered.assets, + concepts: materialized.concepts, + facts: materialized.facts, + metric_validations: llmValidation.metric_validations + }; +} diff --git a/lib/server/taxonomy/linkbase-parser.test.ts b/lib/server/taxonomy/linkbase-parser.test.ts new file mode 100644 index 0000000..a7b8d45 --- /dev/null +++ b/lib/server/taxonomy/linkbase-parser.test.ts @@ -0,0 +1,63 @@ +import { describe, expect, it } from 'bun:test'; +import { + classifyStatementRole, + parseLabelLinkbase, + parsePresentationLinkbase +} from '@/lib/server/taxonomy/linkbase-parser'; + +const SAMPLE_LABEL_LINKBASE = ` + + + + Rev. + Revenues + + + + +`; + +const SAMPLE_PRESENTATION_LINKBASE = ` + + + + + + + + + +`; + +describe('linkbase parser', () => { + it('builds preferred labels from label linkbase', () => { + const labels = parseLabelLinkbase(SAMPLE_LABEL_LINKBASE); + expect(labels.get('http://fasb.org/us-gaap/2024#Revenues')).toBe('Revenues'); + }); + + it('builds role trees with depth/order/parent metadata', () => { + const rows = parsePresentationLinkbase(SAMPLE_PRESENTATION_LINKBASE); + expect(rows.length).toBe(3); + + const root = rows.find((row) => row.qname === 'us-gaap:StatementLineItems'); + const revenue = rows.find((row) => row.qname === 'us-gaap:Revenues'); + const cogs = rows.find((row) => row.qname === 'us-gaap:CostOfGoodsSold'); + + expect(root?.depth).toBe(0); + expect(root?.parentConceptKey).toBeNull(); + expect(revenue?.depth).toBe(1); + expect(cogs?.depth).toBe(1); + expect(revenue?.parentConceptKey).toBe(root?.conceptKey ?? null); + expect(revenue?.order).toBeLessThan(cogs?.order ?? Number.POSITIVE_INFINITY); + }); + + it('classifies statement roles into canonical statement kinds', () => { + expect(classifyStatementRole('http://www.xbrl.org/2003/role/StatementOfOperations')).toBe('income'); + expect(classifyStatementRole('http://www.xbrl.org/2003/role/StatementOfFinancialPosition')).toBe('balance'); + expect(classifyStatementRole('http://www.xbrl.org/2003/role/StatementOfCashFlows')).toBe('cash_flow'); + }); +}); diff --git a/lib/server/taxonomy/linkbase-parser.ts b/lib/server/taxonomy/linkbase-parser.ts new file mode 100644 index 0000000..d0c7b7d --- /dev/null +++ b/lib/server/taxonomy/linkbase-parser.ts @@ -0,0 +1,310 @@ +import type { FinancialStatementKind } from '@/lib/types'; +import type { TaxonomyNamespaceMap, TaxonomyPresentationConcept } from '@/lib/server/taxonomy/types'; + +function decodeXmlEntities(value: string) { + return value + .replace(/&/gi, '&') + .replace(/</gi, '<') + .replace(/>/gi, '>') + .replace(/"/gi, '"') + .replace(/'/gi, "'") + .replace(/ | /gi, ' ') + .trim(); +} + +function parseNamespaceMap(raw: string): TaxonomyNamespaceMap { + const map: TaxonomyNamespaceMap = {}; + const rootStart = raw.match(/<[^>]*linkbase[^>]*>/i)?.[0] ?? raw.slice(0, 1200); + + for (const match of rootStart.matchAll(/xmlns:([a-zA-Z0-9_\-]+)=["']([^"']+)["']/g)) { + const prefix = (match[1] ?? '').trim(); + const uri = (match[2] ?? '').trim(); + if (!prefix || !uri) { + continue; + } + + map[prefix] = uri; + } + + return map; +} + +function qnameFromHref(href: string) { + const fragment = href.includes('#') ? href.slice(href.indexOf('#') + 1) : href; + if (!fragment) { + return null; + } + + const cleaned = fragment.trim().replace(/^loc_+/i, ''); + if (!cleaned) { + return null; + } + + if (cleaned.includes(':')) { + return cleaned; + } + + if (cleaned.includes('_')) { + const idx = cleaned.indexOf('_'); + return `${cleaned.slice(0, idx)}:${cleaned.slice(idx + 1)}`; + } + + return null; +} + +function conceptFromQName(qname: string, namespaces: TaxonomyNamespaceMap) { + const [prefix, ...rest] = qname.split(':'); + const localName = rest.join(':'); + if (!prefix || !localName) { + return null; + } + + const namespaceUri = namespaces[prefix] ?? `urn:unknown:${prefix}`; + + return { + qname, + namespaceUri, + localName, + conceptKey: `${namespaceUri}#${localName}` + }; +} + +function labelPriority(role: string | null) { + const normalized = (role ?? '').toLowerCase(); + if (!normalized) { + return 0; + } + + if (normalized.endsWith('/label')) { + return 4; + } + + if (normalized.endsWith('/terselabel')) { + return 3; + } + + if (normalized.endsWith('/verboselabel')) { + return 2; + } + + return 1; +} + +export function classifyStatementRole(roleUri: string): FinancialStatementKind | null { + const normalized = roleUri.toLowerCase(); + + if (/cash\s*flow|statementsof?cashflows|netcash/.test(normalized)) { + return 'cash_flow'; + } + + if (/shareholders?|stockholders?|equity|retainedearnings/.test(normalized)) { + return 'equity'; + } + + if (/comprehensive\s*income/.test(normalized)) { + return 'comprehensive_income'; + } + + if (/balance\s*sheet|financial\s*position|assets?andliabilities/.test(normalized)) { + return 'balance'; + } + + if (/operations|income\s*statement|statementsofincome|profit/.test(normalized)) { + return 'income'; + } + + return null; +} + +export function parseLabelLinkbase(raw: string): Map { + const namespaces = parseNamespaceMap(raw); + const preferredLabelByConcept = new Map(); + + const linkPattern = /<(?:[a-z0-9_\-]+:)?labelLink\b[^>]*>([\s\S]*?)<\/(?:[a-z0-9_\-]+:)?labelLink>/gi; + for (const linkMatch of raw.matchAll(linkPattern)) { + const block = linkMatch[1] ?? ''; + const locByLabel = new Map(); + const resourceByLabel = new Map(); + + for (const locMatch of block.matchAll(/<(?:[a-z0-9_\-]+:)?loc\b([^>]*)\/?>/gi)) { + const attrs = locMatch[1] ?? ''; + const label = attrs.match(/\bxlink:label=["']([^"']+)["']/i)?.[1]?.trim() ?? ''; + const href = attrs.match(/\bxlink:href=["']([^"']+)["']/i)?.[1]?.trim() ?? ''; + if (!label || !href) { + continue; + } + + const qname = qnameFromHref(href); + if (!qname) { + continue; + } + + const concept = conceptFromQName(qname, namespaces); + if (!concept) { + continue; + } + + locByLabel.set(label, concept.conceptKey); + } + + for (const resourceMatch of block.matchAll(/<(?:[a-z0-9_\-]+:)?label\b([^>]*)>([\s\S]*?)<\/(?:[a-z0-9_\-]+:)?label>/gi)) { + const attrs = resourceMatch[1] ?? ''; + const body = decodeXmlEntities(resourceMatch[2] ?? '').replace(/\s+/g, ' ').trim(); + if (!body) { + continue; + } + + const resourceLabel = attrs.match(/\bxlink:label=["']([^"']+)["']/i)?.[1]?.trim() ?? ''; + const role = attrs.match(/\bxlink:role=["']([^"']+)["']/i)?.[1]?.trim() ?? null; + if (!resourceLabel) { + continue; + } + + resourceByLabel.set(resourceLabel, { + text: body, + role + }); + } + + for (const arcMatch of block.matchAll(/<(?:[a-z0-9_\-]+:)?labelArc\b([^>]*)\/?>/gi)) { + const attrs = arcMatch[1] ?? ''; + const from = attrs.match(/\bxlink:from=["']([^"']+)["']/i)?.[1]?.trim() ?? ''; + const to = attrs.match(/\bxlink:to=["']([^"']+)["']/i)?.[1]?.trim() ?? ''; + if (!from || !to) { + continue; + } + + const conceptKey = locByLabel.get(from); + const resource = resourceByLabel.get(to); + if (!conceptKey || !resource) { + continue; + } + + const priority = labelPriority(resource.role); + const current = preferredLabelByConcept.get(conceptKey); + if (!current || priority > current.priority) { + preferredLabelByConcept.set(conceptKey, { + text: resource.text, + priority + }); + } + } + } + + return new Map( + [...preferredLabelByConcept.entries()].map(([conceptKey, value]) => [conceptKey, value.text]) + ); +} + +export function parsePresentationLinkbase(raw: string): TaxonomyPresentationConcept[] { + const namespaces = parseNamespaceMap(raw); + const rows: TaxonomyPresentationConcept[] = []; + + const linkPattern = /<(?:[a-z0-9_\-]+:)?presentationLink\b([^>]*)>([\s\S]*?)<\/(?:[a-z0-9_\-]+:)?presentationLink>/gi; + for (const linkMatch of raw.matchAll(linkPattern)) { + const linkAttrs = linkMatch[1] ?? ''; + const block = linkMatch[2] ?? ''; + const roleUri = linkAttrs.match(/\bxlink:role=["']([^"']+)["']/i)?.[1]?.trim() ?? ''; + if (!roleUri) { + continue; + } + + const locByLabel = new Map(); + for (const locMatch of block.matchAll(/<(?:[a-z0-9_\-]+:)?loc\b([^>]*)\/?>/gi)) { + const attrs = locMatch[1] ?? ''; + const label = attrs.match(/\bxlink:label=["']([^"']+)["']/i)?.[1]?.trim() ?? ''; + const href = attrs.match(/\bxlink:href=["']([^"']+)["']/i)?.[1]?.trim() ?? ''; + if (!label || !href) { + continue; + } + + const qname = qnameFromHref(href); + if (!qname) { + continue; + } + + const concept = conceptFromQName(qname, namespaces); + if (!concept) { + continue; + } + + locByLabel.set(label, { + conceptKey: concept.conceptKey, + qname: concept.qname, + isAbstract: /abstract/i.test(concept.localName) + }); + } + + const childrenByLabel = new Map>(); + const incoming = new Set(); + const allReferenced = new Set(); + + for (const arcMatch of block.matchAll(/<(?:[a-z0-9_\-]+:)?presentationArc\b([^>]*)\/?>/gi)) { + const attrs = arcMatch[1] ?? ''; + const from = attrs.match(/\bxlink:from=["']([^"']+)["']/i)?.[1]?.trim() ?? ''; + const to = attrs.match(/\bxlink:to=["']([^"']+)["']/i)?.[1]?.trim() ?? ''; + const orderRaw = attrs.match(/\border=["']([^"']+)["']/i)?.[1]?.trim() ?? ''; + const order = Number.parseFloat(orderRaw); + + if (!from || !to || !locByLabel.has(from) || !locByLabel.has(to)) { + continue; + } + + const group = childrenByLabel.get(from) ?? []; + group.push({ label: to, order: Number.isFinite(order) ? order : group.length + 1 }); + childrenByLabel.set(from, group); + + incoming.add(to); + allReferenced.add(from); + allReferenced.add(to); + } + + const roots = [...allReferenced].filter((label) => !incoming.has(label)); + const visited = new Set(); + + function dfs(label: string, depth: number, parentLabel: string | null, baseOrder: number) { + const node = locByLabel.get(label); + if (!node) { + return; + } + + const pathKey = `${parentLabel ?? 'root'}::${label}::${depth}`; + if (visited.has(pathKey)) { + return; + } + visited.add(pathKey); + + const parentConceptKey = parentLabel ? (locByLabel.get(parentLabel)?.conceptKey ?? null) : null; + rows.push({ + conceptKey: node.conceptKey, + qname: node.qname, + roleUri, + order: baseOrder, + depth, + parentConceptKey, + isAbstract: node.isAbstract + }); + + const children = [...(childrenByLabel.get(label) ?? [])].sort((left, right) => left.order - right.order); + for (let i = 0; i < children.length; i += 1) { + const child = children[i]; + if (!child) { + continue; + } + + dfs(child.label, depth + 1, label, baseOrder + (i + 1) / 1000); + } + } + + for (let i = 0; i < roots.length; i += 1) { + const root = roots[i]; + if (!root) { + continue; + } + + dfs(root, 0, null, i + 1); + } + } + + return rows; +} diff --git a/lib/server/taxonomy/materialize.ts b/lib/server/taxonomy/materialize.ts new file mode 100644 index 0000000..c21d9d9 --- /dev/null +++ b/lib/server/taxonomy/materialize.ts @@ -0,0 +1,374 @@ +import type { Filing, FinancialStatementKind, TaxonomyStatementRow } from '@/lib/types'; +import type { TaxonomyConcept, TaxonomyFact, TaxonomyPresentationConcept } from '@/lib/server/taxonomy/types'; +import type { FilingTaxonomyPeriod } from '@/lib/server/repos/filing-taxonomy'; +import { classifyStatementRole } from '@/lib/server/taxonomy/linkbase-parser'; +import { conceptStatementFallback } from '@/lib/server/taxonomy/xbrl-parser'; + +function compactAccessionNumber(value: string) { + return value.replace(/-/g, ''); +} + +function isUsGaapNamespace(namespaceUri: string) { + return /fasb\.org\/us-gaap/i.test(namespaceUri) || /us-gaap/i.test(namespaceUri); +} + +function splitConceptKey(conceptKey: string) { + const index = conceptKey.lastIndexOf('#'); + if (index < 0) { + return { + namespaceUri: 'urn:unknown', + localName: conceptKey + }; + } + + return { + namespaceUri: conceptKey.slice(0, index), + localName: conceptKey.slice(index + 1) + }; +} + +function localNameToLabel(localName: string) { + return localName + .replace(/([a-z0-9])([A-Z])/g, '$1 $2') + .replace(/([A-Z]+)([A-Z][a-z])/g, '$1 $2') + .replace(/_/g, ' ') + .trim(); +} + +function createStatementRecord(factory: () => T): Record { + return { + income: factory(), + balance: factory(), + cash_flow: factory(), + equity: factory(), + comprehensive_income: factory() + }; +} + +function periodSignature(fact: TaxonomyFact) { + const start = fact.periodStart ?? ''; + const end = fact.periodEnd ?? ''; + const instant = fact.periodInstant ?? ''; + return `start:${start}|end:${end}|instant:${instant}`; +} + +function periodDate(fact: TaxonomyFact, fallbackDate: string) { + return fact.periodEnd ?? fact.periodInstant ?? fallbackDate; +} + +function parseEpoch(value: string | null) { + if (!value) { + return Number.NaN; + } + + return Date.parse(value); +} + +function sortPeriods(periods: FilingTaxonomyPeriod[]) { + return [...periods].sort((left, right) => { + const leftDate = parseEpoch(left.periodEnd ?? left.filingDate); + const rightDate = parseEpoch(right.periodEnd ?? right.filingDate); + + if (Number.isFinite(leftDate) && Number.isFinite(rightDate) && leftDate !== rightDate) { + return leftDate - rightDate; + } + + return left.id.localeCompare(right.id); + }); +} + +function pickPreferredFact(facts: T[]) { + if (facts.length === 0) { + return null; + } + + const ordered = [...facts].sort((left, right) => { + const leftScore = left.isDimensionless ? 1 : 0; + const rightScore = right.isDimensionless ? 1 : 0; + if (leftScore !== rightScore) { + return rightScore - leftScore; + } + + const leftDate = parseEpoch(left.periodEnd ?? left.periodInstant); + const rightDate = parseEpoch(right.periodEnd ?? right.periodInstant); + if (Number.isFinite(leftDate) && Number.isFinite(rightDate) && leftDate !== rightDate) { + return rightDate - leftDate; + } + + return Math.abs(right.value) - Math.abs(left.value); + }); + + return ordered[0] ?? null; +} + +export function materializeTaxonomyStatements(input: { + filingId: number; + accessionNumber: string; + filingDate: string; + filingType: '10-K' | '10-Q'; + facts: TaxonomyFact[]; + presentation: TaxonomyPresentationConcept[]; + labelByConcept: Map; +}) { + const periodBySignature = new Map(); + const compactAccession = compactAccessionNumber(input.accessionNumber); + + for (const fact of input.facts) { + const signature = periodSignature(fact); + if (periodBySignature.has(signature)) { + continue; + } + + const date = periodDate(fact, input.filingDate); + const id = `${date}-${compactAccession}-${periodBySignature.size + 1}`; + + periodBySignature.set(signature, { + id, + filingId: input.filingId, + accessionNumber: input.accessionNumber, + filingDate: input.filingDate, + periodStart: fact.periodStart, + periodEnd: fact.periodEnd ?? fact.periodInstant ?? input.filingDate, + filingType: input.filingType, + periodLabel: fact.periodInstant && !fact.periodStart + ? 'Instant' + : fact.periodStart && fact.periodEnd + ? `${fact.periodStart} to ${fact.periodEnd}` + : 'Filing Period' + }); + } + + const periods = sortPeriods([...periodBySignature.values()]); + const periodIdBySignature = new Map( + [...periodBySignature.entries()].map(([signature, period]) => [signature, period.id]) + ); + + const presentationByConcept = new Map(); + for (const node of input.presentation) { + const existing = presentationByConcept.get(node.conceptKey); + if (existing) { + existing.push(node); + } else { + presentationByConcept.set(node.conceptKey, [node]); + } + } + + const enrichedFacts = input.facts.map((fact, index) => { + const nodes = presentationByConcept.get(fact.conceptKey) ?? []; + const bestNode = nodes[0] ?? null; + const statementKind = bestNode + ? classifyStatementRole(bestNode.roleUri) + : conceptStatementFallback(fact.localName); + + return { + ...fact, + __sourceFactId: index + 1, + statement_kind: statementKind, + role_uri: bestNode?.roleUri ?? null + }; + }); + + const rowsByStatement = createStatementRecord(() => []); + const conceptByKey = new Map(); + const groupedByStatement = createStatementRecord>(() => new Map()); + + for (const fact of enrichedFacts) { + if (!fact.statement_kind) { + continue; + } + + const group = groupedByStatement[fact.statement_kind].get(fact.conceptKey); + if (group) { + group.push(fact); + } else { + groupedByStatement[fact.statement_kind].set(fact.conceptKey, [fact]); + } + } + + for (const statement of Object.keys(rowsByStatement) as FinancialStatementKind[]) { + const conceptKeys = new Set(); + + for (const node of input.presentation) { + if (classifyStatementRole(node.roleUri) !== statement) { + continue; + } + + conceptKeys.add(node.conceptKey); + } + + for (const conceptKey of groupedByStatement[statement].keys()) { + conceptKeys.add(conceptKey); + } + + const orderedConcepts = [...conceptKeys] + .map((conceptKey) => { + const presentationNodes = input.presentation.filter( + (node) => node.conceptKey === conceptKey && classifyStatementRole(node.roleUri) === statement + ); + const presentationOrder = presentationNodes.length > 0 + ? Math.min(...presentationNodes.map((node) => node.order)) + : Number.MAX_SAFE_INTEGER; + const presentationDepth = presentationNodes.length > 0 + ? Math.min(...presentationNodes.map((node) => node.depth)) + : 0; + const roleUri = presentationNodes[0]?.roleUri ?? null; + const parentConceptKey = presentationNodes[0]?.parentConceptKey ?? null; + return { + conceptKey, + presentationOrder, + presentationDepth, + roleUri, + parentConceptKey + }; + }) + .sort((left, right) => { + if (left.presentationOrder !== right.presentationOrder) { + return left.presentationOrder - right.presentationOrder; + } + + return left.conceptKey.localeCompare(right.conceptKey); + }); + + for (const orderedConcept of orderedConcepts) { + const facts = groupedByStatement[statement].get(orderedConcept.conceptKey) ?? []; + const { namespaceUri, localName } = splitConceptKey(orderedConcept.conceptKey); + const qname = facts[0]?.qname ?? `unknown:${localName}`; + const label = input.labelByConcept.get(orderedConcept.conceptKey) ?? localNameToLabel(localName); + const values: Record = {}; + const units: Record = {}; + + const factGroups = new Map(); + for (const fact of facts) { + const signature = periodSignature(fact); + const group = factGroups.get(signature); + if (group) { + group.push(fact); + } else { + factGroups.set(signature, [fact]); + } + } + + const sourceFactIds: number[] = []; + let hasDimensions = false; + for (const [signature, group] of factGroups.entries()) { + const periodId = periodIdBySignature.get(signature); + if (!periodId) { + continue; + } + + const preferred = pickPreferredFact(group); + if (!preferred) { + continue; + } + + values[periodId] = preferred.value; + units[periodId] = preferred.unit; + const sourceFactId = (preferred as { __sourceFactId?: number }).__sourceFactId; + if (typeof sourceFactId === 'number') { + sourceFactIds.push(sourceFactId); + } + + if (group.some((entry) => !entry.isDimensionless)) { + hasDimensions = true; + } + } + + if (Object.keys(values).length === 0) { + continue; + } + + const row: TaxonomyStatementRow = { + key: orderedConcept.conceptKey, + label, + conceptKey: orderedConcept.conceptKey, + qname, + namespaceUri, + localName, + isExtension: !isUsGaapNamespace(namespaceUri), + statement, + roleUri: orderedConcept.roleUri, + order: Number.isFinite(orderedConcept.presentationOrder) + ? orderedConcept.presentationOrder + : rowsByStatement[statement].length + 1, + depth: orderedConcept.presentationDepth, + parentKey: orderedConcept.parentConceptKey, + values, + units, + hasDimensions, + sourceFactIds + }; + + rowsByStatement[statement].push(row); + + if (!conceptByKey.has(orderedConcept.conceptKey)) { + conceptByKey.set(orderedConcept.conceptKey, { + concept_key: orderedConcept.conceptKey, + qname, + namespace_uri: namespaceUri, + local_name: localName, + label, + is_extension: !isUsGaapNamespace(namespaceUri), + statement_kind: statement, + role_uri: orderedConcept.roleUri, + presentation_order: row.order, + presentation_depth: row.depth, + parent_concept_key: row.parentKey, + is_abstract: /abstract/i.test(localName) + }); + } + } + } + + for (const fact of enrichedFacts) { + if (conceptByKey.has(fact.conceptKey)) { + continue; + } + + conceptByKey.set(fact.conceptKey, { + concept_key: fact.conceptKey, + qname: fact.qname, + namespace_uri: fact.namespaceUri, + local_name: fact.localName, + label: input.labelByConcept.get(fact.conceptKey) ?? localNameToLabel(fact.localName), + is_extension: !isUsGaapNamespace(fact.namespaceUri), + statement_kind: fact.statement_kind, + role_uri: fact.role_uri, + presentation_order: null, + presentation_depth: null, + parent_concept_key: null, + is_abstract: /abstract/i.test(fact.localName) + }); + } + + const concepts = [...conceptByKey.values()]; + const factRows = enrichedFacts.map((fact) => ({ + concept_key: fact.conceptKey, + qname: fact.qname, + namespace_uri: fact.namespaceUri, + local_name: fact.localName, + statement_kind: fact.statement_kind, + role_uri: fact.role_uri, + context_id: fact.contextId, + unit: fact.unit, + decimals: fact.decimals, + value_num: fact.value, + period_start: fact.periodStart, + period_end: fact.periodEnd, + period_instant: fact.periodInstant, + dimensions: fact.dimensions, + is_dimensionless: fact.isDimensionless, + source_file: fact.sourceFile, + })); + + const dimensionsCount = enrichedFacts.reduce((total, fact) => { + return total + fact.dimensions.length; + }, 0); + + return { + periods, + statement_rows: rowsByStatement, + concepts, + facts: factRows, + dimensionsCount + }; +} diff --git a/lib/server/taxonomy/metrics.test.ts b/lib/server/taxonomy/metrics.test.ts new file mode 100644 index 0000000..e8a166c --- /dev/null +++ b/lib/server/taxonomy/metrics.test.ts @@ -0,0 +1,55 @@ +import { describe, expect, it } from 'bun:test'; +import type { TaxonomyFact } from '@/lib/server/taxonomy/types'; +import { deriveTaxonomyMetrics } from '@/lib/server/taxonomy/metrics'; + +function fact(localName: string, value: number, overrides?: Partial): TaxonomyFact { + return { + conceptKey: `http://fasb.org/us-gaap/2024#${localName}`, + qname: `us-gaap:${localName}`, + namespaceUri: 'http://fasb.org/us-gaap/2024', + localName, + contextId: 'c1', + unit: 'iso4217:USD', + decimals: '-6', + value, + periodStart: '2025-01-01', + periodEnd: '2025-12-31', + periodInstant: null, + dimensions: [], + isDimensionless: true, + sourceFile: 'abc_htm.xml', + ...overrides + }; +} + +describe('taxonomy metric derivation', () => { + it('applies concept priority for canonical metrics and debt component fallback', () => { + const metrics = deriveTaxonomyMetrics([ + fact('SalesRevenueNet', 500), + fact('Revenues', 450), + fact('NetIncomeLoss', 40), + fact('Assets', 1000), + fact('CashAndCashEquivalentsAtCarryingValue', 80), + fact('DebtCurrent', 15), + fact('LongTermDebtNoncurrent', 35) + ]); + + expect(metrics).toEqual({ + revenue: 450, + netIncome: 40, + totalAssets: 1000, + cash: 80, + debt: 50 + }); + }); + + it('uses direct debt concept before computed debt fallback when available', () => { + const metrics = deriveTaxonomyMetrics([ + fact('DebtCurrent', 15), + fact('LongTermDebtNoncurrent', 35), + fact('LongTermDebtAndCapitalLeaseObligations', 90) + ]); + + expect(metrics.debt).toBe(90); + }); +}); diff --git a/lib/server/taxonomy/metrics.ts b/lib/server/taxonomy/metrics.ts new file mode 100644 index 0000000..780c2bd --- /dev/null +++ b/lib/server/taxonomy/metrics.ts @@ -0,0 +1,106 @@ +import type { Filing } from '@/lib/types'; +import type { TaxonomyFact } from '@/lib/server/taxonomy/types'; + +const METRIC_LOCAL_NAME_PRIORITY = { + revenue: [ + 'Revenues', + 'SalesRevenueNet', + 'RevenueFromContractWithCustomerExcludingAssessedTax', + 'TotalRevenuesAndOtherIncome' + ], + netIncome: ['NetIncomeLoss', 'ProfitLoss'], + totalAssets: ['Assets'], + cash: [ + 'CashAndCashEquivalentsAtCarryingValue', + 'CashCashEquivalentsRestrictedCashAndRestrictedCashEquivalents' + ], + debtDirect: [ + 'DebtAndFinanceLeaseLiabilities', + 'Debt', + 'LongTermDebtAndCapitalLeaseObligations' + ], + debtCurrent: [ + 'DebtCurrent', + 'ShortTermBorrowings', + 'LongTermDebtCurrent' + ], + debtNonCurrent: [ + 'LongTermDebtNoncurrent', + 'LongTermDebt', + 'DebtNoncurrent' + ] +} as const; + +function normalizeDateToEpoch(value: string | null) { + if (!value) { + return Number.NaN; + } + + return Date.parse(value); +} + +function sameLocalName(left: string, right: string) { + return left.toLowerCase() === right.toLowerCase(); +} + +function pickPreferredFact(facts: TaxonomyFact[]) { + const ordered = [...facts].sort((left, right) => { + const leftDimensionScore = left.isDimensionless ? 1 : 0; + const rightDimensionScore = right.isDimensionless ? 1 : 0; + if (leftDimensionScore !== rightDimensionScore) { + return rightDimensionScore - leftDimensionScore; + } + + const leftDate = normalizeDateToEpoch(left.periodEnd ?? left.periodInstant); + const rightDate = normalizeDateToEpoch(right.periodEnd ?? right.periodInstant); + if (Number.isFinite(leftDate) && Number.isFinite(rightDate) && leftDate !== rightDate) { + return rightDate - leftDate; + } + + return Math.abs(right.value) - Math.abs(left.value); + }); + + return ordered[0] ?? null; +} + +function pickBestFact(facts: TaxonomyFact[], localNames: readonly string[]) { + for (const localName of localNames) { + const matches = facts.filter((fact) => sameLocalName(fact.localName, localName)); + if (matches.length === 0) { + continue; + } + + return pickPreferredFact(matches); + } + + return null; +} + +function sumIfBoth(left: number | null, right: number | null) { + if (left === null || right === null) { + return null; + } + + return left + right; +} + +export function deriveTaxonomyMetrics(facts: TaxonomyFact[]): NonNullable { + const revenue = pickBestFact(facts, METRIC_LOCAL_NAME_PRIORITY.revenue)?.value ?? null; + const netIncome = pickBestFact(facts, METRIC_LOCAL_NAME_PRIORITY.netIncome)?.value ?? null; + const totalAssets = pickBestFact(facts, METRIC_LOCAL_NAME_PRIORITY.totalAssets)?.value ?? null; + const cash = pickBestFact(facts, METRIC_LOCAL_NAME_PRIORITY.cash)?.value ?? null; + + const directDebt = pickBestFact(facts, METRIC_LOCAL_NAME_PRIORITY.debtDirect)?.value ?? null; + const debt = directDebt ?? sumIfBoth( + pickBestFact(facts, METRIC_LOCAL_NAME_PRIORITY.debtCurrent)?.value ?? null, + pickBestFact(facts, METRIC_LOCAL_NAME_PRIORITY.debtNonCurrent)?.value ?? null + ); + + return { + revenue, + netIncome, + totalAssets, + cash, + debt + }; +} diff --git a/lib/server/taxonomy/pdf-validation.test.ts b/lib/server/taxonomy/pdf-validation.test.ts new file mode 100644 index 0000000..2dd56e1 --- /dev/null +++ b/lib/server/taxonomy/pdf-validation.test.ts @@ -0,0 +1,49 @@ +import { describe, expect, it } from 'bun:test'; +import { __pdfValidationInternals } from '@/lib/server/taxonomy/pdf-validation'; + +describe('pdf metric validation internals', () => { + it('parses fenced json payloads and rejects invalid payloads', () => { + const parsed = __pdfValidationInternals.parseValidationPayload([ + '```json', + '{"revenue":{"value":1000,"pages":[3]},"cash":{"value":200,"pages":["4"]}}', + '```' + ].join('\n')); + + expect(parsed).not.toBeNull(); + expect(parsed?.revenue?.value).toBe(1000); + expect(parsed?.cash?.pages).toEqual(['4']); + expect(__pdfValidationInternals.parseValidationPayload('not-json')).toBeNull(); + }); + + it('compares taxonomy vs llm values with fixed tolerance rules', () => { + expect(__pdfValidationInternals.diffStatus(1000, 1004)).toEqual({ + status: 'matched', + absoluteDiff: 4, + relativeDiff: 0.004 + }); + + expect(__pdfValidationInternals.diffStatus(1000, 1007)).toEqual({ + status: 'mismatch', + absoluteDiff: 7, + relativeDiff: 0.007 + }); + + expect(__pdfValidationInternals.diffStatus(0.5, 1.2)).toEqual({ + status: 'matched', + absoluteDiff: 0.7, + relativeDiff: 0.7 + }); + + expect(__pdfValidationInternals.diffStatus(null, 1)).toEqual({ + status: 'mismatch', + absoluteDiff: null, + relativeDiff: null + }); + + expect(__pdfValidationInternals.diffStatus(null, null)).toEqual({ + status: 'not_run', + absoluteDiff: null, + relativeDiff: null + }); + }); +}); diff --git a/lib/server/taxonomy/pdf-validation.ts b/lib/server/taxonomy/pdf-validation.ts new file mode 100644 index 0000000..291f921 --- /dev/null +++ b/lib/server/taxonomy/pdf-validation.ts @@ -0,0 +1,336 @@ +import { execFile } from 'node:child_process'; +import { mkdtemp, rm, writeFile } from 'node:fs/promises'; +import { tmpdir } from 'node:os'; +import { join } from 'node:path'; +import { promisify } from 'node:util'; +import type { Filing, MetricValidationResult } from '@/lib/types'; +import { runAiAnalysis } from '@/lib/server/ai'; +import type { TaxonomyAsset, TaxonomyMetricValidationCheck } from '@/lib/server/taxonomy/types'; + +const execFileAsync = promisify(execFile); + +const METRIC_KEYS: Array> = [ + 'revenue', + 'netIncome', + 'totalAssets', + 'cash', + 'debt' +]; + +function extractJsonCandidate(raw: string) { + const fencedJson = raw.match(/```(?:json)?\s*([\s\S]*?)```/i)?.[1]; + const candidate = fencedJson ?? (() => { + const start = raw.indexOf('{'); + const end = raw.lastIndexOf('}'); + return start >= 0 && end > start ? raw.slice(start, end + 1) : null; + })(); + + return candidate; +} + +function parseValidationPayload(raw: string) { + const candidate = extractJsonCandidate(raw); + if (!candidate) { + return null; + } + + try { + return JSON.parse(candidate) as Record; + }>; + } catch { + return null; + } +} + +function asNumber(value: unknown) { + if (typeof value === 'number') { + return Number.isFinite(value) ? value : null; + } + + if (typeof value === 'string') { + const parsed = Number(value.replace(/[,\s]/g, '')); + return Number.isFinite(parsed) ? parsed : null; + } + + return null; +} + +function asPageNumbers(raw: unknown): number[] { + if (!Array.isArray(raw)) { + return []; + } + + return raw + .map((entry) => { + if (typeof entry === 'number' && Number.isFinite(entry)) { + return Math.trunc(entry); + } + + if (typeof entry === 'string') { + const parsed = Number(entry); + return Number.isFinite(parsed) ? Math.trunc(parsed) : Number.NaN; + } + + return Number.NaN; + }) + .filter((entry) => Number.isFinite(entry) && entry > 0); +} + +function diffStatus(taxonomyValue: number | null, llmValue: number | null) { + if (taxonomyValue === null && llmValue === null) { + return { + status: 'not_run' as const, + absoluteDiff: null, + relativeDiff: null + }; + } + + if (taxonomyValue === null || llmValue === null) { + return { + status: 'mismatch' as const, + absoluteDiff: null, + relativeDiff: null + }; + } + + const absoluteDiff = Math.abs(taxonomyValue - llmValue); + const denominator = Math.max(Math.abs(taxonomyValue), 1); + const relativeDiff = absoluteDiff / denominator; + const tolerance = Math.max(1, Math.abs(taxonomyValue) * 0.005); + + return { + status: absoluteDiff <= tolerance ? 'matched' as const : 'mismatch' as const, + absoluteDiff, + relativeDiff + }; +} + +async function extractPdfText(url: string, fetchImpl: typeof fetch) { + const response = await fetchImpl(url, { + headers: { + Accept: 'application/pdf, */*;q=0.8' + }, + cache: 'no-store' + }); + + if (!response.ok) { + throw new Error(`PDF request failed (${response.status})`); + } + + const contentType = response.headers.get('content-type') ?? ''; + if (!/pdf/i.test(contentType) && !/\.pdf$/i.test(url)) { + throw new Error(`Asset is not a PDF (${contentType || 'unknown content-type'})`); + } + + const bytes = new Uint8Array(await response.arrayBuffer()); + const tempRoot = await mkdtemp(join(tmpdir(), 'fiscal-pdf-')); + const pdfPath = join(tempRoot, 'source.pdf'); + + try { + await writeFile(pdfPath, bytes); + const { stdout } = await execFileAsync('pdftotext', ['-layout', '-enc', 'UTF-8', pdfPath, '-'], { + maxBuffer: 16 * 1024 * 1024 + }); + + const text = stdout.trim(); + if (!text) { + return null; + } + + return text; + } finally { + await rm(tempRoot, { recursive: true, force: true }); + } +} + +function validationPrompt(metrics: Filing['metrics'], pdfText: string) { + const textSlice = pdfText.slice(0, 80_000); + + return [ + 'Extract numeric financial metrics from the provided financial statement PDF text.', + `Taxonomy baseline metrics: ${JSON.stringify(metrics ?? {})}`, + 'Return ONLY JSON with keys revenue, netIncome, totalAssets, cash, debt.', + 'Each key must map to: {"value": number|null, "pages": [number]}.', + 'Use null when a metric is not found.', + 'PDF text follows:', + textSlice + ].join('\n\n'); +} + +function providerModelOrNull(value: string | undefined | null) { + const normalized = value?.trim(); + return normalized && normalized.length > 0 ? normalized : null; +} + +export async function validateMetricsWithPdfLlm(input: { + metrics: Filing['metrics']; + assets: TaxonomyAsset[]; + fetchImpl?: typeof fetch; +}): Promise<{ + validation_result: MetricValidationResult | null; + metric_validations: TaxonomyMetricValidationCheck[]; +}> { + const taxonomyMetrics = input.metrics ?? { + revenue: null, + netIncome: null, + totalAssets: null, + cash: null, + debt: null + }; + + const selectedPdf = input.assets.find((asset) => asset.asset_type === 'pdf' && asset.is_selected); + if (!selectedPdf) { + return { + validation_result: { + status: 'not_run', + checks: [], + validatedAt: null + }, + metric_validations: [] + }; + } + + const fetchImpl = input.fetchImpl ?? fetch; + let pdfText: string | null = null; + try { + pdfText = await extractPdfText(selectedPdf.url, fetchImpl); + } catch (error) { + const message = error instanceof Error ? error.message : 'PDF extraction failed'; + + const checks: TaxonomyMetricValidationCheck[] = METRIC_KEYS.map((metricKey) => ({ + metric_key: metricKey, + taxonomy_value: taxonomyMetrics[metricKey], + llm_value: null, + absolute_diff: null, + relative_diff: null, + status: 'error', + evidence_pages: [], + pdf_url: selectedPdf.url, + provider: null, + model: null, + error: message + })); + + return { + validation_result: { + status: 'error', + checks: checks.map((check) => ({ + metricKey: check.metric_key, + taxonomyValue: check.taxonomy_value, + llmValue: check.llm_value, + absoluteDiff: check.absolute_diff, + relativeDiff: check.relative_diff, + status: check.status, + evidencePages: check.evidence_pages, + pdfUrl: check.pdf_url, + provider: check.provider, + model: check.model, + error: check.error + })), + validatedAt: new Date().toISOString() + }, + metric_validations: checks + }; + } + + if (!pdfText) { + return { + validation_result: { + status: 'not_run', + checks: [], + validatedAt: new Date().toISOString() + }, + metric_validations: [] + }; + } + + let parsed: Record }> | null = null; + let provider: string | null = null; + let model: string | null = null; + let modelError: string | null = null; + + try { + const aiResult = await runAiAnalysis(validationPrompt(taxonomyMetrics, pdfText), undefined, { + workload: 'extraction' + }); + + provider = providerModelOrNull(aiResult.provider); + model = providerModelOrNull(aiResult.model); + parsed = parseValidationPayload(aiResult.text); + if (!parsed) { + modelError = 'LLM response did not contain valid JSON payload'; + } + } catch (error) { + modelError = error instanceof Error ? error.message : 'LLM validation failed'; + } + + const validations: TaxonomyMetricValidationCheck[] = METRIC_KEYS.map((metricKey) => { + const taxonomyValue = taxonomyMetrics[metricKey] ?? null; + + if (!parsed) { + return { + metric_key: metricKey, + taxonomy_value: taxonomyValue, + llm_value: null, + absolute_diff: null, + relative_diff: null, + status: modelError ? 'error' : 'not_run', + evidence_pages: [], + pdf_url: selectedPdf.url, + provider, + model, + error: modelError + }; + } + + const entry = parsed[metricKey as string] ?? {}; + const llmValue = asNumber(entry.value); + const pages = asPageNumbers(entry.pages); + const diff = diffStatus(taxonomyValue, llmValue); + + return { + metric_key: metricKey, + taxonomy_value: taxonomyValue, + llm_value: llmValue, + absolute_diff: diff.absoluteDiff, + relative_diff: diff.relativeDiff, + status: diff.status, + evidence_pages: pages, + pdf_url: selectedPdf.url, + provider, + model, + error: null + }; + }); + + const hasError = validations.some((entry) => entry.status === 'error'); + const hasMismatch = validations.some((entry) => entry.status === 'mismatch'); + + return { + validation_result: { + status: hasError ? 'error' : hasMismatch ? 'mismatch' : 'matched', + checks: validations.map((check) => ({ + metricKey: check.metric_key, + taxonomyValue: check.taxonomy_value, + llmValue: check.llm_value, + absoluteDiff: check.absolute_diff, + relativeDiff: check.relative_diff, + status: check.status, + evidencePages: check.evidence_pages, + pdfUrl: check.pdf_url, + provider: check.provider, + model: check.model, + error: check.error + })), + validatedAt: new Date().toISOString() + }, + metric_validations: validations + }; +} + +export const __pdfValidationInternals = { + parseValidationPayload, + diffStatus +}; diff --git a/lib/server/taxonomy/types.ts b/lib/server/taxonomy/types.ts new file mode 100644 index 0000000..555a6e4 --- /dev/null +++ b/lib/server/taxonomy/types.ts @@ -0,0 +1,136 @@ +import type { Filing, FinancialStatementKind, MetricValidationResult, TaxonomyStatementRow } from '@/lib/types'; +import type { + FilingTaxonomyAssetType, + FilingTaxonomyParseStatus, + FilingTaxonomyPeriod, + FilingTaxonomySource +} from '@/lib/server/repos/filing-taxonomy'; + +export type TaxonomyAsset = { + asset_type: FilingTaxonomyAssetType; + name: string; + url: string; + size_bytes: number | null; + score: number | null; + is_selected: boolean; +}; + +export type TaxonomyNamespaceMap = Record; + +export type TaxonomyContext = { + id: string; + periodStart: string | null; + periodEnd: string | null; + periodInstant: string | null; + dimensions: Array<{ axis: string; member: string }>; +}; + +export type TaxonomyUnit = { + id: string; + measure: string | null; +}; + +export type TaxonomyFact = { + conceptKey: string; + qname: string; + namespaceUri: string; + localName: string; + contextId: string; + unit: string | null; + decimals: string | null; + value: number; + periodStart: string | null; + periodEnd: string | null; + periodInstant: string | null; + dimensions: Array<{ axis: string; member: string }>; + isDimensionless: boolean; + sourceFile: string | null; +}; + +export type TaxonomyPresentationConcept = { + conceptKey: string; + qname: string; + roleUri: string; + order: number; + depth: number; + parentConceptKey: string | null; + isAbstract: boolean; +}; + +export type TaxonomyConcept = { + concept_key: string; + qname: string; + namespace_uri: string; + local_name: string; + label: string | null; + is_extension: boolean; + statement_kind: FinancialStatementKind | null; + role_uri: string | null; + presentation_order: number | null; + presentation_depth: number | null; + parent_concept_key: string | null; + is_abstract: boolean; +}; + +export type TaxonomyMetricValidationCheck = { + metric_key: keyof NonNullable; + taxonomy_value: number | null; + llm_value: number | null; + absolute_diff: number | null; + relative_diff: number | null; + status: 'not_run' | 'matched' | 'mismatch' | 'error'; + evidence_pages: number[]; + pdf_url: string | null; + provider: string | null; + model: string | null; + error: string | null; +}; + +export type TaxonomyHydrationInput = { + filingId: number; + ticker: string; + cik: string; + accessionNumber: string; + filingDate: string; + filingType: '10-K' | '10-Q'; + filingUrl: string | null; + primaryDocument: string | null; +}; + +export type TaxonomyHydrationResult = { + filing_id: number; + ticker: string; + filing_date: string; + filing_type: '10-K' | '10-Q'; + parse_status: FilingTaxonomyParseStatus; + parse_error: string | null; + source: FilingTaxonomySource; + periods: FilingTaxonomyPeriod[]; + statement_rows: Record; + derived_metrics: Filing['metrics']; + validation_result: MetricValidationResult | null; + facts_count: number; + concepts_count: number; + dimensions_count: number; + assets: TaxonomyAsset[]; + concepts: TaxonomyConcept[]; + facts: Array<{ + concept_key: string; + qname: string; + namespace_uri: string; + local_name: string; + statement_kind: FinancialStatementKind | null; + role_uri: string | null; + context_id: string; + unit: string | null; + decimals: string | null; + value_num: number; + period_start: string | null; + period_end: string | null; + period_instant: string | null; + dimensions: Array<{ axis: string; member: string }>; + is_dimensionless: boolean; + source_file: string | null; + }>; + metric_validations: TaxonomyMetricValidationCheck[]; +}; diff --git a/lib/server/taxonomy/xbrl-parser.test.ts b/lib/server/taxonomy/xbrl-parser.test.ts new file mode 100644 index 0000000..4b66575 --- /dev/null +++ b/lib/server/taxonomy/xbrl-parser.test.ts @@ -0,0 +1,60 @@ +import { describe, expect, it } from 'bun:test'; +import { parseXbrlInstance } from '@/lib/server/taxonomy/xbrl-parser'; + +const SAMPLE_XBRL = ` + + + + 2025-01-01 + 2025-12-31 + + + + + + us-gaap:ConsolidatedGroupMember + + + + 2025-12-31 + + + + iso4217:USD + + 1,234 + 5,678 + Acme Corp + +`; + +describe('xbrl instance parser', () => { + it('parses contexts, units, numeric facts, dimensions, and concept keys', () => { + const parsed = parseXbrlInstance(SAMPLE_XBRL, 'abc_htm.xml'); + + expect(parsed.contexts.c1?.periodStart).toBe('2025-01-01'); + expect(parsed.contexts.c1?.periodEnd).toBe('2025-12-31'); + expect(parsed.contexts.c2?.periodInstant).toBe('2025-12-31'); + expect(parsed.contexts.c2?.dimensions.length).toBe(1); + expect(parsed.units.u1?.measure).toBe('iso4217:USD'); + + expect(parsed.facts.length).toBe(2); + const revenueFact = parsed.facts.find((fact) => fact.localName === 'Revenues'); + const assetsFact = parsed.facts.find((fact) => fact.localName === 'Assets'); + + expect(revenueFact?.conceptKey).toBe('http://fasb.org/us-gaap/2024#Revenues'); + expect(revenueFact?.isDimensionless).toBe(true); + expect(revenueFact?.value).toBe(1234); + expect(revenueFact?.sourceFile).toBe('abc_htm.xml'); + + expect(assetsFact?.conceptKey).toBe('http://fasb.org/us-gaap/2024#Assets'); + expect(assetsFact?.isDimensionless).toBe(false); + expect(assetsFact?.dimensions[0]).toEqual({ + axis: 'us-gaap:StatementBusinessSegmentsAxis', + member: 'us-gaap:ConsolidatedGroupMember' + }); + }); +}); diff --git a/lib/server/taxonomy/xbrl-parser.ts b/lib/server/taxonomy/xbrl-parser.ts new file mode 100644 index 0000000..f42ec93 --- /dev/null +++ b/lib/server/taxonomy/xbrl-parser.ts @@ -0,0 +1,264 @@ +import type { FinancialStatementKind } from '@/lib/types'; +import type { TaxonomyContext, TaxonomyFact, TaxonomyNamespaceMap, TaxonomyUnit } from '@/lib/server/taxonomy/types'; + +function decodeXmlEntities(value: string) { + return value + .replace(/&/gi, '&') + .replace(/</gi, '<') + .replace(/>/gi, '>') + .replace(/"/gi, '"') + .replace(/'/gi, "'") + .replace(/ | /gi, ' ') + .replace(/&#x([0-9a-f]+);/gi, (_match, hex) => { + const parsed = Number.parseInt(hex, 16); + if (!Number.isFinite(parsed)) { + return ' '; + } + + try { + return String.fromCodePoint(parsed); + } catch { + return ' '; + } + }) + .replace(/&#([0-9]+);/g, (_match, numeric) => { + const parsed = Number.parseInt(numeric, 10); + if (!Number.isFinite(parsed)) { + return ' '; + } + + try { + return String.fromCodePoint(parsed); + } catch { + return ' '; + } + }); +} + +function parseNumber(value: string) { + const trimmed = value.trim(); + if (!trimmed) { + return null; + } + + if (/^--+$/.test(trimmed)) { + return null; + } + + const negative = trimmed.startsWith('(') && trimmed.endsWith(')'); + const normalized = trimmed + .replace(/<[^>]+>/g, ' ') + .replace(/[,$\s]/g, '') + .replace(/[()]/g, '') + .replace(/\u2212/g, '-'); + + if (!normalized) { + return null; + } + + const parsed = Number.parseFloat(normalized); + if (!Number.isFinite(parsed)) { + return null; + } + + return negative ? -Math.abs(parsed) : parsed; +} + +function parseNamespaceMapFromDocument(raw: string): TaxonomyNamespaceMap { + const map: TaxonomyNamespaceMap = {}; + const rootStart = raw.match(/<[^>]*xbrl[^>]*>/i)?.[0] ?? raw.slice(0, 1200); + + for (const match of rootStart.matchAll(/xmlns:([a-zA-Z0-9_\-]+)=["']([^"']+)["']/g)) { + const prefix = (match[1] ?? '').trim(); + const uri = (match[2] ?? '').trim(); + + if (!prefix || !uri) { + continue; + } + + map[prefix] = uri; + } + + return map; +} + +function parseContexts(raw: string): Record { + const contexts: Record = {}; + const contextPattern = /<(?:[a-z0-9_\-]+:)?context\b[^>]*\bid=["']([^"']+)["'][^>]*>([\s\S]*?)<\/(?:[a-z0-9_\-]+:)?context>/gi; + + for (const match of raw.matchAll(contextPattern)) { + const contextId = (match[1] ?? '').trim(); + const block = match[2] ?? ''; + if (!contextId) { + continue; + } + + const periodStart = block.match(/<(?:[a-z0-9_\-]+:)?startDate>([^<]+)<\/(?:[a-z0-9_\-]+:)?startDate>/i)?.[1]?.trim() ?? null; + const periodEnd = block.match(/<(?:[a-z0-9_\-]+:)?endDate>([^<]+)<\/(?:[a-z0-9_\-]+:)?endDate>/i)?.[1]?.trim() ?? null; + const periodInstant = block.match(/<(?:[a-z0-9_\-]+:)?instant>([^<]+)<\/(?:[a-z0-9_\-]+:)?instant>/i)?.[1]?.trim() ?? null; + + const dimensions: Array<{ axis: string; member: string }> = []; + const dimPattern = /<(?:[a-z0-9_\-]+:)?explicitMember\b[^>]*\bdimension=["']([^"']+)["'][^>]*>([^<]+)<\/(?:[a-z0-9_\-]+:)?explicitMember>/gi; + for (const dimMatch of block.matchAll(dimPattern)) { + const axis = decodeXmlEntities((dimMatch[1] ?? '').trim()); + const member = decodeXmlEntities((dimMatch[2] ?? '').trim()); + if (!axis || !member) { + continue; + } + + dimensions.push({ axis, member }); + } + + contexts[contextId] = { + id: contextId, + periodStart, + periodEnd, + periodInstant, + dimensions + }; + } + + return contexts; +} + +function parseUnits(raw: string): Record { + const units: Record = {}; + const unitPattern = /<(?:[a-z0-9_\-]+:)?unit\b[^>]*\bid=["']([^"']+)["'][^>]*>([\s\S]*?)<\/(?:[a-z0-9_\-]+:)?unit>/gi; + + for (const match of raw.matchAll(unitPattern)) { + const id = (match[1] ?? '').trim(); + const block = match[2] ?? ''; + if (!id) { + continue; + } + + const measures = [...block.matchAll(/<(?:[a-z0-9_\-]+:)?measure>([^<]+)<\/(?:[a-z0-9_\-]+:)?measure>/gi)] + .map((entry) => decodeXmlEntities((entry[1] ?? '').trim())) + .filter(Boolean); + + let measure: string | null = null; + if (measures.length === 1) { + measure = measures[0] ?? null; + } else if (measures.length > 1) { + measure = measures.join('/'); + } + + units[id] = { + id, + measure + }; + } + + return units; +} + +function classifyStatementKind(localName: string): FinancialStatementKind | null { + const normalized = localName.toLowerCase(); + + if (/cash|operatingactivities|investingactivities|financingactivities/.test(normalized)) { + return 'cash_flow'; + } + + if (/equity|retainedearnings|additionalpaidincapital/.test(normalized)) { + return 'equity'; + } + + if (/comprehensiveincome/.test(normalized)) { + return 'comprehensive_income'; + } + + if (/asset|liabilit|debt/.test(normalized)) { + return 'balance'; + } + + if (/revenue|income|profit|expense|costof/.test(normalized)) { + return 'income'; + } + + return null; +} + +function isXbrlInfrastructurePrefix(prefix: string) { + const normalized = prefix.toLowerCase(); + return normalized === 'xbrli' + || normalized === 'xlink' + || normalized === 'link' + || normalized === 'xbrldi' + || normalized === 'xbrldt'; +} + +function localNameToKey(namespaceUri: string, localName: string) { + return `${namespaceUri}#${localName}`; +} + +export function parseXbrlInstance( + raw: string, + sourceFile: string | null +): { + namespaces: TaxonomyNamespaceMap; + contexts: Record; + units: Record; + facts: TaxonomyFact[]; +} { + const namespaces = parseNamespaceMapFromDocument(raw); + const contexts = parseContexts(raw); + const units = parseUnits(raw); + const facts: TaxonomyFact[] = []; + + const factPattern = /<([a-zA-Z0-9_\-]+):([a-zA-Z0-9_\-.]+)\b([^>]*\bcontextRef=["'][^"']+["'][^>]*)>([\s\S]*?)<\/\1:\2>/g; + + for (const match of raw.matchAll(factPattern)) { + const prefix = (match[1] ?? '').trim(); + const localName = (match[2] ?? '').trim(); + const attrs = match[3] ?? ''; + const body = decodeXmlEntities((match[4] ?? '').trim()); + + if (!prefix || !localName || isXbrlInfrastructurePrefix(prefix)) { + continue; + } + + const contextId = attrs.match(/\bcontextRef=["']([^"']+)["']/i)?.[1]?.trim() ?? ''; + if (!contextId) { + continue; + } + + const value = parseNumber(body); + if (value === null) { + continue; + } + + const unitRef = attrs.match(/\bunitRef=["']([^"']+)["']/i)?.[1]?.trim() ?? null; + const decimals = attrs.match(/\bdecimals=["']([^"']+)["']/i)?.[1]?.trim() ?? null; + + const namespaceUri = namespaces[prefix] ?? `urn:unknown:${prefix}`; + const context = contexts[contextId]; + + facts.push({ + conceptKey: localNameToKey(namespaceUri, localName), + qname: `${prefix}:${localName}`, + namespaceUri, + localName, + contextId, + unit: unitRef && units[unitRef]?.measure ? units[unitRef]?.measure ?? unitRef : unitRef, + decimals, + value, + periodStart: context?.periodStart ?? null, + periodEnd: context?.periodEnd ?? null, + periodInstant: context?.periodInstant ?? null, + dimensions: context?.dimensions ?? [], + isDimensionless: (context?.dimensions.length ?? 0) === 0, + sourceFile, + }); + } + + return { + namespaces, + contexts, + units, + facts + }; +} + +export function conceptStatementFallback(localName: string) { + return classifyStatementKind(localName); +} diff --git a/lib/types.ts b/lib/types.ts index 2ae64b2..1117dd7 100644 --- a/lib/types.ts +++ b/lib/types.ts @@ -98,6 +98,12 @@ export type TaskStage = | 'completed' | 'failed' | 'sync.fetch_filings' + | 'sync.discover_assets' + | 'sync.extract_taxonomy' + | 'sync.normalize_taxonomy' + | 'sync.derive_metrics' + | 'sync.validate_pdf_metrics' + | 'sync.persist_taxonomy' | 'sync.fetch_metrics' | 'sync.persist_filings' | 'sync.hydrate_statements' @@ -169,7 +175,6 @@ export type CompanyFinancialPoint = { debt: number | null; }; -export type FinancialStatementMode = 'standardized' | 'filing_faithful'; export type FinancialStatementKind = 'income' | 'balance' | 'cash_flow' | 'equity' | 'comprehensive_income'; export type FinancialHistoryWindow = '10y' | 'all'; @@ -178,11 +183,79 @@ export type FinancialStatementPeriod = { filingId: number; accessionNumber: string; filingDate: string; + periodStart: string | null; periodEnd: string | null; filingType: Extract; periodLabel: string; }; +export type TaxonomyDimensionMember = { + axis: string; + member: string; +}; + +export type TaxonomyStatementRow = { + key: string; + label: string; + conceptKey: string; + qname: string; + namespaceUri: string; + localName: string; + isExtension: boolean; + statement: FinancialStatementKind; + roleUri: string | null; + order: number; + depth: number; + parentKey: string | null; + values: Record; + units: Record; + hasDimensions: boolean; + sourceFactIds: number[]; +}; + +export type TaxonomyFactRow = { + id: number; + snapshotId: number; + filingId: number; + filingDate: string; + statement: FinancialStatementKind | null; + roleUri: string | null; + conceptKey: string; + qname: string; + namespaceUri: string; + localName: string; + value: number; + contextId: string; + unit: string | null; + decimals: string | null; + periodStart: string | null; + periodEnd: string | null; + periodInstant: string | null; + dimensions: TaxonomyDimensionMember[]; + isDimensionless: boolean; + sourceFile: string | null; +}; + +export type MetricValidationCheck = { + metricKey: keyof NonNullable; + taxonomyValue: number | null; + llmValue: number | null; + absoluteDiff: number | null; + relativeDiff: number | null; + status: 'not_run' | 'matched' | 'mismatch' | 'error'; + evidencePages: number[]; + pdfUrl: string | null; + provider: string | null; + model: string | null; + error: string | null; +}; + +export type MetricValidationResult = { + status: 'not_run' | 'matched' | 'mismatch' | 'error'; + checks: MetricValidationCheck[]; + validatedAt: string | null; +}; + export type StandardizedStatementRow = { key: string; label: string; @@ -220,16 +293,20 @@ export type CompanyFinancialStatementsResponse = { companyName: string; cik: string | null; }; - mode: FinancialStatementMode; statement: FinancialStatementKind; window: FinancialHistoryWindow; periods: FinancialStatementPeriod[]; - rows: StandardizedStatementRow[] | FilingFaithfulStatementRow[]; + rows: TaxonomyStatementRow[]; nextCursor: string | null; + facts: { + rows: TaxonomyFactRow[]; + nextCursor: string | null; + } | null; coverage: { filings: number; rows: number; dimensions: number; + facts: number; }; dataSourceStatus: { enabled: boolean; @@ -239,6 +316,10 @@ export type CompanyFinancialStatementsResponse = { pendingFilings: number; queuedSync: boolean; }; + metrics: { + taxonomy: Filing['metrics']; + validation: MetricValidationResult | null; + }; dimensionBreakdown: Record | null; }; diff --git a/package.json b/package.json index 9c4e16c..eeef021 100644 --- a/package.json +++ b/package.json @@ -11,6 +11,7 @@ "workflow:setup": "workflow-postgres-setup", "backfill:filing-metrics": "bun run scripts/backfill-filing-metrics.ts", "backfill:filing-statements": "bun run scripts/backfill-filing-statements.ts", + "backfill:taxonomy-snapshots": "bun run scripts/backfill-taxonomy-snapshots.ts", "db:generate": "bun x drizzle-kit generate", "db:migrate": "bun x drizzle-kit migrate", "test:e2e:workflow": "RUN_TASK_WORKFLOW_E2E=1 bun test lib/server/api/task-workflow-hybrid.e2e.test.ts" diff --git a/scripts/backfill-taxonomy-snapshots.ts b/scripts/backfill-taxonomy-snapshots.ts new file mode 100644 index 0000000..b110e6e --- /dev/null +++ b/scripts/backfill-taxonomy-snapshots.ts @@ -0,0 +1,227 @@ +import { hydrateFilingTaxonomySnapshot } from '@/lib/server/taxonomy/engine'; +import { listFilingsRecords, updateFilingMetricsById } from '@/lib/server/repos/filings'; +import { + getFilingTaxonomySnapshotByFilingId, + upsertFilingTaxonomySnapshot +} from '@/lib/server/repos/filing-taxonomy'; + +type ScriptOptions = { + apply: boolean; + ticker: string | null; + window: '10y' | 'all'; + limit: number | null; + refresh: boolean; +}; + +type ScriptSummary = { + scanned: number; + wouldWrite: number; + written: number; + skippedFresh: number; + failed: number; +}; + +type FilingRow = { + id: number; + ticker: string; + cik: string; + accessionNumber: string; + filingDate: string; + filingType: '10-K' | '10-Q'; + filingUrl: string | null; + primaryDocument: string | null; + updatedAt: string; +}; + +const REQUEST_DELAY_MS = 120; + +function parseOptions(argv: string[]): ScriptOptions { + const options: ScriptOptions = { + apply: false, + ticker: null, + window: '10y', + limit: null, + refresh: false + }; + + for (const arg of argv) { + if (arg === '--apply') { + options.apply = true; + continue; + } + + if (arg === '--refresh') { + options.refresh = true; + continue; + } + + if (arg.startsWith('--ticker=')) { + const value = arg.slice('--ticker='.length).trim().toUpperCase(); + options.ticker = value.length > 0 ? value : null; + continue; + } + + if (arg.startsWith('--window=')) { + const value = arg.slice('--window='.length).trim().toLowerCase(); + options.window = value === 'all' ? 'all' : '10y'; + continue; + } + + if (arg.startsWith('--limit=')) { + const parsed = Number.parseInt(arg.slice('--limit='.length), 10); + if (Number.isFinite(parsed) && parsed > 0) { + options.limit = parsed; + } + continue; + } + + if (arg === '--help' || arg === '-h') { + console.log('Backfill filing taxonomy snapshots (Financial Statements V3).'); + console.log(''); + console.log('Usage:'); + console.log(' bun run scripts/backfill-taxonomy-snapshots.ts [--apply] [--ticker=SYMBOL] [--window=10y|all] [--limit=N] [--refresh]'); + process.exit(0); + } + } + + return options; +} + +function tenYearsAgoIso() { + const date = new Date(); + date.setUTCFullYear(date.getUTCFullYear() - 10); + return date.toISOString().slice(0, 10); +} + +async function loadFilings(options: ScriptOptions): Promise { + const rows: FilingRow[] = []; + let cursor = 0; + + while (true) { + const page = await listFilingsRecords({ + ticker: options.ticker ?? undefined, + limit: 250 + }); + + const normalizedPage = page + .filter((filing): filing is typeof filing & { filing_type: '10-K' | '10-Q' } => { + return filing.filing_type === '10-K' || filing.filing_type === '10-Q'; + }) + .filter((filing) => { + if (options.window === 'all') { + return true; + } + + return filing.filing_date >= tenYearsAgoIso(); + }) + .slice(cursor); + + if (normalizedPage.length === 0) { + break; + } + + for (const filing of normalizedPage) { + rows.push({ + id: filing.id, + ticker: filing.ticker, + cik: filing.cik, + accessionNumber: filing.accession_number, + filingDate: filing.filing_date, + filingType: filing.filing_type, + filingUrl: filing.filing_url, + primaryDocument: filing.primary_document ?? null, + updatedAt: filing.updated_at + }); + + if (options.limit && rows.length >= options.limit) { + return rows; + } + } + + cursor += normalizedPage.length; + } + + return rows; +} + +async function runBackfill(options: ScriptOptions): Promise { + const rows = await loadFilings(options); + const summary: ScriptSummary = { + scanned: 0, + wouldWrite: 0, + written: 0, + skippedFresh: 0, + failed: 0 + }; + + console.log(`[backfill-taxonomy-snapshots] mode=${options.apply ? 'apply' : 'dry-run'} window=${options.window} filings=${rows.length}`); + if (options.ticker) { + console.log(`[backfill-taxonomy-snapshots] ticker=${options.ticker}`); + } + + for (const [index, row] of rows.entries()) { + summary.scanned += 1; + console.log(`[backfill-taxonomy-snapshots] [${index + 1}/${rows.length}] ${row.ticker} ${row.filingType} ${row.filingDate} ${row.accessionNumber}`); + + const existing = await getFilingTaxonomySnapshotByFilingId(row.id); + const isFresh = existing && Date.parse(existing.updated_at) >= Date.parse(row.updatedAt); + + if (isFresh && !options.refresh) { + summary.skippedFresh += 1; + continue; + } + + try { + const snapshot = await hydrateFilingTaxonomySnapshot({ + filingId: row.id, + ticker: row.ticker, + cik: row.cik, + accessionNumber: row.accessionNumber, + filingDate: row.filingDate, + filingType: row.filingType, + filingUrl: row.filingUrl, + primaryDocument: row.primaryDocument + }); + + summary.wouldWrite += 1; + + if (options.apply) { + await upsertFilingTaxonomySnapshot(snapshot); + await updateFilingMetricsById(row.id, snapshot.derived_metrics); + summary.written += 1; + } + } catch (error) { + summary.failed += 1; + const reason = error instanceof Error ? error.message : 'unknown error'; + console.error(`[backfill-taxonomy-snapshots] failed for ${row.accessionNumber}: ${reason}`); + } + + await Bun.sleep(REQUEST_DELAY_MS); + } + + return summary; +} + +async function main() { + const options = parseOptions(process.argv.slice(2)); + const startedAt = Date.now(); + + try { + const summary = await runBackfill(options); + const durationSec = ((Date.now() - startedAt) / 1000).toFixed(1); + + console.log('[backfill-taxonomy-snapshots] summary'); + console.log(` scanned=${summary.scanned}`); + console.log(` wouldWrite=${summary.wouldWrite}`); + console.log(` written=${summary.written}`); + console.log(` skippedFresh=${summary.skippedFresh}`); + console.log(` failed=${summary.failed}`); + console.log(` durationSec=${durationSec}`); + } catch (error) { + const message = error instanceof Error ? error.message : String(error); + console.error(`[backfill-taxonomy-snapshots] fatal: ${message}`); + process.exitCode = 1; + } +} + +void main();