feat(taxonomy): add rust sidecar compact surface pipeline

This commit is contained in:
2026-03-12 15:23:10 -04:00
parent f2c25fb9c6
commit 58061af006
84 changed files with 19350 additions and 265 deletions

View File

@@ -0,0 +1,320 @@
import type {
DetailFinancialRow,
FinancialStatementKind,
FinancialStatementPeriod,
NormalizationSummary,
StructuredKpiRow,
SurfaceDetailMap,
SurfaceFinancialRow,
TaxonomyFactRow,
TaxonomyStatementRow
} from '@/lib/types';
import { buildStandardizedRows } from '@/lib/server/financials/standardize';
type CompactStatement = Extract<FinancialStatementKind, 'income' | 'balance' | 'cash_flow'>;
type SurfaceDefinition = {
key: string;
label: string;
category: string;
order: number;
unit: SurfaceFinancialRow['unit'];
rowKey?: string;
componentKeys?: string[];
formula?: {
kind: 'subtract';
left: string;
right: string;
};
};
const EMPTY_SURFACE_ROWS: Record<FinancialStatementKind, SurfaceFinancialRow[]> = {
income: [],
balance: [],
cash_flow: [],
equity: [],
comprehensive_income: []
};
const EMPTY_DETAIL_ROWS: Record<FinancialStatementKind, SurfaceDetailMap> = {
income: {},
balance: {},
cash_flow: {},
equity: {},
comprehensive_income: {}
};
const SURFACE_DEFINITIONS: Record<CompactStatement, SurfaceDefinition[]> = {
income: [
{ key: 'revenue', label: 'Revenue', category: 'surface', order: 10, unit: 'currency', rowKey: 'revenue' },
{ key: 'cost_of_revenue', label: 'Cost of Revenue', category: 'surface', order: 20, unit: 'currency', rowKey: 'cost_of_revenue' },
{ key: 'gross_profit', label: 'Gross Profit', category: 'surface', order: 30, unit: 'currency', rowKey: 'gross_profit' },
{
key: 'operating_expenses',
label: 'Operating Expenses',
category: 'surface',
order: 40,
unit: 'currency',
componentKeys: ['selling_general_and_administrative', 'research_and_development', 'depreciation_and_amortization']
},
{ key: 'operating_income', label: 'Operating Income', category: 'surface', order: 50, unit: 'currency', rowKey: 'operating_income' },
{
key: 'interest_and_other',
label: 'Interest and Other',
category: 'surface',
order: 60,
unit: 'currency',
formula: {
kind: 'subtract',
left: 'pretax_income',
right: 'operating_income'
}
},
{ key: 'pretax_income', label: 'Pretax Income', category: 'surface', order: 70, unit: 'currency', rowKey: 'pretax_income' },
{ key: 'income_taxes', label: 'Income Taxes', category: 'surface', order: 80, unit: 'currency', rowKey: 'income_tax_expense' },
{ key: 'net_income', label: 'Net Income', category: 'surface', order: 90, unit: 'currency', rowKey: 'net_income' }
],
balance: [
{ key: 'cash_and_equivalents', label: 'Cash and Equivalents', category: 'surface', order: 10, unit: 'currency', rowKey: 'cash_and_equivalents' },
{ key: 'receivables', label: 'Receivables', category: 'surface', order: 20, unit: 'currency', rowKey: 'accounts_receivable' },
{ key: 'inventory', label: 'Inventory', category: 'surface', order: 30, unit: 'currency', rowKey: 'inventory' },
{ key: 'current_assets', label: 'Current Assets', category: 'surface', order: 40, unit: 'currency', rowKey: 'current_assets' },
{ key: 'ppe', label: 'Property, Plant & Equipment', category: 'surface', order: 50, unit: 'currency', rowKey: 'property_plant_equipment' },
{
key: 'goodwill_and_intangibles',
label: 'Goodwill and Intangibles',
category: 'surface',
order: 60,
unit: 'currency',
componentKeys: ['goodwill', 'intangible_assets']
},
{ key: 'total_assets', label: 'Total Assets', category: 'surface', order: 70, unit: 'currency', rowKey: 'total_assets' },
{ key: 'current_liabilities', label: 'Current Liabilities', category: 'surface', order: 80, unit: 'currency', rowKey: 'current_liabilities' },
{ key: 'debt', label: 'Debt', category: 'surface', order: 90, unit: 'currency', rowKey: 'total_debt' },
{ key: 'total_liabilities', label: 'Total Liabilities', category: 'surface', order: 100, unit: 'currency', rowKey: 'total_liabilities' },
{ key: 'shareholders_equity', label: 'Shareholders Equity', category: 'surface', order: 110, unit: 'currency', rowKey: 'total_equity' }
],
cash_flow: [
{ key: 'operating_cash_flow', label: 'Operating Cash Flow', category: 'surface', order: 10, unit: 'currency', rowKey: 'operating_cash_flow' },
{ key: 'capital_expenditures', label: 'Capital Expenditures', category: 'surface', order: 20, unit: 'currency', rowKey: 'capital_expenditures' },
{ key: 'acquisitions', label: 'Acquisitions', category: 'surface', order: 30, unit: 'currency', rowKey: 'acquisitions' },
{ key: 'investing_cash_flow', label: 'Investing Cash Flow', category: 'surface', order: 40, unit: 'currency', rowKey: 'investing_cash_flow' },
{ key: 'financing_cash_flow', label: 'Financing Cash Flow', category: 'surface', order: 50, unit: 'currency', rowKey: 'financing_cash_flow' },
{ key: 'free_cash_flow', label: 'Free Cash Flow', category: 'surface', order: 60, unit: 'currency', rowKey: 'free_cash_flow' }
]
};
function rowHasAnyValue(row: { values: Record<string, number | null> }) {
return Object.values(row.values).some((value) => value !== null);
}
function sumValues(values: Array<number | null>) {
if (values.every((value) => value === null)) {
return null;
}
return values.reduce<number>((sum, value) => sum + (value ?? 0), 0);
}
function valueForPeriod(
rowByKey: Map<string, SurfaceFinancialRow>,
rowKey: string,
periodId: string
) {
return rowByKey.get(rowKey)?.values[periodId] ?? null;
}
function maxAbsValue(values: Record<string, number | null>) {
return Object.values(values).reduce<number>((max, value) => Math.max(max, Math.abs(value ?? 0)), 0);
}
function detailUnit(row: SurfaceFinancialRow, faithfulRow: TaxonomyStatementRow | undefined) {
if (faithfulRow) {
return Object.values(faithfulRow.units)[0] ?? null;
}
switch (row.unit) {
case 'currency':
return 'USD';
case 'shares':
return 'shares';
case 'percent':
return 'pure';
default:
return null;
}
}
function buildDetailRow(input: {
row: SurfaceFinancialRow;
parentSurfaceKey: string;
faithfulRowByKey: Map<string, TaxonomyStatementRow>;
}): DetailFinancialRow {
const sourceRowKey = input.row.sourceRowKeys.find((key) => input.faithfulRowByKey.has(key)) ?? input.row.sourceRowKeys[0] ?? input.row.key;
const faithfulRow = sourceRowKey ? input.faithfulRowByKey.get(sourceRowKey) : undefined;
const qname = faithfulRow?.qname ?? input.row.sourceConcepts[0] ?? input.row.key;
const [prefix, ...rest] = qname.split(':');
const localName = faithfulRow?.localName ?? (rest.length > 0 ? rest.join(':') : qname);
return {
key: input.row.key,
parentSurfaceKey: input.parentSurfaceKey,
label: input.row.label,
conceptKey: faithfulRow?.conceptKey ?? sourceRowKey,
qname,
namespaceUri: faithfulRow?.namespaceUri ?? (prefix && rest.length > 0 ? `urn:unknown:${prefix}` : 'urn:surface'),
localName,
unit: detailUnit(input.row, faithfulRow),
values: { ...input.row.values },
sourceFactIds: [...input.row.sourceFactIds],
isExtension: faithfulRow?.isExtension ?? false,
dimensionsSummary: faithfulRow?.hasDimensions ? ['has_dimensions'] : [],
residualFlag: input.parentSurfaceKey === 'unmapped'
};
}
function baselineForStatement(statement: CompactStatement, rowByKey: Map<string, SurfaceFinancialRow>) {
const anchorKey = statement === 'balance' ? 'total_assets' : 'revenue';
return maxAbsValue(rowByKey.get(anchorKey)?.values ?? {});
}
function materialityThreshold(statement: CompactStatement, baseline: number) {
if (statement === 'balance') {
return Math.max(5_000_000, baseline * 0.005);
}
return Math.max(1_000_000, baseline * 0.01);
}
export function buildCompactHydrationModel(input: {
periods: FinancialStatementPeriod[];
faithfulRows: Record<FinancialStatementKind, TaxonomyStatementRow[]>;
facts: TaxonomyFactRow[];
kpiRows?: StructuredKpiRow[];
}) {
const surfaceRows = structuredClone(EMPTY_SURFACE_ROWS);
const detailRows = structuredClone(EMPTY_DETAIL_ROWS);
let surfaceRowCount = 0;
let detailRowCount = 0;
let unmappedRowCount = 0;
let materialUnmappedRowCount = 0;
for (const statement of Object.keys(SURFACE_DEFINITIONS) as CompactStatement[]) {
const faithfulRows = input.faithfulRows[statement] ?? [];
const facts = input.facts.filter((fact) => fact.statement === statement);
const fullRows = buildStandardizedRows({
rows: faithfulRows,
statement,
periods: input.periods,
facts
});
const rowByKey = new Map(fullRows.map((row) => [row.key, row]));
const faithfulRowByKey = new Map(faithfulRows.map((row) => [row.key, row]));
const statementDetails: SurfaceDetailMap = {};
for (const definition of SURFACE_DEFINITIONS[statement]) {
const contributingRows = definition.rowKey
? [rowByKey.get(definition.rowKey)].filter((row): row is SurfaceFinancialRow => row !== undefined)
: (definition.componentKeys ?? [])
.map((key) => rowByKey.get(key))
.filter((row): row is SurfaceFinancialRow => row !== undefined);
const values = Object.fromEntries(input.periods.map((period) => {
const nextValue = definition.rowKey
? valueForPeriod(rowByKey, definition.rowKey, period.id)
: definition.formula
? (() => {
const left = valueForPeriod(rowByKey, definition.formula!.left, period.id);
const right = valueForPeriod(rowByKey, definition.formula!.right, period.id);
return left === null || right === null ? null : left - right;
})()
: sumValues(contributingRows.map((row) => row.values[period.id] ?? null));
return [period.id, nextValue];
})) satisfies Record<string, number | null>;
if (!rowHasAnyValue({ values })) {
continue;
}
const sourceConcepts = [...new Set(contributingRows.flatMap((row) => row.sourceConcepts))].sort((left, right) => left.localeCompare(right));
const sourceRowKeys = [...new Set(contributingRows.flatMap((row) => row.sourceRowKeys))].sort((left, right) => left.localeCompare(right));
const sourceFactIds = [...new Set(contributingRows.flatMap((row) => row.sourceFactIds))].sort((left, right) => left - right);
const hasDimensions = contributingRows.some((row) => row.hasDimensions);
const resolvedSourceRowKeys = Object.fromEntries(input.periods.map((period) => [
period.id,
definition.rowKey
? rowByKey.get(definition.rowKey)?.resolvedSourceRowKeys[period.id] ?? null
: null
]));
const rowsForDetail = definition.componentKeys
? contributingRows
: [];
const details = rowsForDetail
.filter((row) => rowHasAnyValue(row))
.map((row) => buildDetailRow({
row,
parentSurfaceKey: definition.key,
faithfulRowByKey
}));
statementDetails[definition.key] = details;
detailRowCount += details.length;
surfaceRows[statement].push({
key: definition.key,
label: definition.label,
category: definition.category,
templateSection: definition.category,
order: definition.order,
unit: definition.unit,
values,
sourceConcepts,
sourceRowKeys,
sourceFactIds,
formulaKey: definition.formula ? definition.key : null,
hasDimensions,
resolvedSourceRowKeys,
statement,
detailCount: details.length
});
surfaceRowCount += 1;
}
const baseline = baselineForStatement(statement, rowByKey);
const threshold = materialityThreshold(statement, baseline);
const residualRows = fullRows
.filter((row) => row.key.startsWith('other:'))
.filter((row) => rowHasAnyValue(row))
.map((row) => buildDetailRow({
row,
parentSurfaceKey: 'unmapped',
faithfulRowByKey
}));
if (residualRows.length > 0) {
statementDetails.unmapped = residualRows;
detailRowCount += residualRows.length;
unmappedRowCount += residualRows.length;
materialUnmappedRowCount += residualRows.filter((row) => maxAbsValue(row.values) >= threshold).length;
}
detailRows[statement] = statementDetails;
}
const normalizationSummary: NormalizationSummary = {
surfaceRowCount,
detailRowCount,
kpiRowCount: input.kpiRows?.length ?? 0,
unmappedRowCount,
materialUnmappedRowCount,
warnings: []
};
return {
surfaceRows,
detailRows,
normalizationSummary
};
}