refactor(taxonomy): remove legacy parser and add rollout checks

This commit is contained in:
2026-03-12 15:25:06 -04:00
parent 58061af006
commit c274f4d55b
11 changed files with 1344 additions and 697 deletions

View File

@@ -0,0 +1,681 @@
import { chromium } from '@playwright/test';
import type { FinancialStatementKind } from '@/lib/types';
import { hydrateFilingTaxonomySnapshot } from '@/lib/server/taxonomy/engine';
import type { TaxonomyHydrationInput, TaxonomyHydrationResult } from '@/lib/server/taxonomy/types';
type ComparisonTarget = {
statement: Extract<FinancialStatementKind, 'income' | 'balance'>;
surfaceKey: string;
fiscalAiLabels: string[];
allowNotMeaningful?: boolean;
notMeaningfulWarnings?: string[];
};
type CompanyCase = {
ticker: string;
exchangeTicker: string;
cik: string;
form: '10-K';
comparisons: ComparisonTarget[];
};
type SecRecentFilings = {
accessionNumber: string[];
primaryDocument: string[];
filingDate: string[];
form: string[];
};
type SecSubmission = {
filings?: {
recent?: SecRecentFilings;
};
};
type FiscalAiTableRow = {
label: string;
normalizedLabel: string;
valueText: string;
value: number | null;
};
type FiscalAiTable = {
columnLabel: string;
rows: FiscalAiTableRow[];
};
type ComparisonRow = {
statement: Extract<FinancialStatementKind, 'income' | 'balance'>;
surfaceKey: string;
fiscalAiLabel: string | null;
fiscalAiValueM: number | null;
ourValueM: number | null;
absDiffM: number | null;
relDiff: number | null;
status: 'pass' | 'fail' | 'missing_reference' | 'missing_ours' | 'not_meaningful';
};
type ResultPeriod = TaxonomyHydrationResult['periods'][number] & {
period_start?: string | null;
period_end?: string | null;
};
const SEC_USER_AGENT = 'fiscal-clone/compare-fiscal-ai (contact: local-dev)';
const BROWSER_USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/145.0.0.0 Safari/537.36';
const VALUE_TOLERANCE_M = 1;
const RELATIVE_TOLERANCE = 0.005;
const CASES: CompanyCase[] = [
{
ticker: 'MSFT',
exchangeTicker: 'NasdaqGS-MSFT',
cik: '0000789019',
form: '10-K',
comparisons: [
{ statement: 'income', surfaceKey: 'revenue', fiscalAiLabels: ['Total Revenues'] },
{
statement: 'income',
surfaceKey: 'gross_profit',
fiscalAiLabels: ['Gross Profit'],
allowNotMeaningful: true,
notMeaningfulWarnings: ['gross_profit_not_meaningful_broker_pack']
},
{ statement: 'income', surfaceKey: 'operating_expenses', fiscalAiLabels: ['Operating Expenses', 'Operating Expense'] },
{ statement: 'income', surfaceKey: 'operating_income', fiscalAiLabels: ['Operating Profit', 'Operating Income'] },
{ statement: 'income', surfaceKey: 'income_tax_expense', fiscalAiLabels: ['Provision for Income Taxes', 'Income Tax Expense', 'Income Taxes'] },
{
statement: 'income',
surfaceKey: 'net_income',
fiscalAiLabels: ['Net Income Attributable to Common Shareholders', 'Consolidated Net Income', 'Net Income']
},
]
},
{
ticker: 'JPM',
exchangeTicker: 'NYSE-JPM',
cik: '0000019617',
form: '10-K',
comparisons: [
{ statement: 'income', surfaceKey: 'revenue', fiscalAiLabels: ['Total Net Revenues', 'Total Revenues'] },
{
statement: 'income',
surfaceKey: 'gross_profit',
fiscalAiLabels: ['Gross Profit'],
allowNotMeaningful: true,
notMeaningfulWarnings: ['gross_profit_not_meaningful_bank_pack']
},
{ statement: 'income', surfaceKey: 'operating_expenses', fiscalAiLabels: ['Operating Expenses', 'Total Operating Expenses', 'Non-Interest Expense'] },
{ statement: 'income', surfaceKey: 'operating_income', fiscalAiLabels: ['Pre-Tax Income', 'Operating Income', 'Operating Profit'] },
{ statement: 'income', surfaceKey: 'income_tax_expense', fiscalAiLabels: ['Income Taxes', 'Income Tax Expense'] },
{
statement: 'income',
surfaceKey: 'net_income',
fiscalAiLabels: ['Net Income to Common', 'Net Income Attributable to Common Shareholders', 'Net Income']
},
]
},
{
ticker: 'AIG',
exchangeTicker: 'NYSE-AIG',
cik: '0000005272',
form: '10-K',
comparisons: [
{ statement: 'income', surfaceKey: 'revenue', fiscalAiLabels: ['Total Revenues', 'Revenue'] },
{
statement: 'income',
surfaceKey: 'gross_profit',
fiscalAiLabels: ['Gross Profit'],
allowNotMeaningful: true,
notMeaningfulWarnings: ['gross_profit_not_meaningful_insurance_pack']
},
{ statement: 'income', surfaceKey: 'operating_expenses', fiscalAiLabels: ['Operating Expenses', 'Insurance Benefits & Claims'] },
{ statement: 'income', surfaceKey: 'operating_income', fiscalAiLabels: ['Operating Income', 'Operating Profit'] },
{ statement: 'income', surfaceKey: 'income_tax_expense', fiscalAiLabels: ['Income Taxes', 'Income Tax Expense'] },
{
statement: 'income',
surfaceKey: 'net_income',
fiscalAiLabels: ['Net Income Attributable to Common Shareholders', 'Consolidated Net Income', 'Net Income']
},
]
},
{
ticker: 'O',
exchangeTicker: 'NYSE-O',
cik: '0000726728',
form: '10-K',
comparisons: [
{ statement: 'income', surfaceKey: 'revenue', fiscalAiLabels: ['Property Revenue', 'Rental Revenue', 'Total Revenues'] },
{ statement: 'income', surfaceKey: 'gross_profit', fiscalAiLabels: ['Gross Profit', 'Property Operating Profit'] },
{ statement: 'income', surfaceKey: 'operating_expenses', fiscalAiLabels: ['Operating Expenses', 'General and Administrative'] },
{ statement: 'income', surfaceKey: 'operating_income', fiscalAiLabels: ['Operating Profit', 'Operating Income'] },
{ statement: 'income', surfaceKey: 'income_tax_expense', fiscalAiLabels: ['Income Taxes', 'Income Tax Expense'] },
{
statement: 'income',
surfaceKey: 'net_income',
fiscalAiLabels: ['Net Income Attributable to Common Shareholders', 'Consolidated Net Income', 'Net Income']
}
]
},
{
ticker: 'BLK',
exchangeTicker: 'NYSE-BLK',
cik: '0002012383',
form: '10-K',
comparisons: [
{ statement: 'income', surfaceKey: 'revenue', fiscalAiLabels: ['Total Revenues', 'Investment advisory and administration fees', 'Advisory and other revenue'] },
{
statement: 'income',
surfaceKey: 'gross_profit',
fiscalAiLabels: ['Gross Profit'],
allowNotMeaningful: true,
notMeaningfulWarnings: ['gross_profit_not_meaningful_broker_pack']
},
{ statement: 'income', surfaceKey: 'operating_expenses', fiscalAiLabels: ['Operating Expenses', 'Operating Expense'] },
{ statement: 'income', surfaceKey: 'operating_income', fiscalAiLabels: ['Operating Income', 'Operating Profit'] },
{ statement: 'income', surfaceKey: 'income_tax_expense', fiscalAiLabels: ['Income Taxes', 'Income Tax Expense'] },
{
statement: 'income',
surfaceKey: 'net_income',
fiscalAiLabels: ['Net Income Attributable to Common Shareholders', 'Net Income Applicable to BlackRock, Inc.', 'Net Income']
}
]
}
];
function parseTickerFilter(argv: string[]) {
for (const arg of argv) {
if (arg === '--help' || arg === '-h') {
console.log('Compare live Fiscal.ai standardized statement rows against local sidecar output.');
console.log('');
console.log('Usage:');
console.log(' bun run scripts/compare-fiscal-ai-statements.ts');
console.log(' bun run scripts/compare-fiscal-ai-statements.ts --ticker=MSFT');
process.exit(0);
}
if (arg.startsWith('--ticker=')) {
const value = arg.slice('--ticker='.length).trim().toUpperCase();
return value.length > 0 ? value : null;
}
}
return null;
}
function normalizeLabel(value: string) {
return value
.toLowerCase()
.replace(/&/g, ' and ')
.replace(/[^a-z0-9]+/g, ' ')
.trim()
.replace(/\s+/g, ' ');
}
function parseDisplayedNumber(value: string) {
const trimmed = value.trim();
if (!trimmed || /^[-–—]+$/.test(trimmed) || /pricing/i.test(trimmed)) {
return null;
}
const negative = trimmed.startsWith('(') && trimmed.endsWith(')');
const normalized = trimmed
.replace(/,/g, '')
.replace(/[%$]/g, '')
.replace(/[()]/g, '')
.trim();
if (!normalized) {
return null;
}
const parsed = Number.parseFloat(normalized);
if (!Number.isFinite(parsed)) {
return null;
}
return negative ? -Math.abs(parsed) : parsed;
}
function roundMillions(value: number | null) {
if (value === null || !Number.isFinite(value)) {
return null;
}
return Math.round(value / 1_000_000);
}
function absoluteDiff(left: number | null, right: number | null) {
if (left === null || right === null) {
return null;
}
return Math.abs(left - right);
}
function relativeDiff(left: number | null, right: number | null) {
if (left === null || right === null) {
return null;
}
const baseline = Math.max(Math.abs(right), 1);
return Math.abs(left - right) / baseline;
}
function periodStart(period: ResultPeriod) {
return period.periodStart ?? period.period_start ?? null;
}
function periodEnd(period: ResultPeriod) {
return period.periodEnd ?? period.period_end ?? null;
}
function chooseDurationPeriodId(result: TaxonomyHydrationResult) {
const annualPeriods = result.periods
.filter((period): period is ResultPeriod => Boolean(periodStart(period as ResultPeriod) && periodEnd(period as ResultPeriod)))
.map((period) => {
const durationDays = Math.round(
(Date.parse(periodEnd(period) as string) - Date.parse(periodStart(period) as string)) / (1000 * 60 * 60 * 24)
);
return { period, durationDays };
})
.filter((entry) => entry.durationDays >= 300)
.sort((left, right) => {
return Date.parse(periodEnd(right.period) as string) - Date.parse(periodEnd(left.period) as string);
});
return annualPeriods[0]?.period.id ?? null;
}
function chooseInstantPeriodId(result: TaxonomyHydrationResult) {
const instantPeriods = result.periods
.filter((period): period is ResultPeriod => !periodStart(period as ResultPeriod) && Boolean(periodEnd(period as ResultPeriod)))
.sort((left, right) => Date.parse(periodEnd(right) as string) - Date.parse(periodEnd(left) as string));
return instantPeriods[0]?.id ?? null;
}
function findSurfaceValue(
result: TaxonomyHydrationResult,
statement: Extract<FinancialStatementKind, 'income' | 'balance'>,
surfaceKey: string
) {
const rows = result.surface_rows[statement] ?? [];
const row = rows.find((entry) => entry.key === surfaceKey) ?? null;
if (!row) {
return { row: null, value: null };
}
const periodId = statement === 'balance'
? chooseInstantPeriodId(result)
: chooseDurationPeriodId(result);
if (periodId) {
const directValue = row.values[periodId];
if (directValue !== null && directValue !== undefined) {
return { row, value: directValue };
}
}
const periodById = new Map(
result.periods.map((period) => [period.id, period as ResultPeriod])
);
const fallback = Object.entries(row.values)
.filter((entry): entry is [string, number] => entry[1] !== null)
.sort((left, right) => {
const leftPeriod = periodById.get(left[0]);
const rightPeriod = periodById.get(right[0]);
const leftDate = leftPeriod ? Date.parse(periodEnd(leftPeriod) ?? '') : Number.NaN;
const rightDate = rightPeriod ? Date.parse(periodEnd(rightPeriod) ?? '') : Number.NaN;
if (Number.isFinite(leftDate) && Number.isFinite(rightDate) && leftDate !== rightDate) {
return rightDate - leftDate;
}
return right[0].localeCompare(left[0]);
})[0];
return {
row,
value: fallback?.[1] ?? null
};
}
function rowResolutionMethod(row: Record<string, unknown> | null) {
if (!row) {
return null;
}
return (row.resolutionMethod ?? row.resolution_method ?? null) as string | null;
}
function rowWarningCodes(row: Record<string, unknown> | null) {
if (!row) {
return [] as string[];
}
const value = row.warningCodes ?? row.warning_codes ?? [];
return Array.isArray(value) ? value.filter((entry): entry is string => typeof entry === 'string') : [];
}
function buildSecFilingUrl(cik: string, accessionNumber: string) {
return `https://www.sec.gov/Archives/edgar/data/${Number.parseInt(cik, 10)}/${accessionNumber.replace(/-/g, '')}/`;
}
async function fetchLatestAnnualFiling(company: CompanyCase): Promise<TaxonomyHydrationInput> {
const cik = company.cik.padStart(10, '0');
const response = await fetch(`https://data.sec.gov/submissions/CIK${cik}.json`, {
headers: {
'user-agent': SEC_USER_AGENT,
accept: 'application/json'
}
});
if (!response.ok) {
throw new Error(`SEC submissions fetch failed for ${company.ticker}: ${response.status}`);
}
const payload = await response.json() as SecSubmission;
const recent = payload.filings?.recent;
if (!recent) {
throw new Error(`SEC submissions payload missing recent filings for ${company.ticker}`);
}
for (let index = 0; index < recent.form.length; index += 1) {
if (recent.form[index] !== company.form) {
continue;
}
const accessionNumber = recent.accessionNumber[index];
const filingDate = recent.filingDate[index];
const primaryDocument = recent.primaryDocument[index];
if (!accessionNumber || !filingDate || !primaryDocument) {
continue;
}
return {
filingId: index + 1,
ticker: company.ticker,
cik: company.cik,
accessionNumber,
filingDate,
filingType: company.form,
filingUrl: buildSecFilingUrl(company.cik, accessionNumber),
primaryDocument
};
}
throw new Error(`No ${company.form} found in SEC recent filings for ${company.ticker}`);
}
async function scrapeFiscalAiTable(
page: import('@playwright/test').Page,
exchangeTicker: string,
statement: 'income' | 'balance'
): Promise<FiscalAiTable> {
const pagePath = statement === 'income' ? 'income-statement' : 'balance-sheet';
const url = `https://fiscal.ai/company/${exchangeTicker}/financials/${pagePath}/annual/?templateType=standardized`;
await page.goto(url, { waitUntil: 'domcontentloaded', timeout: 120_000 });
await page.waitForSelector('table', { timeout: 120_000 });
await page.waitForTimeout(2_500);
return await page.evaluate(() => {
function normalizeLabel(value: string) {
return value
.toLowerCase()
.replace(/&/g, ' and ')
.replace(/[^a-z0-9]+/g, ' ')
.trim()
.replace(/\s+/g, ' ');
}
function parseDisplayedNumber(value: string) {
const trimmed = value.trim();
if (!trimmed || /^[-–—]+$/.test(trimmed) || /pricing/i.test(trimmed)) {
return null;
}
const negative = trimmed.startsWith('(') && trimmed.endsWith(')');
const normalized = trimmed
.replace(/,/g, '')
.replace(/[%$]/g, '')
.replace(/[()]/g, '')
.trim();
if (!normalized) {
return null;
}
const parsed = Number.parseFloat(normalized);
return Number.isFinite(parsed) ? (negative ? -Math.abs(parsed) : parsed) : null;
}
const table = document.querySelector('table');
if (!table) {
throw new Error('Fiscal.ai table not found');
}
const headerCells = Array.from(table.querySelectorAll('tr:first-child th, tr:first-child td'))
.map((cell) => cell.textContent?.trim() ?? '')
.filter((value) => value.length > 0);
const annualColumnIndex = headerCells.findIndex((value, index) => index > 0 && value !== 'LTM');
if (annualColumnIndex < 0) {
throw new Error(`Could not locate latest annual column in headers: ${headerCells.join(' | ')}`);
}
const rows = Array.from(table.querySelectorAll('tr'))
.slice(1)
.map((row) => {
const cells = Array.from(row.querySelectorAll('td'));
if (cells.length <= annualColumnIndex) {
return null;
}
const label = cells[0]?.textContent?.trim() ?? '';
const valueText = cells[annualColumnIndex]?.textContent?.trim() ?? '';
if (!label) {
return null;
}
return {
label,
normalizedLabel: normalizeLabel(label),
valueText,
value: parseDisplayedNumber(valueText)
};
})
.filter((entry): entry is FiscalAiTableRow => entry !== null);
return {
columnLabel: headerCells[annualColumnIndex] ?? 'unknown',
rows
};
});
}
function findFiscalAiRow(rows: FiscalAiTableRow[], candidates: string[]) {
const normalizedCandidates = candidates.map(normalizeLabel);
const benignExtraTokens = new Set(['total', 'net']);
for (const candidate of normalizedCandidates) {
const exactMatch = rows.find((row) => row.normalizedLabel === candidate);
if (exactMatch) {
return exactMatch;
}
}
for (const candidate of normalizedCandidates) {
const candidateTokens = candidate.split(' ').filter((token) => token.length > 0);
const relaxedMatch = rows.find((row) => {
const rowTokens = row.normalizedLabel.split(' ').filter((token) => token.length > 0);
const sharedPrefix = row.normalizedLabel.startsWith(candidate) || candidate.startsWith(row.normalizedLabel);
if (!sharedPrefix) {
return false;
}
const longer = rowTokens.length >= candidateTokens.length ? rowTokens : candidateTokens;
const shorter = rowTokens.length >= candidateTokens.length ? candidateTokens : rowTokens;
const extraTokens = longer.filter((token) => !shorter.includes(token));
return extraTokens.length > 0 && extraTokens.every((token) => benignExtraTokens.has(token));
});
if (relaxedMatch) {
return relaxedMatch;
}
}
return null;
}
function compareRow(
target: ComparisonTarget,
result: TaxonomyHydrationResult,
fiscalAiTable: FiscalAiTable
): ComparisonRow {
const fiscalAiRow = findFiscalAiRow(fiscalAiTable.rows, target.fiscalAiLabels);
const fiscalAiValueM = fiscalAiRow?.value ?? null;
const ourSurface = findSurfaceValue(result, target.statement, target.surfaceKey);
const ourValueM = roundMillions(ourSurface.value);
const absDiffM = absoluteDiff(ourValueM, fiscalAiValueM);
const relDiffValue = relativeDiff(ourValueM, fiscalAiValueM);
const resolutionMethod = rowResolutionMethod(ourSurface.row as Record<string, unknown> | null);
const warningCodes = rowWarningCodes(ourSurface.row as Record<string, unknown> | null);
let status: ComparisonRow['status'];
if (
ourSurface.row &&
ourValueM === null &&
target.allowNotMeaningful &&
resolutionMethod === 'not_meaningful' &&
(target.notMeaningfulWarnings?.length ?? 0) > 0 &&
target.notMeaningfulWarnings!.some((warning) => warningCodes.includes(warning))
) {
status = 'not_meaningful';
} else if (!fiscalAiRow) {
status = 'missing_reference';
} else if (ourValueM === null) {
status = 'missing_ours';
} else if (
absDiffM !== null &&
relDiffValue !== null &&
(absDiffM <= VALUE_TOLERANCE_M || relDiffValue <= RELATIVE_TOLERANCE)
) {
status = 'pass';
} else {
status = 'fail';
}
return {
statement: target.statement,
surfaceKey: target.surfaceKey,
fiscalAiLabel: fiscalAiRow?.label ?? null,
fiscalAiValueM,
ourValueM,
absDiffM,
relDiff: relDiffValue,
status
};
}
async function compareCase(page: import('@playwright/test').Page, company: CompanyCase) {
const filing = await fetchLatestAnnualFiling(company);
const result = await hydrateFilingTaxonomySnapshot(filing);
if (result.parse_status !== 'ready') {
throw new Error(`${company.ticker} parse_status=${result.parse_status}${result.parse_error ? ` parse_error=${result.parse_error}` : ''}`);
}
const incomeTable = await scrapeFiscalAiTable(page, company.exchangeTicker, 'income');
const balanceTable = await scrapeFiscalAiTable(page, company.exchangeTicker, 'balance');
const rows = company.comparisons.map((target) => {
const table = target.statement === 'income' ? incomeTable : balanceTable;
return compareRow(target, result, table);
});
const failures = rows.filter((row) => row.status === 'fail' || row.status === 'missing_ours');
console.log(
`[compare-fiscal-ai] ${company.ticker} filing=${filing.accessionNumber} fiscal_pack=${result.fiscal_pack ?? 'null'} income_column="${incomeTable.columnLabel}" balance_column="${balanceTable.columnLabel}" pass=${rows.length - failures.length}/${rows.length}`
);
for (const row of rows) {
console.log(
[
' ',
row.status.toUpperCase(),
`${row.statement}.${row.surfaceKey}`,
`fiscal_label=${row.fiscalAiLabel ?? 'null'}`,
`ours_m=${row.ourValueM ?? 'null'}`,
`fiscal_m=${row.fiscalAiValueM ?? 'null'}`,
`abs_diff_m=${row.absDiffM ?? 'null'}`,
`rel_diff=${row.relDiff === null ? 'null' : row.relDiff.toFixed(4)}`
].join(' ')
);
}
return {
ticker: company.ticker,
filing,
fiscalPack: result.fiscal_pack,
rows,
failures
};
}
async function main() {
process.env.XBRL_ENGINE_TIMEOUT_MS = process.env.XBRL_ENGINE_TIMEOUT_MS ?? '180000';
const tickerFilter = parseTickerFilter(process.argv.slice(2));
const selectedCases = tickerFilter
? CASES.filter((entry) => entry.ticker === tickerFilter)
: CASES;
if (selectedCases.length === 0) {
console.error(`[compare-fiscal-ai] unknown ticker: ${tickerFilter}`);
process.exitCode = 1;
return;
}
const browser = await chromium.launch({ headless: false });
const page = await browser.newPage({
userAgent: BROWSER_USER_AGENT
});
const failures: Array<{ ticker: string; row: ComparisonRow }> = [];
try {
for (const company of selectedCases) {
const result = await compareCase(page, company);
for (const failure of result.failures) {
failures.push({
ticker: company.ticker,
row: failure
});
}
await Bun.sleep(150);
}
} finally {
await browser.close();
}
console.log(`[compare-fiscal-ai] completed cases=${selectedCases.length} failures=${failures.length}`);
if (failures.length === 0) {
return;
}
for (const failure of failures) {
console.error(
`[compare-fiscal-ai] ${failure.ticker} ${failure.row.statement}.${failure.row.surfaceKey} status=${failure.row.status} ours_m=${failure.row.ourValueM ?? 'null'} fiscal_m=${failure.row.fiscalAiValueM ?? 'null'} fiscal_label=${failure.row.fiscalAiLabel ?? 'null'}`
);
}
process.exitCode = 1;
}
void main().catch((error) => {
const message = error instanceof Error ? error.message : String(error);
console.error(`[compare-fiscal-ai] fatal: ${message}`);
process.exitCode = 1;
});