Expand backend financial statement parsers
This commit is contained in:
@@ -5,7 +5,7 @@ import { hydrateFilingTaxonomySnapshot } from '@/lib/server/taxonomy/engine';
|
||||
import type { TaxonomyHydrationInput, TaxonomyHydrationResult } from '@/lib/server/taxonomy/types';
|
||||
|
||||
type ComparisonTarget = {
|
||||
statement: Extract<FinancialStatementKind, 'income' | 'balance'>;
|
||||
statement: Extract<FinancialStatementKind, 'income' | 'balance' | 'cash_flow'>;
|
||||
surfaceKey: string;
|
||||
fiscalAiLabels: string[];
|
||||
allowNotMeaningful?: boolean;
|
||||
@@ -46,7 +46,7 @@ type FiscalAiTable = {
|
||||
};
|
||||
|
||||
type ComparisonRow = {
|
||||
statement: Extract<FinancialStatementKind, 'income' | 'balance'>;
|
||||
statement: Extract<FinancialStatementKind, 'income' | 'balance' | 'cash_flow'>;
|
||||
surfaceKey: string;
|
||||
fiscalAiLabel: string | null;
|
||||
fiscalAiValueM: number | null;
|
||||
@@ -89,6 +89,11 @@ const CASES: CompanyCase[] = [
|
||||
surfaceKey: 'net_income',
|
||||
fiscalAiLabels: ['Net Income Attributable to Common Shareholders', 'Consolidated Net Income', 'Net Income']
|
||||
},
|
||||
{ statement: 'balance', surfaceKey: 'current_assets', fiscalAiLabels: ['Current Assets', 'Total Current Assets'] },
|
||||
{ statement: 'balance', surfaceKey: 'total_assets', fiscalAiLabels: ['Total Assets'] },
|
||||
{ statement: 'cash_flow', surfaceKey: 'operating_cash_flow', fiscalAiLabels: ['Cash from Operating Activities', 'Operating Cash Flow', 'Net Cash from Operations', 'Net Cash Provided by Operating'] },
|
||||
{ statement: 'cash_flow', surfaceKey: 'capital_expenditures', fiscalAiLabels: ['Capital Expenditures', 'Capital Expenditure'] },
|
||||
{ statement: 'cash_flow', surfaceKey: 'free_cash_flow', fiscalAiLabels: ['Free Cash Flow', 'Levered Free Cash Flow'] },
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -113,6 +118,11 @@ const CASES: CompanyCase[] = [
|
||||
surfaceKey: 'net_income',
|
||||
fiscalAiLabels: ['Net Income to Common', 'Net Income Attributable to Common Shareholders', 'Net Income']
|
||||
},
|
||||
{ statement: 'balance', surfaceKey: 'loans', fiscalAiLabels: ['Net Loans', 'Loans', 'Loans Receivable'] },
|
||||
{ statement: 'balance', surfaceKey: 'total_assets', fiscalAiLabels: ['Total Assets'] },
|
||||
{ statement: 'cash_flow', surfaceKey: 'operating_cash_flow', fiscalAiLabels: ['Cash from Operating Activities', 'Net Cash from Operating Activities', 'Net Cash Provided by Operating'] },
|
||||
{ statement: 'cash_flow', surfaceKey: 'investing_cash_flow', fiscalAiLabels: ['Cash from Investing Activities', 'Net Cash from Investing Activities', 'Net Cash Provided by Investing'] },
|
||||
{ statement: 'cash_flow', surfaceKey: 'financing_cash_flow', fiscalAiLabels: ['Cash from Financing Activities', 'Net Cash from Financing Activities', 'Net Cash Provided by Financing'] },
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -137,6 +147,18 @@ const CASES: CompanyCase[] = [
|
||||
surfaceKey: 'net_income',
|
||||
fiscalAiLabels: ['Net Income Attributable to Common Shareholders', 'Consolidated Net Income', 'Net Income']
|
||||
},
|
||||
{
|
||||
statement: 'balance',
|
||||
surfaceKey: 'deferred_acquisition_costs',
|
||||
fiscalAiLabels: [
|
||||
'Deferred Acquisition Costs',
|
||||
'Deferred Policy Acquisition Costs',
|
||||
'Deferred Policy Acquisition Costs and Value of Business Acquired'
|
||||
]
|
||||
},
|
||||
{ statement: 'balance', surfaceKey: 'total_assets', fiscalAiLabels: ['Total Assets'] },
|
||||
{ statement: 'cash_flow', surfaceKey: 'operating_cash_flow', fiscalAiLabels: ['Cash from Operating Activities', 'Operating Cash Flow', 'Net Cash from Operations', 'Net Cash Provided by Operating'] },
|
||||
{ statement: 'cash_flow', surfaceKey: 'free_cash_flow', fiscalAiLabels: ['Free Cash Flow', 'Levered Free Cash Flow'] },
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -154,7 +176,22 @@ const CASES: CompanyCase[] = [
|
||||
statement: 'income',
|
||||
surfaceKey: 'net_income',
|
||||
fiscalAiLabels: ['Net Income Attributable to Common Shareholders', 'Consolidated Net Income', 'Net Income']
|
||||
}
|
||||
},
|
||||
{
|
||||
statement: 'balance',
|
||||
surfaceKey: 'investment_property',
|
||||
fiscalAiLabels: [
|
||||
'Investment Property',
|
||||
'Investment Properties',
|
||||
'Real Estate Investment Property, Net',
|
||||
'Real Estate Investment Property, at Cost',
|
||||
'Total real estate held for investment, at cost'
|
||||
]
|
||||
},
|
||||
{ statement: 'balance', surfaceKey: 'total_assets', fiscalAiLabels: ['Total Assets'] },
|
||||
{ statement: 'cash_flow', surfaceKey: 'operating_cash_flow', fiscalAiLabels: ['Cash from Operating Activities', 'Operating Cash Flow', 'Net Cash from Operations', 'Net Cash Provided by Operating'] },
|
||||
{ statement: 'cash_flow', surfaceKey: 'capital_expenditures', fiscalAiLabels: ['Capital Expenditures', 'Capital Expenditure'] },
|
||||
{ statement: 'cash_flow', surfaceKey: 'free_cash_flow', fiscalAiLabels: ['Free Cash Flow', 'Levered Free Cash Flow'] }
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -184,6 +221,9 @@ const CASES: CompanyCase[] = [
|
||||
];
|
||||
|
||||
function parseTickerFilter(argv: string[]) {
|
||||
let ticker: string | null = null;
|
||||
let statement: Extract<FinancialStatementKind, 'income' | 'balance' | 'cash_flow'> | null = null;
|
||||
|
||||
for (const arg of argv) {
|
||||
if (arg === '--help' || arg === '-h') {
|
||||
console.log('Compare live Fiscal.ai standardized statement rows against local sidecar output.');
|
||||
@@ -191,16 +231,26 @@ function parseTickerFilter(argv: string[]) {
|
||||
console.log('Usage:');
|
||||
console.log(' bun run scripts/compare-fiscal-ai-statements.ts');
|
||||
console.log(' bun run scripts/compare-fiscal-ai-statements.ts --ticker=MSFT');
|
||||
console.log(' bun run scripts/compare-fiscal-ai-statements.ts --statement=balance');
|
||||
console.log(' bun run scripts/compare-fiscal-ai-statements.ts --statement=cash_flow');
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
if (arg.startsWith('--ticker=')) {
|
||||
const value = arg.slice('--ticker='.length).trim().toUpperCase();
|
||||
return value.length > 0 ? value : null;
|
||||
ticker = value.length > 0 ? value : null;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (arg.startsWith('--statement=')) {
|
||||
const value = arg.slice('--statement='.length).trim().toLowerCase().replace(/-/g, '_');
|
||||
if (value === 'income' || value === 'balance' || value === 'cash_flow') {
|
||||
statement = value;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
return { ticker, statement };
|
||||
}
|
||||
|
||||
function normalizeLabel(value: string) {
|
||||
@@ -295,10 +345,98 @@ function chooseInstantPeriodId(result: TaxonomyHydrationResult) {
|
||||
return instantPeriods[0]?.id ?? null;
|
||||
}
|
||||
|
||||
function parseColumnLabelPeriodEnd(columnLabel: string) {
|
||||
const match = columnLabel.match(/^([A-Za-z]{3})\s+'?(\d{2,4})$/);
|
||||
if (!match) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const [, monthToken, yearToken] = match;
|
||||
const monthMap: Record<string, number> = {
|
||||
jan: 0,
|
||||
feb: 1,
|
||||
mar: 2,
|
||||
apr: 3,
|
||||
may: 4,
|
||||
jun: 5,
|
||||
jul: 6,
|
||||
aug: 7,
|
||||
sep: 8,
|
||||
oct: 9,
|
||||
nov: 10,
|
||||
dec: 11
|
||||
};
|
||||
const month = monthMap[monthToken.toLowerCase()];
|
||||
if (month === undefined) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const parsedYear = Number.parseInt(yearToken, 10);
|
||||
if (!Number.isFinite(parsedYear)) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const year = yearToken.length === 2 ? 2000 + parsedYear : parsedYear;
|
||||
return { month, year };
|
||||
}
|
||||
|
||||
function choosePeriodIdForColumnLabel(
|
||||
result: TaxonomyHydrationResult,
|
||||
statement: Extract<FinancialStatementKind, 'income' | 'balance' | 'cash_flow'>,
|
||||
columnLabel: string
|
||||
) {
|
||||
const parsed = parseColumnLabelPeriodEnd(columnLabel);
|
||||
if (!parsed) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const matchingPeriods = result.periods
|
||||
.filter((period): period is ResultPeriod => {
|
||||
const end = periodEnd(period as ResultPeriod);
|
||||
if (!end) {
|
||||
return false;
|
||||
}
|
||||
const endDate = new Date(end);
|
||||
if (Number.isNaN(endDate.getTime())) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const periodMatchesStatement = statement === 'balance'
|
||||
? !periodStart(period as ResultPeriod)
|
||||
: Boolean(periodStart(period as ResultPeriod));
|
||||
if (!periodMatchesStatement) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return endDate.getUTCFullYear() === parsed.year && endDate.getUTCMonth() === parsed.month;
|
||||
})
|
||||
.sort((left, right) => {
|
||||
if (statement !== 'balance') {
|
||||
const leftStart = periodStart(left);
|
||||
const rightStart = periodStart(right);
|
||||
const leftDuration = leftStart
|
||||
? Math.round((Date.parse(periodEnd(left) as string) - Date.parse(leftStart)) / (1000 * 60 * 60 * 24))
|
||||
: -1;
|
||||
const rightDuration = rightStart
|
||||
? Math.round((Date.parse(periodEnd(right) as string) - Date.parse(rightStart)) / (1000 * 60 * 60 * 24))
|
||||
: -1;
|
||||
|
||||
if (leftDuration !== rightDuration) {
|
||||
return rightDuration - leftDuration;
|
||||
}
|
||||
}
|
||||
|
||||
return Date.parse(periodEnd(right) as string) - Date.parse(periodEnd(left) as string);
|
||||
});
|
||||
|
||||
return matchingPeriods[0]?.id ?? null;
|
||||
}
|
||||
|
||||
function findSurfaceValue(
|
||||
result: TaxonomyHydrationResult,
|
||||
statement: Extract<FinancialStatementKind, 'income' | 'balance'>,
|
||||
surfaceKey: string
|
||||
statement: Extract<FinancialStatementKind, 'income' | 'balance' | 'cash_flow'>,
|
||||
surfaceKey: string,
|
||||
referenceColumnLabel?: string
|
||||
) {
|
||||
const rows = result.surface_rows[statement] ?? [];
|
||||
const row = rows.find((entry) => entry.key === surfaceKey) ?? null;
|
||||
@@ -306,9 +444,11 @@ function findSurfaceValue(
|
||||
return { row: null, value: null };
|
||||
}
|
||||
|
||||
const periodId = statement === 'balance'
|
||||
const periodId = (referenceColumnLabel
|
||||
? choosePeriodIdForColumnLabel(result, statement, referenceColumnLabel)
|
||||
: null) ?? (statement === 'balance'
|
||||
? chooseInstantPeriodId(result)
|
||||
: chooseDurationPeriodId(result);
|
||||
: chooseDurationPeriodId(result));
|
||||
|
||||
if (periodId) {
|
||||
const directValue = row.values[periodId];
|
||||
@@ -412,14 +552,24 @@ async function fetchLatestAnnualFiling(company: CompanyCase): Promise<TaxonomyHy
|
||||
async function scrapeFiscalAiTable(
|
||||
page: import('@playwright/test').Page,
|
||||
exchangeTicker: string,
|
||||
statement: 'income' | 'balance'
|
||||
statement: 'income' | 'balance' | 'cash_flow'
|
||||
): Promise<FiscalAiTable> {
|
||||
const pagePath = statement === 'income' ? 'income-statement' : 'balance-sheet';
|
||||
const pagePath = statement === 'income'
|
||||
? 'income-statement'
|
||||
: statement === 'balance'
|
||||
? 'balance-sheet'
|
||||
: 'cash-flow-statement';
|
||||
const url = `https://fiscal.ai/company/${exchangeTicker}/financials/${pagePath}/annual/?templateType=standardized`;
|
||||
|
||||
await page.goto(url, { waitUntil: 'domcontentloaded', timeout: 120_000 });
|
||||
await page.waitForSelector('table', { timeout: 120_000 });
|
||||
await page.waitForTimeout(2_500);
|
||||
await page.evaluate(async () => {
|
||||
window.scrollTo(0, document.body.scrollHeight);
|
||||
await new Promise((resolve) => setTimeout(resolve, 750));
|
||||
window.scrollTo(0, 0);
|
||||
await new Promise((resolve) => setTimeout(resolve, 250));
|
||||
});
|
||||
|
||||
return await page.evaluate(() => {
|
||||
function normalizeLabel(value: string) {
|
||||
@@ -452,45 +602,52 @@ async function scrapeFiscalAiTable(
|
||||
return Number.isFinite(parsed) ? (negative ? -Math.abs(parsed) : parsed) : null;
|
||||
}
|
||||
|
||||
const table = document.querySelector('table');
|
||||
if (!table) {
|
||||
const tables = Array.from(document.querySelectorAll('table'));
|
||||
if (tables.length === 0) {
|
||||
throw new Error('Fiscal.ai table not found');
|
||||
}
|
||||
|
||||
const headerCells = Array.from(table.querySelectorAll('tr:first-child th, tr:first-child td'))
|
||||
.map((cell) => cell.textContent?.trim() ?? '')
|
||||
.filter((value) => value.length > 0);
|
||||
const rowsByLabel = new Map<string, FiscalAiTableRow>();
|
||||
let columnLabel = 'unknown';
|
||||
|
||||
const annualColumnIndex = headerCells.findIndex((value, index) => index > 0 && value !== 'LTM');
|
||||
if (annualColumnIndex < 0) {
|
||||
throw new Error(`Could not locate latest annual column in headers: ${headerCells.join(' | ')}`);
|
||||
}
|
||||
for (const table of tables) {
|
||||
const headerCells = Array.from(table.querySelectorAll('tr:first-child th, tr:first-child td'))
|
||||
.map((cell) => cell.textContent?.trim() ?? '')
|
||||
.filter((value) => value.length > 0);
|
||||
const annualColumnIndex = headerCells.findIndex((value, index) => index > 0 && value !== 'LTM');
|
||||
if (annualColumnIndex < 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const rows = Array.from(table.querySelectorAll('tr'))
|
||||
.slice(1)
|
||||
.map((row) => {
|
||||
if (columnLabel === 'unknown') {
|
||||
columnLabel = headerCells[annualColumnIndex] ?? 'unknown';
|
||||
}
|
||||
|
||||
for (const row of Array.from(table.querySelectorAll('tr')).slice(1)) {
|
||||
const cells = Array.from(row.querySelectorAll('td'));
|
||||
if (cells.length <= annualColumnIndex) {
|
||||
return null;
|
||||
continue;
|
||||
}
|
||||
|
||||
const label = cells[0]?.textContent?.trim() ?? '';
|
||||
const valueText = cells[annualColumnIndex]?.textContent?.trim() ?? '';
|
||||
if (!label) {
|
||||
return null;
|
||||
continue;
|
||||
}
|
||||
|
||||
return {
|
||||
rowsByLabel.set(label, {
|
||||
label,
|
||||
normalizedLabel: normalizeLabel(label),
|
||||
valueText,
|
||||
value: parseDisplayedNumber(valueText)
|
||||
};
|
||||
})
|
||||
.filter((entry): entry is FiscalAiTableRow => entry !== null);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
const rows = Array.from(rowsByLabel.values());
|
||||
|
||||
return {
|
||||
columnLabel: headerCells[annualColumnIndex] ?? 'unknown',
|
||||
columnLabel,
|
||||
rows
|
||||
};
|
||||
});
|
||||
@@ -536,7 +693,7 @@ function compareRow(
|
||||
): ComparisonRow {
|
||||
const fiscalAiRow = findFiscalAiRow(fiscalAiTable.rows, target.fiscalAiLabels);
|
||||
const fiscalAiValueM = fiscalAiRow?.value ?? null;
|
||||
const ourSurface = findSurfaceValue(result, target.statement, target.surfaceKey);
|
||||
const ourSurface = findSurfaceValue(result, target.statement, target.surfaceKey, fiscalAiTable.columnLabel);
|
||||
const ourValueM = roundMillions(ourSurface.value);
|
||||
const absDiffM = absoluteDiff(ourValueM, fiscalAiValueM);
|
||||
const relDiffValue = relativeDiff(ourValueM, fiscalAiValueM);
|
||||
@@ -587,17 +744,34 @@ async function compareCase(page: import('@playwright/test').Page, company: Compa
|
||||
throw new Error(`${company.ticker} parse_status=${result.parse_status}${result.parse_error ? ` parse_error=${result.parse_error}` : ''}`);
|
||||
}
|
||||
|
||||
const incomeTable = await scrapeFiscalAiTable(page, company.exchangeTicker, 'income');
|
||||
const balanceTable = await scrapeFiscalAiTable(page, company.exchangeTicker, 'balance');
|
||||
const statementKinds = new Set(company.comparisons.map((target) => target.statement));
|
||||
const incomeTable = statementKinds.has('income')
|
||||
? await scrapeFiscalAiTable(page, company.exchangeTicker, 'income')
|
||||
: null;
|
||||
const balanceTable = statementKinds.has('balance')
|
||||
? await scrapeFiscalAiTable(page, company.exchangeTicker, 'balance')
|
||||
: null;
|
||||
const cashFlowTable = statementKinds.has('cash_flow')
|
||||
? await scrapeFiscalAiTable(page, company.exchangeTicker, 'cash_flow')
|
||||
: null;
|
||||
const rows = company.comparisons.map((target) => {
|
||||
const table = target.statement === 'income' ? incomeTable : balanceTable;
|
||||
const table = target.statement === 'income'
|
||||
? incomeTable
|
||||
: target.statement === 'balance'
|
||||
? balanceTable
|
||||
: cashFlowTable;
|
||||
if (!table) {
|
||||
throw new Error(`Missing scraped table for ${target.statement}`);
|
||||
}
|
||||
return compareRow(target, result, table);
|
||||
});
|
||||
|
||||
const failures = rows.filter((row) => row.status === 'fail' || row.status === 'missing_ours');
|
||||
const failures = rows.filter(
|
||||
(row) => row.status === 'fail' || row.status === 'missing_ours' || row.status === 'missing_reference'
|
||||
);
|
||||
|
||||
console.log(
|
||||
`[compare-fiscal-ai] ${company.ticker} filing=${filing.accessionNumber} fiscal_pack=${result.fiscal_pack ?? 'null'} income_column="${incomeTable.columnLabel}" balance_column="${balanceTable.columnLabel}" pass=${rows.length - failures.length}/${rows.length}`
|
||||
`[compare-fiscal-ai] ${company.ticker} filing=${filing.accessionNumber} fiscal_pack=${result.fiscal_pack ?? 'null'} income_column="${incomeTable?.columnLabel ?? 'n/a'}" balance_column="${balanceTable?.columnLabel ?? 'n/a'}" cash_flow_column="${cashFlowTable?.columnLabel ?? 'n/a'}" pass=${rows.length - failures.length}/${rows.length}`
|
||||
);
|
||||
for (const row of rows) {
|
||||
console.log(
|
||||
@@ -625,18 +799,28 @@ async function compareCase(page: import('@playwright/test').Page, company: Compa
|
||||
|
||||
async function main() {
|
||||
process.env.XBRL_ENGINE_TIMEOUT_MS = process.env.XBRL_ENGINE_TIMEOUT_MS ?? '180000';
|
||||
const tickerFilter = parseTickerFilter(process.argv.slice(2));
|
||||
const selectedCases = tickerFilter
|
||||
? CASES.filter((entry) => entry.ticker === tickerFilter)
|
||||
: CASES;
|
||||
const filters = parseTickerFilter(process.argv.slice(2));
|
||||
const selectedCases = (filters.ticker
|
||||
? CASES.filter((entry) => entry.ticker === filters.ticker)
|
||||
: CASES
|
||||
)
|
||||
.map((entry) => ({
|
||||
...entry,
|
||||
comparisons: filters.statement
|
||||
? entry.comparisons.filter((target) => target.statement === filters.statement)
|
||||
: entry.comparisons
|
||||
}))
|
||||
.filter((entry) => entry.comparisons.length > 0);
|
||||
|
||||
if (selectedCases.length === 0) {
|
||||
console.error(`[compare-fiscal-ai] unknown ticker: ${tickerFilter}`);
|
||||
console.error(
|
||||
`[compare-fiscal-ai] no matching cases for ticker=${filters.ticker ?? 'all'} statement=${filters.statement ?? 'all'}`
|
||||
);
|
||||
process.exitCode = 1;
|
||||
return;
|
||||
}
|
||||
|
||||
const browser = await chromium.launch({ headless: false });
|
||||
const browser = await chromium.launch({ headless: true });
|
||||
const page = await browser.newPage({
|
||||
userAgent: BROWSER_USER_AGENT
|
||||
});
|
||||
|
||||
Reference in New Issue
Block a user