From c274f4d55b23cded6002427373dbaeaa90ed0558 Mon Sep 17 00:00:00 2001 From: francy51 Date: Thu, 12 Mar 2026 15:25:06 -0400 Subject: [PATCH] refactor(taxonomy): remove legacy parser and add rollout checks --- .gitea/workflows/taxonomy-sidecar.yml | 44 ++ .github/workflows/taxonomy-sidecar.yml | 44 ++ lib/server/taxonomy/engine.test.ts | 86 +++ lib/server/taxonomy/linkbase-parser.test.ts | 63 -- lib/server/taxonomy/linkbase-parser.ts | 310 --------- lib/server/taxonomy/xbrl-parser.test.ts | 60 -- lib/server/taxonomy/xbrl-parser.ts | 264 -------- package.json | 5 + scripts/compare-fiscal-ai-statements.ts | 681 ++++++++++++++++++++ scripts/report-taxonomy-health.ts | 200 ++++++ scripts/validate-taxonomy-packs.ts | 284 ++++++++ 11 files changed, 1344 insertions(+), 697 deletions(-) create mode 100644 .gitea/workflows/taxonomy-sidecar.yml create mode 100644 .github/workflows/taxonomy-sidecar.yml create mode 100644 lib/server/taxonomy/engine.test.ts delete mode 100644 lib/server/taxonomy/linkbase-parser.test.ts delete mode 100644 lib/server/taxonomy/linkbase-parser.ts delete mode 100644 lib/server/taxonomy/xbrl-parser.test.ts delete mode 100644 lib/server/taxonomy/xbrl-parser.ts create mode 100644 scripts/compare-fiscal-ai-statements.ts create mode 100644 scripts/report-taxonomy-health.ts create mode 100644 scripts/validate-taxonomy-packs.ts diff --git a/.gitea/workflows/taxonomy-sidecar.yml b/.gitea/workflows/taxonomy-sidecar.yml new file mode 100644 index 0000000..c5042c3 --- /dev/null +++ b/.gitea/workflows/taxonomy-sidecar.yml @@ -0,0 +1,44 @@ +name: Taxonomy Sidecar + +on: + pull_request: + branches: + - main + push: + branches: + - codex/** + +concurrency: + group: taxonomy-sidecar-${{ github.ref }} + cancel-in-progress: true + +jobs: + taxonomy-sidecar: + runs-on: ubuntu-latest + + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Setup Bun + uses: oven-sh/setup-bun@v2 + with: + bun-version: "1.3.5" + + - name: Setup Rust + uses: dtolnay/rust-toolchain@stable + + - name: Install dependencies + run: bun install --frozen-lockfile + + - name: Typecheck + run: bun x tsc --noEmit + + - name: Rust unit tests + run: cargo test --manifest-path rust/Cargo.toml -p fiscal-xbrl-core + + - name: Taxonomy tests + run: bun test lib/server/taxonomy/engine.test.ts lib/server/financial-taxonomy.test.ts + + - name: Build Rust sidecar + run: cargo build --manifest-path rust/Cargo.toml -p fiscal-xbrl-cli diff --git a/.github/workflows/taxonomy-sidecar.yml b/.github/workflows/taxonomy-sidecar.yml new file mode 100644 index 0000000..c5042c3 --- /dev/null +++ b/.github/workflows/taxonomy-sidecar.yml @@ -0,0 +1,44 @@ +name: Taxonomy Sidecar + +on: + pull_request: + branches: + - main + push: + branches: + - codex/** + +concurrency: + group: taxonomy-sidecar-${{ github.ref }} + cancel-in-progress: true + +jobs: + taxonomy-sidecar: + runs-on: ubuntu-latest + + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Setup Bun + uses: oven-sh/setup-bun@v2 + with: + bun-version: "1.3.5" + + - name: Setup Rust + uses: dtolnay/rust-toolchain@stable + + - name: Install dependencies + run: bun install --frozen-lockfile + + - name: Typecheck + run: bun x tsc --noEmit + + - name: Rust unit tests + run: cargo test --manifest-path rust/Cargo.toml -p fiscal-xbrl-core + + - name: Taxonomy tests + run: bun test lib/server/taxonomy/engine.test.ts lib/server/financial-taxonomy.test.ts + + - name: Build Rust sidecar + run: cargo build --manifest-path rust/Cargo.toml -p fiscal-xbrl-cli diff --git a/lib/server/taxonomy/engine.test.ts b/lib/server/taxonomy/engine.test.ts new file mode 100644 index 0000000..785b8c9 --- /dev/null +++ b/lib/server/taxonomy/engine.test.ts @@ -0,0 +1,86 @@ +import { beforeEach, describe, expect, it, mock } from 'bun:test'; + +import type { FinancialStatementKind } from '@/lib/types'; +import type { TaxonomyHydrationInput, TaxonomyHydrationResult } from '@/lib/server/taxonomy/types'; + +function createStatementRecord(factory: () => T): Record { + return { + income: factory(), + balance: factory(), + cash_flow: factory(), + equity: factory(), + comprehensive_income: factory() + }; +} + +function createHydrationResult(): TaxonomyHydrationResult { + return { + filing_id: 1, + ticker: 'TEST', + filing_date: '2025-12-31', + filing_type: '10-K', + parse_status: 'ready', + parse_error: null, + source: 'xbrl_instance_with_linkbase', + parser_engine: 'fiscal-xbrl', + parser_version: '0.1.0', + taxonomy_regime: 'us-gaap', + fiscal_pack: 'core', + periods: [], + faithful_rows: createStatementRecord(() => []), + statement_rows: createStatementRecord(() => []), + surface_rows: createStatementRecord(() => []), + detail_rows: createStatementRecord(() => ({})), + kpi_rows: [], + contexts: [], + derived_metrics: null, + validation_result: null, + facts_count: 0, + concepts_count: 0, + dimensions_count: 0, + assets: [], + concepts: [], + facts: [], + metric_validations: [], + normalization_summary: { + surfaceRowCount: 0, + detailRowCount: 0, + kpiRowCount: 0, + unmappedRowCount: 0, + materialUnmappedRowCount: 0, + warnings: ['rust_warning'] + } + }; +} + +const mockHydrateFromSidecar = mock(async () => createHydrationResult()); +mock.module('@/lib/server/taxonomy/parser-client', () => ({ + hydrateFilingTaxonomySnapshotFromSidecar: mockHydrateFromSidecar +})); + +describe('taxonomy engine rust path', () => { + beforeEach(() => { + mockHydrateFromSidecar.mockClear(); + }); + + it('returns sidecar output directly from the Rust sidecar', async () => { + const { hydrateFilingTaxonomySnapshot } = await import('@/lib/server/taxonomy/engine'); + + const input: TaxonomyHydrationInput = { + filingId: 1, + ticker: 'TEST', + cik: '0000000001', + accessionNumber: '0000000001-25-000001', + filingDate: '2025-12-31', + filingType: '10-K', + filingUrl: 'https://www.sec.gov/Archives/edgar/data/1/000000000125000001/', + primaryDocument: 'test-20251231.htm' + }; + + const result = await hydrateFilingTaxonomySnapshot(input); + + expect(mockHydrateFromSidecar).toHaveBeenCalledTimes(1); + expect(result.parser_engine).toBe('fiscal-xbrl'); + expect(result.normalization_summary.warnings).toEqual(['rust_warning']); + }); +}); diff --git a/lib/server/taxonomy/linkbase-parser.test.ts b/lib/server/taxonomy/linkbase-parser.test.ts deleted file mode 100644 index a7b8d45..0000000 --- a/lib/server/taxonomy/linkbase-parser.test.ts +++ /dev/null @@ -1,63 +0,0 @@ -import { describe, expect, it } from 'bun:test'; -import { - classifyStatementRole, - parseLabelLinkbase, - parsePresentationLinkbase -} from '@/lib/server/taxonomy/linkbase-parser'; - -const SAMPLE_LABEL_LINKBASE = ` - - - - Rev. - Revenues - - - - -`; - -const SAMPLE_PRESENTATION_LINKBASE = ` - - - - - - - - - -`; - -describe('linkbase parser', () => { - it('builds preferred labels from label linkbase', () => { - const labels = parseLabelLinkbase(SAMPLE_LABEL_LINKBASE); - expect(labels.get('http://fasb.org/us-gaap/2024#Revenues')).toBe('Revenues'); - }); - - it('builds role trees with depth/order/parent metadata', () => { - const rows = parsePresentationLinkbase(SAMPLE_PRESENTATION_LINKBASE); - expect(rows.length).toBe(3); - - const root = rows.find((row) => row.qname === 'us-gaap:StatementLineItems'); - const revenue = rows.find((row) => row.qname === 'us-gaap:Revenues'); - const cogs = rows.find((row) => row.qname === 'us-gaap:CostOfGoodsSold'); - - expect(root?.depth).toBe(0); - expect(root?.parentConceptKey).toBeNull(); - expect(revenue?.depth).toBe(1); - expect(cogs?.depth).toBe(1); - expect(revenue?.parentConceptKey).toBe(root?.conceptKey ?? null); - expect(revenue?.order).toBeLessThan(cogs?.order ?? Number.POSITIVE_INFINITY); - }); - - it('classifies statement roles into canonical statement kinds', () => { - expect(classifyStatementRole('http://www.xbrl.org/2003/role/StatementOfOperations')).toBe('income'); - expect(classifyStatementRole('http://www.xbrl.org/2003/role/StatementOfFinancialPosition')).toBe('balance'); - expect(classifyStatementRole('http://www.xbrl.org/2003/role/StatementOfCashFlows')).toBe('cash_flow'); - }); -}); diff --git a/lib/server/taxonomy/linkbase-parser.ts b/lib/server/taxonomy/linkbase-parser.ts deleted file mode 100644 index d0c7b7d..0000000 --- a/lib/server/taxonomy/linkbase-parser.ts +++ /dev/null @@ -1,310 +0,0 @@ -import type { FinancialStatementKind } from '@/lib/types'; -import type { TaxonomyNamespaceMap, TaxonomyPresentationConcept } from '@/lib/server/taxonomy/types'; - -function decodeXmlEntities(value: string) { - return value - .replace(/&/gi, '&') - .replace(/</gi, '<') - .replace(/>/gi, '>') - .replace(/"/gi, '"') - .replace(/'/gi, "'") - .replace(/ | /gi, ' ') - .trim(); -} - -function parseNamespaceMap(raw: string): TaxonomyNamespaceMap { - const map: TaxonomyNamespaceMap = {}; - const rootStart = raw.match(/<[^>]*linkbase[^>]*>/i)?.[0] ?? raw.slice(0, 1200); - - for (const match of rootStart.matchAll(/xmlns:([a-zA-Z0-9_\-]+)=["']([^"']+)["']/g)) { - const prefix = (match[1] ?? '').trim(); - const uri = (match[2] ?? '').trim(); - if (!prefix || !uri) { - continue; - } - - map[prefix] = uri; - } - - return map; -} - -function qnameFromHref(href: string) { - const fragment = href.includes('#') ? href.slice(href.indexOf('#') + 1) : href; - if (!fragment) { - return null; - } - - const cleaned = fragment.trim().replace(/^loc_+/i, ''); - if (!cleaned) { - return null; - } - - if (cleaned.includes(':')) { - return cleaned; - } - - if (cleaned.includes('_')) { - const idx = cleaned.indexOf('_'); - return `${cleaned.slice(0, idx)}:${cleaned.slice(idx + 1)}`; - } - - return null; -} - -function conceptFromQName(qname: string, namespaces: TaxonomyNamespaceMap) { - const [prefix, ...rest] = qname.split(':'); - const localName = rest.join(':'); - if (!prefix || !localName) { - return null; - } - - const namespaceUri = namespaces[prefix] ?? `urn:unknown:${prefix}`; - - return { - qname, - namespaceUri, - localName, - conceptKey: `${namespaceUri}#${localName}` - }; -} - -function labelPriority(role: string | null) { - const normalized = (role ?? '').toLowerCase(); - if (!normalized) { - return 0; - } - - if (normalized.endsWith('/label')) { - return 4; - } - - if (normalized.endsWith('/terselabel')) { - return 3; - } - - if (normalized.endsWith('/verboselabel')) { - return 2; - } - - return 1; -} - -export function classifyStatementRole(roleUri: string): FinancialStatementKind | null { - const normalized = roleUri.toLowerCase(); - - if (/cash\s*flow|statementsof?cashflows|netcash/.test(normalized)) { - return 'cash_flow'; - } - - if (/shareholders?|stockholders?|equity|retainedearnings/.test(normalized)) { - return 'equity'; - } - - if (/comprehensive\s*income/.test(normalized)) { - return 'comprehensive_income'; - } - - if (/balance\s*sheet|financial\s*position|assets?andliabilities/.test(normalized)) { - return 'balance'; - } - - if (/operations|income\s*statement|statementsofincome|profit/.test(normalized)) { - return 'income'; - } - - return null; -} - -export function parseLabelLinkbase(raw: string): Map { - const namespaces = parseNamespaceMap(raw); - const preferredLabelByConcept = new Map(); - - const linkPattern = /<(?:[a-z0-9_\-]+:)?labelLink\b[^>]*>([\s\S]*?)<\/(?:[a-z0-9_\-]+:)?labelLink>/gi; - for (const linkMatch of raw.matchAll(linkPattern)) { - const block = linkMatch[1] ?? ''; - const locByLabel = new Map(); - const resourceByLabel = new Map(); - - for (const locMatch of block.matchAll(/<(?:[a-z0-9_\-]+:)?loc\b([^>]*)\/?>/gi)) { - const attrs = locMatch[1] ?? ''; - const label = attrs.match(/\bxlink:label=["']([^"']+)["']/i)?.[1]?.trim() ?? ''; - const href = attrs.match(/\bxlink:href=["']([^"']+)["']/i)?.[1]?.trim() ?? ''; - if (!label || !href) { - continue; - } - - const qname = qnameFromHref(href); - if (!qname) { - continue; - } - - const concept = conceptFromQName(qname, namespaces); - if (!concept) { - continue; - } - - locByLabel.set(label, concept.conceptKey); - } - - for (const resourceMatch of block.matchAll(/<(?:[a-z0-9_\-]+:)?label\b([^>]*)>([\s\S]*?)<\/(?:[a-z0-9_\-]+:)?label>/gi)) { - const attrs = resourceMatch[1] ?? ''; - const body = decodeXmlEntities(resourceMatch[2] ?? '').replace(/\s+/g, ' ').trim(); - if (!body) { - continue; - } - - const resourceLabel = attrs.match(/\bxlink:label=["']([^"']+)["']/i)?.[1]?.trim() ?? ''; - const role = attrs.match(/\bxlink:role=["']([^"']+)["']/i)?.[1]?.trim() ?? null; - if (!resourceLabel) { - continue; - } - - resourceByLabel.set(resourceLabel, { - text: body, - role - }); - } - - for (const arcMatch of block.matchAll(/<(?:[a-z0-9_\-]+:)?labelArc\b([^>]*)\/?>/gi)) { - const attrs = arcMatch[1] ?? ''; - const from = attrs.match(/\bxlink:from=["']([^"']+)["']/i)?.[1]?.trim() ?? ''; - const to = attrs.match(/\bxlink:to=["']([^"']+)["']/i)?.[1]?.trim() ?? ''; - if (!from || !to) { - continue; - } - - const conceptKey = locByLabel.get(from); - const resource = resourceByLabel.get(to); - if (!conceptKey || !resource) { - continue; - } - - const priority = labelPriority(resource.role); - const current = preferredLabelByConcept.get(conceptKey); - if (!current || priority > current.priority) { - preferredLabelByConcept.set(conceptKey, { - text: resource.text, - priority - }); - } - } - } - - return new Map( - [...preferredLabelByConcept.entries()].map(([conceptKey, value]) => [conceptKey, value.text]) - ); -} - -export function parsePresentationLinkbase(raw: string): TaxonomyPresentationConcept[] { - const namespaces = parseNamespaceMap(raw); - const rows: TaxonomyPresentationConcept[] = []; - - const linkPattern = /<(?:[a-z0-9_\-]+:)?presentationLink\b([^>]*)>([\s\S]*?)<\/(?:[a-z0-9_\-]+:)?presentationLink>/gi; - for (const linkMatch of raw.matchAll(linkPattern)) { - const linkAttrs = linkMatch[1] ?? ''; - const block = linkMatch[2] ?? ''; - const roleUri = linkAttrs.match(/\bxlink:role=["']([^"']+)["']/i)?.[1]?.trim() ?? ''; - if (!roleUri) { - continue; - } - - const locByLabel = new Map(); - for (const locMatch of block.matchAll(/<(?:[a-z0-9_\-]+:)?loc\b([^>]*)\/?>/gi)) { - const attrs = locMatch[1] ?? ''; - const label = attrs.match(/\bxlink:label=["']([^"']+)["']/i)?.[1]?.trim() ?? ''; - const href = attrs.match(/\bxlink:href=["']([^"']+)["']/i)?.[1]?.trim() ?? ''; - if (!label || !href) { - continue; - } - - const qname = qnameFromHref(href); - if (!qname) { - continue; - } - - const concept = conceptFromQName(qname, namespaces); - if (!concept) { - continue; - } - - locByLabel.set(label, { - conceptKey: concept.conceptKey, - qname: concept.qname, - isAbstract: /abstract/i.test(concept.localName) - }); - } - - const childrenByLabel = new Map>(); - const incoming = new Set(); - const allReferenced = new Set(); - - for (const arcMatch of block.matchAll(/<(?:[a-z0-9_\-]+:)?presentationArc\b([^>]*)\/?>/gi)) { - const attrs = arcMatch[1] ?? ''; - const from = attrs.match(/\bxlink:from=["']([^"']+)["']/i)?.[1]?.trim() ?? ''; - const to = attrs.match(/\bxlink:to=["']([^"']+)["']/i)?.[1]?.trim() ?? ''; - const orderRaw = attrs.match(/\border=["']([^"']+)["']/i)?.[1]?.trim() ?? ''; - const order = Number.parseFloat(orderRaw); - - if (!from || !to || !locByLabel.has(from) || !locByLabel.has(to)) { - continue; - } - - const group = childrenByLabel.get(from) ?? []; - group.push({ label: to, order: Number.isFinite(order) ? order : group.length + 1 }); - childrenByLabel.set(from, group); - - incoming.add(to); - allReferenced.add(from); - allReferenced.add(to); - } - - const roots = [...allReferenced].filter((label) => !incoming.has(label)); - const visited = new Set(); - - function dfs(label: string, depth: number, parentLabel: string | null, baseOrder: number) { - const node = locByLabel.get(label); - if (!node) { - return; - } - - const pathKey = `${parentLabel ?? 'root'}::${label}::${depth}`; - if (visited.has(pathKey)) { - return; - } - visited.add(pathKey); - - const parentConceptKey = parentLabel ? (locByLabel.get(parentLabel)?.conceptKey ?? null) : null; - rows.push({ - conceptKey: node.conceptKey, - qname: node.qname, - roleUri, - order: baseOrder, - depth, - parentConceptKey, - isAbstract: node.isAbstract - }); - - const children = [...(childrenByLabel.get(label) ?? [])].sort((left, right) => left.order - right.order); - for (let i = 0; i < children.length; i += 1) { - const child = children[i]; - if (!child) { - continue; - } - - dfs(child.label, depth + 1, label, baseOrder + (i + 1) / 1000); - } - } - - for (let i = 0; i < roots.length; i += 1) { - const root = roots[i]; - if (!root) { - continue; - } - - dfs(root, 0, null, i + 1); - } - } - - return rows; -} diff --git a/lib/server/taxonomy/xbrl-parser.test.ts b/lib/server/taxonomy/xbrl-parser.test.ts deleted file mode 100644 index 4b66575..0000000 --- a/lib/server/taxonomy/xbrl-parser.test.ts +++ /dev/null @@ -1,60 +0,0 @@ -import { describe, expect, it } from 'bun:test'; -import { parseXbrlInstance } from '@/lib/server/taxonomy/xbrl-parser'; - -const SAMPLE_XBRL = ` - - - - 2025-01-01 - 2025-12-31 - - - - - - us-gaap:ConsolidatedGroupMember - - - - 2025-12-31 - - - - iso4217:USD - - 1,234 - 5,678 - Acme Corp - -`; - -describe('xbrl instance parser', () => { - it('parses contexts, units, numeric facts, dimensions, and concept keys', () => { - const parsed = parseXbrlInstance(SAMPLE_XBRL, 'abc_htm.xml'); - - expect(parsed.contexts.c1?.periodStart).toBe('2025-01-01'); - expect(parsed.contexts.c1?.periodEnd).toBe('2025-12-31'); - expect(parsed.contexts.c2?.periodInstant).toBe('2025-12-31'); - expect(parsed.contexts.c2?.dimensions.length).toBe(1); - expect(parsed.units.u1?.measure).toBe('iso4217:USD'); - - expect(parsed.facts.length).toBe(2); - const revenueFact = parsed.facts.find((fact) => fact.localName === 'Revenues'); - const assetsFact = parsed.facts.find((fact) => fact.localName === 'Assets'); - - expect(revenueFact?.conceptKey).toBe('http://fasb.org/us-gaap/2024#Revenues'); - expect(revenueFact?.isDimensionless).toBe(true); - expect(revenueFact?.value).toBe(1234); - expect(revenueFact?.sourceFile).toBe('abc_htm.xml'); - - expect(assetsFact?.conceptKey).toBe('http://fasb.org/us-gaap/2024#Assets'); - expect(assetsFact?.isDimensionless).toBe(false); - expect(assetsFact?.dimensions[0]).toEqual({ - axis: 'us-gaap:StatementBusinessSegmentsAxis', - member: 'us-gaap:ConsolidatedGroupMember' - }); - }); -}); diff --git a/lib/server/taxonomy/xbrl-parser.ts b/lib/server/taxonomy/xbrl-parser.ts deleted file mode 100644 index f42ec93..0000000 --- a/lib/server/taxonomy/xbrl-parser.ts +++ /dev/null @@ -1,264 +0,0 @@ -import type { FinancialStatementKind } from '@/lib/types'; -import type { TaxonomyContext, TaxonomyFact, TaxonomyNamespaceMap, TaxonomyUnit } from '@/lib/server/taxonomy/types'; - -function decodeXmlEntities(value: string) { - return value - .replace(/&/gi, '&') - .replace(/</gi, '<') - .replace(/>/gi, '>') - .replace(/"/gi, '"') - .replace(/'/gi, "'") - .replace(/ | /gi, ' ') - .replace(/&#x([0-9a-f]+);/gi, (_match, hex) => { - const parsed = Number.parseInt(hex, 16); - if (!Number.isFinite(parsed)) { - return ' '; - } - - try { - return String.fromCodePoint(parsed); - } catch { - return ' '; - } - }) - .replace(/&#([0-9]+);/g, (_match, numeric) => { - const parsed = Number.parseInt(numeric, 10); - if (!Number.isFinite(parsed)) { - return ' '; - } - - try { - return String.fromCodePoint(parsed); - } catch { - return ' '; - } - }); -} - -function parseNumber(value: string) { - const trimmed = value.trim(); - if (!trimmed) { - return null; - } - - if (/^--+$/.test(trimmed)) { - return null; - } - - const negative = trimmed.startsWith('(') && trimmed.endsWith(')'); - const normalized = trimmed - .replace(/<[^>]+>/g, ' ') - .replace(/[,$\s]/g, '') - .replace(/[()]/g, '') - .replace(/\u2212/g, '-'); - - if (!normalized) { - return null; - } - - const parsed = Number.parseFloat(normalized); - if (!Number.isFinite(parsed)) { - return null; - } - - return negative ? -Math.abs(parsed) : parsed; -} - -function parseNamespaceMapFromDocument(raw: string): TaxonomyNamespaceMap { - const map: TaxonomyNamespaceMap = {}; - const rootStart = raw.match(/<[^>]*xbrl[^>]*>/i)?.[0] ?? raw.slice(0, 1200); - - for (const match of rootStart.matchAll(/xmlns:([a-zA-Z0-9_\-]+)=["']([^"']+)["']/g)) { - const prefix = (match[1] ?? '').trim(); - const uri = (match[2] ?? '').trim(); - - if (!prefix || !uri) { - continue; - } - - map[prefix] = uri; - } - - return map; -} - -function parseContexts(raw: string): Record { - const contexts: Record = {}; - const contextPattern = /<(?:[a-z0-9_\-]+:)?context\b[^>]*\bid=["']([^"']+)["'][^>]*>([\s\S]*?)<\/(?:[a-z0-9_\-]+:)?context>/gi; - - for (const match of raw.matchAll(contextPattern)) { - const contextId = (match[1] ?? '').trim(); - const block = match[2] ?? ''; - if (!contextId) { - continue; - } - - const periodStart = block.match(/<(?:[a-z0-9_\-]+:)?startDate>([^<]+)<\/(?:[a-z0-9_\-]+:)?startDate>/i)?.[1]?.trim() ?? null; - const periodEnd = block.match(/<(?:[a-z0-9_\-]+:)?endDate>([^<]+)<\/(?:[a-z0-9_\-]+:)?endDate>/i)?.[1]?.trim() ?? null; - const periodInstant = block.match(/<(?:[a-z0-9_\-]+:)?instant>([^<]+)<\/(?:[a-z0-9_\-]+:)?instant>/i)?.[1]?.trim() ?? null; - - const dimensions: Array<{ axis: string; member: string }> = []; - const dimPattern = /<(?:[a-z0-9_\-]+:)?explicitMember\b[^>]*\bdimension=["']([^"']+)["'][^>]*>([^<]+)<\/(?:[a-z0-9_\-]+:)?explicitMember>/gi; - for (const dimMatch of block.matchAll(dimPattern)) { - const axis = decodeXmlEntities((dimMatch[1] ?? '').trim()); - const member = decodeXmlEntities((dimMatch[2] ?? '').trim()); - if (!axis || !member) { - continue; - } - - dimensions.push({ axis, member }); - } - - contexts[contextId] = { - id: contextId, - periodStart, - periodEnd, - periodInstant, - dimensions - }; - } - - return contexts; -} - -function parseUnits(raw: string): Record { - const units: Record = {}; - const unitPattern = /<(?:[a-z0-9_\-]+:)?unit\b[^>]*\bid=["']([^"']+)["'][^>]*>([\s\S]*?)<\/(?:[a-z0-9_\-]+:)?unit>/gi; - - for (const match of raw.matchAll(unitPattern)) { - const id = (match[1] ?? '').trim(); - const block = match[2] ?? ''; - if (!id) { - continue; - } - - const measures = [...block.matchAll(/<(?:[a-z0-9_\-]+:)?measure>([^<]+)<\/(?:[a-z0-9_\-]+:)?measure>/gi)] - .map((entry) => decodeXmlEntities((entry[1] ?? '').trim())) - .filter(Boolean); - - let measure: string | null = null; - if (measures.length === 1) { - measure = measures[0] ?? null; - } else if (measures.length > 1) { - measure = measures.join('/'); - } - - units[id] = { - id, - measure - }; - } - - return units; -} - -function classifyStatementKind(localName: string): FinancialStatementKind | null { - const normalized = localName.toLowerCase(); - - if (/cash|operatingactivities|investingactivities|financingactivities/.test(normalized)) { - return 'cash_flow'; - } - - if (/equity|retainedearnings|additionalpaidincapital/.test(normalized)) { - return 'equity'; - } - - if (/comprehensiveincome/.test(normalized)) { - return 'comprehensive_income'; - } - - if (/asset|liabilit|debt/.test(normalized)) { - return 'balance'; - } - - if (/revenue|income|profit|expense|costof/.test(normalized)) { - return 'income'; - } - - return null; -} - -function isXbrlInfrastructurePrefix(prefix: string) { - const normalized = prefix.toLowerCase(); - return normalized === 'xbrli' - || normalized === 'xlink' - || normalized === 'link' - || normalized === 'xbrldi' - || normalized === 'xbrldt'; -} - -function localNameToKey(namespaceUri: string, localName: string) { - return `${namespaceUri}#${localName}`; -} - -export function parseXbrlInstance( - raw: string, - sourceFile: string | null -): { - namespaces: TaxonomyNamespaceMap; - contexts: Record; - units: Record; - facts: TaxonomyFact[]; -} { - const namespaces = parseNamespaceMapFromDocument(raw); - const contexts = parseContexts(raw); - const units = parseUnits(raw); - const facts: TaxonomyFact[] = []; - - const factPattern = /<([a-zA-Z0-9_\-]+):([a-zA-Z0-9_\-.]+)\b([^>]*\bcontextRef=["'][^"']+["'][^>]*)>([\s\S]*?)<\/\1:\2>/g; - - for (const match of raw.matchAll(factPattern)) { - const prefix = (match[1] ?? '').trim(); - const localName = (match[2] ?? '').trim(); - const attrs = match[3] ?? ''; - const body = decodeXmlEntities((match[4] ?? '').trim()); - - if (!prefix || !localName || isXbrlInfrastructurePrefix(prefix)) { - continue; - } - - const contextId = attrs.match(/\bcontextRef=["']([^"']+)["']/i)?.[1]?.trim() ?? ''; - if (!contextId) { - continue; - } - - const value = parseNumber(body); - if (value === null) { - continue; - } - - const unitRef = attrs.match(/\bunitRef=["']([^"']+)["']/i)?.[1]?.trim() ?? null; - const decimals = attrs.match(/\bdecimals=["']([^"']+)["']/i)?.[1]?.trim() ?? null; - - const namespaceUri = namespaces[prefix] ?? `urn:unknown:${prefix}`; - const context = contexts[contextId]; - - facts.push({ - conceptKey: localNameToKey(namespaceUri, localName), - qname: `${prefix}:${localName}`, - namespaceUri, - localName, - contextId, - unit: unitRef && units[unitRef]?.measure ? units[unitRef]?.measure ?? unitRef : unitRef, - decimals, - value, - periodStart: context?.periodStart ?? null, - periodEnd: context?.periodEnd ?? null, - periodInstant: context?.periodInstant ?? null, - dimensions: context?.dimensions ?? [], - isDimensionless: (context?.dimensions.length ?? 0) === 0, - sourceFile, - }); - } - - return { - namespaces, - contexts, - units, - facts - }; -} - -export function conceptStatementFallback(localName: string) { - return classifyStatementKind(localName); -} diff --git a/package.json b/package.json index 3092af1..a017507 100644 --- a/package.json +++ b/package.json @@ -6,8 +6,11 @@ "scripts": { "dev": "bun run scripts/dev.ts", "dev:next": "bun --bun next dev --turbopack", + "build:sidecar": "cargo build --manifest-path rust/Cargo.toml --release --bin fiscal-xbrl", "build": "bun --bun next build --turbopack", "bootstrap:prod": "bun run scripts/bootstrap-production.ts", + "check:sidecar": "cargo check --manifest-path rust/Cargo.toml", + "validate:taxonomy-packs": "bun run scripts/validate-taxonomy-packs.ts", "start": "bun --bun next start", "lint": "bun x tsc --noEmit", "e2e:prepare": "bun run scripts/e2e-prepare.ts", @@ -17,6 +20,8 @@ "backfill:filing-statements": "bun run scripts/backfill-filing-statements.ts", "backfill:search-index": "bun run scripts/backfill-search-index.ts", "backfill:taxonomy-snapshots": "bun run scripts/backfill-taxonomy-snapshots.ts", + "compare:fiscal-ai": "bun run scripts/compare-fiscal-ai-statements.ts", + "report:taxonomy-health": "bun run scripts/report-taxonomy-health.ts", "db:generate": "bun x drizzle-kit generate", "db:migrate": "bun x drizzle-kit migrate", "test:e2e": "bun run scripts/e2e-run.ts", diff --git a/scripts/compare-fiscal-ai-statements.ts b/scripts/compare-fiscal-ai-statements.ts new file mode 100644 index 0000000..a6f0283 --- /dev/null +++ b/scripts/compare-fiscal-ai-statements.ts @@ -0,0 +1,681 @@ +import { chromium } from '@playwright/test'; + +import type { FinancialStatementKind } from '@/lib/types'; +import { hydrateFilingTaxonomySnapshot } from '@/lib/server/taxonomy/engine'; +import type { TaxonomyHydrationInput, TaxonomyHydrationResult } from '@/lib/server/taxonomy/types'; + +type ComparisonTarget = { + statement: Extract; + surfaceKey: string; + fiscalAiLabels: string[]; + allowNotMeaningful?: boolean; + notMeaningfulWarnings?: string[]; +}; + +type CompanyCase = { + ticker: string; + exchangeTicker: string; + cik: string; + form: '10-K'; + comparisons: ComparisonTarget[]; +}; + +type SecRecentFilings = { + accessionNumber: string[]; + primaryDocument: string[]; + filingDate: string[]; + form: string[]; +}; + +type SecSubmission = { + filings?: { + recent?: SecRecentFilings; + }; +}; + +type FiscalAiTableRow = { + label: string; + normalizedLabel: string; + valueText: string; + value: number | null; +}; + +type FiscalAiTable = { + columnLabel: string; + rows: FiscalAiTableRow[]; +}; + +type ComparisonRow = { + statement: Extract; + surfaceKey: string; + fiscalAiLabel: string | null; + fiscalAiValueM: number | null; + ourValueM: number | null; + absDiffM: number | null; + relDiff: number | null; + status: 'pass' | 'fail' | 'missing_reference' | 'missing_ours' | 'not_meaningful'; +}; + +type ResultPeriod = TaxonomyHydrationResult['periods'][number] & { + period_start?: string | null; + period_end?: string | null; +}; + +const SEC_USER_AGENT = 'fiscal-clone/compare-fiscal-ai (contact: local-dev)'; +const BROWSER_USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/145.0.0.0 Safari/537.36'; +const VALUE_TOLERANCE_M = 1; +const RELATIVE_TOLERANCE = 0.005; + +const CASES: CompanyCase[] = [ + { + ticker: 'MSFT', + exchangeTicker: 'NasdaqGS-MSFT', + cik: '0000789019', + form: '10-K', + comparisons: [ + { statement: 'income', surfaceKey: 'revenue', fiscalAiLabels: ['Total Revenues'] }, + { + statement: 'income', + surfaceKey: 'gross_profit', + fiscalAiLabels: ['Gross Profit'], + allowNotMeaningful: true, + notMeaningfulWarnings: ['gross_profit_not_meaningful_broker_pack'] + }, + { statement: 'income', surfaceKey: 'operating_expenses', fiscalAiLabels: ['Operating Expenses', 'Operating Expense'] }, + { statement: 'income', surfaceKey: 'operating_income', fiscalAiLabels: ['Operating Profit', 'Operating Income'] }, + { statement: 'income', surfaceKey: 'income_tax_expense', fiscalAiLabels: ['Provision for Income Taxes', 'Income Tax Expense', 'Income Taxes'] }, + { + statement: 'income', + surfaceKey: 'net_income', + fiscalAiLabels: ['Net Income Attributable to Common Shareholders', 'Consolidated Net Income', 'Net Income'] + }, + ] + }, + { + ticker: 'JPM', + exchangeTicker: 'NYSE-JPM', + cik: '0000019617', + form: '10-K', + comparisons: [ + { statement: 'income', surfaceKey: 'revenue', fiscalAiLabels: ['Total Net Revenues', 'Total Revenues'] }, + { + statement: 'income', + surfaceKey: 'gross_profit', + fiscalAiLabels: ['Gross Profit'], + allowNotMeaningful: true, + notMeaningfulWarnings: ['gross_profit_not_meaningful_bank_pack'] + }, + { statement: 'income', surfaceKey: 'operating_expenses', fiscalAiLabels: ['Operating Expenses', 'Total Operating Expenses', 'Non-Interest Expense'] }, + { statement: 'income', surfaceKey: 'operating_income', fiscalAiLabels: ['Pre-Tax Income', 'Operating Income', 'Operating Profit'] }, + { statement: 'income', surfaceKey: 'income_tax_expense', fiscalAiLabels: ['Income Taxes', 'Income Tax Expense'] }, + { + statement: 'income', + surfaceKey: 'net_income', + fiscalAiLabels: ['Net Income to Common', 'Net Income Attributable to Common Shareholders', 'Net Income'] + }, + ] + }, + { + ticker: 'AIG', + exchangeTicker: 'NYSE-AIG', + cik: '0000005272', + form: '10-K', + comparisons: [ + { statement: 'income', surfaceKey: 'revenue', fiscalAiLabels: ['Total Revenues', 'Revenue'] }, + { + statement: 'income', + surfaceKey: 'gross_profit', + fiscalAiLabels: ['Gross Profit'], + allowNotMeaningful: true, + notMeaningfulWarnings: ['gross_profit_not_meaningful_insurance_pack'] + }, + { statement: 'income', surfaceKey: 'operating_expenses', fiscalAiLabels: ['Operating Expenses', 'Insurance Benefits & Claims'] }, + { statement: 'income', surfaceKey: 'operating_income', fiscalAiLabels: ['Operating Income', 'Operating Profit'] }, + { statement: 'income', surfaceKey: 'income_tax_expense', fiscalAiLabels: ['Income Taxes', 'Income Tax Expense'] }, + { + statement: 'income', + surfaceKey: 'net_income', + fiscalAiLabels: ['Net Income Attributable to Common Shareholders', 'Consolidated Net Income', 'Net Income'] + }, + ] + }, + { + ticker: 'O', + exchangeTicker: 'NYSE-O', + cik: '0000726728', + form: '10-K', + comparisons: [ + { statement: 'income', surfaceKey: 'revenue', fiscalAiLabels: ['Property Revenue', 'Rental Revenue', 'Total Revenues'] }, + { statement: 'income', surfaceKey: 'gross_profit', fiscalAiLabels: ['Gross Profit', 'Property Operating Profit'] }, + { statement: 'income', surfaceKey: 'operating_expenses', fiscalAiLabels: ['Operating Expenses', 'General and Administrative'] }, + { statement: 'income', surfaceKey: 'operating_income', fiscalAiLabels: ['Operating Profit', 'Operating Income'] }, + { statement: 'income', surfaceKey: 'income_tax_expense', fiscalAiLabels: ['Income Taxes', 'Income Tax Expense'] }, + { + statement: 'income', + surfaceKey: 'net_income', + fiscalAiLabels: ['Net Income Attributable to Common Shareholders', 'Consolidated Net Income', 'Net Income'] + } + ] + }, + { + ticker: 'BLK', + exchangeTicker: 'NYSE-BLK', + cik: '0002012383', + form: '10-K', + comparisons: [ + { statement: 'income', surfaceKey: 'revenue', fiscalAiLabels: ['Total Revenues', 'Investment advisory and administration fees', 'Advisory and other revenue'] }, + { + statement: 'income', + surfaceKey: 'gross_profit', + fiscalAiLabels: ['Gross Profit'], + allowNotMeaningful: true, + notMeaningfulWarnings: ['gross_profit_not_meaningful_broker_pack'] + }, + { statement: 'income', surfaceKey: 'operating_expenses', fiscalAiLabels: ['Operating Expenses', 'Operating Expense'] }, + { statement: 'income', surfaceKey: 'operating_income', fiscalAiLabels: ['Operating Income', 'Operating Profit'] }, + { statement: 'income', surfaceKey: 'income_tax_expense', fiscalAiLabels: ['Income Taxes', 'Income Tax Expense'] }, + { + statement: 'income', + surfaceKey: 'net_income', + fiscalAiLabels: ['Net Income Attributable to Common Shareholders', 'Net Income Applicable to BlackRock, Inc.', 'Net Income'] + } + ] + } +]; + +function parseTickerFilter(argv: string[]) { + for (const arg of argv) { + if (arg === '--help' || arg === '-h') { + console.log('Compare live Fiscal.ai standardized statement rows against local sidecar output.'); + console.log(''); + console.log('Usage:'); + console.log(' bun run scripts/compare-fiscal-ai-statements.ts'); + console.log(' bun run scripts/compare-fiscal-ai-statements.ts --ticker=MSFT'); + process.exit(0); + } + + if (arg.startsWith('--ticker=')) { + const value = arg.slice('--ticker='.length).trim().toUpperCase(); + return value.length > 0 ? value : null; + } + } + + return null; +} + +function normalizeLabel(value: string) { + return value + .toLowerCase() + .replace(/&/g, ' and ') + .replace(/[^a-z0-9]+/g, ' ') + .trim() + .replace(/\s+/g, ' '); +} + +function parseDisplayedNumber(value: string) { + const trimmed = value.trim(); + if (!trimmed || /^[-–—]+$/.test(trimmed) || /pricing/i.test(trimmed)) { + return null; + } + + const negative = trimmed.startsWith('(') && trimmed.endsWith(')'); + const normalized = trimmed + .replace(/,/g, '') + .replace(/[%$]/g, '') + .replace(/[()]/g, '') + .trim(); + + if (!normalized) { + return null; + } + + const parsed = Number.parseFloat(normalized); + if (!Number.isFinite(parsed)) { + return null; + } + + return negative ? -Math.abs(parsed) : parsed; +} + +function roundMillions(value: number | null) { + if (value === null || !Number.isFinite(value)) { + return null; + } + + return Math.round(value / 1_000_000); +} + +function absoluteDiff(left: number | null, right: number | null) { + if (left === null || right === null) { + return null; + } + + return Math.abs(left - right); +} + +function relativeDiff(left: number | null, right: number | null) { + if (left === null || right === null) { + return null; + } + + const baseline = Math.max(Math.abs(right), 1); + return Math.abs(left - right) / baseline; +} + +function periodStart(period: ResultPeriod) { + return period.periodStart ?? period.period_start ?? null; +} + +function periodEnd(period: ResultPeriod) { + return period.periodEnd ?? period.period_end ?? null; +} + +function chooseDurationPeriodId(result: TaxonomyHydrationResult) { + const annualPeriods = result.periods + .filter((period): period is ResultPeriod => Boolean(periodStart(period as ResultPeriod) && periodEnd(period as ResultPeriod))) + .map((period) => { + const durationDays = Math.round( + (Date.parse(periodEnd(period) as string) - Date.parse(periodStart(period) as string)) / (1000 * 60 * 60 * 24) + ); + return { period, durationDays }; + }) + .filter((entry) => entry.durationDays >= 300) + .sort((left, right) => { + return Date.parse(periodEnd(right.period) as string) - Date.parse(periodEnd(left.period) as string); + }); + + return annualPeriods[0]?.period.id ?? null; +} + +function chooseInstantPeriodId(result: TaxonomyHydrationResult) { + const instantPeriods = result.periods + .filter((period): period is ResultPeriod => !periodStart(period as ResultPeriod) && Boolean(periodEnd(period as ResultPeriod))) + .sort((left, right) => Date.parse(periodEnd(right) as string) - Date.parse(periodEnd(left) as string)); + + return instantPeriods[0]?.id ?? null; +} + +function findSurfaceValue( + result: TaxonomyHydrationResult, + statement: Extract, + surfaceKey: string +) { + const rows = result.surface_rows[statement] ?? []; + const row = rows.find((entry) => entry.key === surfaceKey) ?? null; + if (!row) { + return { row: null, value: null }; + } + + const periodId = statement === 'balance' + ? chooseInstantPeriodId(result) + : chooseDurationPeriodId(result); + + if (periodId) { + const directValue = row.values[periodId]; + if (directValue !== null && directValue !== undefined) { + return { row, value: directValue }; + } + } + + const periodById = new Map( + result.periods.map((period) => [period.id, period as ResultPeriod]) + ); + + const fallback = Object.entries(row.values) + .filter((entry): entry is [string, number] => entry[1] !== null) + .sort((left, right) => { + const leftPeriod = periodById.get(left[0]); + const rightPeriod = periodById.get(right[0]); + const leftDate = leftPeriod ? Date.parse(periodEnd(leftPeriod) ?? '') : Number.NaN; + const rightDate = rightPeriod ? Date.parse(periodEnd(rightPeriod) ?? '') : Number.NaN; + + if (Number.isFinite(leftDate) && Number.isFinite(rightDate) && leftDate !== rightDate) { + return rightDate - leftDate; + } + + return right[0].localeCompare(left[0]); + })[0]; + + return { + row, + value: fallback?.[1] ?? null + }; +} + +function rowResolutionMethod(row: Record | null) { + if (!row) { + return null; + } + + return (row.resolutionMethod ?? row.resolution_method ?? null) as string | null; +} + +function rowWarningCodes(row: Record | null) { + if (!row) { + return [] as string[]; + } + + const value = row.warningCodes ?? row.warning_codes ?? []; + return Array.isArray(value) ? value.filter((entry): entry is string => typeof entry === 'string') : []; +} + +function buildSecFilingUrl(cik: string, accessionNumber: string) { + return `https://www.sec.gov/Archives/edgar/data/${Number.parseInt(cik, 10)}/${accessionNumber.replace(/-/g, '')}/`; +} + +async function fetchLatestAnnualFiling(company: CompanyCase): Promise { + const cik = company.cik.padStart(10, '0'); + const response = await fetch(`https://data.sec.gov/submissions/CIK${cik}.json`, { + headers: { + 'user-agent': SEC_USER_AGENT, + accept: 'application/json' + } + }); + + if (!response.ok) { + throw new Error(`SEC submissions fetch failed for ${company.ticker}: ${response.status}`); + } + + const payload = await response.json() as SecSubmission; + const recent = payload.filings?.recent; + if (!recent) { + throw new Error(`SEC submissions payload missing recent filings for ${company.ticker}`); + } + + for (let index = 0; index < recent.form.length; index += 1) { + if (recent.form[index] !== company.form) { + continue; + } + + const accessionNumber = recent.accessionNumber[index]; + const filingDate = recent.filingDate[index]; + const primaryDocument = recent.primaryDocument[index]; + if (!accessionNumber || !filingDate || !primaryDocument) { + continue; + } + + return { + filingId: index + 1, + ticker: company.ticker, + cik: company.cik, + accessionNumber, + filingDate, + filingType: company.form, + filingUrl: buildSecFilingUrl(company.cik, accessionNumber), + primaryDocument + }; + } + + throw new Error(`No ${company.form} found in SEC recent filings for ${company.ticker}`); +} + +async function scrapeFiscalAiTable( + page: import('@playwright/test').Page, + exchangeTicker: string, + statement: 'income' | 'balance' +): Promise { + const pagePath = statement === 'income' ? 'income-statement' : 'balance-sheet'; + const url = `https://fiscal.ai/company/${exchangeTicker}/financials/${pagePath}/annual/?templateType=standardized`; + + await page.goto(url, { waitUntil: 'domcontentloaded', timeout: 120_000 }); + await page.waitForSelector('table', { timeout: 120_000 }); + await page.waitForTimeout(2_500); + + return await page.evaluate(() => { + function normalizeLabel(value: string) { + return value + .toLowerCase() + .replace(/&/g, ' and ') + .replace(/[^a-z0-9]+/g, ' ') + .trim() + .replace(/\s+/g, ' '); + } + + function parseDisplayedNumber(value: string) { + const trimmed = value.trim(); + if (!trimmed || /^[-–—]+$/.test(trimmed) || /pricing/i.test(trimmed)) { + return null; + } + + const negative = trimmed.startsWith('(') && trimmed.endsWith(')'); + const normalized = trimmed + .replace(/,/g, '') + .replace(/[%$]/g, '') + .replace(/[()]/g, '') + .trim(); + + if (!normalized) { + return null; + } + + const parsed = Number.parseFloat(normalized); + return Number.isFinite(parsed) ? (negative ? -Math.abs(parsed) : parsed) : null; + } + + const table = document.querySelector('table'); + if (!table) { + throw new Error('Fiscal.ai table not found'); + } + + const headerCells = Array.from(table.querySelectorAll('tr:first-child th, tr:first-child td')) + .map((cell) => cell.textContent?.trim() ?? '') + .filter((value) => value.length > 0); + + const annualColumnIndex = headerCells.findIndex((value, index) => index > 0 && value !== 'LTM'); + if (annualColumnIndex < 0) { + throw new Error(`Could not locate latest annual column in headers: ${headerCells.join(' | ')}`); + } + + const rows = Array.from(table.querySelectorAll('tr')) + .slice(1) + .map((row) => { + const cells = Array.from(row.querySelectorAll('td')); + if (cells.length <= annualColumnIndex) { + return null; + } + + const label = cells[0]?.textContent?.trim() ?? ''; + const valueText = cells[annualColumnIndex]?.textContent?.trim() ?? ''; + if (!label) { + return null; + } + + return { + label, + normalizedLabel: normalizeLabel(label), + valueText, + value: parseDisplayedNumber(valueText) + }; + }) + .filter((entry): entry is FiscalAiTableRow => entry !== null); + + return { + columnLabel: headerCells[annualColumnIndex] ?? 'unknown', + rows + }; + }); +} + +function findFiscalAiRow(rows: FiscalAiTableRow[], candidates: string[]) { + const normalizedCandidates = candidates.map(normalizeLabel); + const benignExtraTokens = new Set(['total', 'net']); + + for (const candidate of normalizedCandidates) { + const exactMatch = rows.find((row) => row.normalizedLabel === candidate); + if (exactMatch) { + return exactMatch; + } + } + + for (const candidate of normalizedCandidates) { + const candidateTokens = candidate.split(' ').filter((token) => token.length > 0); + const relaxedMatch = rows.find((row) => { + const rowTokens = row.normalizedLabel.split(' ').filter((token) => token.length > 0); + const sharedPrefix = row.normalizedLabel.startsWith(candidate) || candidate.startsWith(row.normalizedLabel); + if (!sharedPrefix) { + return false; + } + + const longer = rowTokens.length >= candidateTokens.length ? rowTokens : candidateTokens; + const shorter = rowTokens.length >= candidateTokens.length ? candidateTokens : rowTokens; + const extraTokens = longer.filter((token) => !shorter.includes(token)); + return extraTokens.length > 0 && extraTokens.every((token) => benignExtraTokens.has(token)); + }); + if (relaxedMatch) { + return relaxedMatch; + } + } + + return null; +} + +function compareRow( + target: ComparisonTarget, + result: TaxonomyHydrationResult, + fiscalAiTable: FiscalAiTable +): ComparisonRow { + const fiscalAiRow = findFiscalAiRow(fiscalAiTable.rows, target.fiscalAiLabels); + const fiscalAiValueM = fiscalAiRow?.value ?? null; + const ourSurface = findSurfaceValue(result, target.statement, target.surfaceKey); + const ourValueM = roundMillions(ourSurface.value); + const absDiffM = absoluteDiff(ourValueM, fiscalAiValueM); + const relDiffValue = relativeDiff(ourValueM, fiscalAiValueM); + const resolutionMethod = rowResolutionMethod(ourSurface.row as Record | null); + const warningCodes = rowWarningCodes(ourSurface.row as Record | null); + + let status: ComparisonRow['status']; + if ( + ourSurface.row && + ourValueM === null && + target.allowNotMeaningful && + resolutionMethod === 'not_meaningful' && + (target.notMeaningfulWarnings?.length ?? 0) > 0 && + target.notMeaningfulWarnings!.some((warning) => warningCodes.includes(warning)) + ) { + status = 'not_meaningful'; + } else if (!fiscalAiRow) { + status = 'missing_reference'; + } else if (ourValueM === null) { + status = 'missing_ours'; + } else if ( + absDiffM !== null && + relDiffValue !== null && + (absDiffM <= VALUE_TOLERANCE_M || relDiffValue <= RELATIVE_TOLERANCE) + ) { + status = 'pass'; + } else { + status = 'fail'; + } + + return { + statement: target.statement, + surfaceKey: target.surfaceKey, + fiscalAiLabel: fiscalAiRow?.label ?? null, + fiscalAiValueM, + ourValueM, + absDiffM, + relDiff: relDiffValue, + status + }; +} + +async function compareCase(page: import('@playwright/test').Page, company: CompanyCase) { + const filing = await fetchLatestAnnualFiling(company); + const result = await hydrateFilingTaxonomySnapshot(filing); + + if (result.parse_status !== 'ready') { + throw new Error(`${company.ticker} parse_status=${result.parse_status}${result.parse_error ? ` parse_error=${result.parse_error}` : ''}`); + } + + const incomeTable = await scrapeFiscalAiTable(page, company.exchangeTicker, 'income'); + const balanceTable = await scrapeFiscalAiTable(page, company.exchangeTicker, 'balance'); + const rows = company.comparisons.map((target) => { + const table = target.statement === 'income' ? incomeTable : balanceTable; + return compareRow(target, result, table); + }); + + const failures = rows.filter((row) => row.status === 'fail' || row.status === 'missing_ours'); + + console.log( + `[compare-fiscal-ai] ${company.ticker} filing=${filing.accessionNumber} fiscal_pack=${result.fiscal_pack ?? 'null'} income_column="${incomeTable.columnLabel}" balance_column="${balanceTable.columnLabel}" pass=${rows.length - failures.length}/${rows.length}` + ); + for (const row of rows) { + console.log( + [ + ' ', + row.status.toUpperCase(), + `${row.statement}.${row.surfaceKey}`, + `fiscal_label=${row.fiscalAiLabel ?? 'null'}`, + `ours_m=${row.ourValueM ?? 'null'}`, + `fiscal_m=${row.fiscalAiValueM ?? 'null'}`, + `abs_diff_m=${row.absDiffM ?? 'null'}`, + `rel_diff=${row.relDiff === null ? 'null' : row.relDiff.toFixed(4)}` + ].join(' ') + ); + } + + return { + ticker: company.ticker, + filing, + fiscalPack: result.fiscal_pack, + rows, + failures + }; +} + +async function main() { + process.env.XBRL_ENGINE_TIMEOUT_MS = process.env.XBRL_ENGINE_TIMEOUT_MS ?? '180000'; + const tickerFilter = parseTickerFilter(process.argv.slice(2)); + const selectedCases = tickerFilter + ? CASES.filter((entry) => entry.ticker === tickerFilter) + : CASES; + + if (selectedCases.length === 0) { + console.error(`[compare-fiscal-ai] unknown ticker: ${tickerFilter}`); + process.exitCode = 1; + return; + } + + const browser = await chromium.launch({ headless: false }); + const page = await browser.newPage({ + userAgent: BROWSER_USER_AGENT + }); + + const failures: Array<{ ticker: string; row: ComparisonRow }> = []; + + try { + for (const company of selectedCases) { + const result = await compareCase(page, company); + for (const failure of result.failures) { + failures.push({ + ticker: company.ticker, + row: failure + }); + } + + await Bun.sleep(150); + } + } finally { + await browser.close(); + } + + console.log(`[compare-fiscal-ai] completed cases=${selectedCases.length} failures=${failures.length}`); + + if (failures.length === 0) { + return; + } + + for (const failure of failures) { + console.error( + `[compare-fiscal-ai] ${failure.ticker} ${failure.row.statement}.${failure.row.surfaceKey} status=${failure.row.status} ours_m=${failure.row.ourValueM ?? 'null'} fiscal_m=${failure.row.fiscalAiValueM ?? 'null'} fiscal_label=${failure.row.fiscalAiLabel ?? 'null'}` + ); + } + + process.exitCode = 1; +} + +void main().catch((error) => { + const message = error instanceof Error ? error.message : String(error); + console.error(`[compare-fiscal-ai] fatal: ${message}`); + process.exitCode = 1; +}); diff --git a/scripts/report-taxonomy-health.ts b/scripts/report-taxonomy-health.ts new file mode 100644 index 0000000..e154605 --- /dev/null +++ b/scripts/report-taxonomy-health.ts @@ -0,0 +1,200 @@ +import { and, desc, eq, gte, lte } from 'drizzle-orm'; + +import { db } from '@/lib/server/db'; +import { filingTaxonomySnapshot } from '@/lib/server/db/schema'; + +type ScriptOptions = { + ticker: string | null; + from: string | null; + to: string | null; + sampleLimit: number; +}; + +type SnapshotRow = { + filing_id: number; + ticker: string; + filing_date: string; + filing_type: '10-K' | '10-Q'; + parse_status: string; + parse_error: string | null; + parser_engine: string; + parser_version: string; + fiscal_pack: string | null; + normalization_summary: { + warnings?: string[]; + } | null; + updated_at: string; +}; + +function parseOptions(argv: string[]): ScriptOptions { + const options: ScriptOptions = { + ticker: null, + from: null, + to: null, + sampleLimit: 5 + }; + + for (const arg of argv) { + if (arg === '--help' || arg === '-h') { + console.log('Report taxonomy snapshot health from the local database.'); + console.log(''); + console.log('Usage:'); + console.log(' bun run scripts/report-taxonomy-health.ts [--ticker=SYMBOL] [--from=YYYY-MM-DD] [--to=YYYY-MM-DD] [--sample-limit=N]'); + process.exit(0); + } + + if (arg.startsWith('--ticker=')) { + const value = arg.slice('--ticker='.length).trim().toUpperCase(); + options.ticker = value.length > 0 ? value : null; + continue; + } + + if (arg.startsWith('--from=')) { + const value = arg.slice('--from='.length).trim(); + options.from = value.length > 0 ? value : null; + continue; + } + + if (arg.startsWith('--to=')) { + const value = arg.slice('--to='.length).trim(); + options.to = value.length > 0 ? value : null; + continue; + } + + if (arg.startsWith('--sample-limit=')) { + const parsed = Number.parseInt(arg.slice('--sample-limit='.length), 10); + if (Number.isFinite(parsed) && parsed > 0) { + options.sampleLimit = parsed; + } + } + } + + return options; +} + +function incrementCount(map: Map, key: string) { + map.set(key, (map.get(key) ?? 0) + 1); +} + +function printCountMap(label: string, counts: Map) { + console.log(`[report-taxonomy-health] ${label}`); + if (counts.size === 0) { + console.log(' (none)'); + return; + } + + for (const [key, count] of [...counts.entries()].sort((left, right) => right[1] - left[1] || left[0].localeCompare(right[0]))) { + console.log(` ${key}=${count}`); + } +} + +function printSamples(label: string, rows: SnapshotRow[]) { + console.log(`[report-taxonomy-health] ${label}`); + if (rows.length === 0) { + console.log(' (none)'); + return; + } + + for (const row of rows) { + const warnings = row.normalization_summary?.warnings ?? []; + console.log( + ` ${row.ticker} ${row.filing_type} ${row.filing_date} filing_id=${row.filing_id} status=${row.parse_status} parser=${row.parser_engine} pack=${row.fiscal_pack ?? 'null'} warnings=${warnings.join(',') || '-'} error=${row.parse_error ?? '-'}` + ); + } +} + +async function loadRows(options: ScriptOptions): Promise { + const conditions = []; + + if (options.ticker) { + conditions.push(eq(filingTaxonomySnapshot.ticker, options.ticker)); + } + + if (options.from) { + conditions.push(gte(filingTaxonomySnapshot.filing_date, options.from)); + } + + if (options.to) { + conditions.push(lte(filingTaxonomySnapshot.filing_date, options.to)); + } + + const whereClause = conditions.length > 0 ? and(...conditions) : undefined; + + const baseQuery = db.select({ + filing_id: filingTaxonomySnapshot.filing_id, + ticker: filingTaxonomySnapshot.ticker, + filing_date: filingTaxonomySnapshot.filing_date, + filing_type: filingTaxonomySnapshot.filing_type, + parse_status: filingTaxonomySnapshot.parse_status, + parse_error: filingTaxonomySnapshot.parse_error, + parser_engine: filingTaxonomySnapshot.parser_engine, + parser_version: filingTaxonomySnapshot.parser_version, + fiscal_pack: filingTaxonomySnapshot.fiscal_pack, + normalization_summary: filingTaxonomySnapshot.normalization_summary, + updated_at: filingTaxonomySnapshot.updated_at + }).from(filingTaxonomySnapshot).orderBy(desc(filingTaxonomySnapshot.updated_at)); + + if (whereClause) { + return await baseQuery.where(whereClause); + } + + return await baseQuery; +} + +async function main() { + const options = parseOptions(process.argv.slice(2)); + const rows = await loadRows(options); + + const statusCounts = new Map(); + const parserCounts = new Map(); + const packCounts = new Map(); + const warningCounts = new Map(); + const parserVersionCounts = new Map(); + + for (const row of rows) { + incrementCount(statusCounts, row.parse_status); + incrementCount(parserCounts, row.parser_engine); + incrementCount(parserVersionCounts, row.parser_version); + incrementCount(packCounts, row.fiscal_pack ?? 'null'); + + for (const warning of row.normalization_summary?.warnings ?? []) { + incrementCount(warningCounts, warning); + } + } + + const failedRows = rows + .filter((row) => row.parse_status === 'failed') + .slice(0, options.sampleLimit); + const warningRows = rows + .filter((row) => (row.normalization_summary?.warnings?.length ?? 0) > 0) + .slice(0, options.sampleLimit); + + const legacyCount = rows.filter((row) => row.parser_engine === 'legacy-ts').length; + const deferredCount = rows.filter((row) => (row.normalization_summary?.warnings ?? []).includes('surface_rows_deferred_to_typescript')).length; + const fallbackCount = rows.filter((row) => (row.normalization_summary?.warnings ?? []).includes('ts_compact_surface_fallback_used')).length; + + console.log(`[report-taxonomy-health] snapshots=${rows.length}`); + if (options.ticker) { + console.log(`[report-taxonomy-health] ticker=${options.ticker}`); + } + if (options.from || options.to) { + console.log(`[report-taxonomy-health] range=${options.from ?? 'min'}..${options.to ?? 'max'}`); + } + console.log(`[report-taxonomy-health] legacy_ts=${legacyCount}`); + console.log(`[report-taxonomy-health] deferred_to_typescript=${deferredCount}`); + console.log(`[report-taxonomy-health] ts_compact_surface_fallback=${fallbackCount}`); + + printCountMap('parse_status', statusCounts); + printCountMap('parser_engine', parserCounts); + printCountMap('parser_version', parserVersionCounts); + printCountMap('fiscal_pack', packCounts); + printCountMap('warnings', warningCounts); + printSamples('failed_samples', failedRows); + printSamples('warning_samples', warningRows); +} + +void main().catch((error) => { + const message = error instanceof Error ? error.message : String(error); + console.error(`[report-taxonomy-health] fatal: ${message}`); + process.exitCode = 1; +}); diff --git a/scripts/validate-taxonomy-packs.ts b/scripts/validate-taxonomy-packs.ts new file mode 100644 index 0000000..970cd8d --- /dev/null +++ b/scripts/validate-taxonomy-packs.ts @@ -0,0 +1,284 @@ +import type { FinancialStatementKind } from '@/lib/types'; +import { hydrateFilingTaxonomySnapshot } from '@/lib/server/taxonomy/engine'; +import type { TaxonomyHydrationInput } from '@/lib/server/taxonomy/types'; + +type ValidationCase = { + name: string; + expectedPack: string; + input: TaxonomyHydrationInput; + requiredSurfaceKeys: Partial>; + requiredKpiKeys?: string[]; +}; + +type ValidationFailure = { + name: string; + issues: string[]; +}; + +const UNIVERSAL_INCOME_KEYS = [ + 'revenue', + 'gross_profit', + 'operating_expenses', + 'operating_income', + 'income_tax_expense', + 'net_income' +] as const; + +const EXPENSE_BREAKDOWN_KEYS = [ + 'selling_general_and_administrative', + 'research_and_development', + 'other_operating_expense' +] as const; + +const CORPUS: ValidationCase[] = [ + { + name: 'core-msft-2026-01-28', + expectedPack: 'core', + input: { + filingId: 1, + ticker: 'MSFT', + cik: '0000789019', + accessionNumber: '0001193125-26-027207', + filingDate: '2026-01-28', + filingType: '10-Q', + filingUrl: 'https://www.sec.gov/Archives/edgar/data/789019/000119312526027207/', + primaryDocument: 'msft-20251231.htm' + }, + requiredSurfaceKeys: { + income: [...UNIVERSAL_INCOME_KEYS, ...EXPENSE_BREAKDOWN_KEYS], + balance: ['total_assets'] + } + }, + { + name: 'bank-jpm-2026-02-13', + expectedPack: 'bank_lender', + input: { + filingId: 2, + ticker: 'JPM', + cik: '0000019617', + accessionNumber: '0001628280-26-008131', + filingDate: '2026-02-13', + filingType: '10-K', + filingUrl: 'https://www.sec.gov/Archives/edgar/data/19617/000162828026008131/', + primaryDocument: 'jpm-20251231.htm' + }, + requiredSurfaceKeys: { + income: [...UNIVERSAL_INCOME_KEYS, ...EXPENSE_BREAKDOWN_KEYS, 'net_interest_income', 'noninterest_income'], + balance: ['loans', 'deposits'] + }, + requiredKpiKeys: ['net_interest_margin'] + }, + { + name: 'insurance-aig-2026-02-12', + expectedPack: 'insurance', + input: { + filingId: 3, + ticker: 'AIG', + cik: '0000005272', + accessionNumber: '0000005272-26-000023', + filingDate: '2026-02-12', + filingType: '10-K', + filingUrl: 'https://www.sec.gov/Archives/edgar/data/5272/000000527226000023/', + primaryDocument: 'aig-20251231.htm' + }, + requiredSurfaceKeys: { + income: [...UNIVERSAL_INCOME_KEYS, ...EXPENSE_BREAKDOWN_KEYS, 'premiums', 'claims_and_benefits'], + balance: ['policy_liabilities'] + }, + requiredKpiKeys: ['combined_ratio'] + }, + { + name: 'reit-o-2026-02-25', + expectedPack: 'reit_real_estate', + input: { + filingId: 4, + ticker: 'O', + cik: '0000726728', + accessionNumber: '0000726728-26-000011', + filingDate: '2026-02-25', + filingType: '10-K', + filingUrl: 'https://www.sec.gov/Archives/edgar/data/726728/000072672826000011/', + primaryDocument: 'o-20251231.htm' + }, + requiredSurfaceKeys: { + income: [...UNIVERSAL_INCOME_KEYS, ...EXPENSE_BREAKDOWN_KEYS, 'rental_revenue'], + balance: ['investment_property', 'total_assets'] + }, + requiredKpiKeys: ['property_count'] + }, + { + name: 'broker-blk-2026-02-25', + expectedPack: 'broker_asset_manager', + input: { + filingId: 5, + ticker: 'BLK', + cik: '0002012383', + accessionNumber: '0001193125-26-071966', + filingDate: '2026-02-25', + filingType: '10-K', + filingUrl: 'https://www.sec.gov/Archives/edgar/data/2012383/000119312526071966/', + primaryDocument: 'blk-20251231.htm' + }, + requiredSurfaceKeys: { + income: [...UNIVERSAL_INCOME_KEYS, ...EXPENSE_BREAKDOWN_KEYS, 'fee_revenue'], + balance: ['total_assets', 'total_liabilities'] + }, + requiredKpiKeys: ['aum', 'fee_paying_aum'] + } +]; + +const FALLBACK_WARNINGS = new Set([ + 'surface_rows_deferred_to_typescript', + 'ts_compact_surface_fallback_used' +]); + +function parseCaseFilter(argv: string[]) { + for (const arg of argv) { + if (arg === '--help' || arg === '-h') { + console.log('Validate live SEC representative filings for each active taxonomy pack.'); + console.log(''); + console.log('Usage:'); + console.log(' bun run scripts/validate-taxonomy-packs.ts'); + console.log(' bun run scripts/validate-taxonomy-packs.ts --case=bank-jpm-2026-02-13'); + process.exit(0); + } + + if (arg.startsWith('--case=')) { + const value = arg.slice('--case='.length).trim(); + return value.length > 0 ? value : null; + } + } + + return null; +} + +function keysForStatement( + result: Awaited>, + statement: FinancialStatementKind +) { + return (result.surface_rows[statement] ?? []).map((row) => row.key); +} + +async function validateCase(testCase: ValidationCase): Promise { + const startedAt = Date.now(); + const result = await hydrateFilingTaxonomySnapshot(testCase.input); + const issues: string[] = []; + const warnings = result.normalization_summary.warnings ?? []; + const kpiKeys = result.kpi_rows.map((row) => row.key); + + if (result.parse_status !== 'ready') { + issues.push(`parse_status=${result.parse_status}${result.parse_error ? ` parse_error=${result.parse_error}` : ''}`); + } + + if (result.fiscal_pack !== testCase.expectedPack) { + issues.push(`fiscal_pack=${result.fiscal_pack ?? 'null'} expected=${testCase.expectedPack}`); + } + + if ((Object.values(result.surface_rows) as Array>).every((rows) => rows.length === 0)) { + issues.push('surface_rows are empty'); + } + + const fallbackWarning = warnings.find((warning) => FALLBACK_WARNINGS.has(warning)); + if (fallbackWarning) { + issues.push(`unexpected fallback warning=${fallbackWarning}`); + } + + for (const [statement, requiredKeys] of Object.entries(testCase.requiredSurfaceKeys) as Array< + [FinancialStatementKind, string[]] + >) { + const actualKeys = new Set(keysForStatement(result, statement)); + for (const requiredKey of requiredKeys) { + if (!actualKeys.has(requiredKey)) { + issues.push(`${statement} missing surface key=${requiredKey}`); + } + } + } + + for (const requiredKpiKey of testCase.requiredKpiKeys ?? []) { + if (!kpiKeys.includes(requiredKpiKey)) { + issues.push(`missing kpi key=${requiredKpiKey}`); + } + } + + const durationMs = Date.now() - startedAt; + const incomeKeys = keysForStatement(result, 'income'); + const balanceKeys = keysForStatement(result, 'balance'); + console.log( + [ + `[validate-taxonomy-packs] ${testCase.name}`, + `status=${issues.length === 0 ? 'pass' : 'fail'}`, + `parse=${result.parse_status}`, + `pack=${result.fiscal_pack ?? 'null'}`, + `income=${incomeKeys.join(',') || '-'}`, + `balance=${balanceKeys.join(',') || '-'}`, + `kpis=${kpiKeys.join(',') || '-'}`, + `warnings=${warnings.join(',') || '-'}`, + `durationMs=${durationMs}` + ].join(' ') + ); + + if (issues.length === 0) { + return null; + } + + return { + name: testCase.name, + issues + }; +} + +async function main() { + process.env.XBRL_ENGINE_TIMEOUT_MS = process.env.XBRL_ENGINE_TIMEOUT_MS ?? '180000'; + + const requestedCase = parseCaseFilter(process.argv.slice(2)); + const selectedCases = requestedCase + ? CORPUS.filter((testCase) => testCase.name === requestedCase) + : CORPUS; + + if (selectedCases.length === 0) { + console.error(`[validate-taxonomy-packs] unknown case: ${requestedCase}`); + process.exitCode = 1; + return; + } + + const failures: ValidationFailure[] = []; + const startedAt = Date.now(); + + for (const testCase of selectedCases) { + try { + const failure = await validateCase(testCase); + if (failure) { + failures.push(failure); + } + } catch (error) { + failures.push({ + name: testCase.name, + issues: [error instanceof Error ? error.message : String(error)] + }); + } + + await Bun.sleep(150); + } + + console.log( + `[validate-taxonomy-packs] completed cases=${selectedCases.length} failures=${failures.length} durationSec=${( + (Date.now() - startedAt) / + 1000 + ).toFixed(1)}` + ); + + if (failures.length === 0) { + return; + } + + for (const failure of failures) { + console.error(`[validate-taxonomy-packs] ${failure.name}`); + for (const issue of failure.issues) { + console.error(` - ${issue}`); + } + } + + process.exitCode = 1; +} + +void main();