refactor(taxonomy): remove legacy parser and add rollout checks

This commit is contained in:
2026-03-12 15:25:06 -04:00
parent 58061af006
commit c274f4d55b
11 changed files with 1344 additions and 697 deletions

View File

@@ -0,0 +1,86 @@
import { beforeEach, describe, expect, it, mock } from 'bun:test';
import type { FinancialStatementKind } from '@/lib/types';
import type { TaxonomyHydrationInput, TaxonomyHydrationResult } from '@/lib/server/taxonomy/types';
function createStatementRecord<T>(factory: () => T): Record<FinancialStatementKind, T> {
return {
income: factory(),
balance: factory(),
cash_flow: factory(),
equity: factory(),
comprehensive_income: factory()
};
}
function createHydrationResult(): TaxonomyHydrationResult {
return {
filing_id: 1,
ticker: 'TEST',
filing_date: '2025-12-31',
filing_type: '10-K',
parse_status: 'ready',
parse_error: null,
source: 'xbrl_instance_with_linkbase',
parser_engine: 'fiscal-xbrl',
parser_version: '0.1.0',
taxonomy_regime: 'us-gaap',
fiscal_pack: 'core',
periods: [],
faithful_rows: createStatementRecord(() => []),
statement_rows: createStatementRecord(() => []),
surface_rows: createStatementRecord(() => []),
detail_rows: createStatementRecord(() => ({})),
kpi_rows: [],
contexts: [],
derived_metrics: null,
validation_result: null,
facts_count: 0,
concepts_count: 0,
dimensions_count: 0,
assets: [],
concepts: [],
facts: [],
metric_validations: [],
normalization_summary: {
surfaceRowCount: 0,
detailRowCount: 0,
kpiRowCount: 0,
unmappedRowCount: 0,
materialUnmappedRowCount: 0,
warnings: ['rust_warning']
}
};
}
const mockHydrateFromSidecar = mock(async () => createHydrationResult());
mock.module('@/lib/server/taxonomy/parser-client', () => ({
hydrateFilingTaxonomySnapshotFromSidecar: mockHydrateFromSidecar
}));
describe('taxonomy engine rust path', () => {
beforeEach(() => {
mockHydrateFromSidecar.mockClear();
});
it('returns sidecar output directly from the Rust sidecar', async () => {
const { hydrateFilingTaxonomySnapshot } = await import('@/lib/server/taxonomy/engine');
const input: TaxonomyHydrationInput = {
filingId: 1,
ticker: 'TEST',
cik: '0000000001',
accessionNumber: '0000000001-25-000001',
filingDate: '2025-12-31',
filingType: '10-K',
filingUrl: 'https://www.sec.gov/Archives/edgar/data/1/000000000125000001/',
primaryDocument: 'test-20251231.htm'
};
const result = await hydrateFilingTaxonomySnapshot(input);
expect(mockHydrateFromSidecar).toHaveBeenCalledTimes(1);
expect(result.parser_engine).toBe('fiscal-xbrl');
expect(result.normalization_summary.warnings).toEqual(['rust_warning']);
});
});

View File

@@ -1,63 +0,0 @@
import { describe, expect, it } from 'bun:test';
import {
classifyStatementRole,
parseLabelLinkbase,
parsePresentationLinkbase
} from '@/lib/server/taxonomy/linkbase-parser';
const SAMPLE_LABEL_LINKBASE = `
<link:linkbase xmlns:link="http://www.xbrl.org/2003/linkbase"
xmlns:xlink="http://www.w3.org/1999/xlink"
xmlns:us-gaap="http://fasb.org/us-gaap/2024">
<link:labelLink xlink:type="extended">
<link:loc xlink:type="locator" xlink:label="loc_rev" xlink:href="test.xsd#us-gaap_Revenues" />
<link:label xlink:type="resource" xlink:label="lab_terse" xlink:role="http://www.xbrl.org/2003/role/terseLabel">Rev.</link:label>
<link:label xlink:type="resource" xlink:label="lab_label" xlink:role="http://www.xbrl.org/2003/role/label">Revenues</link:label>
<link:labelArc xlink:type="arc" xlink:from="loc_rev" xlink:to="lab_terse" />
<link:labelArc xlink:type="arc" xlink:from="loc_rev" xlink:to="lab_label" />
</link:labelLink>
</link:linkbase>
`;
const SAMPLE_PRESENTATION_LINKBASE = `
<link:linkbase xmlns:link="http://www.xbrl.org/2003/linkbase"
xmlns:xlink="http://www.w3.org/1999/xlink"
xmlns:us-gaap="http://fasb.org/us-gaap/2024">
<link:presentationLink xlink:type="extended" xlink:role="http://www.xbrl.org/2003/role/StatementOfOperations">
<link:loc xlink:type="locator" xlink:label="root" xlink:href="test.xsd#us-gaap_StatementLineItems" />
<link:loc xlink:type="locator" xlink:label="rev" xlink:href="test.xsd#us-gaap_Revenues" />
<link:loc xlink:type="locator" xlink:label="cogs" xlink:href="test.xsd#us-gaap_CostOfGoodsSold" />
<link:presentationArc xlink:type="arc" xlink:from="root" xlink:to="rev" order="1" />
<link:presentationArc xlink:type="arc" xlink:from="root" xlink:to="cogs" order="2" />
</link:presentationLink>
</link:linkbase>
`;
describe('linkbase parser', () => {
it('builds preferred labels from label linkbase', () => {
const labels = parseLabelLinkbase(SAMPLE_LABEL_LINKBASE);
expect(labels.get('http://fasb.org/us-gaap/2024#Revenues')).toBe('Revenues');
});
it('builds role trees with depth/order/parent metadata', () => {
const rows = parsePresentationLinkbase(SAMPLE_PRESENTATION_LINKBASE);
expect(rows.length).toBe(3);
const root = rows.find((row) => row.qname === 'us-gaap:StatementLineItems');
const revenue = rows.find((row) => row.qname === 'us-gaap:Revenues');
const cogs = rows.find((row) => row.qname === 'us-gaap:CostOfGoodsSold');
expect(root?.depth).toBe(0);
expect(root?.parentConceptKey).toBeNull();
expect(revenue?.depth).toBe(1);
expect(cogs?.depth).toBe(1);
expect(revenue?.parentConceptKey).toBe(root?.conceptKey ?? null);
expect(revenue?.order).toBeLessThan(cogs?.order ?? Number.POSITIVE_INFINITY);
});
it('classifies statement roles into canonical statement kinds', () => {
expect(classifyStatementRole('http://www.xbrl.org/2003/role/StatementOfOperations')).toBe('income');
expect(classifyStatementRole('http://www.xbrl.org/2003/role/StatementOfFinancialPosition')).toBe('balance');
expect(classifyStatementRole('http://www.xbrl.org/2003/role/StatementOfCashFlows')).toBe('cash_flow');
});
});

View File

@@ -1,310 +0,0 @@
import type { FinancialStatementKind } from '@/lib/types';
import type { TaxonomyNamespaceMap, TaxonomyPresentationConcept } from '@/lib/server/taxonomy/types';
function decodeXmlEntities(value: string) {
return value
.replace(/&amp;/gi, '&')
.replace(/&lt;/gi, '<')
.replace(/&gt;/gi, '>')
.replace(/&quot;/gi, '"')
.replace(/&#39;/gi, "'")
.replace(/&#160;|&nbsp;/gi, ' ')
.trim();
}
function parseNamespaceMap(raw: string): TaxonomyNamespaceMap {
const map: TaxonomyNamespaceMap = {};
const rootStart = raw.match(/<[^>]*linkbase[^>]*>/i)?.[0] ?? raw.slice(0, 1200);
for (const match of rootStart.matchAll(/xmlns:([a-zA-Z0-9_\-]+)=["']([^"']+)["']/g)) {
const prefix = (match[1] ?? '').trim();
const uri = (match[2] ?? '').trim();
if (!prefix || !uri) {
continue;
}
map[prefix] = uri;
}
return map;
}
function qnameFromHref(href: string) {
const fragment = href.includes('#') ? href.slice(href.indexOf('#') + 1) : href;
if (!fragment) {
return null;
}
const cleaned = fragment.trim().replace(/^loc_+/i, '');
if (!cleaned) {
return null;
}
if (cleaned.includes(':')) {
return cleaned;
}
if (cleaned.includes('_')) {
const idx = cleaned.indexOf('_');
return `${cleaned.slice(0, idx)}:${cleaned.slice(idx + 1)}`;
}
return null;
}
function conceptFromQName(qname: string, namespaces: TaxonomyNamespaceMap) {
const [prefix, ...rest] = qname.split(':');
const localName = rest.join(':');
if (!prefix || !localName) {
return null;
}
const namespaceUri = namespaces[prefix] ?? `urn:unknown:${prefix}`;
return {
qname,
namespaceUri,
localName,
conceptKey: `${namespaceUri}#${localName}`
};
}
function labelPriority(role: string | null) {
const normalized = (role ?? '').toLowerCase();
if (!normalized) {
return 0;
}
if (normalized.endsWith('/label')) {
return 4;
}
if (normalized.endsWith('/terselabel')) {
return 3;
}
if (normalized.endsWith('/verboselabel')) {
return 2;
}
return 1;
}
export function classifyStatementRole(roleUri: string): FinancialStatementKind | null {
const normalized = roleUri.toLowerCase();
if (/cash\s*flow|statementsof?cashflows|netcash/.test(normalized)) {
return 'cash_flow';
}
if (/shareholders?|stockholders?|equity|retainedearnings/.test(normalized)) {
return 'equity';
}
if (/comprehensive\s*income/.test(normalized)) {
return 'comprehensive_income';
}
if (/balance\s*sheet|financial\s*position|assets?andliabilities/.test(normalized)) {
return 'balance';
}
if (/operations|income\s*statement|statementsofincome|profit/.test(normalized)) {
return 'income';
}
return null;
}
export function parseLabelLinkbase(raw: string): Map<string, string> {
const namespaces = parseNamespaceMap(raw);
const preferredLabelByConcept = new Map<string, { text: string; priority: number }>();
const linkPattern = /<(?:[a-z0-9_\-]+:)?labelLink\b[^>]*>([\s\S]*?)<\/(?:[a-z0-9_\-]+:)?labelLink>/gi;
for (const linkMatch of raw.matchAll(linkPattern)) {
const block = linkMatch[1] ?? '';
const locByLabel = new Map<string, string>();
const resourceByLabel = new Map<string, { text: string; role: string | null }>();
for (const locMatch of block.matchAll(/<(?:[a-z0-9_\-]+:)?loc\b([^>]*)\/?>/gi)) {
const attrs = locMatch[1] ?? '';
const label = attrs.match(/\bxlink:label=["']([^"']+)["']/i)?.[1]?.trim() ?? '';
const href = attrs.match(/\bxlink:href=["']([^"']+)["']/i)?.[1]?.trim() ?? '';
if (!label || !href) {
continue;
}
const qname = qnameFromHref(href);
if (!qname) {
continue;
}
const concept = conceptFromQName(qname, namespaces);
if (!concept) {
continue;
}
locByLabel.set(label, concept.conceptKey);
}
for (const resourceMatch of block.matchAll(/<(?:[a-z0-9_\-]+:)?label\b([^>]*)>([\s\S]*?)<\/(?:[a-z0-9_\-]+:)?label>/gi)) {
const attrs = resourceMatch[1] ?? '';
const body = decodeXmlEntities(resourceMatch[2] ?? '').replace(/\s+/g, ' ').trim();
if (!body) {
continue;
}
const resourceLabel = attrs.match(/\bxlink:label=["']([^"']+)["']/i)?.[1]?.trim() ?? '';
const role = attrs.match(/\bxlink:role=["']([^"']+)["']/i)?.[1]?.trim() ?? null;
if (!resourceLabel) {
continue;
}
resourceByLabel.set(resourceLabel, {
text: body,
role
});
}
for (const arcMatch of block.matchAll(/<(?:[a-z0-9_\-]+:)?labelArc\b([^>]*)\/?>/gi)) {
const attrs = arcMatch[1] ?? '';
const from = attrs.match(/\bxlink:from=["']([^"']+)["']/i)?.[1]?.trim() ?? '';
const to = attrs.match(/\bxlink:to=["']([^"']+)["']/i)?.[1]?.trim() ?? '';
if (!from || !to) {
continue;
}
const conceptKey = locByLabel.get(from);
const resource = resourceByLabel.get(to);
if (!conceptKey || !resource) {
continue;
}
const priority = labelPriority(resource.role);
const current = preferredLabelByConcept.get(conceptKey);
if (!current || priority > current.priority) {
preferredLabelByConcept.set(conceptKey, {
text: resource.text,
priority
});
}
}
}
return new Map(
[...preferredLabelByConcept.entries()].map(([conceptKey, value]) => [conceptKey, value.text])
);
}
export function parsePresentationLinkbase(raw: string): TaxonomyPresentationConcept[] {
const namespaces = parseNamespaceMap(raw);
const rows: TaxonomyPresentationConcept[] = [];
const linkPattern = /<(?:[a-z0-9_\-]+:)?presentationLink\b([^>]*)>([\s\S]*?)<\/(?:[a-z0-9_\-]+:)?presentationLink>/gi;
for (const linkMatch of raw.matchAll(linkPattern)) {
const linkAttrs = linkMatch[1] ?? '';
const block = linkMatch[2] ?? '';
const roleUri = linkAttrs.match(/\bxlink:role=["']([^"']+)["']/i)?.[1]?.trim() ?? '';
if (!roleUri) {
continue;
}
const locByLabel = new Map<string, { conceptKey: string; qname: string; isAbstract: boolean }>();
for (const locMatch of block.matchAll(/<(?:[a-z0-9_\-]+:)?loc\b([^>]*)\/?>/gi)) {
const attrs = locMatch[1] ?? '';
const label = attrs.match(/\bxlink:label=["']([^"']+)["']/i)?.[1]?.trim() ?? '';
const href = attrs.match(/\bxlink:href=["']([^"']+)["']/i)?.[1]?.trim() ?? '';
if (!label || !href) {
continue;
}
const qname = qnameFromHref(href);
if (!qname) {
continue;
}
const concept = conceptFromQName(qname, namespaces);
if (!concept) {
continue;
}
locByLabel.set(label, {
conceptKey: concept.conceptKey,
qname: concept.qname,
isAbstract: /abstract/i.test(concept.localName)
});
}
const childrenByLabel = new Map<string, Array<{ label: string; order: number }>>();
const incoming = new Set<string>();
const allReferenced = new Set<string>();
for (const arcMatch of block.matchAll(/<(?:[a-z0-9_\-]+:)?presentationArc\b([^>]*)\/?>/gi)) {
const attrs = arcMatch[1] ?? '';
const from = attrs.match(/\bxlink:from=["']([^"']+)["']/i)?.[1]?.trim() ?? '';
const to = attrs.match(/\bxlink:to=["']([^"']+)["']/i)?.[1]?.trim() ?? '';
const orderRaw = attrs.match(/\border=["']([^"']+)["']/i)?.[1]?.trim() ?? '';
const order = Number.parseFloat(orderRaw);
if (!from || !to || !locByLabel.has(from) || !locByLabel.has(to)) {
continue;
}
const group = childrenByLabel.get(from) ?? [];
group.push({ label: to, order: Number.isFinite(order) ? order : group.length + 1 });
childrenByLabel.set(from, group);
incoming.add(to);
allReferenced.add(from);
allReferenced.add(to);
}
const roots = [...allReferenced].filter((label) => !incoming.has(label));
const visited = new Set<string>();
function dfs(label: string, depth: number, parentLabel: string | null, baseOrder: number) {
const node = locByLabel.get(label);
if (!node) {
return;
}
const pathKey = `${parentLabel ?? 'root'}::${label}::${depth}`;
if (visited.has(pathKey)) {
return;
}
visited.add(pathKey);
const parentConceptKey = parentLabel ? (locByLabel.get(parentLabel)?.conceptKey ?? null) : null;
rows.push({
conceptKey: node.conceptKey,
qname: node.qname,
roleUri,
order: baseOrder,
depth,
parentConceptKey,
isAbstract: node.isAbstract
});
const children = [...(childrenByLabel.get(label) ?? [])].sort((left, right) => left.order - right.order);
for (let i = 0; i < children.length; i += 1) {
const child = children[i];
if (!child) {
continue;
}
dfs(child.label, depth + 1, label, baseOrder + (i + 1) / 1000);
}
}
for (let i = 0; i < roots.length; i += 1) {
const root = roots[i];
if (!root) {
continue;
}
dfs(root, 0, null, i + 1);
}
}
return rows;
}

View File

@@ -1,60 +0,0 @@
import { describe, expect, it } from 'bun:test';
import { parseXbrlInstance } from '@/lib/server/taxonomy/xbrl-parser';
const SAMPLE_XBRL = `
<xbrli:xbrl xmlns:xbrli="http://www.xbrl.org/2003/instance"
xmlns:xbrldi="http://xbrl.org/2006/xbrldi"
xmlns:us-gaap="http://fasb.org/us-gaap/2024"
xmlns:dei="http://xbrl.sec.gov/dei/2024">
<xbrli:context id="c1">
<xbrli:period>
<xbrli:startDate>2025-01-01</xbrli:startDate>
<xbrli:endDate>2025-12-31</xbrli:endDate>
</xbrli:period>
</xbrli:context>
<xbrli:context id="c2">
<xbrli:entity>
<xbrli:segment>
<xbrldi:explicitMember dimension="us-gaap:StatementBusinessSegmentsAxis">us-gaap:ConsolidatedGroupMember</xbrldi:explicitMember>
</xbrli:segment>
</xbrli:entity>
<xbrli:period>
<xbrli:instant>2025-12-31</xbrli:instant>
</xbrli:period>
</xbrli:context>
<xbrli:unit id="u1">
<xbrli:measure>iso4217:USD</xbrli:measure>
</xbrli:unit>
<us-gaap:Revenues contextRef="c1" unitRef="u1" decimals="-6">1,234</us-gaap:Revenues>
<us-gaap:Assets contextRef="c2" unitRef="u1" decimals="-6">5,678</us-gaap:Assets>
<dei:EntityRegistrantName contextRef="c1">Acme Corp</dei:EntityRegistrantName>
</xbrli:xbrl>
`;
describe('xbrl instance parser', () => {
it('parses contexts, units, numeric facts, dimensions, and concept keys', () => {
const parsed = parseXbrlInstance(SAMPLE_XBRL, 'abc_htm.xml');
expect(parsed.contexts.c1?.periodStart).toBe('2025-01-01');
expect(parsed.contexts.c1?.periodEnd).toBe('2025-12-31');
expect(parsed.contexts.c2?.periodInstant).toBe('2025-12-31');
expect(parsed.contexts.c2?.dimensions.length).toBe(1);
expect(parsed.units.u1?.measure).toBe('iso4217:USD');
expect(parsed.facts.length).toBe(2);
const revenueFact = parsed.facts.find((fact) => fact.localName === 'Revenues');
const assetsFact = parsed.facts.find((fact) => fact.localName === 'Assets');
expect(revenueFact?.conceptKey).toBe('http://fasb.org/us-gaap/2024#Revenues');
expect(revenueFact?.isDimensionless).toBe(true);
expect(revenueFact?.value).toBe(1234);
expect(revenueFact?.sourceFile).toBe('abc_htm.xml');
expect(assetsFact?.conceptKey).toBe('http://fasb.org/us-gaap/2024#Assets');
expect(assetsFact?.isDimensionless).toBe(false);
expect(assetsFact?.dimensions[0]).toEqual({
axis: 'us-gaap:StatementBusinessSegmentsAxis',
member: 'us-gaap:ConsolidatedGroupMember'
});
});
});

View File

@@ -1,264 +0,0 @@
import type { FinancialStatementKind } from '@/lib/types';
import type { TaxonomyContext, TaxonomyFact, TaxonomyNamespaceMap, TaxonomyUnit } from '@/lib/server/taxonomy/types';
function decodeXmlEntities(value: string) {
return value
.replace(/&amp;/gi, '&')
.replace(/&lt;/gi, '<')
.replace(/&gt;/gi, '>')
.replace(/&quot;/gi, '"')
.replace(/&#39;/gi, "'")
.replace(/&#160;|&nbsp;/gi, ' ')
.replace(/&#x([0-9a-f]+);/gi, (_match, hex) => {
const parsed = Number.parseInt(hex, 16);
if (!Number.isFinite(parsed)) {
return ' ';
}
try {
return String.fromCodePoint(parsed);
} catch {
return ' ';
}
})
.replace(/&#([0-9]+);/g, (_match, numeric) => {
const parsed = Number.parseInt(numeric, 10);
if (!Number.isFinite(parsed)) {
return ' ';
}
try {
return String.fromCodePoint(parsed);
} catch {
return ' ';
}
});
}
function parseNumber(value: string) {
const trimmed = value.trim();
if (!trimmed) {
return null;
}
if (/^--+$/.test(trimmed)) {
return null;
}
const negative = trimmed.startsWith('(') && trimmed.endsWith(')');
const normalized = trimmed
.replace(/<[^>]+>/g, ' ')
.replace(/[,$\s]/g, '')
.replace(/[()]/g, '')
.replace(/\u2212/g, '-');
if (!normalized) {
return null;
}
const parsed = Number.parseFloat(normalized);
if (!Number.isFinite(parsed)) {
return null;
}
return negative ? -Math.abs(parsed) : parsed;
}
function parseNamespaceMapFromDocument(raw: string): TaxonomyNamespaceMap {
const map: TaxonomyNamespaceMap = {};
const rootStart = raw.match(/<[^>]*xbrl[^>]*>/i)?.[0] ?? raw.slice(0, 1200);
for (const match of rootStart.matchAll(/xmlns:([a-zA-Z0-9_\-]+)=["']([^"']+)["']/g)) {
const prefix = (match[1] ?? '').trim();
const uri = (match[2] ?? '').trim();
if (!prefix || !uri) {
continue;
}
map[prefix] = uri;
}
return map;
}
function parseContexts(raw: string): Record<string, TaxonomyContext> {
const contexts: Record<string, TaxonomyContext> = {};
const contextPattern = /<(?:[a-z0-9_\-]+:)?context\b[^>]*\bid=["']([^"']+)["'][^>]*>([\s\S]*?)<\/(?:[a-z0-9_\-]+:)?context>/gi;
for (const match of raw.matchAll(contextPattern)) {
const contextId = (match[1] ?? '').trim();
const block = match[2] ?? '';
if (!contextId) {
continue;
}
const periodStart = block.match(/<(?:[a-z0-9_\-]+:)?startDate>([^<]+)<\/(?:[a-z0-9_\-]+:)?startDate>/i)?.[1]?.trim() ?? null;
const periodEnd = block.match(/<(?:[a-z0-9_\-]+:)?endDate>([^<]+)<\/(?:[a-z0-9_\-]+:)?endDate>/i)?.[1]?.trim() ?? null;
const periodInstant = block.match(/<(?:[a-z0-9_\-]+:)?instant>([^<]+)<\/(?:[a-z0-9_\-]+:)?instant>/i)?.[1]?.trim() ?? null;
const dimensions: Array<{ axis: string; member: string }> = [];
const dimPattern = /<(?:[a-z0-9_\-]+:)?explicitMember\b[^>]*\bdimension=["']([^"']+)["'][^>]*>([^<]+)<\/(?:[a-z0-9_\-]+:)?explicitMember>/gi;
for (const dimMatch of block.matchAll(dimPattern)) {
const axis = decodeXmlEntities((dimMatch[1] ?? '').trim());
const member = decodeXmlEntities((dimMatch[2] ?? '').trim());
if (!axis || !member) {
continue;
}
dimensions.push({ axis, member });
}
contexts[contextId] = {
id: contextId,
periodStart,
periodEnd,
periodInstant,
dimensions
};
}
return contexts;
}
function parseUnits(raw: string): Record<string, TaxonomyUnit> {
const units: Record<string, TaxonomyUnit> = {};
const unitPattern = /<(?:[a-z0-9_\-]+:)?unit\b[^>]*\bid=["']([^"']+)["'][^>]*>([\s\S]*?)<\/(?:[a-z0-9_\-]+:)?unit>/gi;
for (const match of raw.matchAll(unitPattern)) {
const id = (match[1] ?? '').trim();
const block = match[2] ?? '';
if (!id) {
continue;
}
const measures = [...block.matchAll(/<(?:[a-z0-9_\-]+:)?measure>([^<]+)<\/(?:[a-z0-9_\-]+:)?measure>/gi)]
.map((entry) => decodeXmlEntities((entry[1] ?? '').trim()))
.filter(Boolean);
let measure: string | null = null;
if (measures.length === 1) {
measure = measures[0] ?? null;
} else if (measures.length > 1) {
measure = measures.join('/');
}
units[id] = {
id,
measure
};
}
return units;
}
function classifyStatementKind(localName: string): FinancialStatementKind | null {
const normalized = localName.toLowerCase();
if (/cash|operatingactivities|investingactivities|financingactivities/.test(normalized)) {
return 'cash_flow';
}
if (/equity|retainedearnings|additionalpaidincapital/.test(normalized)) {
return 'equity';
}
if (/comprehensiveincome/.test(normalized)) {
return 'comprehensive_income';
}
if (/asset|liabilit|debt/.test(normalized)) {
return 'balance';
}
if (/revenue|income|profit|expense|costof/.test(normalized)) {
return 'income';
}
return null;
}
function isXbrlInfrastructurePrefix(prefix: string) {
const normalized = prefix.toLowerCase();
return normalized === 'xbrli'
|| normalized === 'xlink'
|| normalized === 'link'
|| normalized === 'xbrldi'
|| normalized === 'xbrldt';
}
function localNameToKey(namespaceUri: string, localName: string) {
return `${namespaceUri}#${localName}`;
}
export function parseXbrlInstance(
raw: string,
sourceFile: string | null
): {
namespaces: TaxonomyNamespaceMap;
contexts: Record<string, TaxonomyContext>;
units: Record<string, TaxonomyUnit>;
facts: TaxonomyFact[];
} {
const namespaces = parseNamespaceMapFromDocument(raw);
const contexts = parseContexts(raw);
const units = parseUnits(raw);
const facts: TaxonomyFact[] = [];
const factPattern = /<([a-zA-Z0-9_\-]+):([a-zA-Z0-9_\-.]+)\b([^>]*\bcontextRef=["'][^"']+["'][^>]*)>([\s\S]*?)<\/\1:\2>/g;
for (const match of raw.matchAll(factPattern)) {
const prefix = (match[1] ?? '').trim();
const localName = (match[2] ?? '').trim();
const attrs = match[3] ?? '';
const body = decodeXmlEntities((match[4] ?? '').trim());
if (!prefix || !localName || isXbrlInfrastructurePrefix(prefix)) {
continue;
}
const contextId = attrs.match(/\bcontextRef=["']([^"']+)["']/i)?.[1]?.trim() ?? '';
if (!contextId) {
continue;
}
const value = parseNumber(body);
if (value === null) {
continue;
}
const unitRef = attrs.match(/\bunitRef=["']([^"']+)["']/i)?.[1]?.trim() ?? null;
const decimals = attrs.match(/\bdecimals=["']([^"']+)["']/i)?.[1]?.trim() ?? null;
const namespaceUri = namespaces[prefix] ?? `urn:unknown:${prefix}`;
const context = contexts[contextId];
facts.push({
conceptKey: localNameToKey(namespaceUri, localName),
qname: `${prefix}:${localName}`,
namespaceUri,
localName,
contextId,
unit: unitRef && units[unitRef]?.measure ? units[unitRef]?.measure ?? unitRef : unitRef,
decimals,
value,
periodStart: context?.periodStart ?? null,
periodEnd: context?.periodEnd ?? null,
periodInstant: context?.periodInstant ?? null,
dimensions: context?.dimensions ?? [],
isDimensionless: (context?.dimensions.length ?? 0) === 0,
sourceFile,
});
}
return {
namespaces,
contexts,
units,
facts
};
}
export function conceptStatementFallback(localName: string) {
return classifyStatementKind(localName);
}