import type { FinancialStatementKind } from '@/lib/types'; import type { TaxonomyNamespaceMap, TaxonomyPresentationConcept } from '@/lib/server/taxonomy/types'; function decodeXmlEntities(value: string) { return value .replace(/&/gi, '&') .replace(/</gi, '<') .replace(/>/gi, '>') .replace(/"/gi, '"') .replace(/'/gi, "'") .replace(/ | /gi, ' ') .trim(); } function parseNamespaceMap(raw: string): TaxonomyNamespaceMap { const map: TaxonomyNamespaceMap = {}; const rootStart = raw.match(/<[^>]*linkbase[^>]*>/i)?.[0] ?? raw.slice(0, 1200); for (const match of rootStart.matchAll(/xmlns:([a-zA-Z0-9_\-]+)=["']([^"']+)["']/g)) { const prefix = (match[1] ?? '').trim(); const uri = (match[2] ?? '').trim(); if (!prefix || !uri) { continue; } map[prefix] = uri; } return map; } function qnameFromHref(href: string) { const fragment = href.includes('#') ? href.slice(href.indexOf('#') + 1) : href; if (!fragment) { return null; } const cleaned = fragment.trim().replace(/^loc_+/i, ''); if (!cleaned) { return null; } if (cleaned.includes(':')) { return cleaned; } if (cleaned.includes('_')) { const idx = cleaned.indexOf('_'); return `${cleaned.slice(0, idx)}:${cleaned.slice(idx + 1)}`; } return null; } function conceptFromQName(qname: string, namespaces: TaxonomyNamespaceMap) { const [prefix, ...rest] = qname.split(':'); const localName = rest.join(':'); if (!prefix || !localName) { return null; } const namespaceUri = namespaces[prefix] ?? `urn:unknown:${prefix}`; return { qname, namespaceUri, localName, conceptKey: `${namespaceUri}#${localName}` }; } function labelPriority(role: string | null) { const normalized = (role ?? '').toLowerCase(); if (!normalized) { return 0; } if (normalized.endsWith('/label')) { return 4; } if (normalized.endsWith('/terselabel')) { return 3; } if (normalized.endsWith('/verboselabel')) { return 2; } return 1; } export function classifyStatementRole(roleUri: string): FinancialStatementKind | null { const normalized = roleUri.toLowerCase(); if (/cash\s*flow|statementsof?cashflows|netcash/.test(normalized)) { return 'cash_flow'; } if (/shareholders?|stockholders?|equity|retainedearnings/.test(normalized)) { return 'equity'; } if (/comprehensive\s*income/.test(normalized)) { return 'comprehensive_income'; } if (/balance\s*sheet|financial\s*position|assets?andliabilities/.test(normalized)) { return 'balance'; } if (/operations|income\s*statement|statementsofincome|profit/.test(normalized)) { return 'income'; } return null; } export function parseLabelLinkbase(raw: string): Map { const namespaces = parseNamespaceMap(raw); const preferredLabelByConcept = new Map(); const linkPattern = /<(?:[a-z0-9_\-]+:)?labelLink\b[^>]*>([\s\S]*?)<\/(?:[a-z0-9_\-]+:)?labelLink>/gi; for (const linkMatch of raw.matchAll(linkPattern)) { const block = linkMatch[1] ?? ''; const locByLabel = new Map(); const resourceByLabel = new Map(); for (const locMatch of block.matchAll(/<(?:[a-z0-9_\-]+:)?loc\b([^>]*)\/?>/gi)) { const attrs = locMatch[1] ?? ''; const label = attrs.match(/\bxlink:label=["']([^"']+)["']/i)?.[1]?.trim() ?? ''; const href = attrs.match(/\bxlink:href=["']([^"']+)["']/i)?.[1]?.trim() ?? ''; if (!label || !href) { continue; } const qname = qnameFromHref(href); if (!qname) { continue; } const concept = conceptFromQName(qname, namespaces); if (!concept) { continue; } locByLabel.set(label, concept.conceptKey); } for (const resourceMatch of block.matchAll(/<(?:[a-z0-9_\-]+:)?label\b([^>]*)>([\s\S]*?)<\/(?:[a-z0-9_\-]+:)?label>/gi)) { const attrs = resourceMatch[1] ?? ''; const body = decodeXmlEntities(resourceMatch[2] ?? '').replace(/\s+/g, ' ').trim(); if (!body) { continue; } const resourceLabel = attrs.match(/\bxlink:label=["']([^"']+)["']/i)?.[1]?.trim() ?? ''; const role = attrs.match(/\bxlink:role=["']([^"']+)["']/i)?.[1]?.trim() ?? null; if (!resourceLabel) { continue; } resourceByLabel.set(resourceLabel, { text: body, role }); } for (const arcMatch of block.matchAll(/<(?:[a-z0-9_\-]+:)?labelArc\b([^>]*)\/?>/gi)) { const attrs = arcMatch[1] ?? ''; const from = attrs.match(/\bxlink:from=["']([^"']+)["']/i)?.[1]?.trim() ?? ''; const to = attrs.match(/\bxlink:to=["']([^"']+)["']/i)?.[1]?.trim() ?? ''; if (!from || !to) { continue; } const conceptKey = locByLabel.get(from); const resource = resourceByLabel.get(to); if (!conceptKey || !resource) { continue; } const priority = labelPriority(resource.role); const current = preferredLabelByConcept.get(conceptKey); if (!current || priority > current.priority) { preferredLabelByConcept.set(conceptKey, { text: resource.text, priority }); } } } return new Map( [...preferredLabelByConcept.entries()].map(([conceptKey, value]) => [conceptKey, value.text]) ); } export function parsePresentationLinkbase(raw: string): TaxonomyPresentationConcept[] { const namespaces = parseNamespaceMap(raw); const rows: TaxonomyPresentationConcept[] = []; const linkPattern = /<(?:[a-z0-9_\-]+:)?presentationLink\b([^>]*)>([\s\S]*?)<\/(?:[a-z0-9_\-]+:)?presentationLink>/gi; for (const linkMatch of raw.matchAll(linkPattern)) { const linkAttrs = linkMatch[1] ?? ''; const block = linkMatch[2] ?? ''; const roleUri = linkAttrs.match(/\bxlink:role=["']([^"']+)["']/i)?.[1]?.trim() ?? ''; if (!roleUri) { continue; } const locByLabel = new Map(); for (const locMatch of block.matchAll(/<(?:[a-z0-9_\-]+:)?loc\b([^>]*)\/?>/gi)) { const attrs = locMatch[1] ?? ''; const label = attrs.match(/\bxlink:label=["']([^"']+)["']/i)?.[1]?.trim() ?? ''; const href = attrs.match(/\bxlink:href=["']([^"']+)["']/i)?.[1]?.trim() ?? ''; if (!label || !href) { continue; } const qname = qnameFromHref(href); if (!qname) { continue; } const concept = conceptFromQName(qname, namespaces); if (!concept) { continue; } locByLabel.set(label, { conceptKey: concept.conceptKey, qname: concept.qname, isAbstract: /abstract/i.test(concept.localName) }); } const childrenByLabel = new Map>(); const incoming = new Set(); const allReferenced = new Set(); for (const arcMatch of block.matchAll(/<(?:[a-z0-9_\-]+:)?presentationArc\b([^>]*)\/?>/gi)) { const attrs = arcMatch[1] ?? ''; const from = attrs.match(/\bxlink:from=["']([^"']+)["']/i)?.[1]?.trim() ?? ''; const to = attrs.match(/\bxlink:to=["']([^"']+)["']/i)?.[1]?.trim() ?? ''; const orderRaw = attrs.match(/\border=["']([^"']+)["']/i)?.[1]?.trim() ?? ''; const order = Number.parseFloat(orderRaw); if (!from || !to || !locByLabel.has(from) || !locByLabel.has(to)) { continue; } const group = childrenByLabel.get(from) ?? []; group.push({ label: to, order: Number.isFinite(order) ? order : group.length + 1 }); childrenByLabel.set(from, group); incoming.add(to); allReferenced.add(from); allReferenced.add(to); } const roots = [...allReferenced].filter((label) => !incoming.has(label)); const visited = new Set(); function dfs(label: string, depth: number, parentLabel: string | null, baseOrder: number) { const node = locByLabel.get(label); if (!node) { return; } const pathKey = `${parentLabel ?? 'root'}::${label}::${depth}`; if (visited.has(pathKey)) { return; } visited.add(pathKey); const parentConceptKey = parentLabel ? (locByLabel.get(parentLabel)?.conceptKey ?? null) : null; rows.push({ conceptKey: node.conceptKey, qname: node.qname, roleUri, order: baseOrder, depth, parentConceptKey, isAbstract: node.isAbstract }); const children = [...(childrenByLabel.get(label) ?? [])].sort((left, right) => left.order - right.order); for (let i = 0; i < children.length; i += 1) { const child = children[i]; if (!child) { continue; } dfs(child.label, depth + 1, label, baseOrder + (i + 1) / 1000); } } for (let i = 0; i < roots.length; i += 1) { const root = roots[i]; if (!root) { continue; } dfs(root, 0, null, i + 1); } } return rows; }