Files
Neon-Desk/lib/server/taxonomy/linkbase-parser.ts
2026-03-06 14:40:43 -05:00

311 lines
8.9 KiB
TypeScript

import type { FinancialStatementKind } from '@/lib/types';
import type { TaxonomyNamespaceMap, TaxonomyPresentationConcept } from '@/lib/server/taxonomy/types';
function decodeXmlEntities(value: string) {
return value
.replace(/&/gi, '&')
.replace(/&lt;/gi, '<')
.replace(/&gt;/gi, '>')
.replace(/&quot;/gi, '"')
.replace(/&#39;/gi, "'")
.replace(/&#160;|&nbsp;/gi, ' ')
.trim();
}
function parseNamespaceMap(raw: string): TaxonomyNamespaceMap {
const map: TaxonomyNamespaceMap = {};
const rootStart = raw.match(/<[^>]*linkbase[^>]*>/i)?.[0] ?? raw.slice(0, 1200);
for (const match of rootStart.matchAll(/xmlns:([a-zA-Z0-9_\-]+)=["']([^"']+)["']/g)) {
const prefix = (match[1] ?? '').trim();
const uri = (match[2] ?? '').trim();
if (!prefix || !uri) {
continue;
}
map[prefix] = uri;
}
return map;
}
function qnameFromHref(href: string) {
const fragment = href.includes('#') ? href.slice(href.indexOf('#') + 1) : href;
if (!fragment) {
return null;
}
const cleaned = fragment.trim().replace(/^loc_+/i, '');
if (!cleaned) {
return null;
}
if (cleaned.includes(':')) {
return cleaned;
}
if (cleaned.includes('_')) {
const idx = cleaned.indexOf('_');
return `${cleaned.slice(0, idx)}:${cleaned.slice(idx + 1)}`;
}
return null;
}
function conceptFromQName(qname: string, namespaces: TaxonomyNamespaceMap) {
const [prefix, ...rest] = qname.split(':');
const localName = rest.join(':');
if (!prefix || !localName) {
return null;
}
const namespaceUri = namespaces[prefix] ?? `urn:unknown:${prefix}`;
return {
qname,
namespaceUri,
localName,
conceptKey: `${namespaceUri}#${localName}`
};
}
function labelPriority(role: string | null) {
const normalized = (role ?? '').toLowerCase();
if (!normalized) {
return 0;
}
if (normalized.endsWith('/label')) {
return 4;
}
if (normalized.endsWith('/terselabel')) {
return 3;
}
if (normalized.endsWith('/verboselabel')) {
return 2;
}
return 1;
}
export function classifyStatementRole(roleUri: string): FinancialStatementKind | null {
const normalized = roleUri.toLowerCase();
if (/cash\s*flow|statementsof?cashflows|netcash/.test(normalized)) {
return 'cash_flow';
}
if (/shareholders?|stockholders?|equity|retainedearnings/.test(normalized)) {
return 'equity';
}
if (/comprehensive\s*income/.test(normalized)) {
return 'comprehensive_income';
}
if (/balance\s*sheet|financial\s*position|assets?andliabilities/.test(normalized)) {
return 'balance';
}
if (/operations|income\s*statement|statementsofincome|profit/.test(normalized)) {
return 'income';
}
return null;
}
export function parseLabelLinkbase(raw: string): Map<string, string> {
const namespaces = parseNamespaceMap(raw);
const preferredLabelByConcept = new Map<string, { text: string; priority: number }>();
const linkPattern = /<(?:[a-z0-9_\-]+:)?labelLink\b[^>]*>([\s\S]*?)<\/(?:[a-z0-9_\-]+:)?labelLink>/gi;
for (const linkMatch of raw.matchAll(linkPattern)) {
const block = linkMatch[1] ?? '';
const locByLabel = new Map<string, string>();
const resourceByLabel = new Map<string, { text: string; role: string | null }>();
for (const locMatch of block.matchAll(/<(?:[a-z0-9_\-]+:)?loc\b([^>]*)\/?>/gi)) {
const attrs = locMatch[1] ?? '';
const label = attrs.match(/\bxlink:label=["']([^"']+)["']/i)?.[1]?.trim() ?? '';
const href = attrs.match(/\bxlink:href=["']([^"']+)["']/i)?.[1]?.trim() ?? '';
if (!label || !href) {
continue;
}
const qname = qnameFromHref(href);
if (!qname) {
continue;
}
const concept = conceptFromQName(qname, namespaces);
if (!concept) {
continue;
}
locByLabel.set(label, concept.conceptKey);
}
for (const resourceMatch of block.matchAll(/<(?:[a-z0-9_\-]+:)?label\b([^>]*)>([\s\S]*?)<\/(?:[a-z0-9_\-]+:)?label>/gi)) {
const attrs = resourceMatch[1] ?? '';
const body = decodeXmlEntities(resourceMatch[2] ?? '').replace(/\s+/g, ' ').trim();
if (!body) {
continue;
}
const resourceLabel = attrs.match(/\bxlink:label=["']([^"']+)["']/i)?.[1]?.trim() ?? '';
const role = attrs.match(/\bxlink:role=["']([^"']+)["']/i)?.[1]?.trim() ?? null;
if (!resourceLabel) {
continue;
}
resourceByLabel.set(resourceLabel, {
text: body,
role
});
}
for (const arcMatch of block.matchAll(/<(?:[a-z0-9_\-]+:)?labelArc\b([^>]*)\/?>/gi)) {
const attrs = arcMatch[1] ?? '';
const from = attrs.match(/\bxlink:from=["']([^"']+)["']/i)?.[1]?.trim() ?? '';
const to = attrs.match(/\bxlink:to=["']([^"']+)["']/i)?.[1]?.trim() ?? '';
if (!from || !to) {
continue;
}
const conceptKey = locByLabel.get(from);
const resource = resourceByLabel.get(to);
if (!conceptKey || !resource) {
continue;
}
const priority = labelPriority(resource.role);
const current = preferredLabelByConcept.get(conceptKey);
if (!current || priority > current.priority) {
preferredLabelByConcept.set(conceptKey, {
text: resource.text,
priority
});
}
}
}
return new Map(
[...preferredLabelByConcept.entries()].map(([conceptKey, value]) => [conceptKey, value.text])
);
}
export function parsePresentationLinkbase(raw: string): TaxonomyPresentationConcept[] {
const namespaces = parseNamespaceMap(raw);
const rows: TaxonomyPresentationConcept[] = [];
const linkPattern = /<(?:[a-z0-9_\-]+:)?presentationLink\b([^>]*)>([\s\S]*?)<\/(?:[a-z0-9_\-]+:)?presentationLink>/gi;
for (const linkMatch of raw.matchAll(linkPattern)) {
const linkAttrs = linkMatch[1] ?? '';
const block = linkMatch[2] ?? '';
const roleUri = linkAttrs.match(/\bxlink:role=["']([^"']+)["']/i)?.[1]?.trim() ?? '';
if (!roleUri) {
continue;
}
const locByLabel = new Map<string, { conceptKey: string; qname: string; isAbstract: boolean }>();
for (const locMatch of block.matchAll(/<(?:[a-z0-9_\-]+:)?loc\b([^>]*)\/?>/gi)) {
const attrs = locMatch[1] ?? '';
const label = attrs.match(/\bxlink:label=["']([^"']+)["']/i)?.[1]?.trim() ?? '';
const href = attrs.match(/\bxlink:href=["']([^"']+)["']/i)?.[1]?.trim() ?? '';
if (!label || !href) {
continue;
}
const qname = qnameFromHref(href);
if (!qname) {
continue;
}
const concept = conceptFromQName(qname, namespaces);
if (!concept) {
continue;
}
locByLabel.set(label, {
conceptKey: concept.conceptKey,
qname: concept.qname,
isAbstract: /abstract/i.test(concept.localName)
});
}
const childrenByLabel = new Map<string, Array<{ label: string; order: number }>>();
const incoming = new Set<string>();
const allReferenced = new Set<string>();
for (const arcMatch of block.matchAll(/<(?:[a-z0-9_\-]+:)?presentationArc\b([^>]*)\/?>/gi)) {
const attrs = arcMatch[1] ?? '';
const from = attrs.match(/\bxlink:from=["']([^"']+)["']/i)?.[1]?.trim() ?? '';
const to = attrs.match(/\bxlink:to=["']([^"']+)["']/i)?.[1]?.trim() ?? '';
const orderRaw = attrs.match(/\border=["']([^"']+)["']/i)?.[1]?.trim() ?? '';
const order = Number.parseFloat(orderRaw);
if (!from || !to || !locByLabel.has(from) || !locByLabel.has(to)) {
continue;
}
const group = childrenByLabel.get(from) ?? [];
group.push({ label: to, order: Number.isFinite(order) ? order : group.length + 1 });
childrenByLabel.set(from, group);
incoming.add(to);
allReferenced.add(from);
allReferenced.add(to);
}
const roots = [...allReferenced].filter((label) => !incoming.has(label));
const visited = new Set<string>();
function dfs(label: string, depth: number, parentLabel: string | null, baseOrder: number) {
const node = locByLabel.get(label);
if (!node) {
return;
}
const pathKey = `${parentLabel ?? 'root'}::${label}::${depth}`;
if (visited.has(pathKey)) {
return;
}
visited.add(pathKey);
const parentConceptKey = parentLabel ? (locByLabel.get(parentLabel)?.conceptKey ?? null) : null;
rows.push({
conceptKey: node.conceptKey,
qname: node.qname,
roleUri,
order: baseOrder,
depth,
parentConceptKey,
isAbstract: node.isAbstract
});
const children = [...(childrenByLabel.get(label) ?? [])].sort((left, right) => left.order - right.order);
for (let i = 0; i < children.length; i += 1) {
const child = children[i];
if (!child) {
continue;
}
dfs(child.label, depth + 1, label, baseOrder + (i + 1) / 1000);
}
}
for (let i = 0; i < roots.length; i += 1) {
const root = roots[i];
if (!root) {
continue;
}
dfs(root, 0, null, i + 1);
}
}
return rows;
}