Files
Neon-Desk/lib/server/taxonomy/xbrl-parser.ts
2026-03-06 14:40:43 -05:00

265 lines
7.3 KiB
TypeScript

import type { FinancialStatementKind } from '@/lib/types';
import type { TaxonomyContext, TaxonomyFact, TaxonomyNamespaceMap, TaxonomyUnit } from '@/lib/server/taxonomy/types';
function decodeXmlEntities(value: string) {
return value
.replace(/&/gi, '&')
.replace(/&lt;/gi, '<')
.replace(/&gt;/gi, '>')
.replace(/&quot;/gi, '"')
.replace(/&#39;/gi, "'")
.replace(/&#160;|&nbsp;/gi, ' ')
.replace(/&#x([0-9a-f]+);/gi, (_match, hex) => {
const parsed = Number.parseInt(hex, 16);
if (!Number.isFinite(parsed)) {
return ' ';
}
try {
return String.fromCodePoint(parsed);
} catch {
return ' ';
}
})
.replace(/&#([0-9]+);/g, (_match, numeric) => {
const parsed = Number.parseInt(numeric, 10);
if (!Number.isFinite(parsed)) {
return ' ';
}
try {
return String.fromCodePoint(parsed);
} catch {
return ' ';
}
});
}
function parseNumber(value: string) {
const trimmed = value.trim();
if (!trimmed) {
return null;
}
if (/^--+$/.test(trimmed)) {
return null;
}
const negative = trimmed.startsWith('(') && trimmed.endsWith(')');
const normalized = trimmed
.replace(/<[^>]+>/g, ' ')
.replace(/[,$\s]/g, '')
.replace(/[()]/g, '')
.replace(/\u2212/g, '-');
if (!normalized) {
return null;
}
const parsed = Number.parseFloat(normalized);
if (!Number.isFinite(parsed)) {
return null;
}
return negative ? -Math.abs(parsed) : parsed;
}
function parseNamespaceMapFromDocument(raw: string): TaxonomyNamespaceMap {
const map: TaxonomyNamespaceMap = {};
const rootStart = raw.match(/<[^>]*xbrl[^>]*>/i)?.[0] ?? raw.slice(0, 1200);
for (const match of rootStart.matchAll(/xmlns:([a-zA-Z0-9_\-]+)=["']([^"']+)["']/g)) {
const prefix = (match[1] ?? '').trim();
const uri = (match[2] ?? '').trim();
if (!prefix || !uri) {
continue;
}
map[prefix] = uri;
}
return map;
}
function parseContexts(raw: string): Record<string, TaxonomyContext> {
const contexts: Record<string, TaxonomyContext> = {};
const contextPattern = /<(?:[a-z0-9_\-]+:)?context\b[^>]*\bid=["']([^"']+)["'][^>]*>([\s\S]*?)<\/(?:[a-z0-9_\-]+:)?context>/gi;
for (const match of raw.matchAll(contextPattern)) {
const contextId = (match[1] ?? '').trim();
const block = match[2] ?? '';
if (!contextId) {
continue;
}
const periodStart = block.match(/<(?:[a-z0-9_\-]+:)?startDate>([^<]+)<\/(?:[a-z0-9_\-]+:)?startDate>/i)?.[1]?.trim() ?? null;
const periodEnd = block.match(/<(?:[a-z0-9_\-]+:)?endDate>([^<]+)<\/(?:[a-z0-9_\-]+:)?endDate>/i)?.[1]?.trim() ?? null;
const periodInstant = block.match(/<(?:[a-z0-9_\-]+:)?instant>([^<]+)<\/(?:[a-z0-9_\-]+:)?instant>/i)?.[1]?.trim() ?? null;
const dimensions: Array<{ axis: string; member: string }> = [];
const dimPattern = /<(?:[a-z0-9_\-]+:)?explicitMember\b[^>]*\bdimension=["']([^"']+)["'][^>]*>([^<]+)<\/(?:[a-z0-9_\-]+:)?explicitMember>/gi;
for (const dimMatch of block.matchAll(dimPattern)) {
const axis = decodeXmlEntities((dimMatch[1] ?? '').trim());
const member = decodeXmlEntities((dimMatch[2] ?? '').trim());
if (!axis || !member) {
continue;
}
dimensions.push({ axis, member });
}
contexts[contextId] = {
id: contextId,
periodStart,
periodEnd,
periodInstant,
dimensions
};
}
return contexts;
}
function parseUnits(raw: string): Record<string, TaxonomyUnit> {
const units: Record<string, TaxonomyUnit> = {};
const unitPattern = /<(?:[a-z0-9_\-]+:)?unit\b[^>]*\bid=["']([^"']+)["'][^>]*>([\s\S]*?)<\/(?:[a-z0-9_\-]+:)?unit>/gi;
for (const match of raw.matchAll(unitPattern)) {
const id = (match[1] ?? '').trim();
const block = match[2] ?? '';
if (!id) {
continue;
}
const measures = [...block.matchAll(/<(?:[a-z0-9_\-]+:)?measure>([^<]+)<\/(?:[a-z0-9_\-]+:)?measure>/gi)]
.map((entry) => decodeXmlEntities((entry[1] ?? '').trim()))
.filter(Boolean);
let measure: string | null = null;
if (measures.length === 1) {
measure = measures[0] ?? null;
} else if (measures.length > 1) {
measure = measures.join('/');
}
units[id] = {
id,
measure
};
}
return units;
}
function classifyStatementKind(localName: string): FinancialStatementKind | null {
const normalized = localName.toLowerCase();
if (/cash|operatingactivities|investingactivities|financingactivities/.test(normalized)) {
return 'cash_flow';
}
if (/equity|retainedearnings|additionalpaidincapital/.test(normalized)) {
return 'equity';
}
if (/comprehensiveincome/.test(normalized)) {
return 'comprehensive_income';
}
if (/asset|liabilit|debt/.test(normalized)) {
return 'balance';
}
if (/revenue|income|profit|expense|costof/.test(normalized)) {
return 'income';
}
return null;
}
function isXbrlInfrastructurePrefix(prefix: string) {
const normalized = prefix.toLowerCase();
return normalized === 'xbrli'
|| normalized === 'xlink'
|| normalized === 'link'
|| normalized === 'xbrldi'
|| normalized === 'xbrldt';
}
function localNameToKey(namespaceUri: string, localName: string) {
return `${namespaceUri}#${localName}`;
}
export function parseXbrlInstance(
raw: string,
sourceFile: string | null
): {
namespaces: TaxonomyNamespaceMap;
contexts: Record<string, TaxonomyContext>;
units: Record<string, TaxonomyUnit>;
facts: TaxonomyFact[];
} {
const namespaces = parseNamespaceMapFromDocument(raw);
const contexts = parseContexts(raw);
const units = parseUnits(raw);
const facts: TaxonomyFact[] = [];
const factPattern = /<([a-zA-Z0-9_\-]+):([a-zA-Z0-9_\-.]+)\b([^>]*\bcontextRef=["'][^"']+["'][^>]*)>([\s\S]*?)<\/\1:\2>/g;
for (const match of raw.matchAll(factPattern)) {
const prefix = (match[1] ?? '').trim();
const localName = (match[2] ?? '').trim();
const attrs = match[3] ?? '';
const body = decodeXmlEntities((match[4] ?? '').trim());
if (!prefix || !localName || isXbrlInfrastructurePrefix(prefix)) {
continue;
}
const contextId = attrs.match(/\bcontextRef=["']([^"']+)["']/i)?.[1]?.trim() ?? '';
if (!contextId) {
continue;
}
const value = parseNumber(body);
if (value === null) {
continue;
}
const unitRef = attrs.match(/\bunitRef=["']([^"']+)["']/i)?.[1]?.trim() ?? null;
const decimals = attrs.match(/\bdecimals=["']([^"']+)["']/i)?.[1]?.trim() ?? null;
const namespaceUri = namespaces[prefix] ?? `urn:unknown:${prefix}`;
const context = contexts[contextId];
facts.push({
conceptKey: localNameToKey(namespaceUri, localName),
qname: `${prefix}:${localName}`,
namespaceUri,
localName,
contextId,
unit: unitRef && units[unitRef]?.measure ? units[unitRef]?.measure ?? unitRef : unitRef,
decimals,
value,
periodStart: context?.periodStart ?? null,
periodEnd: context?.periodEnd ?? null,
periodInstant: context?.periodInstant ?? null,
dimensions: context?.dimensions ?? [],
isDimensionless: (context?.dimensions.length ?? 0) === 0,
sourceFile,
});
}
return {
namespaces,
contexts,
units,
facts
};
}
export function conceptStatementFallback(localName: string) {
return classifyStatementKind(localName);
}