Run playwright UI tests
This commit is contained in:
264
lib/server/taxonomy/xbrl-parser.ts
Normal file
264
lib/server/taxonomy/xbrl-parser.ts
Normal file
@@ -0,0 +1,264 @@
|
||||
import type { FinancialStatementKind } from '@/lib/types';
|
||||
import type { TaxonomyContext, TaxonomyFact, TaxonomyNamespaceMap, TaxonomyUnit } from '@/lib/server/taxonomy/types';
|
||||
|
||||
function decodeXmlEntities(value: string) {
|
||||
return value
|
||||
.replace(/&/gi, '&')
|
||||
.replace(/</gi, '<')
|
||||
.replace(/>/gi, '>')
|
||||
.replace(/"/gi, '"')
|
||||
.replace(/'/gi, "'")
|
||||
.replace(/ | /gi, ' ')
|
||||
.replace(/&#x([0-9a-f]+);/gi, (_match, hex) => {
|
||||
const parsed = Number.parseInt(hex, 16);
|
||||
if (!Number.isFinite(parsed)) {
|
||||
return ' ';
|
||||
}
|
||||
|
||||
try {
|
||||
return String.fromCodePoint(parsed);
|
||||
} catch {
|
||||
return ' ';
|
||||
}
|
||||
})
|
||||
.replace(/&#([0-9]+);/g, (_match, numeric) => {
|
||||
const parsed = Number.parseInt(numeric, 10);
|
||||
if (!Number.isFinite(parsed)) {
|
||||
return ' ';
|
||||
}
|
||||
|
||||
try {
|
||||
return String.fromCodePoint(parsed);
|
||||
} catch {
|
||||
return ' ';
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
function parseNumber(value: string) {
|
||||
const trimmed = value.trim();
|
||||
if (!trimmed) {
|
||||
return null;
|
||||
}
|
||||
|
||||
if (/^--+$/.test(trimmed)) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const negative = trimmed.startsWith('(') && trimmed.endsWith(')');
|
||||
const normalized = trimmed
|
||||
.replace(/<[^>]+>/g, ' ')
|
||||
.replace(/[,$\s]/g, '')
|
||||
.replace(/[()]/g, '')
|
||||
.replace(/\u2212/g, '-');
|
||||
|
||||
if (!normalized) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const parsed = Number.parseFloat(normalized);
|
||||
if (!Number.isFinite(parsed)) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return negative ? -Math.abs(parsed) : parsed;
|
||||
}
|
||||
|
||||
function parseNamespaceMapFromDocument(raw: string): TaxonomyNamespaceMap {
|
||||
const map: TaxonomyNamespaceMap = {};
|
||||
const rootStart = raw.match(/<[^>]*xbrl[^>]*>/i)?.[0] ?? raw.slice(0, 1200);
|
||||
|
||||
for (const match of rootStart.matchAll(/xmlns:([a-zA-Z0-9_\-]+)=["']([^"']+)["']/g)) {
|
||||
const prefix = (match[1] ?? '').trim();
|
||||
const uri = (match[2] ?? '').trim();
|
||||
|
||||
if (!prefix || !uri) {
|
||||
continue;
|
||||
}
|
||||
|
||||
map[prefix] = uri;
|
||||
}
|
||||
|
||||
return map;
|
||||
}
|
||||
|
||||
function parseContexts(raw: string): Record<string, TaxonomyContext> {
|
||||
const contexts: Record<string, TaxonomyContext> = {};
|
||||
const contextPattern = /<(?:[a-z0-9_\-]+:)?context\b[^>]*\bid=["']([^"']+)["'][^>]*>([\s\S]*?)<\/(?:[a-z0-9_\-]+:)?context>/gi;
|
||||
|
||||
for (const match of raw.matchAll(contextPattern)) {
|
||||
const contextId = (match[1] ?? '').trim();
|
||||
const block = match[2] ?? '';
|
||||
if (!contextId) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const periodStart = block.match(/<(?:[a-z0-9_\-]+:)?startDate>([^<]+)<\/(?:[a-z0-9_\-]+:)?startDate>/i)?.[1]?.trim() ?? null;
|
||||
const periodEnd = block.match(/<(?:[a-z0-9_\-]+:)?endDate>([^<]+)<\/(?:[a-z0-9_\-]+:)?endDate>/i)?.[1]?.trim() ?? null;
|
||||
const periodInstant = block.match(/<(?:[a-z0-9_\-]+:)?instant>([^<]+)<\/(?:[a-z0-9_\-]+:)?instant>/i)?.[1]?.trim() ?? null;
|
||||
|
||||
const dimensions: Array<{ axis: string; member: string }> = [];
|
||||
const dimPattern = /<(?:[a-z0-9_\-]+:)?explicitMember\b[^>]*\bdimension=["']([^"']+)["'][^>]*>([^<]+)<\/(?:[a-z0-9_\-]+:)?explicitMember>/gi;
|
||||
for (const dimMatch of block.matchAll(dimPattern)) {
|
||||
const axis = decodeXmlEntities((dimMatch[1] ?? '').trim());
|
||||
const member = decodeXmlEntities((dimMatch[2] ?? '').trim());
|
||||
if (!axis || !member) {
|
||||
continue;
|
||||
}
|
||||
|
||||
dimensions.push({ axis, member });
|
||||
}
|
||||
|
||||
contexts[contextId] = {
|
||||
id: contextId,
|
||||
periodStart,
|
||||
periodEnd,
|
||||
periodInstant,
|
||||
dimensions
|
||||
};
|
||||
}
|
||||
|
||||
return contexts;
|
||||
}
|
||||
|
||||
function parseUnits(raw: string): Record<string, TaxonomyUnit> {
|
||||
const units: Record<string, TaxonomyUnit> = {};
|
||||
const unitPattern = /<(?:[a-z0-9_\-]+:)?unit\b[^>]*\bid=["']([^"']+)["'][^>]*>([\s\S]*?)<\/(?:[a-z0-9_\-]+:)?unit>/gi;
|
||||
|
||||
for (const match of raw.matchAll(unitPattern)) {
|
||||
const id = (match[1] ?? '').trim();
|
||||
const block = match[2] ?? '';
|
||||
if (!id) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const measures = [...block.matchAll(/<(?:[a-z0-9_\-]+:)?measure>([^<]+)<\/(?:[a-z0-9_\-]+:)?measure>/gi)]
|
||||
.map((entry) => decodeXmlEntities((entry[1] ?? '').trim()))
|
||||
.filter(Boolean);
|
||||
|
||||
let measure: string | null = null;
|
||||
if (measures.length === 1) {
|
||||
measure = measures[0] ?? null;
|
||||
} else if (measures.length > 1) {
|
||||
measure = measures.join('/');
|
||||
}
|
||||
|
||||
units[id] = {
|
||||
id,
|
||||
measure
|
||||
};
|
||||
}
|
||||
|
||||
return units;
|
||||
}
|
||||
|
||||
function classifyStatementKind(localName: string): FinancialStatementKind | null {
|
||||
const normalized = localName.toLowerCase();
|
||||
|
||||
if (/cash|operatingactivities|investingactivities|financingactivities/.test(normalized)) {
|
||||
return 'cash_flow';
|
||||
}
|
||||
|
||||
if (/equity|retainedearnings|additionalpaidincapital/.test(normalized)) {
|
||||
return 'equity';
|
||||
}
|
||||
|
||||
if (/comprehensiveincome/.test(normalized)) {
|
||||
return 'comprehensive_income';
|
||||
}
|
||||
|
||||
if (/asset|liabilit|debt/.test(normalized)) {
|
||||
return 'balance';
|
||||
}
|
||||
|
||||
if (/revenue|income|profit|expense|costof/.test(normalized)) {
|
||||
return 'income';
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
function isXbrlInfrastructurePrefix(prefix: string) {
|
||||
const normalized = prefix.toLowerCase();
|
||||
return normalized === 'xbrli'
|
||||
|| normalized === 'xlink'
|
||||
|| normalized === 'link'
|
||||
|| normalized === 'xbrldi'
|
||||
|| normalized === 'xbrldt';
|
||||
}
|
||||
|
||||
function localNameToKey(namespaceUri: string, localName: string) {
|
||||
return `${namespaceUri}#${localName}`;
|
||||
}
|
||||
|
||||
export function parseXbrlInstance(
|
||||
raw: string,
|
||||
sourceFile: string | null
|
||||
): {
|
||||
namespaces: TaxonomyNamespaceMap;
|
||||
contexts: Record<string, TaxonomyContext>;
|
||||
units: Record<string, TaxonomyUnit>;
|
||||
facts: TaxonomyFact[];
|
||||
} {
|
||||
const namespaces = parseNamespaceMapFromDocument(raw);
|
||||
const contexts = parseContexts(raw);
|
||||
const units = parseUnits(raw);
|
||||
const facts: TaxonomyFact[] = [];
|
||||
|
||||
const factPattern = /<([a-zA-Z0-9_\-]+):([a-zA-Z0-9_\-.]+)\b([^>]*\bcontextRef=["'][^"']+["'][^>]*)>([\s\S]*?)<\/\1:\2>/g;
|
||||
|
||||
for (const match of raw.matchAll(factPattern)) {
|
||||
const prefix = (match[1] ?? '').trim();
|
||||
const localName = (match[2] ?? '').trim();
|
||||
const attrs = match[3] ?? '';
|
||||
const body = decodeXmlEntities((match[4] ?? '').trim());
|
||||
|
||||
if (!prefix || !localName || isXbrlInfrastructurePrefix(prefix)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const contextId = attrs.match(/\bcontextRef=["']([^"']+)["']/i)?.[1]?.trim() ?? '';
|
||||
if (!contextId) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const value = parseNumber(body);
|
||||
if (value === null) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const unitRef = attrs.match(/\bunitRef=["']([^"']+)["']/i)?.[1]?.trim() ?? null;
|
||||
const decimals = attrs.match(/\bdecimals=["']([^"']+)["']/i)?.[1]?.trim() ?? null;
|
||||
|
||||
const namespaceUri = namespaces[prefix] ?? `urn:unknown:${prefix}`;
|
||||
const context = contexts[contextId];
|
||||
|
||||
facts.push({
|
||||
conceptKey: localNameToKey(namespaceUri, localName),
|
||||
qname: `${prefix}:${localName}`,
|
||||
namespaceUri,
|
||||
localName,
|
||||
contextId,
|
||||
unit: unitRef && units[unitRef]?.measure ? units[unitRef]?.measure ?? unitRef : unitRef,
|
||||
decimals,
|
||||
value,
|
||||
periodStart: context?.periodStart ?? null,
|
||||
periodEnd: context?.periodEnd ?? null,
|
||||
periodInstant: context?.periodInstant ?? null,
|
||||
dimensions: context?.dimensions ?? [],
|
||||
isDimensionless: (context?.dimensions.length ?? 0) === 0,
|
||||
sourceFile,
|
||||
});
|
||||
}
|
||||
|
||||
return {
|
||||
namespaces,
|
||||
contexts,
|
||||
units,
|
||||
facts
|
||||
};
|
||||
}
|
||||
|
||||
export function conceptStatementFallback(localName: string) {
|
||||
return classifyStatementKind(localName);
|
||||
}
|
||||
Reference in New Issue
Block a user