Run playwright UI tests

This commit is contained in:
2026-03-06 14:40:43 -05:00
parent 610fce8db3
commit 8e62c66677
37 changed files with 4430 additions and 643 deletions

View File

@@ -0,0 +1,73 @@
import { describe, expect, it } from 'bun:test';
import { discoverFilingAssets } from '@/lib/server/taxonomy/asset-discovery';
describe('taxonomy asset discovery', () => {
it('classifies assets and selects ranked instance/pdf candidates', async () => {
const fetchImpl = (async () => {
return new Response(JSON.stringify({
directory: {
item: [
{ name: 'abc_htm.xml', size: '900000' },
{ name: 'abc_pre.xml', size: '250000' },
{ name: 'abc_lab.xml', size: '120000' },
{ name: '10k_financial_statements.pdf', size: '400000' },
{ name: 'annual_report.pdf', size: '300000' },
{ name: 'quarter_statement.pdf', size: '200000' },
{ name: 'exhibit99.pdf', size: '500000' }
]
}
}), {
status: 200,
headers: {
'content-type': 'application/json'
}
});
}) as unknown as typeof fetch;
const result = await discoverFilingAssets({
cik: '0000123456',
accessionNumber: '0000123456-26-000001',
filingUrl: 'https://www.sec.gov/Archives/edgar/data/123456/000012345626000001/abc.htm',
primaryDocument: 'abc.htm',
fetchImpl
});
expect(result.directoryUrl).toBe('https://www.sec.gov/Archives/edgar/data/123456/000012345626000001/');
const selectedInstance = result.assets.find((asset) => asset.asset_type === 'instance' && asset.is_selected);
expect(selectedInstance?.name).toBe('abc_htm.xml');
const selectedPdfs = result.assets
.filter((asset) => asset.asset_type === 'pdf' && asset.is_selected)
.map((asset) => asset.name);
expect(selectedPdfs.length).toBe(3);
expect(selectedPdfs).toContain('10k_financial_statements.pdf');
expect(selectedPdfs).toContain('annual_report.pdf');
expect(selectedPdfs).toContain('quarter_statement.pdf');
expect(selectedPdfs).not.toContain('exhibit99.pdf');
});
it('falls back to filing url when SEC directory assets are unavailable', async () => {
const fetchImpl = (async () => {
return new Response('not found', { status: 404 });
}) as unknown as typeof fetch;
const result = await discoverFilingAssets({
cik: '0000123456',
accessionNumber: '0000123456-26-000001',
filingUrl: 'https://www.sec.gov/Archives/edgar/data/123456/000012345626000001/abc.xml',
primaryDocument: 'abc.xml',
fetchImpl
});
expect(result.assets.length).toBe(1);
expect(result.assets[0]).toEqual({
asset_type: 'instance',
name: 'abc.xml',
url: 'https://www.sec.gov/Archives/edgar/data/123456/000012345626000001/abc.xml',
size_bytes: null,
score: 6,
is_selected: true
});
});
});

View File

@@ -0,0 +1,283 @@
import type { TaxonomyAsset } from '@/lib/server/taxonomy/types';
type FilingAssetDiscoveryInput = {
cik: string;
accessionNumber: string;
filingUrl: string | null;
primaryDocument: string | null;
fetchImpl?: typeof fetch;
};
type FilingDirectoryJson = {
directory?: {
item?: Array<{
name?: string;
type?: string;
size?: string | number;
}>;
};
};
function envUserAgent() {
return process.env.SEC_USER_AGENT || 'Fiscal Clone <support@fiscal.local>';
}
function compactAccessionNumber(value: string) {
return value.replace(/-/g, '');
}
function normalizeCikForPath(value: string) {
const digits = value.replace(/\D/g, '');
if (!digits) {
return null;
}
const numeric = Number(digits);
if (!Number.isFinite(numeric)) {
return null;
}
return String(numeric);
}
function resolveFilingDirectoryUrl(input: {
filingUrl: string | null;
cik: string;
accessionNumber: string;
}) {
const direct = input.filingUrl?.trim();
if (direct) {
const lastSlash = direct.lastIndexOf('/');
if (lastSlash > 'https://'.length) {
return direct.slice(0, lastSlash + 1);
}
}
const cikPath = normalizeCikForPath(input.cik);
const accessionPath = compactAccessionNumber(input.accessionNumber);
if (!cikPath || !accessionPath) {
return null;
}
return `https://www.sec.gov/Archives/edgar/data/${cikPath}/${accessionPath}/`;
}
function classifyAssetType(name: string): TaxonomyAsset['asset_type'] {
const lower = name.toLowerCase();
if (lower.endsWith('.pdf')) {
return 'pdf';
}
if (lower.endsWith('.xsd')) {
return 'schema';
}
if (lower.endsWith('.xml')) {
if (/(_|-)pre\.xml$/.test(lower) || /presentation/.test(lower)) {
return 'presentation';
}
if (/(_|-)lab\.xml$/.test(lower) || /label/.test(lower)) {
return 'label';
}
if (/(_|-)cal\.xml$/.test(lower) || /calculation/.test(lower)) {
return 'calculation';
}
if (/(_|-)def\.xml$/.test(lower) || /definition/.test(lower)) {
return 'definition';
}
return 'instance';
}
return 'other';
}
function scorePdf(name: string, sizeBytes: number | null) {
const lower = name.toLowerCase();
let score = 0;
if (/financial|statement|annual|quarter|10k|10q/.test(lower)) {
score += 8;
}
if (/exhibit|ex-\d+/.test(lower)) {
score -= 2;
}
if (sizeBytes && sizeBytes > 100_000) {
score += 1;
}
return score;
}
function scoreInstance(name: string, primaryDocument: string | null) {
const lower = name.toLowerCase();
let score = 1;
if (/_htm\.xml$/.test(lower)) {
score += 4;
}
if (/_ins\.xml$/.test(lower)) {
score += 4;
}
const basePrimary = (primaryDocument ?? '').replace(/\.[a-z0-9]+$/i, '').toLowerCase();
if (basePrimary && lower.includes(basePrimary)) {
score += 5;
}
if (/cal|def|lab|pre/.test(lower)) {
score -= 3;
}
return score;
}
function parseSize(raw: unknown) {
if (typeof raw === 'number') {
return Number.isFinite(raw) ? raw : null;
}
if (typeof raw === 'string') {
const parsed = Number(raw);
return Number.isFinite(parsed) ? parsed : null;
}
return null;
}
async function fetchJson<T>(url: string, fetchImpl: typeof fetch): Promise<T> {
const response = await fetchImpl(url, {
headers: {
'User-Agent': envUserAgent(),
Accept: 'application/json'
},
cache: 'no-store'
});
if (!response.ok) {
throw new Error(`SEC request failed (${response.status})`);
}
return await response.json() as T;
}
export async function discoverFilingAssets(input: FilingAssetDiscoveryInput): Promise<{
directoryUrl: string | null;
assets: TaxonomyAsset[];
}> {
const fetchImpl = input.fetchImpl ?? fetch;
const directoryUrl = resolveFilingDirectoryUrl({
filingUrl: input.filingUrl,
cik: input.cik,
accessionNumber: input.accessionNumber
});
if (!directoryUrl) {
return {
directoryUrl: null,
assets: []
};
}
let payload: FilingDirectoryJson | null = null;
try {
payload = await fetchJson<FilingDirectoryJson>(`${directoryUrl}index.json`, fetchImpl);
} catch {
payload = null;
}
const discovered: TaxonomyAsset[] = [];
for (const item of payload?.directory?.item ?? []) {
const name = (item.name ?? '').trim();
if (!name) {
continue;
}
const url = `${directoryUrl}${name.replace(/^\/+/, '')}`;
const asset_type = classifyAssetType(name);
const size_bytes = parseSize(item.size);
discovered.push({
asset_type,
name,
url,
size_bytes,
score: null,
is_selected: false
});
}
if (discovered.length === 0 && input.filingUrl) {
const fallbackName = input.primaryDocument ?? input.filingUrl.split('/').pop() ?? 'primary_document';
discovered.push({
asset_type: fallbackName.toLowerCase().endsWith('.xml') ? 'instance' : 'other',
name: fallbackName,
url: input.filingUrl,
size_bytes: null,
score: null,
is_selected: true
});
}
const instanceCandidates = discovered
.filter((asset) => asset.asset_type === 'instance')
.map((asset) => ({
asset,
score: scoreInstance(asset.name, input.primaryDocument)
}))
.sort((a, b) => b.score - a.score);
const selectedInstanceUrl = instanceCandidates[0]?.asset.url ?? null;
const selectedPdfUrls = discovered
.filter((asset) => asset.asset_type === 'pdf')
.map((asset) => ({
asset,
score: scorePdf(asset.name, asset.size_bytes)
}))
.sort((a, b) => b.score - a.score)
.slice(0, 3)
.map((entry) => entry.asset.url);
const assets = discovered.map((asset) => {
if (asset.asset_type === 'instance') {
const score = scoreInstance(asset.name, input.primaryDocument);
return {
...asset,
score,
is_selected: asset.url === selectedInstanceUrl
};
}
if (asset.asset_type === 'pdf') {
const score = scorePdf(asset.name, asset.size_bytes);
return {
...asset,
score,
is_selected: selectedPdfUrls.includes(asset.url)
};
}
return {
...asset,
score: null,
is_selected: asset.asset_type === 'presentation'
|| asset.asset_type === 'label'
|| asset.asset_type === 'calculation'
|| asset.asset_type === 'definition'
|| asset.asset_type === 'schema'
};
});
return {
directoryUrl,
assets
};
}

View File

@@ -0,0 +1,185 @@
import type { FinancialStatementKind } from '@/lib/types';
import { discoverFilingAssets } from '@/lib/server/taxonomy/asset-discovery';
import { parseLabelLinkbase, parsePresentationLinkbase } from '@/lib/server/taxonomy/linkbase-parser';
import { deriveTaxonomyMetrics } from '@/lib/server/taxonomy/metrics';
import { materializeTaxonomyStatements } from '@/lib/server/taxonomy/materialize';
import { validateMetricsWithPdfLlm } from '@/lib/server/taxonomy/pdf-validation';
import type { TaxonomyHydrationInput, TaxonomyHydrationResult } from '@/lib/server/taxonomy/types';
import { parseXbrlInstance } from '@/lib/server/taxonomy/xbrl-parser';
function createStatementRecord<T>(factory: () => T): Record<FinancialStatementKind, T> {
return {
income: factory(),
balance: factory(),
cash_flow: factory(),
equity: factory(),
comprehensive_income: factory()
};
}
function envUserAgent() {
return process.env.SEC_USER_AGENT || 'Fiscal Clone <support@fiscal.local>';
}
async function fetchText(url: string, fetchImpl: typeof fetch) {
const response = await fetchImpl(url, {
headers: {
'User-Agent': envUserAgent(),
Accept: 'text/xml, text/plain, text/html;q=0.8, */*;q=0.5'
},
cache: 'no-store'
});
if (!response.ok) {
throw new Error(`SEC request failed (${response.status})`);
}
return await response.text();
}
export async function hydrateFilingTaxonomySnapshot(
input: TaxonomyHydrationInput,
options?: {
fetchImpl?: typeof fetch;
}
): Promise<TaxonomyHydrationResult> {
const fetchImpl = options?.fetchImpl ?? fetch;
const discovered = await discoverFilingAssets({
cik: input.cik,
accessionNumber: input.accessionNumber,
filingUrl: input.filingUrl,
primaryDocument: input.primaryDocument,
fetchImpl
});
const emptyResult: TaxonomyHydrationResult = {
filing_id: input.filingId,
ticker: input.ticker.trim().toUpperCase(),
filing_date: input.filingDate,
filing_type: input.filingType,
parse_status: 'failed',
parse_error: 'No XBRL instance found',
source: 'legacy_html_fallback',
periods: [],
statement_rows: createStatementRecord(() => []),
derived_metrics: null,
validation_result: {
status: 'not_run',
checks: [],
validatedAt: null
},
facts_count: 0,
concepts_count: 0,
dimensions_count: 0,
assets: discovered.assets,
concepts: [],
facts: [],
metric_validations: []
};
const selectedInstance = discovered.assets.find((asset) => asset.asset_type === 'instance' && asset.is_selected)
?? discovered.assets.find((asset) => asset.asset_type === 'instance')
?? null;
if (!selectedInstance) {
return emptyResult;
}
let parseError: string | null = null;
let source: TaxonomyHydrationResult['source'] = 'xbrl_instance';
let instanceText = '';
try {
instanceText = await fetchText(selectedInstance.url, fetchImpl);
} catch (error) {
parseError = error instanceof Error ? error.message : 'Unable to fetch instance file';
return {
...emptyResult,
parse_error: parseError
};
}
const parsedInstance = parseXbrlInstance(instanceText, selectedInstance.name);
const labelByConcept = new Map<string, string>();
const presentation: ReturnType<typeof parsePresentationLinkbase> = [];
for (const asset of discovered.assets) {
if (!asset.is_selected) {
continue;
}
if (asset.asset_type !== 'presentation' && asset.asset_type !== 'label') {
continue;
}
try {
const content = await fetchText(asset.url, fetchImpl);
if (asset.asset_type === 'presentation') {
const parsed = parsePresentationLinkbase(content);
if (parsed.length > 0) {
source = 'xbrl_instance_with_linkbase';
}
presentation.push(...parsed);
} else if (asset.asset_type === 'label') {
const parsed = parseLabelLinkbase(content);
for (const [conceptKey, label] of parsed.entries()) {
if (!labelByConcept.has(conceptKey)) {
labelByConcept.set(conceptKey, label);
}
}
}
} catch (error) {
parseError = parseError ?? (error instanceof Error ? error.message : 'Failed to parse taxonomy linkbase');
}
}
const materialized = materializeTaxonomyStatements({
filingId: input.filingId,
accessionNumber: input.accessionNumber,
filingDate: input.filingDate,
filingType: input.filingType,
facts: parsedInstance.facts,
presentation,
labelByConcept
});
const derivedMetrics = deriveTaxonomyMetrics(parsedInstance.facts);
const llmValidation = await validateMetricsWithPdfLlm({
metrics: derivedMetrics,
assets: discovered.assets,
fetchImpl
});
const hasRows = (Object.values(materialized.statement_rows).reduce((total, rows) => total + rows.length, 0)) > 0;
const hasFacts = materialized.facts.length > 0;
const parseStatus: TaxonomyHydrationResult['parse_status'] = hasRows && hasFacts
? 'ready'
: hasFacts
? 'partial'
: 'failed';
return {
filing_id: input.filingId,
ticker: input.ticker.trim().toUpperCase(),
filing_date: input.filingDate,
filing_type: input.filingType,
parse_status: parseStatus,
parse_error: parseStatus === 'failed' ? (parseError ?? 'No XBRL facts extracted') : parseError,
source,
periods: materialized.periods,
statement_rows: materialized.statement_rows,
derived_metrics: derivedMetrics,
validation_result: llmValidation.validation_result,
facts_count: materialized.facts.length,
concepts_count: materialized.concepts.length,
dimensions_count: materialized.dimensionsCount,
assets: discovered.assets,
concepts: materialized.concepts,
facts: materialized.facts,
metric_validations: llmValidation.metric_validations
};
}

View File

@@ -0,0 +1,63 @@
import { describe, expect, it } from 'bun:test';
import {
classifyStatementRole,
parseLabelLinkbase,
parsePresentationLinkbase
} from '@/lib/server/taxonomy/linkbase-parser';
const SAMPLE_LABEL_LINKBASE = `
<link:linkbase xmlns:link="http://www.xbrl.org/2003/linkbase"
xmlns:xlink="http://www.w3.org/1999/xlink"
xmlns:us-gaap="http://fasb.org/us-gaap/2024">
<link:labelLink xlink:type="extended">
<link:loc xlink:type="locator" xlink:label="loc_rev" xlink:href="test.xsd#us-gaap_Revenues" />
<link:label xlink:type="resource" xlink:label="lab_terse" xlink:role="http://www.xbrl.org/2003/role/terseLabel">Rev.</link:label>
<link:label xlink:type="resource" xlink:label="lab_label" xlink:role="http://www.xbrl.org/2003/role/label">Revenues</link:label>
<link:labelArc xlink:type="arc" xlink:from="loc_rev" xlink:to="lab_terse" />
<link:labelArc xlink:type="arc" xlink:from="loc_rev" xlink:to="lab_label" />
</link:labelLink>
</link:linkbase>
`;
const SAMPLE_PRESENTATION_LINKBASE = `
<link:linkbase xmlns:link="http://www.xbrl.org/2003/linkbase"
xmlns:xlink="http://www.w3.org/1999/xlink"
xmlns:us-gaap="http://fasb.org/us-gaap/2024">
<link:presentationLink xlink:type="extended" xlink:role="http://www.xbrl.org/2003/role/StatementOfOperations">
<link:loc xlink:type="locator" xlink:label="root" xlink:href="test.xsd#us-gaap_StatementLineItems" />
<link:loc xlink:type="locator" xlink:label="rev" xlink:href="test.xsd#us-gaap_Revenues" />
<link:loc xlink:type="locator" xlink:label="cogs" xlink:href="test.xsd#us-gaap_CostOfGoodsSold" />
<link:presentationArc xlink:type="arc" xlink:from="root" xlink:to="rev" order="1" />
<link:presentationArc xlink:type="arc" xlink:from="root" xlink:to="cogs" order="2" />
</link:presentationLink>
</link:linkbase>
`;
describe('linkbase parser', () => {
it('builds preferred labels from label linkbase', () => {
const labels = parseLabelLinkbase(SAMPLE_LABEL_LINKBASE);
expect(labels.get('http://fasb.org/us-gaap/2024#Revenues')).toBe('Revenues');
});
it('builds role trees with depth/order/parent metadata', () => {
const rows = parsePresentationLinkbase(SAMPLE_PRESENTATION_LINKBASE);
expect(rows.length).toBe(3);
const root = rows.find((row) => row.qname === 'us-gaap:StatementLineItems');
const revenue = rows.find((row) => row.qname === 'us-gaap:Revenues');
const cogs = rows.find((row) => row.qname === 'us-gaap:CostOfGoodsSold');
expect(root?.depth).toBe(0);
expect(root?.parentConceptKey).toBeNull();
expect(revenue?.depth).toBe(1);
expect(cogs?.depth).toBe(1);
expect(revenue?.parentConceptKey).toBe(root?.conceptKey ?? null);
expect(revenue?.order).toBeLessThan(cogs?.order ?? Number.POSITIVE_INFINITY);
});
it('classifies statement roles into canonical statement kinds', () => {
expect(classifyStatementRole('http://www.xbrl.org/2003/role/StatementOfOperations')).toBe('income');
expect(classifyStatementRole('http://www.xbrl.org/2003/role/StatementOfFinancialPosition')).toBe('balance');
expect(classifyStatementRole('http://www.xbrl.org/2003/role/StatementOfCashFlows')).toBe('cash_flow');
});
});

View File

@@ -0,0 +1,310 @@
import type { FinancialStatementKind } from '@/lib/types';
import type { TaxonomyNamespaceMap, TaxonomyPresentationConcept } from '@/lib/server/taxonomy/types';
function decodeXmlEntities(value: string) {
return value
.replace(/&amp;/gi, '&')
.replace(/&lt;/gi, '<')
.replace(/&gt;/gi, '>')
.replace(/&quot;/gi, '"')
.replace(/&#39;/gi, "'")
.replace(/&#160;|&nbsp;/gi, ' ')
.trim();
}
function parseNamespaceMap(raw: string): TaxonomyNamespaceMap {
const map: TaxonomyNamespaceMap = {};
const rootStart = raw.match(/<[^>]*linkbase[^>]*>/i)?.[0] ?? raw.slice(0, 1200);
for (const match of rootStart.matchAll(/xmlns:([a-zA-Z0-9_\-]+)=["']([^"']+)["']/g)) {
const prefix = (match[1] ?? '').trim();
const uri = (match[2] ?? '').trim();
if (!prefix || !uri) {
continue;
}
map[prefix] = uri;
}
return map;
}
function qnameFromHref(href: string) {
const fragment = href.includes('#') ? href.slice(href.indexOf('#') + 1) : href;
if (!fragment) {
return null;
}
const cleaned = fragment.trim().replace(/^loc_+/i, '');
if (!cleaned) {
return null;
}
if (cleaned.includes(':')) {
return cleaned;
}
if (cleaned.includes('_')) {
const idx = cleaned.indexOf('_');
return `${cleaned.slice(0, idx)}:${cleaned.slice(idx + 1)}`;
}
return null;
}
function conceptFromQName(qname: string, namespaces: TaxonomyNamespaceMap) {
const [prefix, ...rest] = qname.split(':');
const localName = rest.join(':');
if (!prefix || !localName) {
return null;
}
const namespaceUri = namespaces[prefix] ?? `urn:unknown:${prefix}`;
return {
qname,
namespaceUri,
localName,
conceptKey: `${namespaceUri}#${localName}`
};
}
function labelPriority(role: string | null) {
const normalized = (role ?? '').toLowerCase();
if (!normalized) {
return 0;
}
if (normalized.endsWith('/label')) {
return 4;
}
if (normalized.endsWith('/terselabel')) {
return 3;
}
if (normalized.endsWith('/verboselabel')) {
return 2;
}
return 1;
}
export function classifyStatementRole(roleUri: string): FinancialStatementKind | null {
const normalized = roleUri.toLowerCase();
if (/cash\s*flow|statementsof?cashflows|netcash/.test(normalized)) {
return 'cash_flow';
}
if (/shareholders?|stockholders?|equity|retainedearnings/.test(normalized)) {
return 'equity';
}
if (/comprehensive\s*income/.test(normalized)) {
return 'comprehensive_income';
}
if (/balance\s*sheet|financial\s*position|assets?andliabilities/.test(normalized)) {
return 'balance';
}
if (/operations|income\s*statement|statementsofincome|profit/.test(normalized)) {
return 'income';
}
return null;
}
export function parseLabelLinkbase(raw: string): Map<string, string> {
const namespaces = parseNamespaceMap(raw);
const preferredLabelByConcept = new Map<string, { text: string; priority: number }>();
const linkPattern = /<(?:[a-z0-9_\-]+:)?labelLink\b[^>]*>([\s\S]*?)<\/(?:[a-z0-9_\-]+:)?labelLink>/gi;
for (const linkMatch of raw.matchAll(linkPattern)) {
const block = linkMatch[1] ?? '';
const locByLabel = new Map<string, string>();
const resourceByLabel = new Map<string, { text: string; role: string | null }>();
for (const locMatch of block.matchAll(/<(?:[a-z0-9_\-]+:)?loc\b([^>]*)\/?>/gi)) {
const attrs = locMatch[1] ?? '';
const label = attrs.match(/\bxlink:label=["']([^"']+)["']/i)?.[1]?.trim() ?? '';
const href = attrs.match(/\bxlink:href=["']([^"']+)["']/i)?.[1]?.trim() ?? '';
if (!label || !href) {
continue;
}
const qname = qnameFromHref(href);
if (!qname) {
continue;
}
const concept = conceptFromQName(qname, namespaces);
if (!concept) {
continue;
}
locByLabel.set(label, concept.conceptKey);
}
for (const resourceMatch of block.matchAll(/<(?:[a-z0-9_\-]+:)?label\b([^>]*)>([\s\S]*?)<\/(?:[a-z0-9_\-]+:)?label>/gi)) {
const attrs = resourceMatch[1] ?? '';
const body = decodeXmlEntities(resourceMatch[2] ?? '').replace(/\s+/g, ' ').trim();
if (!body) {
continue;
}
const resourceLabel = attrs.match(/\bxlink:label=["']([^"']+)["']/i)?.[1]?.trim() ?? '';
const role = attrs.match(/\bxlink:role=["']([^"']+)["']/i)?.[1]?.trim() ?? null;
if (!resourceLabel) {
continue;
}
resourceByLabel.set(resourceLabel, {
text: body,
role
});
}
for (const arcMatch of block.matchAll(/<(?:[a-z0-9_\-]+:)?labelArc\b([^>]*)\/?>/gi)) {
const attrs = arcMatch[1] ?? '';
const from = attrs.match(/\bxlink:from=["']([^"']+)["']/i)?.[1]?.trim() ?? '';
const to = attrs.match(/\bxlink:to=["']([^"']+)["']/i)?.[1]?.trim() ?? '';
if (!from || !to) {
continue;
}
const conceptKey = locByLabel.get(from);
const resource = resourceByLabel.get(to);
if (!conceptKey || !resource) {
continue;
}
const priority = labelPriority(resource.role);
const current = preferredLabelByConcept.get(conceptKey);
if (!current || priority > current.priority) {
preferredLabelByConcept.set(conceptKey, {
text: resource.text,
priority
});
}
}
}
return new Map(
[...preferredLabelByConcept.entries()].map(([conceptKey, value]) => [conceptKey, value.text])
);
}
export function parsePresentationLinkbase(raw: string): TaxonomyPresentationConcept[] {
const namespaces = parseNamespaceMap(raw);
const rows: TaxonomyPresentationConcept[] = [];
const linkPattern = /<(?:[a-z0-9_\-]+:)?presentationLink\b([^>]*)>([\s\S]*?)<\/(?:[a-z0-9_\-]+:)?presentationLink>/gi;
for (const linkMatch of raw.matchAll(linkPattern)) {
const linkAttrs = linkMatch[1] ?? '';
const block = linkMatch[2] ?? '';
const roleUri = linkAttrs.match(/\bxlink:role=["']([^"']+)["']/i)?.[1]?.trim() ?? '';
if (!roleUri) {
continue;
}
const locByLabel = new Map<string, { conceptKey: string; qname: string; isAbstract: boolean }>();
for (const locMatch of block.matchAll(/<(?:[a-z0-9_\-]+:)?loc\b([^>]*)\/?>/gi)) {
const attrs = locMatch[1] ?? '';
const label = attrs.match(/\bxlink:label=["']([^"']+)["']/i)?.[1]?.trim() ?? '';
const href = attrs.match(/\bxlink:href=["']([^"']+)["']/i)?.[1]?.trim() ?? '';
if (!label || !href) {
continue;
}
const qname = qnameFromHref(href);
if (!qname) {
continue;
}
const concept = conceptFromQName(qname, namespaces);
if (!concept) {
continue;
}
locByLabel.set(label, {
conceptKey: concept.conceptKey,
qname: concept.qname,
isAbstract: /abstract/i.test(concept.localName)
});
}
const childrenByLabel = new Map<string, Array<{ label: string; order: number }>>();
const incoming = new Set<string>();
const allReferenced = new Set<string>();
for (const arcMatch of block.matchAll(/<(?:[a-z0-9_\-]+:)?presentationArc\b([^>]*)\/?>/gi)) {
const attrs = arcMatch[1] ?? '';
const from = attrs.match(/\bxlink:from=["']([^"']+)["']/i)?.[1]?.trim() ?? '';
const to = attrs.match(/\bxlink:to=["']([^"']+)["']/i)?.[1]?.trim() ?? '';
const orderRaw = attrs.match(/\border=["']([^"']+)["']/i)?.[1]?.trim() ?? '';
const order = Number.parseFloat(orderRaw);
if (!from || !to || !locByLabel.has(from) || !locByLabel.has(to)) {
continue;
}
const group = childrenByLabel.get(from) ?? [];
group.push({ label: to, order: Number.isFinite(order) ? order : group.length + 1 });
childrenByLabel.set(from, group);
incoming.add(to);
allReferenced.add(from);
allReferenced.add(to);
}
const roots = [...allReferenced].filter((label) => !incoming.has(label));
const visited = new Set<string>();
function dfs(label: string, depth: number, parentLabel: string | null, baseOrder: number) {
const node = locByLabel.get(label);
if (!node) {
return;
}
const pathKey = `${parentLabel ?? 'root'}::${label}::${depth}`;
if (visited.has(pathKey)) {
return;
}
visited.add(pathKey);
const parentConceptKey = parentLabel ? (locByLabel.get(parentLabel)?.conceptKey ?? null) : null;
rows.push({
conceptKey: node.conceptKey,
qname: node.qname,
roleUri,
order: baseOrder,
depth,
parentConceptKey,
isAbstract: node.isAbstract
});
const children = [...(childrenByLabel.get(label) ?? [])].sort((left, right) => left.order - right.order);
for (let i = 0; i < children.length; i += 1) {
const child = children[i];
if (!child) {
continue;
}
dfs(child.label, depth + 1, label, baseOrder + (i + 1) / 1000);
}
}
for (let i = 0; i < roots.length; i += 1) {
const root = roots[i];
if (!root) {
continue;
}
dfs(root, 0, null, i + 1);
}
}
return rows;
}

View File

@@ -0,0 +1,374 @@
import type { Filing, FinancialStatementKind, TaxonomyStatementRow } from '@/lib/types';
import type { TaxonomyConcept, TaxonomyFact, TaxonomyPresentationConcept } from '@/lib/server/taxonomy/types';
import type { FilingTaxonomyPeriod } from '@/lib/server/repos/filing-taxonomy';
import { classifyStatementRole } from '@/lib/server/taxonomy/linkbase-parser';
import { conceptStatementFallback } from '@/lib/server/taxonomy/xbrl-parser';
function compactAccessionNumber(value: string) {
return value.replace(/-/g, '');
}
function isUsGaapNamespace(namespaceUri: string) {
return /fasb\.org\/us-gaap/i.test(namespaceUri) || /us-gaap/i.test(namespaceUri);
}
function splitConceptKey(conceptKey: string) {
const index = conceptKey.lastIndexOf('#');
if (index < 0) {
return {
namespaceUri: 'urn:unknown',
localName: conceptKey
};
}
return {
namespaceUri: conceptKey.slice(0, index),
localName: conceptKey.slice(index + 1)
};
}
function localNameToLabel(localName: string) {
return localName
.replace(/([a-z0-9])([A-Z])/g, '$1 $2')
.replace(/([A-Z]+)([A-Z][a-z])/g, '$1 $2')
.replace(/_/g, ' ')
.trim();
}
function createStatementRecord<T>(factory: () => T): Record<FinancialStatementKind, T> {
return {
income: factory(),
balance: factory(),
cash_flow: factory(),
equity: factory(),
comprehensive_income: factory()
};
}
function periodSignature(fact: TaxonomyFact) {
const start = fact.periodStart ?? '';
const end = fact.periodEnd ?? '';
const instant = fact.periodInstant ?? '';
return `start:${start}|end:${end}|instant:${instant}`;
}
function periodDate(fact: TaxonomyFact, fallbackDate: string) {
return fact.periodEnd ?? fact.periodInstant ?? fallbackDate;
}
function parseEpoch(value: string | null) {
if (!value) {
return Number.NaN;
}
return Date.parse(value);
}
function sortPeriods(periods: FilingTaxonomyPeriod[]) {
return [...periods].sort((left, right) => {
const leftDate = parseEpoch(left.periodEnd ?? left.filingDate);
const rightDate = parseEpoch(right.periodEnd ?? right.filingDate);
if (Number.isFinite(leftDate) && Number.isFinite(rightDate) && leftDate !== rightDate) {
return leftDate - rightDate;
}
return left.id.localeCompare(right.id);
});
}
function pickPreferredFact<T extends TaxonomyFact>(facts: T[]) {
if (facts.length === 0) {
return null;
}
const ordered = [...facts].sort((left, right) => {
const leftScore = left.isDimensionless ? 1 : 0;
const rightScore = right.isDimensionless ? 1 : 0;
if (leftScore !== rightScore) {
return rightScore - leftScore;
}
const leftDate = parseEpoch(left.periodEnd ?? left.periodInstant);
const rightDate = parseEpoch(right.periodEnd ?? right.periodInstant);
if (Number.isFinite(leftDate) && Number.isFinite(rightDate) && leftDate !== rightDate) {
return rightDate - leftDate;
}
return Math.abs(right.value) - Math.abs(left.value);
});
return ordered[0] ?? null;
}
export function materializeTaxonomyStatements(input: {
filingId: number;
accessionNumber: string;
filingDate: string;
filingType: '10-K' | '10-Q';
facts: TaxonomyFact[];
presentation: TaxonomyPresentationConcept[];
labelByConcept: Map<string, string>;
}) {
const periodBySignature = new Map<string, FilingTaxonomyPeriod>();
const compactAccession = compactAccessionNumber(input.accessionNumber);
for (const fact of input.facts) {
const signature = periodSignature(fact);
if (periodBySignature.has(signature)) {
continue;
}
const date = periodDate(fact, input.filingDate);
const id = `${date}-${compactAccession}-${periodBySignature.size + 1}`;
periodBySignature.set(signature, {
id,
filingId: input.filingId,
accessionNumber: input.accessionNumber,
filingDate: input.filingDate,
periodStart: fact.periodStart,
periodEnd: fact.periodEnd ?? fact.periodInstant ?? input.filingDate,
filingType: input.filingType,
periodLabel: fact.periodInstant && !fact.periodStart
? 'Instant'
: fact.periodStart && fact.periodEnd
? `${fact.periodStart} to ${fact.periodEnd}`
: 'Filing Period'
});
}
const periods = sortPeriods([...periodBySignature.values()]);
const periodIdBySignature = new Map<string, string>(
[...periodBySignature.entries()].map(([signature, period]) => [signature, period.id])
);
const presentationByConcept = new Map<string, TaxonomyPresentationConcept[]>();
for (const node of input.presentation) {
const existing = presentationByConcept.get(node.conceptKey);
if (existing) {
existing.push(node);
} else {
presentationByConcept.set(node.conceptKey, [node]);
}
}
const enrichedFacts = input.facts.map((fact, index) => {
const nodes = presentationByConcept.get(fact.conceptKey) ?? [];
const bestNode = nodes[0] ?? null;
const statementKind = bestNode
? classifyStatementRole(bestNode.roleUri)
: conceptStatementFallback(fact.localName);
return {
...fact,
__sourceFactId: index + 1,
statement_kind: statementKind,
role_uri: bestNode?.roleUri ?? null
};
});
const rowsByStatement = createStatementRecord<TaxonomyStatementRow[]>(() => []);
const conceptByKey = new Map<string, TaxonomyConcept>();
const groupedByStatement = createStatementRecord<Map<string, typeof enrichedFacts>>(() => new Map());
for (const fact of enrichedFacts) {
if (!fact.statement_kind) {
continue;
}
const group = groupedByStatement[fact.statement_kind].get(fact.conceptKey);
if (group) {
group.push(fact);
} else {
groupedByStatement[fact.statement_kind].set(fact.conceptKey, [fact]);
}
}
for (const statement of Object.keys(rowsByStatement) as FinancialStatementKind[]) {
const conceptKeys = new Set<string>();
for (const node of input.presentation) {
if (classifyStatementRole(node.roleUri) !== statement) {
continue;
}
conceptKeys.add(node.conceptKey);
}
for (const conceptKey of groupedByStatement[statement].keys()) {
conceptKeys.add(conceptKey);
}
const orderedConcepts = [...conceptKeys]
.map((conceptKey) => {
const presentationNodes = input.presentation.filter(
(node) => node.conceptKey === conceptKey && classifyStatementRole(node.roleUri) === statement
);
const presentationOrder = presentationNodes.length > 0
? Math.min(...presentationNodes.map((node) => node.order))
: Number.MAX_SAFE_INTEGER;
const presentationDepth = presentationNodes.length > 0
? Math.min(...presentationNodes.map((node) => node.depth))
: 0;
const roleUri = presentationNodes[0]?.roleUri ?? null;
const parentConceptKey = presentationNodes[0]?.parentConceptKey ?? null;
return {
conceptKey,
presentationOrder,
presentationDepth,
roleUri,
parentConceptKey
};
})
.sort((left, right) => {
if (left.presentationOrder !== right.presentationOrder) {
return left.presentationOrder - right.presentationOrder;
}
return left.conceptKey.localeCompare(right.conceptKey);
});
for (const orderedConcept of orderedConcepts) {
const facts = groupedByStatement[statement].get(orderedConcept.conceptKey) ?? [];
const { namespaceUri, localName } = splitConceptKey(orderedConcept.conceptKey);
const qname = facts[0]?.qname ?? `unknown:${localName}`;
const label = input.labelByConcept.get(orderedConcept.conceptKey) ?? localNameToLabel(localName);
const values: Record<string, number | null> = {};
const units: Record<string, string | null> = {};
const factGroups = new Map<string, typeof facts>();
for (const fact of facts) {
const signature = periodSignature(fact);
const group = factGroups.get(signature);
if (group) {
group.push(fact);
} else {
factGroups.set(signature, [fact]);
}
}
const sourceFactIds: number[] = [];
let hasDimensions = false;
for (const [signature, group] of factGroups.entries()) {
const periodId = periodIdBySignature.get(signature);
if (!periodId) {
continue;
}
const preferred = pickPreferredFact(group);
if (!preferred) {
continue;
}
values[periodId] = preferred.value;
units[periodId] = preferred.unit;
const sourceFactId = (preferred as { __sourceFactId?: number }).__sourceFactId;
if (typeof sourceFactId === 'number') {
sourceFactIds.push(sourceFactId);
}
if (group.some((entry) => !entry.isDimensionless)) {
hasDimensions = true;
}
}
if (Object.keys(values).length === 0) {
continue;
}
const row: TaxonomyStatementRow = {
key: orderedConcept.conceptKey,
label,
conceptKey: orderedConcept.conceptKey,
qname,
namespaceUri,
localName,
isExtension: !isUsGaapNamespace(namespaceUri),
statement,
roleUri: orderedConcept.roleUri,
order: Number.isFinite(orderedConcept.presentationOrder)
? orderedConcept.presentationOrder
: rowsByStatement[statement].length + 1,
depth: orderedConcept.presentationDepth,
parentKey: orderedConcept.parentConceptKey,
values,
units,
hasDimensions,
sourceFactIds
};
rowsByStatement[statement].push(row);
if (!conceptByKey.has(orderedConcept.conceptKey)) {
conceptByKey.set(orderedConcept.conceptKey, {
concept_key: orderedConcept.conceptKey,
qname,
namespace_uri: namespaceUri,
local_name: localName,
label,
is_extension: !isUsGaapNamespace(namespaceUri),
statement_kind: statement,
role_uri: orderedConcept.roleUri,
presentation_order: row.order,
presentation_depth: row.depth,
parent_concept_key: row.parentKey,
is_abstract: /abstract/i.test(localName)
});
}
}
}
for (const fact of enrichedFacts) {
if (conceptByKey.has(fact.conceptKey)) {
continue;
}
conceptByKey.set(fact.conceptKey, {
concept_key: fact.conceptKey,
qname: fact.qname,
namespace_uri: fact.namespaceUri,
local_name: fact.localName,
label: input.labelByConcept.get(fact.conceptKey) ?? localNameToLabel(fact.localName),
is_extension: !isUsGaapNamespace(fact.namespaceUri),
statement_kind: fact.statement_kind,
role_uri: fact.role_uri,
presentation_order: null,
presentation_depth: null,
parent_concept_key: null,
is_abstract: /abstract/i.test(fact.localName)
});
}
const concepts = [...conceptByKey.values()];
const factRows = enrichedFacts.map((fact) => ({
concept_key: fact.conceptKey,
qname: fact.qname,
namespace_uri: fact.namespaceUri,
local_name: fact.localName,
statement_kind: fact.statement_kind,
role_uri: fact.role_uri,
context_id: fact.contextId,
unit: fact.unit,
decimals: fact.decimals,
value_num: fact.value,
period_start: fact.periodStart,
period_end: fact.periodEnd,
period_instant: fact.periodInstant,
dimensions: fact.dimensions,
is_dimensionless: fact.isDimensionless,
source_file: fact.sourceFile,
}));
const dimensionsCount = enrichedFacts.reduce((total, fact) => {
return total + fact.dimensions.length;
}, 0);
return {
periods,
statement_rows: rowsByStatement,
concepts,
facts: factRows,
dimensionsCount
};
}

View File

@@ -0,0 +1,55 @@
import { describe, expect, it } from 'bun:test';
import type { TaxonomyFact } from '@/lib/server/taxonomy/types';
import { deriveTaxonomyMetrics } from '@/lib/server/taxonomy/metrics';
function fact(localName: string, value: number, overrides?: Partial<TaxonomyFact>): TaxonomyFact {
return {
conceptKey: `http://fasb.org/us-gaap/2024#${localName}`,
qname: `us-gaap:${localName}`,
namespaceUri: 'http://fasb.org/us-gaap/2024',
localName,
contextId: 'c1',
unit: 'iso4217:USD',
decimals: '-6',
value,
periodStart: '2025-01-01',
periodEnd: '2025-12-31',
periodInstant: null,
dimensions: [],
isDimensionless: true,
sourceFile: 'abc_htm.xml',
...overrides
};
}
describe('taxonomy metric derivation', () => {
it('applies concept priority for canonical metrics and debt component fallback', () => {
const metrics = deriveTaxonomyMetrics([
fact('SalesRevenueNet', 500),
fact('Revenues', 450),
fact('NetIncomeLoss', 40),
fact('Assets', 1000),
fact('CashAndCashEquivalentsAtCarryingValue', 80),
fact('DebtCurrent', 15),
fact('LongTermDebtNoncurrent', 35)
]);
expect(metrics).toEqual({
revenue: 450,
netIncome: 40,
totalAssets: 1000,
cash: 80,
debt: 50
});
});
it('uses direct debt concept before computed debt fallback when available', () => {
const metrics = deriveTaxonomyMetrics([
fact('DebtCurrent', 15),
fact('LongTermDebtNoncurrent', 35),
fact('LongTermDebtAndCapitalLeaseObligations', 90)
]);
expect(metrics.debt).toBe(90);
});
});

View File

@@ -0,0 +1,106 @@
import type { Filing } from '@/lib/types';
import type { TaxonomyFact } from '@/lib/server/taxonomy/types';
const METRIC_LOCAL_NAME_PRIORITY = {
revenue: [
'Revenues',
'SalesRevenueNet',
'RevenueFromContractWithCustomerExcludingAssessedTax',
'TotalRevenuesAndOtherIncome'
],
netIncome: ['NetIncomeLoss', 'ProfitLoss'],
totalAssets: ['Assets'],
cash: [
'CashAndCashEquivalentsAtCarryingValue',
'CashCashEquivalentsRestrictedCashAndRestrictedCashEquivalents'
],
debtDirect: [
'DebtAndFinanceLeaseLiabilities',
'Debt',
'LongTermDebtAndCapitalLeaseObligations'
],
debtCurrent: [
'DebtCurrent',
'ShortTermBorrowings',
'LongTermDebtCurrent'
],
debtNonCurrent: [
'LongTermDebtNoncurrent',
'LongTermDebt',
'DebtNoncurrent'
]
} as const;
function normalizeDateToEpoch(value: string | null) {
if (!value) {
return Number.NaN;
}
return Date.parse(value);
}
function sameLocalName(left: string, right: string) {
return left.toLowerCase() === right.toLowerCase();
}
function pickPreferredFact(facts: TaxonomyFact[]) {
const ordered = [...facts].sort((left, right) => {
const leftDimensionScore = left.isDimensionless ? 1 : 0;
const rightDimensionScore = right.isDimensionless ? 1 : 0;
if (leftDimensionScore !== rightDimensionScore) {
return rightDimensionScore - leftDimensionScore;
}
const leftDate = normalizeDateToEpoch(left.periodEnd ?? left.periodInstant);
const rightDate = normalizeDateToEpoch(right.periodEnd ?? right.periodInstant);
if (Number.isFinite(leftDate) && Number.isFinite(rightDate) && leftDate !== rightDate) {
return rightDate - leftDate;
}
return Math.abs(right.value) - Math.abs(left.value);
});
return ordered[0] ?? null;
}
function pickBestFact(facts: TaxonomyFact[], localNames: readonly string[]) {
for (const localName of localNames) {
const matches = facts.filter((fact) => sameLocalName(fact.localName, localName));
if (matches.length === 0) {
continue;
}
return pickPreferredFact(matches);
}
return null;
}
function sumIfBoth(left: number | null, right: number | null) {
if (left === null || right === null) {
return null;
}
return left + right;
}
export function deriveTaxonomyMetrics(facts: TaxonomyFact[]): NonNullable<Filing['metrics']> {
const revenue = pickBestFact(facts, METRIC_LOCAL_NAME_PRIORITY.revenue)?.value ?? null;
const netIncome = pickBestFact(facts, METRIC_LOCAL_NAME_PRIORITY.netIncome)?.value ?? null;
const totalAssets = pickBestFact(facts, METRIC_LOCAL_NAME_PRIORITY.totalAssets)?.value ?? null;
const cash = pickBestFact(facts, METRIC_LOCAL_NAME_PRIORITY.cash)?.value ?? null;
const directDebt = pickBestFact(facts, METRIC_LOCAL_NAME_PRIORITY.debtDirect)?.value ?? null;
const debt = directDebt ?? sumIfBoth(
pickBestFact(facts, METRIC_LOCAL_NAME_PRIORITY.debtCurrent)?.value ?? null,
pickBestFact(facts, METRIC_LOCAL_NAME_PRIORITY.debtNonCurrent)?.value ?? null
);
return {
revenue,
netIncome,
totalAssets,
cash,
debt
};
}

View File

@@ -0,0 +1,49 @@
import { describe, expect, it } from 'bun:test';
import { __pdfValidationInternals } from '@/lib/server/taxonomy/pdf-validation';
describe('pdf metric validation internals', () => {
it('parses fenced json payloads and rejects invalid payloads', () => {
const parsed = __pdfValidationInternals.parseValidationPayload([
'```json',
'{"revenue":{"value":1000,"pages":[3]},"cash":{"value":200,"pages":["4"]}}',
'```'
].join('\n'));
expect(parsed).not.toBeNull();
expect(parsed?.revenue?.value).toBe(1000);
expect(parsed?.cash?.pages).toEqual(['4']);
expect(__pdfValidationInternals.parseValidationPayload('not-json')).toBeNull();
});
it('compares taxonomy vs llm values with fixed tolerance rules', () => {
expect(__pdfValidationInternals.diffStatus(1000, 1004)).toEqual({
status: 'matched',
absoluteDiff: 4,
relativeDiff: 0.004
});
expect(__pdfValidationInternals.diffStatus(1000, 1007)).toEqual({
status: 'mismatch',
absoluteDiff: 7,
relativeDiff: 0.007
});
expect(__pdfValidationInternals.diffStatus(0.5, 1.2)).toEqual({
status: 'matched',
absoluteDiff: 0.7,
relativeDiff: 0.7
});
expect(__pdfValidationInternals.diffStatus(null, 1)).toEqual({
status: 'mismatch',
absoluteDiff: null,
relativeDiff: null
});
expect(__pdfValidationInternals.diffStatus(null, null)).toEqual({
status: 'not_run',
absoluteDiff: null,
relativeDiff: null
});
});
});

View File

@@ -0,0 +1,336 @@
import { execFile } from 'node:child_process';
import { mkdtemp, rm, writeFile } from 'node:fs/promises';
import { tmpdir } from 'node:os';
import { join } from 'node:path';
import { promisify } from 'node:util';
import type { Filing, MetricValidationResult } from '@/lib/types';
import { runAiAnalysis } from '@/lib/server/ai';
import type { TaxonomyAsset, TaxonomyMetricValidationCheck } from '@/lib/server/taxonomy/types';
const execFileAsync = promisify(execFile);
const METRIC_KEYS: Array<keyof NonNullable<Filing['metrics']>> = [
'revenue',
'netIncome',
'totalAssets',
'cash',
'debt'
];
function extractJsonCandidate(raw: string) {
const fencedJson = raw.match(/```(?:json)?\s*([\s\S]*?)```/i)?.[1];
const candidate = fencedJson ?? (() => {
const start = raw.indexOf('{');
const end = raw.lastIndexOf('}');
return start >= 0 && end > start ? raw.slice(start, end + 1) : null;
})();
return candidate;
}
function parseValidationPayload(raw: string) {
const candidate = extractJsonCandidate(raw);
if (!candidate) {
return null;
}
try {
return JSON.parse(candidate) as Record<string, {
value?: number | string | null;
pages?: Array<number | string>;
}>;
} catch {
return null;
}
}
function asNumber(value: unknown) {
if (typeof value === 'number') {
return Number.isFinite(value) ? value : null;
}
if (typeof value === 'string') {
const parsed = Number(value.replace(/[,\s]/g, ''));
return Number.isFinite(parsed) ? parsed : null;
}
return null;
}
function asPageNumbers(raw: unknown): number[] {
if (!Array.isArray(raw)) {
return [];
}
return raw
.map((entry) => {
if (typeof entry === 'number' && Number.isFinite(entry)) {
return Math.trunc(entry);
}
if (typeof entry === 'string') {
const parsed = Number(entry);
return Number.isFinite(parsed) ? Math.trunc(parsed) : Number.NaN;
}
return Number.NaN;
})
.filter((entry) => Number.isFinite(entry) && entry > 0);
}
function diffStatus(taxonomyValue: number | null, llmValue: number | null) {
if (taxonomyValue === null && llmValue === null) {
return {
status: 'not_run' as const,
absoluteDiff: null,
relativeDiff: null
};
}
if (taxonomyValue === null || llmValue === null) {
return {
status: 'mismatch' as const,
absoluteDiff: null,
relativeDiff: null
};
}
const absoluteDiff = Math.abs(taxonomyValue - llmValue);
const denominator = Math.max(Math.abs(taxonomyValue), 1);
const relativeDiff = absoluteDiff / denominator;
const tolerance = Math.max(1, Math.abs(taxonomyValue) * 0.005);
return {
status: absoluteDiff <= tolerance ? 'matched' as const : 'mismatch' as const,
absoluteDiff,
relativeDiff
};
}
async function extractPdfText(url: string, fetchImpl: typeof fetch) {
const response = await fetchImpl(url, {
headers: {
Accept: 'application/pdf, */*;q=0.8'
},
cache: 'no-store'
});
if (!response.ok) {
throw new Error(`PDF request failed (${response.status})`);
}
const contentType = response.headers.get('content-type') ?? '';
if (!/pdf/i.test(contentType) && !/\.pdf$/i.test(url)) {
throw new Error(`Asset is not a PDF (${contentType || 'unknown content-type'})`);
}
const bytes = new Uint8Array(await response.arrayBuffer());
const tempRoot = await mkdtemp(join(tmpdir(), 'fiscal-pdf-'));
const pdfPath = join(tempRoot, 'source.pdf');
try {
await writeFile(pdfPath, bytes);
const { stdout } = await execFileAsync('pdftotext', ['-layout', '-enc', 'UTF-8', pdfPath, '-'], {
maxBuffer: 16 * 1024 * 1024
});
const text = stdout.trim();
if (!text) {
return null;
}
return text;
} finally {
await rm(tempRoot, { recursive: true, force: true });
}
}
function validationPrompt(metrics: Filing['metrics'], pdfText: string) {
const textSlice = pdfText.slice(0, 80_000);
return [
'Extract numeric financial metrics from the provided financial statement PDF text.',
`Taxonomy baseline metrics: ${JSON.stringify(metrics ?? {})}`,
'Return ONLY JSON with keys revenue, netIncome, totalAssets, cash, debt.',
'Each key must map to: {"value": number|null, "pages": [number]}.',
'Use null when a metric is not found.',
'PDF text follows:',
textSlice
].join('\n\n');
}
function providerModelOrNull(value: string | undefined | null) {
const normalized = value?.trim();
return normalized && normalized.length > 0 ? normalized : null;
}
export async function validateMetricsWithPdfLlm(input: {
metrics: Filing['metrics'];
assets: TaxonomyAsset[];
fetchImpl?: typeof fetch;
}): Promise<{
validation_result: MetricValidationResult | null;
metric_validations: TaxonomyMetricValidationCheck[];
}> {
const taxonomyMetrics = input.metrics ?? {
revenue: null,
netIncome: null,
totalAssets: null,
cash: null,
debt: null
};
const selectedPdf = input.assets.find((asset) => asset.asset_type === 'pdf' && asset.is_selected);
if (!selectedPdf) {
return {
validation_result: {
status: 'not_run',
checks: [],
validatedAt: null
},
metric_validations: []
};
}
const fetchImpl = input.fetchImpl ?? fetch;
let pdfText: string | null = null;
try {
pdfText = await extractPdfText(selectedPdf.url, fetchImpl);
} catch (error) {
const message = error instanceof Error ? error.message : 'PDF extraction failed';
const checks: TaxonomyMetricValidationCheck[] = METRIC_KEYS.map((metricKey) => ({
metric_key: metricKey,
taxonomy_value: taxonomyMetrics[metricKey],
llm_value: null,
absolute_diff: null,
relative_diff: null,
status: 'error',
evidence_pages: [],
pdf_url: selectedPdf.url,
provider: null,
model: null,
error: message
}));
return {
validation_result: {
status: 'error',
checks: checks.map((check) => ({
metricKey: check.metric_key,
taxonomyValue: check.taxonomy_value,
llmValue: check.llm_value,
absoluteDiff: check.absolute_diff,
relativeDiff: check.relative_diff,
status: check.status,
evidencePages: check.evidence_pages,
pdfUrl: check.pdf_url,
provider: check.provider,
model: check.model,
error: check.error
})),
validatedAt: new Date().toISOString()
},
metric_validations: checks
};
}
if (!pdfText) {
return {
validation_result: {
status: 'not_run',
checks: [],
validatedAt: new Date().toISOString()
},
metric_validations: []
};
}
let parsed: Record<string, { value?: number | string | null; pages?: Array<number | string> }> | null = null;
let provider: string | null = null;
let model: string | null = null;
let modelError: string | null = null;
try {
const aiResult = await runAiAnalysis(validationPrompt(taxonomyMetrics, pdfText), undefined, {
workload: 'extraction'
});
provider = providerModelOrNull(aiResult.provider);
model = providerModelOrNull(aiResult.model);
parsed = parseValidationPayload(aiResult.text);
if (!parsed) {
modelError = 'LLM response did not contain valid JSON payload';
}
} catch (error) {
modelError = error instanceof Error ? error.message : 'LLM validation failed';
}
const validations: TaxonomyMetricValidationCheck[] = METRIC_KEYS.map((metricKey) => {
const taxonomyValue = taxonomyMetrics[metricKey] ?? null;
if (!parsed) {
return {
metric_key: metricKey,
taxonomy_value: taxonomyValue,
llm_value: null,
absolute_diff: null,
relative_diff: null,
status: modelError ? 'error' : 'not_run',
evidence_pages: [],
pdf_url: selectedPdf.url,
provider,
model,
error: modelError
};
}
const entry = parsed[metricKey as string] ?? {};
const llmValue = asNumber(entry.value);
const pages = asPageNumbers(entry.pages);
const diff = diffStatus(taxonomyValue, llmValue);
return {
metric_key: metricKey,
taxonomy_value: taxonomyValue,
llm_value: llmValue,
absolute_diff: diff.absoluteDiff,
relative_diff: diff.relativeDiff,
status: diff.status,
evidence_pages: pages,
pdf_url: selectedPdf.url,
provider,
model,
error: null
};
});
const hasError = validations.some((entry) => entry.status === 'error');
const hasMismatch = validations.some((entry) => entry.status === 'mismatch');
return {
validation_result: {
status: hasError ? 'error' : hasMismatch ? 'mismatch' : 'matched',
checks: validations.map((check) => ({
metricKey: check.metric_key,
taxonomyValue: check.taxonomy_value,
llmValue: check.llm_value,
absoluteDiff: check.absolute_diff,
relativeDiff: check.relative_diff,
status: check.status,
evidencePages: check.evidence_pages,
pdfUrl: check.pdf_url,
provider: check.provider,
model: check.model,
error: check.error
})),
validatedAt: new Date().toISOString()
},
metric_validations: validations
};
}
export const __pdfValidationInternals = {
parseValidationPayload,
diffStatus
};

View File

@@ -0,0 +1,136 @@
import type { Filing, FinancialStatementKind, MetricValidationResult, TaxonomyStatementRow } from '@/lib/types';
import type {
FilingTaxonomyAssetType,
FilingTaxonomyParseStatus,
FilingTaxonomyPeriod,
FilingTaxonomySource
} from '@/lib/server/repos/filing-taxonomy';
export type TaxonomyAsset = {
asset_type: FilingTaxonomyAssetType;
name: string;
url: string;
size_bytes: number | null;
score: number | null;
is_selected: boolean;
};
export type TaxonomyNamespaceMap = Record<string, string>;
export type TaxonomyContext = {
id: string;
periodStart: string | null;
periodEnd: string | null;
periodInstant: string | null;
dimensions: Array<{ axis: string; member: string }>;
};
export type TaxonomyUnit = {
id: string;
measure: string | null;
};
export type TaxonomyFact = {
conceptKey: string;
qname: string;
namespaceUri: string;
localName: string;
contextId: string;
unit: string | null;
decimals: string | null;
value: number;
periodStart: string | null;
periodEnd: string | null;
periodInstant: string | null;
dimensions: Array<{ axis: string; member: string }>;
isDimensionless: boolean;
sourceFile: string | null;
};
export type TaxonomyPresentationConcept = {
conceptKey: string;
qname: string;
roleUri: string;
order: number;
depth: number;
parentConceptKey: string | null;
isAbstract: boolean;
};
export type TaxonomyConcept = {
concept_key: string;
qname: string;
namespace_uri: string;
local_name: string;
label: string | null;
is_extension: boolean;
statement_kind: FinancialStatementKind | null;
role_uri: string | null;
presentation_order: number | null;
presentation_depth: number | null;
parent_concept_key: string | null;
is_abstract: boolean;
};
export type TaxonomyMetricValidationCheck = {
metric_key: keyof NonNullable<Filing['metrics']>;
taxonomy_value: number | null;
llm_value: number | null;
absolute_diff: number | null;
relative_diff: number | null;
status: 'not_run' | 'matched' | 'mismatch' | 'error';
evidence_pages: number[];
pdf_url: string | null;
provider: string | null;
model: string | null;
error: string | null;
};
export type TaxonomyHydrationInput = {
filingId: number;
ticker: string;
cik: string;
accessionNumber: string;
filingDate: string;
filingType: '10-K' | '10-Q';
filingUrl: string | null;
primaryDocument: string | null;
};
export type TaxonomyHydrationResult = {
filing_id: number;
ticker: string;
filing_date: string;
filing_type: '10-K' | '10-Q';
parse_status: FilingTaxonomyParseStatus;
parse_error: string | null;
source: FilingTaxonomySource;
periods: FilingTaxonomyPeriod[];
statement_rows: Record<FinancialStatementKind, TaxonomyStatementRow[]>;
derived_metrics: Filing['metrics'];
validation_result: MetricValidationResult | null;
facts_count: number;
concepts_count: number;
dimensions_count: number;
assets: TaxonomyAsset[];
concepts: TaxonomyConcept[];
facts: Array<{
concept_key: string;
qname: string;
namespace_uri: string;
local_name: string;
statement_kind: FinancialStatementKind | null;
role_uri: string | null;
context_id: string;
unit: string | null;
decimals: string | null;
value_num: number;
period_start: string | null;
period_end: string | null;
period_instant: string | null;
dimensions: Array<{ axis: string; member: string }>;
is_dimensionless: boolean;
source_file: string | null;
}>;
metric_validations: TaxonomyMetricValidationCheck[];
};

View File

@@ -0,0 +1,60 @@
import { describe, expect, it } from 'bun:test';
import { parseXbrlInstance } from '@/lib/server/taxonomy/xbrl-parser';
const SAMPLE_XBRL = `
<xbrli:xbrl xmlns:xbrli="http://www.xbrl.org/2003/instance"
xmlns:xbrldi="http://xbrl.org/2006/xbrldi"
xmlns:us-gaap="http://fasb.org/us-gaap/2024"
xmlns:dei="http://xbrl.sec.gov/dei/2024">
<xbrli:context id="c1">
<xbrli:period>
<xbrli:startDate>2025-01-01</xbrli:startDate>
<xbrli:endDate>2025-12-31</xbrli:endDate>
</xbrli:period>
</xbrli:context>
<xbrli:context id="c2">
<xbrli:entity>
<xbrli:segment>
<xbrldi:explicitMember dimension="us-gaap:StatementBusinessSegmentsAxis">us-gaap:ConsolidatedGroupMember</xbrldi:explicitMember>
</xbrli:segment>
</xbrli:entity>
<xbrli:period>
<xbrli:instant>2025-12-31</xbrli:instant>
</xbrli:period>
</xbrli:context>
<xbrli:unit id="u1">
<xbrli:measure>iso4217:USD</xbrli:measure>
</xbrli:unit>
<us-gaap:Revenues contextRef="c1" unitRef="u1" decimals="-6">1,234</us-gaap:Revenues>
<us-gaap:Assets contextRef="c2" unitRef="u1" decimals="-6">5,678</us-gaap:Assets>
<dei:EntityRegistrantName contextRef="c1">Acme Corp</dei:EntityRegistrantName>
</xbrli:xbrl>
`;
describe('xbrl instance parser', () => {
it('parses contexts, units, numeric facts, dimensions, and concept keys', () => {
const parsed = parseXbrlInstance(SAMPLE_XBRL, 'abc_htm.xml');
expect(parsed.contexts.c1?.periodStart).toBe('2025-01-01');
expect(parsed.contexts.c1?.periodEnd).toBe('2025-12-31');
expect(parsed.contexts.c2?.periodInstant).toBe('2025-12-31');
expect(parsed.contexts.c2?.dimensions.length).toBe(1);
expect(parsed.units.u1?.measure).toBe('iso4217:USD');
expect(parsed.facts.length).toBe(2);
const revenueFact = parsed.facts.find((fact) => fact.localName === 'Revenues');
const assetsFact = parsed.facts.find((fact) => fact.localName === 'Assets');
expect(revenueFact?.conceptKey).toBe('http://fasb.org/us-gaap/2024#Revenues');
expect(revenueFact?.isDimensionless).toBe(true);
expect(revenueFact?.value).toBe(1234);
expect(revenueFact?.sourceFile).toBe('abc_htm.xml');
expect(assetsFact?.conceptKey).toBe('http://fasb.org/us-gaap/2024#Assets');
expect(assetsFact?.isDimensionless).toBe(false);
expect(assetsFact?.dimensions[0]).toEqual({
axis: 'us-gaap:StatementBusinessSegmentsAxis',
member: 'us-gaap:ConsolidatedGroupMember'
});
});
});

View File

@@ -0,0 +1,264 @@
import type { FinancialStatementKind } from '@/lib/types';
import type { TaxonomyContext, TaxonomyFact, TaxonomyNamespaceMap, TaxonomyUnit } from '@/lib/server/taxonomy/types';
function decodeXmlEntities(value: string) {
return value
.replace(/&amp;/gi, '&')
.replace(/&lt;/gi, '<')
.replace(/&gt;/gi, '>')
.replace(/&quot;/gi, '"')
.replace(/&#39;/gi, "'")
.replace(/&#160;|&nbsp;/gi, ' ')
.replace(/&#x([0-9a-f]+);/gi, (_match, hex) => {
const parsed = Number.parseInt(hex, 16);
if (!Number.isFinite(parsed)) {
return ' ';
}
try {
return String.fromCodePoint(parsed);
} catch {
return ' ';
}
})
.replace(/&#([0-9]+);/g, (_match, numeric) => {
const parsed = Number.parseInt(numeric, 10);
if (!Number.isFinite(parsed)) {
return ' ';
}
try {
return String.fromCodePoint(parsed);
} catch {
return ' ';
}
});
}
function parseNumber(value: string) {
const trimmed = value.trim();
if (!trimmed) {
return null;
}
if (/^--+$/.test(trimmed)) {
return null;
}
const negative = trimmed.startsWith('(') && trimmed.endsWith(')');
const normalized = trimmed
.replace(/<[^>]+>/g, ' ')
.replace(/[,$\s]/g, '')
.replace(/[()]/g, '')
.replace(/\u2212/g, '-');
if (!normalized) {
return null;
}
const parsed = Number.parseFloat(normalized);
if (!Number.isFinite(parsed)) {
return null;
}
return negative ? -Math.abs(parsed) : parsed;
}
function parseNamespaceMapFromDocument(raw: string): TaxonomyNamespaceMap {
const map: TaxonomyNamespaceMap = {};
const rootStart = raw.match(/<[^>]*xbrl[^>]*>/i)?.[0] ?? raw.slice(0, 1200);
for (const match of rootStart.matchAll(/xmlns:([a-zA-Z0-9_\-]+)=["']([^"']+)["']/g)) {
const prefix = (match[1] ?? '').trim();
const uri = (match[2] ?? '').trim();
if (!prefix || !uri) {
continue;
}
map[prefix] = uri;
}
return map;
}
function parseContexts(raw: string): Record<string, TaxonomyContext> {
const contexts: Record<string, TaxonomyContext> = {};
const contextPattern = /<(?:[a-z0-9_\-]+:)?context\b[^>]*\bid=["']([^"']+)["'][^>]*>([\s\S]*?)<\/(?:[a-z0-9_\-]+:)?context>/gi;
for (const match of raw.matchAll(contextPattern)) {
const contextId = (match[1] ?? '').trim();
const block = match[2] ?? '';
if (!contextId) {
continue;
}
const periodStart = block.match(/<(?:[a-z0-9_\-]+:)?startDate>([^<]+)<\/(?:[a-z0-9_\-]+:)?startDate>/i)?.[1]?.trim() ?? null;
const periodEnd = block.match(/<(?:[a-z0-9_\-]+:)?endDate>([^<]+)<\/(?:[a-z0-9_\-]+:)?endDate>/i)?.[1]?.trim() ?? null;
const periodInstant = block.match(/<(?:[a-z0-9_\-]+:)?instant>([^<]+)<\/(?:[a-z0-9_\-]+:)?instant>/i)?.[1]?.trim() ?? null;
const dimensions: Array<{ axis: string; member: string }> = [];
const dimPattern = /<(?:[a-z0-9_\-]+:)?explicitMember\b[^>]*\bdimension=["']([^"']+)["'][^>]*>([^<]+)<\/(?:[a-z0-9_\-]+:)?explicitMember>/gi;
for (const dimMatch of block.matchAll(dimPattern)) {
const axis = decodeXmlEntities((dimMatch[1] ?? '').trim());
const member = decodeXmlEntities((dimMatch[2] ?? '').trim());
if (!axis || !member) {
continue;
}
dimensions.push({ axis, member });
}
contexts[contextId] = {
id: contextId,
periodStart,
periodEnd,
periodInstant,
dimensions
};
}
return contexts;
}
function parseUnits(raw: string): Record<string, TaxonomyUnit> {
const units: Record<string, TaxonomyUnit> = {};
const unitPattern = /<(?:[a-z0-9_\-]+:)?unit\b[^>]*\bid=["']([^"']+)["'][^>]*>([\s\S]*?)<\/(?:[a-z0-9_\-]+:)?unit>/gi;
for (const match of raw.matchAll(unitPattern)) {
const id = (match[1] ?? '').trim();
const block = match[2] ?? '';
if (!id) {
continue;
}
const measures = [...block.matchAll(/<(?:[a-z0-9_\-]+:)?measure>([^<]+)<\/(?:[a-z0-9_\-]+:)?measure>/gi)]
.map((entry) => decodeXmlEntities((entry[1] ?? '').trim()))
.filter(Boolean);
let measure: string | null = null;
if (measures.length === 1) {
measure = measures[0] ?? null;
} else if (measures.length > 1) {
measure = measures.join('/');
}
units[id] = {
id,
measure
};
}
return units;
}
function classifyStatementKind(localName: string): FinancialStatementKind | null {
const normalized = localName.toLowerCase();
if (/cash|operatingactivities|investingactivities|financingactivities/.test(normalized)) {
return 'cash_flow';
}
if (/equity|retainedearnings|additionalpaidincapital/.test(normalized)) {
return 'equity';
}
if (/comprehensiveincome/.test(normalized)) {
return 'comprehensive_income';
}
if (/asset|liabilit|debt/.test(normalized)) {
return 'balance';
}
if (/revenue|income|profit|expense|costof/.test(normalized)) {
return 'income';
}
return null;
}
function isXbrlInfrastructurePrefix(prefix: string) {
const normalized = prefix.toLowerCase();
return normalized === 'xbrli'
|| normalized === 'xlink'
|| normalized === 'link'
|| normalized === 'xbrldi'
|| normalized === 'xbrldt';
}
function localNameToKey(namespaceUri: string, localName: string) {
return `${namespaceUri}#${localName}`;
}
export function parseXbrlInstance(
raw: string,
sourceFile: string | null
): {
namespaces: TaxonomyNamespaceMap;
contexts: Record<string, TaxonomyContext>;
units: Record<string, TaxonomyUnit>;
facts: TaxonomyFact[];
} {
const namespaces = parseNamespaceMapFromDocument(raw);
const contexts = parseContexts(raw);
const units = parseUnits(raw);
const facts: TaxonomyFact[] = [];
const factPattern = /<([a-zA-Z0-9_\-]+):([a-zA-Z0-9_\-.]+)\b([^>]*\bcontextRef=["'][^"']+["'][^>]*)>([\s\S]*?)<\/\1:\2>/g;
for (const match of raw.matchAll(factPattern)) {
const prefix = (match[1] ?? '').trim();
const localName = (match[2] ?? '').trim();
const attrs = match[3] ?? '';
const body = decodeXmlEntities((match[4] ?? '').trim());
if (!prefix || !localName || isXbrlInfrastructurePrefix(prefix)) {
continue;
}
const contextId = attrs.match(/\bcontextRef=["']([^"']+)["']/i)?.[1]?.trim() ?? '';
if (!contextId) {
continue;
}
const value = parseNumber(body);
if (value === null) {
continue;
}
const unitRef = attrs.match(/\bunitRef=["']([^"']+)["']/i)?.[1]?.trim() ?? null;
const decimals = attrs.match(/\bdecimals=["']([^"']+)["']/i)?.[1]?.trim() ?? null;
const namespaceUri = namespaces[prefix] ?? `urn:unknown:${prefix}`;
const context = contexts[contextId];
facts.push({
conceptKey: localNameToKey(namespaceUri, localName),
qname: `${prefix}:${localName}`,
namespaceUri,
localName,
contextId,
unit: unitRef && units[unitRef]?.measure ? units[unitRef]?.measure ?? unitRef : unitRef,
decimals,
value,
periodStart: context?.periodStart ?? null,
periodEnd: context?.periodEnd ?? null,
periodInstant: context?.periodInstant ?? null,
dimensions: context?.dimensions ?? [],
isDimensionless: (context?.dimensions.length ?? 0) === 0,
sourceFile,
});
}
return {
namespaces,
contexts,
units,
facts
};
}
export function conceptStatementFallback(localName: string) {
return classifyStatementKind(localName);
}