feat(taxonomy): add rust sidecar compact surface pipeline

This commit is contained in:
2026-03-12 15:23:10 -04:00
parent f2c25fb9c6
commit 58061af006
84 changed files with 19350 additions and 265 deletions

View File

@@ -0,0 +1,53 @@
import type { FinancialStatementKind } from '@/lib/types';
export function classifyStatementRole(roleUri: string): FinancialStatementKind | null {
const normalized = roleUri.toLowerCase();
if (/cash\s*flow|statementsof?cashflows|netcash/.test(normalized)) {
return 'cash_flow';
}
if (/shareholders?|stockholders?|equity|retainedearnings/.test(normalized)) {
return 'equity';
}
if (/comprehensive\s*income/.test(normalized)) {
return 'comprehensive_income';
}
if (/balance\s*sheet|financial\s*position|assets?andliabilities/.test(normalized)) {
return 'balance';
}
if (/operations|income\s*statement|statementsofincome|profit/.test(normalized)) {
return 'income';
}
return null;
}
export function conceptStatementFallback(localName: string): FinancialStatementKind | null {
const normalized = localName.toLowerCase();
if (/cash|operatingactivities|investingactivities|financingactivities/.test(normalized)) {
return 'cash_flow';
}
if (/equity|retainedearnings|additionalpaidincapital/.test(normalized)) {
return 'equity';
}
if (/comprehensiveincome/.test(normalized)) {
return 'comprehensive_income';
}
if (/asset|liabilit|debt/.test(normalized)) {
return 'balance';
}
if (/revenue|income|profit|expense|costof/.test(normalized)) {
return 'income';
}
return null;
}

View File

@@ -1,185 +1,8 @@
import type { FinancialStatementKind } from '@/lib/types';
import { discoverFilingAssets } from '@/lib/server/taxonomy/asset-discovery';
import { parseLabelLinkbase, parsePresentationLinkbase } from '@/lib/server/taxonomy/linkbase-parser';
import { deriveTaxonomyMetrics } from '@/lib/server/taxonomy/metrics';
import { materializeTaxonomyStatements } from '@/lib/server/taxonomy/materialize';
import { validateMetricsWithPdfLlm } from '@/lib/server/taxonomy/pdf-validation';
import { hydrateFilingTaxonomySnapshotFromSidecar } from '@/lib/server/taxonomy/parser-client';
import type { TaxonomyHydrationInput, TaxonomyHydrationResult } from '@/lib/server/taxonomy/types';
import { parseXbrlInstance } from '@/lib/server/taxonomy/xbrl-parser';
function createStatementRecord<T>(factory: () => T): Record<FinancialStatementKind, T> {
return {
income: factory(),
balance: factory(),
cash_flow: factory(),
equity: factory(),
comprehensive_income: factory()
};
}
function envUserAgent() {
return process.env.SEC_USER_AGENT || 'Fiscal Clone <support@fiscal.local>';
}
async function fetchText(url: string, fetchImpl: typeof fetch) {
const response = await fetchImpl(url, {
headers: {
'User-Agent': envUserAgent(),
Accept: 'text/xml, text/plain, text/html;q=0.8, */*;q=0.5'
},
cache: 'no-store'
});
if (!response.ok) {
throw new Error(`SEC request failed (${response.status})`);
}
return await response.text();
}
export async function hydrateFilingTaxonomySnapshot(
input: TaxonomyHydrationInput,
options?: {
fetchImpl?: typeof fetch;
}
input: TaxonomyHydrationInput
): Promise<TaxonomyHydrationResult> {
const fetchImpl = options?.fetchImpl ?? fetch;
const discovered = await discoverFilingAssets({
cik: input.cik,
accessionNumber: input.accessionNumber,
filingUrl: input.filingUrl,
primaryDocument: input.primaryDocument,
fetchImpl
});
const emptyResult: TaxonomyHydrationResult = {
filing_id: input.filingId,
ticker: input.ticker.trim().toUpperCase(),
filing_date: input.filingDate,
filing_type: input.filingType,
parse_status: 'failed',
parse_error: 'No XBRL instance found',
source: 'legacy_html_fallback',
periods: [],
statement_rows: createStatementRecord(() => []),
derived_metrics: null,
validation_result: {
status: 'not_run',
checks: [],
validatedAt: null
},
facts_count: 0,
concepts_count: 0,
dimensions_count: 0,
assets: discovered.assets,
concepts: [],
facts: [],
metric_validations: []
};
const selectedInstance = discovered.assets.find((asset) => asset.asset_type === 'instance' && asset.is_selected)
?? discovered.assets.find((asset) => asset.asset_type === 'instance')
?? null;
if (!selectedInstance) {
return emptyResult;
}
let parseError: string | null = null;
let source: TaxonomyHydrationResult['source'] = 'xbrl_instance';
let instanceText = '';
try {
instanceText = await fetchText(selectedInstance.url, fetchImpl);
} catch (error) {
parseError = error instanceof Error ? error.message : 'Unable to fetch instance file';
return {
...emptyResult,
parse_error: parseError
};
}
const parsedInstance = parseXbrlInstance(instanceText, selectedInstance.name);
const labelByConcept = new Map<string, string>();
const presentation: ReturnType<typeof parsePresentationLinkbase> = [];
for (const asset of discovered.assets) {
if (!asset.is_selected) {
continue;
}
if (asset.asset_type !== 'presentation' && asset.asset_type !== 'label') {
continue;
}
try {
const content = await fetchText(asset.url, fetchImpl);
if (asset.asset_type === 'presentation') {
const parsed = parsePresentationLinkbase(content);
if (parsed.length > 0) {
source = 'xbrl_instance_with_linkbase';
}
presentation.push(...parsed);
} else if (asset.asset_type === 'label') {
const parsed = parseLabelLinkbase(content);
for (const [conceptKey, label] of parsed.entries()) {
if (!labelByConcept.has(conceptKey)) {
labelByConcept.set(conceptKey, label);
}
}
}
} catch (error) {
parseError = parseError ?? (error instanceof Error ? error.message : 'Failed to parse taxonomy linkbase');
}
}
const materialized = materializeTaxonomyStatements({
filingId: input.filingId,
accessionNumber: input.accessionNumber,
filingDate: input.filingDate,
filingType: input.filingType,
facts: parsedInstance.facts,
presentation,
labelByConcept
});
const derivedMetrics = deriveTaxonomyMetrics(parsedInstance.facts);
const llmValidation = await validateMetricsWithPdfLlm({
metrics: derivedMetrics,
assets: discovered.assets,
fetchImpl
});
const hasRows = (Object.values(materialized.statement_rows).reduce((total, rows) => total + rows.length, 0)) > 0;
const hasFacts = materialized.facts.length > 0;
const parseStatus: TaxonomyHydrationResult['parse_status'] = hasRows && hasFacts
? 'ready'
: hasFacts
? 'partial'
: 'failed';
return {
filing_id: input.filingId,
ticker: input.ticker.trim().toUpperCase(),
filing_date: input.filingDate,
filing_type: input.filingType,
parse_status: parseStatus,
parse_error: parseStatus === 'failed' ? (parseError ?? 'No XBRL facts extracted') : parseError,
source,
periods: materialized.periods,
statement_rows: materialized.statement_rows,
derived_metrics: derivedMetrics,
validation_result: llmValidation.validation_result,
facts_count: materialized.facts.length,
concepts_count: materialized.concepts.length,
dimensions_count: materialized.dimensionsCount,
assets: discovered.assets,
concepts: materialized.concepts,
facts: materialized.facts,
metric_validations: llmValidation.metric_validations
};
return await hydrateFilingTaxonomySnapshotFromSidecar(input);
}

View File

@@ -1,8 +1,7 @@
import type { Filing, FinancialStatementKind, TaxonomyStatementRow } from '@/lib/types';
import type { TaxonomyConcept, TaxonomyFact, TaxonomyPresentationConcept } from '@/lib/server/taxonomy/types';
import type { FilingTaxonomyPeriod } from '@/lib/server/repos/filing-taxonomy';
import { classifyStatementRole } from '@/lib/server/taxonomy/linkbase-parser';
import { conceptStatementFallback } from '@/lib/server/taxonomy/xbrl-parser';
import { classifyStatementRole, conceptStatementFallback } from '@/lib/server/taxonomy/classifiers';
function compactAccessionNumber(value: string) {
return value.replace(/-/g, '');
@@ -308,8 +307,17 @@ export function materializeTaxonomyStatements(input: {
local_name: localName,
label,
is_extension: !isUsGaapNamespace(namespaceUri),
balance: null,
period_type: null,
data_type: null,
statement_kind: statement,
role_uri: orderedConcept.roleUri,
authoritative_concept_key: null,
mapping_method: null,
surface_key: null,
detail_parent_surface_key: null,
kpi_key: null,
residual_flag: false,
presentation_order: row.order,
presentation_depth: row.depth,
parent_concept_key: row.parentKey,
@@ -331,8 +339,17 @@ export function materializeTaxonomyStatements(input: {
local_name: fact.localName,
label: input.labelByConcept.get(fact.conceptKey) ?? localNameToLabel(fact.localName),
is_extension: !isUsGaapNamespace(fact.namespaceUri),
balance: null,
period_type: null,
data_type: fact.dataType,
statement_kind: fact.statement_kind,
role_uri: fact.role_uri,
authoritative_concept_key: null,
mapping_method: null,
surface_key: null,
detail_parent_surface_key: null,
kpi_key: null,
residual_flag: false,
presentation_order: null,
presentation_depth: null,
parent_concept_key: null,
@@ -346,11 +363,20 @@ export function materializeTaxonomyStatements(input: {
qname: fact.qname,
namespace_uri: fact.namespaceUri,
local_name: fact.localName,
data_type: fact.dataType,
statement_kind: fact.statement_kind,
role_uri: fact.role_uri,
authoritative_concept_key: null,
mapping_method: null,
surface_key: null,
detail_parent_surface_key: null,
kpi_key: null,
residual_flag: false,
context_id: fact.contextId,
unit: fact.unit,
decimals: fact.decimals,
precision: fact.precision,
nil: fact.nil,
value_num: fact.value,
period_start: fact.periodStart,
period_end: fact.periodEnd,

View File

@@ -8,9 +8,12 @@ function fact(localName: string, value: number, overrides?: Partial<TaxonomyFact
qname: `us-gaap:${localName}`,
namespaceUri: 'http://fasb.org/us-gaap/2024',
localName,
dataType: null,
contextId: 'c1',
unit: 'iso4217:USD',
decimals: '-6',
precision: null,
nil: false,
value,
periodStart: '2025-01-01',
periodEnd: '2025-12-31',

View File

@@ -0,0 +1,76 @@
import { existsSync } from 'node:fs';
import { join } from 'node:path';
import type { TaxonomyHydrationInput, TaxonomyHydrationResult } from '@/lib/server/taxonomy/types';
function candidateBinaryPaths() {
return [
process.env.FISCAL_XBRL_BIN?.trim(),
join(process.cwd(), 'bin', 'fiscal-xbrl'),
join(process.cwd(), 'rust', 'target', 'release', 'fiscal-xbrl'),
join(process.cwd(), 'rust', 'target', 'debug', 'fiscal-xbrl')
].filter((value): value is string => typeof value === 'string' && value.length > 0);
}
export function resolveFiscalXbrlBinary() {
const resolved = candidateBinaryPaths().find((path) => existsSync(path));
if (!resolved) {
throw new Error('Rust XBRL sidecar binary is required but was not found. Set FISCAL_XBRL_BIN or build `fiscal-xbrl` under rust/target.');
}
return resolved;
}
export async function hydrateFilingTaxonomySnapshotFromSidecar(
input: TaxonomyHydrationInput
): Promise<TaxonomyHydrationResult> {
const binary = resolveFiscalXbrlBinary();
const timeoutMs = Math.max(Number(process.env.XBRL_ENGINE_TIMEOUT_MS ?? 45_000), 1_000);
const command = [binary, 'hydrate-filing'];
const requestBody = JSON.stringify({
filingId: input.filingId,
ticker: input.ticker,
cik: input.cik,
accessionNumber: input.accessionNumber,
filingDate: input.filingDate,
filingType: input.filingType,
filingUrl: input.filingUrl,
primaryDocument: input.primaryDocument,
cacheDir: process.env.FISCAL_XBRL_CACHE_DIR ?? join(process.cwd(), '.cache', 'xbrl')
});
const child = Bun.spawn(command, {
stdin: 'pipe',
stdout: 'pipe',
stderr: 'pipe',
env: {
...process.env
}
});
child.stdin.write(new TextEncoder().encode(requestBody));
child.stdin.end();
const timeout = setTimeout(() => {
child.kill();
}, timeoutMs);
try {
const [stdout, stderr, exitCode] = await Promise.all([
new Response(child.stdout).text(),
new Response(child.stderr).text(),
child.exited
]);
if (stderr.trim().length > 0) {
console.warn(`[fiscal-xbrl] ${stderr.trim()}`);
}
if (exitCode !== 0) {
throw new Error(`Rust XBRL sidecar failed with exit code ${exitCode}: ${stderr.trim() || stdout.trim() || 'no error output'}`);
}
return JSON.parse(stdout) as TaxonomyHydrationResult;
} finally {
clearTimeout(timeout);
}
}

View File

@@ -1,4 +1,13 @@
import type { Filing, FinancialStatementKind, MetricValidationResult, TaxonomyStatementRow } from '@/lib/types';
import type {
Filing,
FinancialStatementKind,
MetricValidationResult,
NormalizationSummary,
StructuredKpiRow,
SurfaceDetailMap,
SurfaceFinancialRow,
TaxonomyStatementRow
} from '@/lib/types';
import type {
FilingTaxonomyAssetType,
FilingTaxonomyParseStatus,
@@ -19,10 +28,20 @@ export type TaxonomyNamespaceMap = Record<string, string>;
export type TaxonomyContext = {
id: string;
entityIdentifier: string | null;
entityScheme: string | null;
periodStart: string | null;
periodEnd: string | null;
periodInstant: string | null;
dimensions: Array<{ axis: string; member: string }>;
segment: {
explicitMembers: Array<{ axis: string; member: string }>;
typedMembers: Array<{ axis: string; value: string }>;
} | null;
scenario: {
explicitMembers: Array<{ axis: string; member: string }>;
typedMembers: Array<{ axis: string; value: string }>;
} | null;
};
export type TaxonomyUnit = {
@@ -35,9 +54,12 @@ export type TaxonomyFact = {
qname: string;
namespaceUri: string;
localName: string;
dataType: string | null;
contextId: string;
unit: string | null;
decimals: string | null;
precision: string | null;
nil: boolean;
value: number;
periodStart: string | null;
periodEnd: string | null;
@@ -64,8 +86,17 @@ export type TaxonomyConcept = {
local_name: string;
label: string | null;
is_extension: boolean;
balance: string | null;
period_type: string | null;
data_type: string | null;
statement_kind: FinancialStatementKind | null;
role_uri: string | null;
authoritative_concept_key: string | null;
mapping_method: string | null;
surface_key: string | null;
detail_parent_surface_key: string | null;
kpi_key: string | null;
residual_flag: boolean;
presentation_order: number | null;
presentation_depth: number | null;
parent_concept_key: string | null;
@@ -105,8 +136,26 @@ export type TaxonomyHydrationResult = {
parse_status: FilingTaxonomyParseStatus;
parse_error: string | null;
source: FilingTaxonomySource;
parser_engine: string;
parser_version: string;
taxonomy_regime: 'us-gaap' | 'ifrs-full' | 'unknown';
fiscal_pack: string | null;
periods: FilingTaxonomyPeriod[];
faithful_rows: Record<FinancialStatementKind, TaxonomyStatementRow[]>;
statement_rows: Record<FinancialStatementKind, TaxonomyStatementRow[]>;
surface_rows: Record<FinancialStatementKind, SurfaceFinancialRow[]>;
detail_rows: Record<FinancialStatementKind, SurfaceDetailMap>;
kpi_rows: StructuredKpiRow[];
contexts: Array<{
context_id: string;
entity_identifier: string | null;
entity_scheme: string | null;
period_start: string | null;
period_end: string | null;
period_instant: string | null;
segment_json: Record<string, unknown> | null;
scenario_json: Record<string, unknown> | null;
}>;
derived_metrics: Filing['metrics'];
validation_result: MetricValidationResult | null;
facts_count: number;
@@ -119,11 +168,20 @@ export type TaxonomyHydrationResult = {
qname: string;
namespace_uri: string;
local_name: string;
data_type: string | null;
statement_kind: FinancialStatementKind | null;
role_uri: string | null;
authoritative_concept_key: string | null;
mapping_method: string | null;
surface_key: string | null;
detail_parent_surface_key: string | null;
kpi_key: string | null;
residual_flag: boolean;
context_id: string;
unit: string | null;
decimals: string | null;
precision: string | null;
nil: boolean;
value_num: number;
period_start: string | null;
period_end: string | null;
@@ -133,4 +191,5 @@ export type TaxonomyHydrationResult = {
source_file: string | null;
}>;
metric_validations: TaxonomyMetricValidationCheck[];
normalization_summary: NormalizationSummary;
};