Integrate crabrl parser into taxonomy hydration

This commit is contained in:
2026-03-16 15:18:01 -04:00
parent cf084793ed
commit a58b07456e
23 changed files with 4696 additions and 2466 deletions

View File

@@ -1,37 +1,43 @@
import { beforeEach, describe, expect, it, mock } from 'bun:test';
import { beforeEach, describe, expect, it, mock } from "bun:test";
import type { FinancialStatementKind } from '@/lib/types';
import type { TaxonomyHydrationInput, TaxonomyHydrationResult } from '@/lib/server/taxonomy/types';
import type { FinancialStatementKind } from "@/lib/types";
import type {
TaxonomyHydrationInput,
TaxonomyHydrationResult,
} from "@/lib/server/taxonomy/types";
function createStatementRecord<T>(factory: () => T): Record<FinancialStatementKind, T> {
function createStatementRecord<T>(
factory: () => T,
): Record<FinancialStatementKind, T> {
return {
income: factory(),
balance: factory(),
cash_flow: factory(),
equity: factory(),
comprehensive_income: factory()
comprehensive_income: factory(),
};
}
function createHydrationResult(): TaxonomyHydrationResult {
return {
filing_id: 1,
ticker: 'TEST',
filing_date: '2025-12-31',
filing_type: '10-K',
parse_status: 'ready',
ticker: "TEST",
filing_date: "2025-12-31",
filing_type: "10-K",
parse_status: "ready",
parse_error: null,
source: 'xbrl_instance_with_linkbase',
parser_engine: 'fiscal-xbrl',
parser_version: '0.1.0',
taxonomy_regime: 'us-gaap',
fiscal_pack: 'core',
source: "xbrl_instance_with_linkbase",
parser_engine: "fiscal-xbrl",
parser_version: "0.1.0",
taxonomy_regime: "us-gaap",
fiscal_pack: "core",
periods: [],
faithful_rows: createStatementRecord(() => []),
statement_rows: createStatementRecord(() => []),
surface_rows: createStatementRecord(() => []),
detail_rows: createStatementRecord(() => ({})),
kpi_rows: [],
computed_definitions: [],
contexts: [],
derived_metrics: null,
validation_result: null,
@@ -48,42 +54,44 @@ function createHydrationResult(): TaxonomyHydrationResult {
kpi_row_count: 0,
unmapped_row_count: 0,
material_unmapped_row_count: 0,
warnings: ['rust_warning']
warnings: ["rust_warning"],
},
xbrl_validation: {
status: 'passed'
}
status: "passed",
},
};
}
const mockHydrateFromSidecar = mock(async () => createHydrationResult());
mock.module('@/lib/server/taxonomy/parser-client', () => ({
hydrateFilingTaxonomySnapshotFromSidecar: mockHydrateFromSidecar
mock.module("@/lib/server/taxonomy/parser-client", () => ({
hydrateFilingTaxonomySnapshotFromSidecar: mockHydrateFromSidecar,
}));
describe('taxonomy engine rust path', () => {
describe("taxonomy engine rust path", () => {
beforeEach(() => {
mockHydrateFromSidecar.mockClear();
});
it('returns sidecar output directly from the Rust sidecar', async () => {
const { hydrateFilingTaxonomySnapshot } = await import('@/lib/server/taxonomy/engine');
it("returns sidecar output directly from the Rust sidecar", async () => {
const { hydrateFilingTaxonomySnapshot } =
await import("@/lib/server/taxonomy/engine");
const input: TaxonomyHydrationInput = {
filingId: 1,
ticker: 'TEST',
cik: '0000000001',
accessionNumber: '0000000001-25-000001',
filingDate: '2025-12-31',
filingType: '10-K',
filingUrl: 'https://www.sec.gov/Archives/edgar/data/1/000000000125000001/',
primaryDocument: 'test-20251231.htm'
ticker: "TEST",
cik: "0000000001",
accessionNumber: "0000000001-25-000001",
filingDate: "2025-12-31",
filingType: "10-K",
filingUrl:
"https://www.sec.gov/Archives/edgar/data/1/000000000125000001/",
primaryDocument: "test-20251231.htm",
};
const result = await hydrateFilingTaxonomySnapshot(input);
expect(mockHydrateFromSidecar).toHaveBeenCalledTimes(1);
expect(result.parser_engine).toBe('fiscal-xbrl');
expect(result.normalization_summary.warnings).toEqual(['rust_warning']);
expect(result.parser_engine).toBe("fiscal-xbrl");
expect(result.normalization_summary.warnings).toEqual(["rust_warning"]);
});
});

View File

@@ -0,0 +1,286 @@
import { beforeEach, describe, expect, it, mock } from "bun:test";
import type {
TaxonomyHydrationInput,
TaxonomyHydrationResult,
} from "@/lib/server/taxonomy/types";
import { __parserClientInternals } from "@/lib/server/taxonomy/parser-client";
function streamFromText(text: string) {
const encoded = new TextEncoder().encode(text);
return new ReadableStream<Uint8Array>({
start(controller) {
controller.enqueue(encoded);
controller.close();
},
});
}
function sampleHydrationResult(): TaxonomyHydrationResult {
return {
filing_id: 1,
ticker: "AAPL",
filing_date: "2026-01-30",
filing_type: "10-Q",
parse_status: "ready",
parse_error: null,
source: "xbrl_instance",
parser_engine: "fiscal-xbrl",
parser_version: "0.1.0",
taxonomy_regime: "us-gaap",
fiscal_pack: "core",
periods: [],
faithful_rows: {
income: [],
balance: [],
cash_flow: [],
equity: [],
comprehensive_income: [],
},
statement_rows: {
income: [],
balance: [],
cash_flow: [],
equity: [],
comprehensive_income: [],
},
surface_rows: {
income: [],
balance: [],
cash_flow: [],
equity: [],
comprehensive_income: [],
},
detail_rows: {
income: {},
balance: {},
cash_flow: {},
equity: {},
comprehensive_income: {},
},
kpi_rows: [],
computed_definitions: [],
contexts: [],
derived_metrics: null,
validation_result: null,
facts_count: 0,
concepts_count: 0,
dimensions_count: 0,
assets: [],
concepts: [],
facts: [],
metric_validations: [],
normalization_summary: {
surface_row_count: 0,
detail_row_count: 0,
kpi_row_count: 0,
unmapped_row_count: 0,
material_unmapped_row_count: 0,
warnings: [],
},
xbrl_validation: {
status: "passed",
},
};
}
function sampleInput(): TaxonomyHydrationInput {
return {
filingId: 1,
ticker: "AAPL",
cik: "0000320193",
accessionNumber: "0000320193-26-000001",
filingDate: "2026-01-30",
filingType: "10-Q",
filingUrl:
"https://www.sec.gov/Archives/edgar/data/320193/000032019326000001/",
primaryDocument: "a10q.htm",
};
}
const passThroughTimeout = ((handler: TimerHandler, timeout?: number) =>
globalThis.setTimeout(
handler,
timeout,
)) as unknown as typeof globalThis.setTimeout;
const immediateTimeout = ((handler: TimerHandler) => {
if (typeof handler === "function") {
handler();
}
return 1 as unknown as ReturnType<typeof globalThis.setTimeout>;
}) as unknown as typeof globalThis.setTimeout;
describe("parser client", () => {
beforeEach(() => {
delete process.env.FISCAL_XBRL_BIN;
delete process.env.XBRL_ENGINE_TIMEOUT_MS;
});
it("throws when the sidecar binary cannot be resolved", () => {
expect(() =>
__parserClientInternals.resolveFiscalXbrlBinary({
existsSync: () => false,
}),
).toThrow(/Rust XBRL sidecar binary is required/);
});
it("returns parsed sidecar JSON on success", async () => {
const stdinWrite = mock(() => {});
const stdinEnd = mock(() => {});
const result = await __parserClientInternals.hydrateFromSidecarImpl(
sampleInput(),
{
existsSync: () => true,
spawn: mock(() => ({
stdin: {
write: stdinWrite,
end: stdinEnd,
},
stdout: streamFromText(JSON.stringify(sampleHydrationResult())),
stderr: streamFromText(""),
exited: Promise.resolve(0),
kill: mock(() => {}),
})) as never,
setTimeout: passThroughTimeout,
clearTimeout,
},
);
expect(result.parser_engine).toBe("fiscal-xbrl");
expect(stdinWrite).toHaveBeenCalledTimes(1);
expect(stdinEnd).toHaveBeenCalledTimes(1);
});
it("throws when the sidecar exits non-zero", async () => {
await expect(
__parserClientInternals.hydrateFromSidecarImpl(sampleInput(), {
existsSync: () => true,
spawn: mock(() => ({
stdin: {
write: () => {},
end: () => {},
},
stdout: streamFromText(""),
stderr: streamFromText("fatal parse error"),
exited: Promise.resolve(3),
kill: mock(() => {}),
})) as never,
setTimeout: passThroughTimeout,
clearTimeout,
}),
).rejects.toThrow(/exit code 3/);
});
it("throws on invalid JSON stdout", async () => {
await expect(
__parserClientInternals.hydrateFromSidecarImpl(sampleInput(), {
existsSync: () => true,
spawn: mock(() => ({
stdin: {
write: () => {},
end: () => {},
},
stdout: streamFromText("{not json"),
stderr: streamFromText(""),
exited: Promise.resolve(0),
kill: mock(() => {}),
})) as never,
setTimeout: passThroughTimeout,
clearTimeout,
}),
).rejects.toThrow();
});
it("kills the sidecar when the timeout fires", async () => {
const kill = mock(() => {});
await expect(
__parserClientInternals.hydrateFromSidecarImpl(sampleInput(), {
existsSync: () => true,
spawn: mock(() => ({
stdin: {
write: () => {},
end: () => {},
},
stdout: streamFromText(""),
stderr: streamFromText("killed"),
exited: Promise.resolve(137),
kill,
})) as never,
setTimeout: immediateTimeout,
clearTimeout: () => {},
}),
).rejects.toThrow(/exit code 137/);
expect(kill).toHaveBeenCalledTimes(1);
});
it("retries retryable sidecar failures but not invalid requests", async () => {
let attempts = 0;
const spawn = mock(() => {
attempts += 1;
const exitCode = attempts < 3 ? 1 : 0;
const stdout =
exitCode === 0 ? JSON.stringify(sampleHydrationResult()) : "";
const stderr = exitCode === 0 ? "" : "process killed";
return {
stdin: {
write: () => {},
end: () => {},
},
stdout: streamFromText(stdout),
stderr: streamFromText(stderr),
exited: Promise.resolve(exitCode),
kill: mock(() => {}),
};
});
const result =
await __parserClientInternals.hydrateFilingTaxonomySnapshotFromSidecarWithDeps(
sampleInput(),
{
existsSync: () => true,
spawn: spawn as never,
setTimeout: passThroughTimeout,
clearTimeout,
},
);
expect(result.parser_version).toBe("0.1.0");
expect(attempts).toBe(3);
attempts = 0;
const invalidRequestSpawn = mock(() => {
attempts += 1;
return {
stdin: {
write: () => {},
end: () => {},
},
stdout: streamFromText(""),
stderr: streamFromText("invalid request: bad command"),
exited: Promise.resolve(6),
kill: mock(() => {}),
};
});
await expect(
__parserClientInternals.hydrateFilingTaxonomySnapshotFromSidecarWithDeps(
sampleInput(),
{
existsSync: () => true,
spawn: invalidRequestSpawn as never,
setTimeout: passThroughTimeout,
clearTimeout,
},
),
).rejects.toThrow(/invalid request/);
expect(attempts).toBe(1);
});
});

View File

@@ -1,36 +1,89 @@
import { existsSync } from 'node:fs';
import { join } from 'node:path';
import type { TaxonomyHydrationInput, TaxonomyHydrationResult } from '@/lib/server/taxonomy/types';
import { withRetry } from '@/lib/server/utils/retry';
import { existsSync } from "node:fs";
import { join } from "node:path";
import type {
TaxonomyHydrationInput,
TaxonomyHydrationResult,
} from "@/lib/server/taxonomy/types";
import { withRetry } from "@/lib/server/utils/retry";
type SpawnedSidecar = {
stdin: { write: (chunk: Uint8Array) => void; end: () => void };
stdout: ReadableStream<Uint8Array>;
stderr: ReadableStream<Uint8Array>;
exited: Promise<number>;
kill: () => void;
};
type SidecarDeps = {
existsSync: typeof existsSync;
spawn: typeof Bun.spawn;
setTimeout: typeof globalThis.setTimeout;
clearTimeout: typeof globalThis.clearTimeout;
};
function candidateBinaryPaths() {
return [
process.env.FISCAL_XBRL_BIN?.trim(),
join(process.cwd(), 'bin', 'fiscal-xbrl'),
join(process.cwd(), 'rust', 'target', 'release', 'fiscal-xbrl'),
join(process.cwd(), 'rust', 'target', 'debug', 'fiscal-xbrl')
].filter((value): value is string => typeof value === 'string' && value.length > 0);
join(process.cwd(), "bin", "fiscal-xbrl"),
join(process.cwd(), "rust", "target", "release", "fiscal-xbrl"),
join(process.cwd(), "rust", "target", "debug", "fiscal-xbrl"),
].filter(
(value): value is string => typeof value === "string" && value.length > 0,
);
}
export function resolveFiscalXbrlBinary() {
const resolved = candidateBinaryPaths().find((path) => existsSync(path));
return resolveFiscalXbrlBinaryWithDeps({
existsSync,
});
}
function resolveFiscalXbrlBinaryWithDeps(
deps: Pick<SidecarDeps, "existsSync">,
) {
const resolved = candidateBinaryPaths().find((path) => deps.existsSync(path));
if (!resolved) {
throw new Error('Rust XBRL sidecar binary is required but was not found. Set FISCAL_XBRL_BIN or build `fiscal-xbrl` under rust/target.');
throw new Error(
"Rust XBRL sidecar binary is required but was not found. Set FISCAL_XBRL_BIN or build `fiscal-xbrl` under rust/target.",
);
}
return resolved;
}
export async function hydrateFilingTaxonomySnapshotFromSidecar(
input: TaxonomyHydrationInput
input: TaxonomyHydrationInput,
): Promise<TaxonomyHydrationResult> {
return withRetry(() => hydrateFromSidecarImpl(input));
return hydrateFilingTaxonomySnapshotFromSidecarWithDeps(input, {
existsSync,
spawn: Bun.spawn,
setTimeout: globalThis.setTimeout,
clearTimeout: globalThis.clearTimeout,
});
}
async function hydrateFromSidecarImpl(input: TaxonomyHydrationInput): Promise<TaxonomyHydrationResult> {
const binary = resolveFiscalXbrlBinary();
const timeoutMs = Math.max(Number(process.env.XBRL_ENGINE_TIMEOUT_MS ?? 45_000), 1_000);
const command = [binary, 'hydrate-filing'];
async function hydrateFilingTaxonomySnapshotFromSidecarWithDeps(
input: TaxonomyHydrationInput,
deps: SidecarDeps,
): Promise<TaxonomyHydrationResult> {
return withRetry(() => hydrateFromSidecarImpl(input, deps));
}
async function hydrateFromSidecarImpl(
input: TaxonomyHydrationInput,
deps: SidecarDeps = {
existsSync,
spawn: Bun.spawn,
setTimeout: globalThis.setTimeout,
clearTimeout: globalThis.clearTimeout,
},
): Promise<TaxonomyHydrationResult> {
const binary = resolveFiscalXbrlBinaryWithDeps(deps);
const timeoutMs = Math.max(
Number(process.env.XBRL_ENGINE_TIMEOUT_MS ?? 45_000),
1_000,
);
const command = [binary, "hydrate-filing"];
const requestBody = JSON.stringify({
filingId: input.filingId,
ticker: input.ticker,
@@ -40,22 +93,24 @@ async function hydrateFromSidecarImpl(input: TaxonomyHydrationInput): Promise<Ta
filingType: input.filingType,
filingUrl: input.filingUrl,
primaryDocument: input.primaryDocument,
cacheDir: process.env.FISCAL_XBRL_CACHE_DIR ?? join(process.cwd(), '.cache', 'xbrl')
cacheDir:
process.env.FISCAL_XBRL_CACHE_DIR ??
join(process.cwd(), ".cache", "xbrl"),
});
const child = Bun.spawn(command, {
stdin: 'pipe',
stdout: 'pipe',
stderr: 'pipe',
const child = deps.spawn(command, {
stdin: "pipe",
stdout: "pipe",
stderr: "pipe",
env: {
...process.env
}
});
...process.env,
},
}) as SpawnedSidecar;
child.stdin.write(new TextEncoder().encode(requestBody));
child.stdin.end();
const timeout = setTimeout(() => {
const timeout = deps.setTimeout(() => {
child.kill();
}, timeoutMs);
@@ -63,7 +118,7 @@ async function hydrateFromSidecarImpl(input: TaxonomyHydrationInput): Promise<Ta
const [stdout, stderr, exitCode] = await Promise.all([
new Response(child.stdout).text(),
new Response(child.stderr).text(),
child.exited
child.exited,
]);
if (stderr.trim().length > 0) {
@@ -71,11 +126,20 @@ async function hydrateFromSidecarImpl(input: TaxonomyHydrationInput): Promise<Ta
}
if (exitCode !== 0) {
throw new Error(`Rust XBRL sidecar failed with exit code ${exitCode}: ${stderr.trim() || stdout.trim() || 'no error output'}`);
throw new Error(
`Rust XBRL sidecar failed with exit code ${exitCode}: ${stderr.trim() || stdout.trim() || "no error output"}`,
);
}
return JSON.parse(stdout) as TaxonomyHydrationResult;
} finally {
clearTimeout(timeout);
deps.clearTimeout(timeout);
}
}
export const __parserClientInternals = {
candidateBinaryPaths,
hydrateFilingTaxonomySnapshotFromSidecarWithDeps,
hydrateFromSidecarImpl,
resolveFiscalXbrlBinary: resolveFiscalXbrlBinaryWithDeps,
};

View File

@@ -1,14 +1,15 @@
import type {
Filing,
FinancialStatementKind,
MetricValidationResult
} from '@/lib/types';
MetricValidationResult,
} from "@/lib/types";
import type { ComputedDefinition } from "@/lib/generated";
import type {
FilingTaxonomyAssetType,
FilingTaxonomyParseStatus,
FilingTaxonomyPeriod,
FilingTaxonomySource
} from '@/lib/server/repos/filing-taxonomy';
FilingTaxonomySource,
} from "@/lib/server/repos/filing-taxonomy";
export type TaxonomyAsset = {
asset_type: FilingTaxonomyAssetType;
@@ -99,12 +100,12 @@ export type TaxonomyConcept = {
};
export type TaxonomyMetricValidationCheck = {
metric_key: keyof NonNullable<Filing['metrics']>;
metric_key: keyof NonNullable<Filing["metrics"]>;
taxonomy_value: number | null;
llm_value: number | null;
absolute_diff: number | null;
relative_diff: number | null;
status: 'not_run' | 'matched' | 'mismatch' | 'error';
status: "not_run" | "matched" | "mismatch" | "error";
evidence_pages: number[];
pdf_url: string | null;
provider: string | null;
@@ -119,7 +120,7 @@ export type TaxonomyHydrationPeriod = {
filing_date: string;
period_start: string | null;
period_end: string | null;
filing_type: '10-K' | '10-Q';
filing_type: "10-K" | "10-Q";
period_label: string;
};
@@ -148,7 +149,7 @@ export type TaxonomyHydrationSurfaceRow = {
category: string;
template_section?: string;
order: number;
unit: 'currency' | 'count' | 'shares' | 'percent' | 'ratio';
unit: "currency" | "count" | "shares" | "percent" | "ratio";
values: Record<string, number | null>;
source_concepts: string[];
source_row_keys: string[];
@@ -156,10 +157,14 @@ export type TaxonomyHydrationSurfaceRow = {
formula_key: string | null;
has_dimensions: boolean;
resolved_source_row_keys: Record<string, string | null>;
statement?: 'income' | 'balance' | 'cash_flow';
statement?: "income" | "balance" | "cash_flow";
detail_count?: number;
resolution_method?: 'direct' | 'surface_bridge' | 'formula_derived' | 'not_meaningful';
confidence?: 'high' | 'medium' | 'low';
resolution_method?:
| "direct"
| "surface_bridge"
| "formula_derived"
| "not_meaningful";
confidence?: "high" | "medium" | "low";
warning_codes?: string[];
};
@@ -183,7 +188,7 @@ export type TaxonomyHydrationStructuredKpiRow = {
key: string;
label: string;
category: string;
unit: 'currency' | 'count' | 'shares' | 'percent' | 'ratio';
unit: "currency" | "count" | "shares" | "percent" | "ratio";
order: number;
segment: string | null;
axis: string | null;
@@ -191,7 +196,7 @@ export type TaxonomyHydrationStructuredKpiRow = {
values: Record<string, number | null>;
source_concepts: string[];
source_fact_ids: number[];
provenance_type: 'taxonomy' | 'structured_note';
provenance_type: "taxonomy" | "structured_note";
has_dimensions: boolean;
};
@@ -205,7 +210,7 @@ export type TaxonomyHydrationNormalizationSummary = {
};
export type XbrlValidationResult = {
status: 'passed' | 'warning' | 'error';
status: "passed" | "warning" | "error";
message?: string;
};
@@ -215,7 +220,7 @@ export type TaxonomyHydrationInput = {
cik: string;
accessionNumber: string;
filingDate: string;
filingType: '10-K' | '10-Q';
filingType: "10-K" | "10-Q";
filingUrl: string | null;
primaryDocument: string | null;
};
@@ -224,20 +229,30 @@ export type TaxonomyHydrationResult = {
filing_id: number;
ticker: string;
filing_date: string;
filing_type: '10-K' | '10-Q';
filing_type: "10-K" | "10-Q";
parse_status: FilingTaxonomyParseStatus;
parse_error: string | null;
source: FilingTaxonomySource;
parser_engine: string;
parser_version: string;
taxonomy_regime: 'us-gaap' | 'ifrs-full' | 'unknown';
taxonomy_regime: "us-gaap" | "ifrs-full" | "unknown";
fiscal_pack: string | null;
periods: TaxonomyHydrationPeriod[];
faithful_rows: Record<FinancialStatementKind, TaxonomyHydrationStatementRow[]>;
statement_rows: Record<FinancialStatementKind, TaxonomyHydrationStatementRow[]>;
faithful_rows: Record<
FinancialStatementKind,
TaxonomyHydrationStatementRow[]
>;
statement_rows: Record<
FinancialStatementKind,
TaxonomyHydrationStatementRow[]
>;
surface_rows: Record<FinancialStatementKind, TaxonomyHydrationSurfaceRow[]>;
detail_rows: Record<FinancialStatementKind, Record<string, TaxonomyHydrationDetailRow[]>>;
detail_rows: Record<
FinancialStatementKind,
Record<string, TaxonomyHydrationDetailRow[]>
>;
kpi_rows: TaxonomyHydrationStructuredKpiRow[];
computed_definitions: ComputedDefinition[];
contexts: Array<{
context_id: string;
entity_identifier: string | null;
@@ -248,7 +263,7 @@ export type TaxonomyHydrationResult = {
segment_json: Record<string, unknown> | null;
scenario_json: Record<string, unknown> | null;
}>;
derived_metrics: Filing['metrics'];
derived_metrics: Filing["metrics"];
validation_result: MetricValidationResult | null;
facts_count: number;
concepts_count: number;