Files
Neon-Desk/rust/fiscal-xbrl-core/src/lib.rs
francy51 24aa8e33d4 Consolidate metric definitions with Rust JSON as single source of truth
- Add core.computed.json with 32 ratio definitions (filing + market derived)
- Add Rust types for ComputedDefinition and ComputationSpec
- Create generate-taxonomy.ts to generate TypeScript from Rust JSON
- Generate lib/generated/ (gitignored) with surfaces, computed, kpis
- Update financial-metrics.ts to use generated definitions
- Add build-time generation via 'bun run generate'
- Add taxonomy architecture documentation

Two-phase ratio computation:
- Filing-derived: margins, returns, per-share, growth (Rust computes)
- Market-derived: valuation ratios (TypeScript computes with price data)

All 32 ratios defined in core.computed.json:
- Margins: gross, operating, ebitda, net, fcf
- Returns: roa, roe, roic, roce
- Financial health: debt_to_equity, net_debt_to_ebitda, cash_to_debt, current_ratio
- Per-share: revenue, fcf, book_value
- Growth: yoy metrics + 3y/5y cagr
- Valuation: market_cap, ev, p/e, p/fcf, p/b, ev/sales, ev/ebitda, ev/fcf
2026-03-15 15:22:51 -04:00

2461 lines
81 KiB
Rust

use anyhow::{anyhow, Context, Result};
use once_cell::sync::Lazy;
use regex::Regex;
use reqwest::blocking::Client;
use serde::{Deserialize, Serialize};
use std::collections::{BTreeMap, HashMap, HashSet};
mod kpi_mapper;
mod metrics;
mod pack_selector;
mod surface_mapper;
mod taxonomy_loader;
mod universal_income;
use taxonomy_loader::{ComputationSpec, ComputedDefinition};
#[cfg(feature = "with-crabrl")]
use crabrl as _;
pub const PARSER_ENGINE: &str = "fiscal-xbrl";
pub const PARSER_VERSION: &str = env!("CARGO_PKG_VERSION");
static CONTEXT_RE: Lazy<Regex> = Lazy::new(|| {
Regex::new(r#"(?is)<(?:[a-z0-9_\-]+:)?context\b[^>]*\bid=["']([^"']+)["'][^>]*>(.*?)</(?:[a-z0-9_\-]+:)?context>"#).unwrap()
});
static UNIT_RE: Lazy<Regex> = Lazy::new(|| {
Regex::new(r#"(?is)<(?:[a-z0-9_\-]+:)?unit\b[^>]*\bid=["']([^"']+)["'][^>]*>(.*?)</(?:[a-z0-9_\-]+:)?unit>"#).unwrap()
});
static FACT_RE: Lazy<Regex> = Lazy::new(|| {
Regex::new(r#"(?is)<([a-zA-Z0-9_\-]+):([a-zA-Z0-9_\-.]+)\b([^>]*\bcontextRef=["'][^"']+["'][^>]*)>(.*?)</[a-zA-Z0-9_\-]+:[a-zA-Z0-9_\-.]+>"#).unwrap()
});
static EXPLICIT_MEMBER_RE: Lazy<Regex> = Lazy::new(|| {
Regex::new(r#"(?is)<(?:[a-z0-9_\-]+:)?explicitMember\b[^>]*\bdimension=["']([^"']+)["'][^>]*>(.*?)</(?:[a-z0-9_\-]+:)?explicitMember>"#).unwrap()
});
static TYPED_MEMBER_RE: Lazy<Regex> = Lazy::new(|| {
Regex::new(r#"(?is)<(?:[a-z0-9_\-]+:)?typedMember\b[^>]*\bdimension=["']([^"']+)["'][^>]*>(.*?)</(?:[a-z0-9_\-]+:)?typedMember>"#).unwrap()
});
static IDENTIFIER_RE: Lazy<Regex> = Lazy::new(|| {
Regex::new(r#"(?is)<(?:[a-z0-9_\-]+:)?identifier\b[^>]*\bscheme=["']([^"']+)["'][^>]*>(.*?)</(?:[a-z0-9_\-]+:)?identifier>"#).unwrap()
});
static SEGMENT_RE: Lazy<Regex> = Lazy::new(|| {
Regex::new(r#"(?is)<(?:[a-z0-9_\-]+:)?segment\b[^>]*>(.*?)</(?:[a-z0-9_\-]+:)?segment>"#)
.unwrap()
});
static SCENARIO_RE: Lazy<Regex> = Lazy::new(|| {
Regex::new(r#"(?is)<(?:[a-z0-9_\-]+:)?scenario\b[^>]*>(.*?)</(?:[a-z0-9_\-]+:)?scenario>"#)
.unwrap()
});
static START_DATE_RE: Lazy<Regex> = Lazy::new(|| {
Regex::new(r#"(?is)<(?:[a-z0-9_\-]+:)?startDate>(.*?)</(?:[a-z0-9_\-]+:)?startDate>"#).unwrap()
});
static END_DATE_RE: Lazy<Regex> = Lazy::new(|| {
Regex::new(r#"(?is)<(?:[a-z0-9_\-]+:)?endDate>(.*?)</(?:[a-z0-9_\-]+:)?endDate>"#).unwrap()
});
static INSTANT_RE: Lazy<Regex> = Lazy::new(|| {
Regex::new(r#"(?is)<(?:[a-z0-9_\-]+:)?instant>(.*?)</(?:[a-z0-9_\-]+:)?instant>"#).unwrap()
});
static MEASURE_RE: Lazy<Regex> = Lazy::new(|| {
Regex::new(r#"(?is)<(?:[a-z0-9_\-]+:)?measure>(.*?)</(?:[a-z0-9_\-]+:)?measure>"#).unwrap()
});
static LABEL_LINK_RE: Lazy<Regex> = Lazy::new(|| {
Regex::new(r#"(?is)<(?:[a-z0-9_\-]+:)?labelLink\b[^>]*>(.*?)</(?:[a-z0-9_\-]+:)?labelLink>"#)
.unwrap()
});
static PRESENTATION_LINK_RE: Lazy<Regex> = Lazy::new(|| {
Regex::new(r#"(?is)<(?:[a-z0-9_\-]+:)?presentationLink\b([^>]*)>(.*?)</(?:[a-z0-9_\-]+:)?presentationLink>"#).unwrap()
});
static LOC_RE: Lazy<Regex> = Lazy::new(|| {
Regex::new(r#"(?is)<(?:[a-z0-9_\-]+:)?loc\b([^>]*)/?>(?:</(?:[a-z0-9_\-]+:)?loc>)?"#).unwrap()
});
static LABEL_RESOURCE_RE: Lazy<Regex> = Lazy::new(|| {
Regex::new(r#"(?is)<(?:[a-z0-9_\-]+:)?label\b([^>]*)>(.*?)</(?:[a-z0-9_\-]+:)?label>"#).unwrap()
});
static LABEL_ARC_RE: Lazy<Regex> = Lazy::new(|| {
Regex::new(r#"(?is)<(?:[a-z0-9_\-]+:)?labelArc\b([^>]*)/?>(?:</(?:[a-z0-9_\-]+:)?labelArc>)?"#)
.unwrap()
});
static PRESENTATION_ARC_RE: Lazy<Regex> = Lazy::new(|| {
Regex::new(r#"(?is)<(?:[a-z0-9_\-]+:)?presentationArc\b([^>]*)/?>(?:</(?:[a-z0-9_\-]+:)?presentationArc>)?"#).unwrap()
});
static ATTR_RE: Lazy<Regex> =
Lazy::new(|| Regex::new(r#"([a-zA-Z0-9:_\-]+)=["']([^"']+)["']"#).unwrap());
#[derive(Debug, Deserialize)]
#[serde(rename_all = "camelCase")]
pub struct HydrateFilingRequest {
pub filing_id: i64,
pub ticker: String,
pub cik: String,
pub accession_number: String,
pub filing_date: String,
pub filing_type: String,
pub filing_url: Option<String>,
pub primary_document: Option<String>,
pub cache_dir: Option<String>,
}
#[derive(Debug, Serialize)]
pub struct HydrateFilingResponse {
pub filing_id: i64,
pub ticker: String,
pub filing_date: String,
pub filing_type: String,
pub parse_status: String,
pub parse_error: Option<String>,
pub source: String,
pub parser_engine: String,
pub parser_version: String,
pub taxonomy_regime: String,
pub fiscal_pack: Option<String>,
pub periods: Vec<PeriodOutput>,
pub faithful_rows: StatementRowMap,
pub statement_rows: StatementRowMap,
pub surface_rows: SurfaceRowMap,
pub detail_rows: DetailRowStatementMap,
pub kpi_rows: Vec<KpiRowOutput>,
pub computed_definitions: Vec<ComputedDefinitionOutput>,
pub contexts: Vec<ContextOutput>,
pub derived_metrics: FilingMetrics,
pub validation_result: ValidationResultOutput,
pub facts_count: usize,
pub concepts_count: usize,
pub dimensions_count: usize,
pub assets: Vec<AssetOutput>,
pub concepts: Vec<ConceptOutput>,
pub facts: Vec<FactOutput>,
pub metric_validations: Vec<MetricValidationOutput>,
pub normalization_summary: NormalizationSummaryOutput,
}
#[derive(Debug, Clone, Serialize, Default)]
pub struct FilingMetrics {
pub revenue: Option<f64>,
#[serde(rename = "netIncome")]
pub net_income: Option<f64>,
#[serde(rename = "totalAssets")]
pub total_assets: Option<f64>,
pub cash: Option<f64>,
pub debt: Option<f64>,
}
#[derive(Debug, Clone, Serialize)]
pub struct ValidationResultOutput {
pub status: String,
pub checks: Vec<serde_json::Value>,
#[serde(rename = "validatedAt")]
pub validated_at: Option<String>,
}
#[derive(Debug, Clone, Serialize)]
pub struct AssetOutput {
pub asset_type: String,
pub name: String,
pub url: String,
pub size_bytes: Option<i64>,
pub score: Option<f64>,
pub is_selected: bool,
}
#[derive(Debug, Clone, Serialize)]
pub struct PeriodOutput {
pub id: String,
pub filing_id: i64,
pub accession_number: String,
pub filing_date: String,
pub period_start: Option<String>,
pub period_end: Option<String>,
pub filing_type: String,
pub period_label: String,
}
#[derive(Debug, Clone, Serialize)]
pub struct ContextOutput {
pub context_id: String,
pub entity_identifier: Option<String>,
pub entity_scheme: Option<String>,
pub period_start: Option<String>,
pub period_end: Option<String>,
pub period_instant: Option<String>,
pub segment_json: Option<serde_json::Value>,
pub scenario_json: Option<serde_json::Value>,
}
#[derive(Debug, Clone, Serialize)]
pub struct StatementRowOutput {
pub key: String,
pub label: String,
pub concept_key: String,
pub qname: String,
pub namespace_uri: String,
pub local_name: String,
pub is_extension: bool,
pub statement: String,
pub role_uri: Option<String>,
pub order: i64,
pub depth: i64,
pub parent_key: Option<String>,
pub values: BTreeMap<String, Option<f64>>,
pub units: BTreeMap<String, Option<String>>,
pub has_dimensions: bool,
pub source_fact_ids: Vec<i64>,
}
#[derive(Debug, Clone, Serialize)]
pub struct SurfaceRowOutput {
pub key: String,
pub label: String,
pub category: String,
pub template_section: String,
pub order: i64,
pub unit: String,
pub values: BTreeMap<String, Option<f64>>,
pub source_concepts: Vec<String>,
pub source_row_keys: Vec<String>,
pub source_fact_ids: Vec<i64>,
pub formula_key: Option<String>,
pub has_dimensions: bool,
pub resolved_source_row_keys: BTreeMap<String, Option<String>>,
pub statement: Option<String>,
pub detail_count: Option<i64>,
#[serde(skip_serializing_if = "Option::is_none")]
pub resolution_method: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub confidence: Option<String>,
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub warning_codes: Vec<String>,
}
#[derive(Debug, Clone, Serialize)]
pub struct DetailRowOutput {
pub key: String,
pub parent_surface_key: String,
pub label: String,
pub concept_key: String,
pub qname: String,
pub namespace_uri: String,
pub local_name: String,
pub unit: Option<String>,
pub values: BTreeMap<String, Option<f64>>,
pub source_fact_ids: Vec<i64>,
pub is_extension: bool,
pub dimensions_summary: Vec<String>,
pub residual_flag: bool,
}
#[derive(Debug, Clone, Serialize)]
pub struct KpiRowOutput {
pub key: String,
pub label: String,
pub category: String,
pub unit: String,
pub order: i64,
pub segment: Option<String>,
pub axis: Option<String>,
pub member: Option<String>,
pub values: BTreeMap<String, Option<f64>>,
pub source_concepts: Vec<String>,
pub source_fact_ids: Vec<i64>,
pub provenance_type: String,
pub has_dimensions: bool,
}
#[derive(Debug, Clone, Serialize)]
pub struct ComputedDefinitionOutput {
pub key: String,
pub label: String,
pub category: String,
pub order: i64,
pub unit: String,
pub computation: ComputationSpecOutput,
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub supported_cadences: Vec<String>,
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub requires_external_data: Vec<String>,
}
#[derive(Debug, Clone, Serialize)]
#[serde(tag = "type", rename_all = "snake_case")]
pub enum ComputationSpecOutput {
Ratio {
numerator: String,
denominator: String,
},
YoyGrowth {
source: String,
},
Cagr {
source: String,
years: i64,
},
PerShare {
source: String,
shares_key: String,
},
Simple {
formula: String,
},
}
impl From<&ComputationSpec> for ComputationSpecOutput {
fn from(spec: &ComputationSpec) -> Self {
match spec {
ComputationSpec::Ratio {
numerator,
denominator,
} => ComputationSpecOutput::Ratio {
numerator: numerator.clone(),
denominator: denominator.clone(),
},
ComputationSpec::YoyGrowth { source } => ComputationSpecOutput::YoyGrowth {
source: source.clone(),
},
ComputationSpec::Cagr { source, years } => ComputationSpecOutput::Cagr {
source: source.clone(),
years: *years,
},
ComputationSpec::PerShare { source, shares_key } => ComputationSpecOutput::PerShare {
source: source.clone(),
shares_key: shares_key.clone(),
},
ComputationSpec::Simple { formula } => ComputationSpecOutput::Simple {
formula: formula.clone(),
},
}
}
}
impl From<&ComputedDefinition> for ComputedDefinitionOutput {
fn from(def: &ComputedDefinition) -> Self {
ComputedDefinitionOutput {
key: def.key.clone(),
label: def.label.clone(),
category: def.category.clone(),
order: def.order,
unit: def.unit.clone(),
computation: ComputationSpecOutput::from(&def.computation),
supported_cadences: def.supported_cadences.clone(),
requires_external_data: def.requires_external_data.clone(),
}
}
}
#[derive(Debug, Clone, Serialize)]
pub struct ConceptOutput {
pub concept_key: String,
pub qname: String,
pub namespace_uri: String,
pub local_name: String,
pub label: Option<String>,
pub is_extension: bool,
pub balance: Option<String>,
pub period_type: Option<String>,
pub data_type: Option<String>,
pub statement_kind: Option<String>,
pub role_uri: Option<String>,
pub authoritative_concept_key: Option<String>,
pub mapping_method: Option<String>,
pub surface_key: Option<String>,
pub detail_parent_surface_key: Option<String>,
pub kpi_key: Option<String>,
pub residual_flag: bool,
pub presentation_order: Option<f64>,
pub presentation_depth: Option<i64>,
pub parent_concept_key: Option<String>,
pub is_abstract: bool,
}
#[derive(Debug, Clone, Serialize)]
pub struct FactOutput {
pub concept_key: String,
pub qname: String,
pub namespace_uri: String,
pub local_name: String,
pub data_type: Option<String>,
pub statement_kind: Option<String>,
pub role_uri: Option<String>,
pub authoritative_concept_key: Option<String>,
pub mapping_method: Option<String>,
pub surface_key: Option<String>,
pub detail_parent_surface_key: Option<String>,
pub kpi_key: Option<String>,
pub residual_flag: bool,
pub context_id: String,
pub unit: Option<String>,
pub decimals: Option<String>,
pub precision: Option<String>,
pub nil: bool,
pub value_num: f64,
pub period_start: Option<String>,
pub period_end: Option<String>,
pub period_instant: Option<String>,
pub dimensions: Vec<DimensionOutput>,
pub is_dimensionless: bool,
pub source_file: Option<String>,
}
#[derive(Debug, Clone, Serialize)]
pub struct DimensionOutput {
pub axis: String,
pub member: String,
}
#[derive(Debug, Clone, Serialize)]
pub struct MetricValidationOutput {
pub metric_key: String,
pub taxonomy_value: Option<f64>,
pub llm_value: Option<f64>,
pub absolute_diff: Option<f64>,
pub relative_diff: Option<f64>,
pub status: String,
pub evidence_pages: Vec<i64>,
pub pdf_url: Option<String>,
pub provider: Option<String>,
pub model: Option<String>,
pub error: Option<String>,
}
#[derive(Debug, Clone, Serialize, Default)]
pub struct NormalizationSummaryOutput {
pub surface_row_count: usize,
pub detail_row_count: usize,
pub kpi_row_count: usize,
pub unmapped_row_count: usize,
pub material_unmapped_row_count: usize,
pub warnings: Vec<String>,
}
pub type StatementRowMap = BTreeMap<String, Vec<StatementRowOutput>>;
pub type SurfaceRowMap = BTreeMap<String, Vec<SurfaceRowOutput>>;
pub type DetailRowStatementMap = BTreeMap<String, BTreeMap<String, Vec<DetailRowOutput>>>;
#[derive(Debug, Clone)]
struct ParsedContext {
id: String,
entity_identifier: Option<String>,
entity_scheme: Option<String>,
period_start: Option<String>,
period_end: Option<String>,
period_instant: Option<String>,
dimensions: Vec<DimensionOutput>,
segment: Option<serde_json::Value>,
scenario: Option<serde_json::Value>,
}
#[derive(Debug, Clone)]
struct ParsedUnit {
measure: Option<String>,
}
#[derive(Debug, Clone)]
struct ParsedFact {
concept_key: String,
qname: String,
namespace_uri: String,
local_name: String,
data_type: Option<String>,
context_id: String,
unit: Option<String>,
decimals: Option<String>,
precision: Option<String>,
nil: bool,
value: f64,
period_start: Option<String>,
period_end: Option<String>,
period_instant: Option<String>,
dimensions: Vec<DimensionOutput>,
is_dimensionless: bool,
source_file: Option<String>,
}
#[derive(Debug, Clone)]
struct PresentationNode {
concept_key: String,
role_uri: String,
order: f64,
depth: i64,
parent_concept_key: Option<String>,
is_abstract: bool,
}
pub fn hydrate_filing(input: HydrateFilingRequest) -> Result<HydrateFilingResponse> {
let client = Client::builder()
.user_agent("Fiscal Clone <support@fiscal.local>")
.build()
.context("unable to build HTTP client")?;
let discovered = discover_filing_assets(&input, &client)?;
let empty_rows = empty_statement_row_map();
let empty_surface_rows = empty_surface_row_map();
let empty_detail_rows = empty_detail_row_map();
let validation_result = ValidationResultOutput {
status: "not_run".to_string(),
checks: vec![],
validated_at: None,
};
let Some(instance_asset) = discovered
.assets
.iter()
.find(|asset| asset.asset_type == "instance" && asset.is_selected)
.cloned()
else {
return Ok(HydrateFilingResponse {
filing_id: input.filing_id,
ticker: input.ticker.to_uppercase(),
filing_date: input.filing_date,
filing_type: input.filing_type,
parse_status: "failed".to_string(),
parse_error: Some("No XBRL instance found".to_string()),
source: "legacy_html_fallback".to_string(),
parser_engine: PARSER_ENGINE.to_string(),
parser_version: PARSER_VERSION.to_string(),
taxonomy_regime: "unknown".to_string(),
fiscal_pack: Some("core".to_string()),
periods: vec![],
faithful_rows: empty_rows.clone(),
statement_rows: empty_rows,
surface_rows: empty_surface_rows,
detail_rows: empty_detail_rows,
kpi_rows: vec![],
computed_definitions: vec![],
contexts: vec![],
derived_metrics: FilingMetrics::default(),
validation_result,
facts_count: 0,
concepts_count: 0,
dimensions_count: 0,
assets: discovered.assets,
concepts: vec![],
facts: vec![],
metric_validations: vec![],
normalization_summary: NormalizationSummaryOutput {
surface_row_count: 0,
detail_row_count: 0,
kpi_row_count: 0,
unmapped_row_count: 0,
material_unmapped_row_count: 0,
warnings: vec![],
},
});
};
let instance_text = fetch_text(&client, &instance_asset.url)
.context("fetch request failed for XBRL instance")?;
let parsed_instance = parse_xbrl_instance(&instance_text, Some(instance_asset.name.clone()));
let mut label_by_concept = HashMap::new();
let mut presentation = Vec::new();
let mut source = "xbrl_instance".to_string();
let mut parse_error = None;
for asset in discovered.assets.iter().filter(|asset| {
asset.is_selected && (asset.asset_type == "presentation" || asset.asset_type == "label")
}) {
match fetch_text(&client, &asset.url) {
Ok(content) => {
if asset.asset_type == "presentation" {
let parsed = parse_presentation_linkbase(&content);
if !parsed.is_empty() {
source = "xbrl_instance_with_linkbase".to_string();
}
presentation.extend(parsed);
} else {
for (key, value) in parse_label_linkbase(&content) {
label_by_concept.entry(key).or_insert(value);
}
}
}
Err(error) => {
if parse_error.is_none() {
parse_error = Some(error.to_string());
}
}
}
}
let materialized = materialize_taxonomy_statements(
input.filing_id,
&input.accession_number,
&input.filing_date,
&input.filing_type,
&parsed_instance.facts,
&presentation,
&label_by_concept,
);
let taxonomy_regime = infer_taxonomy_regime(&parsed_instance.facts);
let mut concepts = materialized.concepts;
let mut facts = materialized.facts;
let pack_selection = pack_selector::select_fiscal_pack(&materialized.statement_rows, &facts);
let fiscal_pack = pack_selection.pack.as_str().to_string();
let mut compact_model = surface_mapper::build_compact_surface_model(
&materialized.periods,
&materialized.statement_rows,
&taxonomy_regime,
pack_selection.pack,
pack_selection.warnings,
)?;
universal_income::apply_universal_income_rows(
&materialized.periods,
&materialized.statement_rows,
&facts,
&taxonomy_regime,
pack_selection.pack,
&mut compact_model,
)?;
let kpi_result =
kpi_mapper::build_taxonomy_kpis(&materialized.periods, &facts, pack_selection.pack)?;
compact_model.normalization_summary.kpi_row_count = kpi_result.rows.len();
for warning in kpi_result.warnings {
if !compact_model
.normalization_summary
.warnings
.contains(&warning)
{
compact_model.normalization_summary.warnings.push(warning);
}
}
surface_mapper::merge_mapping_assignments(
&mut compact_model.concept_mappings,
kpi_result.mapping_assignments,
);
surface_mapper::apply_mapping_assignments(
&mut concepts,
&mut facts,
&compact_model.concept_mappings,
);
let computed_pack = taxonomy_loader::load_computed_pack(pack_selection.pack)
.ok()
.or_else(|| taxonomy_loader::load_computed_pack(pack_selector::FiscalPack::Core).ok());
let computed_definitions: Vec<ComputedDefinitionOutput> = computed_pack
.map(|pack| {
pack.computed
.iter()
.map(ComputedDefinitionOutput::from)
.collect()
})
.unwrap_or_default();
let has_rows = materialized
.statement_rows
.values()
.map(|rows| rows.len())
.sum::<usize>()
> 0;
let has_facts = !facts.is_empty();
let parse_status = if has_rows && has_facts {
"ready"
} else if has_facts {
"partial"
} else {
"failed"
};
Ok(HydrateFilingResponse {
filing_id: input.filing_id,
ticker: input.ticker.to_uppercase(),
filing_date: input.filing_date,
filing_type: input.filing_type,
parse_status: parse_status.to_string(),
parse_error: if parse_status == "failed" {
Some(parse_error.unwrap_or_else(|| "No XBRL facts extracted".to_string()))
} else {
parse_error
},
source,
parser_engine: PARSER_ENGINE.to_string(),
parser_version: PARSER_VERSION.to_string(),
taxonomy_regime,
fiscal_pack: Some(fiscal_pack),
periods: materialized.periods,
faithful_rows: materialized.statement_rows.clone(),
statement_rows: materialized.statement_rows,
surface_rows: compact_model.surface_rows,
detail_rows: compact_model.detail_rows,
kpi_rows: kpi_result.rows,
computed_definitions,
contexts: parsed_instance.contexts,
derived_metrics: metrics::derive_metrics(&facts),
validation_result,
facts_count: facts.len(),
concepts_count: concepts.len(),
dimensions_count: facts
.iter()
.flat_map(|fact| {
fact.dimensions
.iter()
.map(|dimension| format!("{}::{}", dimension.axis, dimension.member))
})
.collect::<HashSet<_>>()
.len(),
assets: discovered.assets,
concepts,
facts,
metric_validations: vec![],
normalization_summary: compact_model.normalization_summary,
})
}
fn infer_taxonomy_regime(facts: &[ParsedFact]) -> String {
if facts
.iter()
.any(|fact| fact.namespace_uri.to_lowercase().contains("us-gaap"))
{
return "us-gaap".to_string();
}
if facts
.iter()
.any(|fact| fact.namespace_uri.to_lowercase().contains("ifrs"))
{
return "ifrs-full".to_string();
}
"unknown".to_string()
}
#[derive(Debug, Deserialize)]
struct FilingDirectoryPayload {
directory: Option<FilingDirectory>,
}
#[derive(Debug, Deserialize)]
struct FilingDirectory {
item: Option<Vec<FilingDirectoryItem>>,
}
#[derive(Debug, Deserialize)]
struct FilingDirectoryItem {
name: Option<String>,
size: Option<serde_json::Value>,
}
#[derive(Debug)]
struct DiscoveredAssets {
assets: Vec<AssetOutput>,
}
fn discover_filing_assets(
input: &HydrateFilingRequest,
client: &Client,
) -> Result<DiscoveredAssets> {
let Some(directory_url) = resolve_filing_directory_url(
input.filing_url.as_deref(),
&input.cik,
&input.accession_number,
) else {
return Ok(DiscoveredAssets { assets: vec![] });
};
let payload =
fetch_json::<FilingDirectoryPayload>(client, &format!("{directory_url}index.json")).ok();
let mut discovered = Vec::new();
if let Some(items) =
payload.and_then(|payload| payload.directory.and_then(|directory| directory.item))
{
for item in items {
let Some(name) = item
.name
.map(|name| name.trim().to_string())
.filter(|name| !name.is_empty())
else {
continue;
};
let asset_type = classify_asset_type(&name);
let size_bytes = parse_size(item.size.as_ref());
discovered.push(AssetOutput {
asset_type: asset_type.to_string(),
name: name.clone(),
url: format!("{directory_url}{}", name.trim_start_matches('/')),
size_bytes,
score: None,
is_selected: false,
});
}
}
if discovered.is_empty() {
if let Some(filing_url) = &input.filing_url {
discovered.push(AssetOutput {
asset_type: if filing_url.to_lowercase().ends_with(".xml") {
"instance".to_string()
} else {
"other".to_string()
},
name: input
.primary_document
.clone()
.or_else(|| filing_url.split('/').last().map(|part| part.to_string()))
.unwrap_or_else(|| "primary_document".to_string()),
url: filing_url.clone(),
size_bytes: None,
score: None,
is_selected: true,
});
}
}
let selected_instance_url = discovered
.iter()
.filter(|asset| asset.asset_type == "instance")
.map(|asset| {
(
asset.url.clone(),
score_instance(&asset.name, input.primary_document.as_deref()),
)
})
.max_by(|left, right| {
left.1
.partial_cmp(&right.1)
.unwrap_or(std::cmp::Ordering::Equal)
})
.map(|entry| entry.0);
for asset in &mut discovered {
asset.score = if asset.asset_type == "instance" {
Some(score_instance(
&asset.name,
input.primary_document.as_deref(),
))
} else if asset.asset_type == "pdf" {
Some(score_pdf(&asset.name, asset.size_bytes))
} else {
None
};
asset.is_selected = match asset.asset_type.as_str() {
"instance" => selected_instance_url
.as_ref()
.map(|url| url == &asset.url)
.unwrap_or(false),
"presentation" | "label" => true,
_ => false,
};
}
Ok(DiscoveredAssets { assets: discovered })
}
fn resolve_filing_directory_url(
filing_url: Option<&str>,
cik: &str,
accession_number: &str,
) -> Option<String> {
if let Some(filing_url) = filing_url.map(str::trim).filter(|value| !value.is_empty()) {
if let Some(last_slash) = filing_url.rfind('/') {
if last_slash > "https://".len() {
return Some(filing_url[..=last_slash].to_string());
}
}
}
let cik_path = normalize_cik_for_path(cik)?;
let accession_path = accession_number.replace('-', "");
Some(format!(
"https://www.sec.gov/Archives/edgar/data/{cik_path}/{accession_path}/"
))
}
fn normalize_cik_for_path(value: &str) -> Option<String> {
let digits = value
.chars()
.filter(|char| char.is_ascii_digit())
.collect::<String>();
if digits.is_empty() {
return None;
}
digits.parse::<u64>().ok().map(|parsed| parsed.to_string())
}
fn classify_asset_type(name: &str) -> &'static str {
let lower = name.to_lowercase();
if lower.ends_with(".pdf") {
return "pdf";
}
if lower.ends_with(".xsd") {
return "schema";
}
if lower.ends_with(".xml") {
if lower.ends_with("_pre.xml")
|| lower.ends_with("-pre.xml")
|| lower.contains("presentation")
{
return "presentation";
}
if lower.ends_with("_lab.xml") || lower.ends_with("-lab.xml") || lower.contains("label") {
return "label";
}
if lower.ends_with("_cal.xml")
|| lower.ends_with("-cal.xml")
|| lower.contains("calculation")
{
return "calculation";
}
if lower.ends_with("_def.xml")
|| lower.ends_with("-def.xml")
|| lower.contains("definition")
{
return "definition";
}
return "instance";
}
"other"
}
fn score_instance(name: &str, primary_document: Option<&str>) -> f64 {
let lower = name.to_lowercase();
let mut score = 1.0;
if lower.ends_with("_htm.xml") {
score += 4.0;
}
if lower.ends_with("_ins.xml") {
score += 4.0;
}
if let Some(base_primary) = primary_document
.map(|value| value.replace(|char: char| char == '.' || char == '-', "_"))
.map(|value| value.to_lowercase())
{
let base = base_primary
.rsplit_once('_')
.map(|(head, _)| head.to_string())
.unwrap_or(base_primary);
if !base.is_empty() && lower.contains(&base) {
score += 5.0;
}
}
if lower.contains("cal")
|| lower.contains("def")
|| lower.contains("lab")
|| lower.contains("pre")
{
score -= 3.0;
}
score
}
fn score_pdf(name: &str, size_bytes: Option<i64>) -> f64 {
let lower = name.to_lowercase();
let mut score = 0.0;
if ["financial", "statement", "annual", "quarter", "10k", "10q"]
.iter()
.any(|needle| lower.contains(needle))
{
score += 8.0;
}
if lower.contains("exhibit") {
score -= 2.0;
}
if size_bytes.unwrap_or_default() > 100_000 {
score += 1.0;
}
score
}
fn parse_size(value: Option<&serde_json::Value>) -> Option<i64> {
match value {
Some(serde_json::Value::Number(number)) => number.as_i64(),
Some(serde_json::Value::String(raw)) => raw.parse::<i64>().ok(),
_ => None,
}
}
fn fetch_text(client: &Client, url: &str) -> Result<String> {
let response = client
.get(url)
.send()
.with_context(|| format!("request failed for {url}"))?;
if !response.status().is_success() {
return Err(anyhow!("request failed for {url} ({})", response.status()));
}
response
.text()
.with_context(|| format!("unable to read response body for {url}"))
}
fn fetch_json<T: for<'de> Deserialize<'de>>(client: &Client, url: &str) -> Result<T> {
let response = client
.get(url)
.send()
.with_context(|| format!("request failed for {url}"))?;
if !response.status().is_success() {
return Err(anyhow!("request failed for {url} ({})", response.status()));
}
response
.json::<T>()
.with_context(|| format!("unable to parse JSON response for {url}"))
}
struct ParsedInstance {
contexts: Vec<ContextOutput>,
facts: Vec<ParsedFact>,
}
fn parse_xbrl_instance(raw: &str, source_file: Option<String>) -> ParsedInstance {
let namespaces = parse_namespace_map(raw, "xbrl");
let context_by_id = parse_contexts(raw);
let unit_by_id = parse_units(raw);
let mut facts = Vec::new();
for captures in FACT_RE.captures_iter(raw) {
let prefix = captures
.get(1)
.map(|value| value.as_str().trim())
.unwrap_or_default();
let local_name = captures
.get(2)
.map(|value| value.as_str().trim())
.unwrap_or_default();
let attrs = captures
.get(3)
.map(|value| value.as_str())
.unwrap_or_default();
let body = decode_xml_entities(
captures
.get(4)
.map(|value| value.as_str())
.unwrap_or_default()
.trim(),
);
if prefix.is_empty() || local_name.is_empty() || is_xbrl_infrastructure_prefix(prefix) {
continue;
}
let attr_map = parse_attrs(attrs);
let Some(context_id) = attr_map
.get("contextRef")
.cloned()
.or_else(|| attr_map.get("contextref").cloned())
else {
continue;
};
let Some(value) = parse_number(&body) else {
continue;
};
let namespace_uri = namespaces
.get(prefix)
.cloned()
.unwrap_or_else(|| format!("urn:unknown:{prefix}"));
let context = context_by_id.get(&context_id);
let unit_ref = attr_map
.get("unitRef")
.cloned()
.or_else(|| attr_map.get("unitref").cloned());
let unit = unit_ref
.as_ref()
.and_then(|unit_ref| unit_by_id.get(unit_ref))
.and_then(|unit| unit.measure.clone())
.or(unit_ref);
facts.push(ParsedFact {
concept_key: format!("{namespace_uri}#{local_name}"),
qname: format!("{prefix}:{local_name}"),
namespace_uri,
local_name: local_name.to_string(),
data_type: None,
context_id: context_id.clone(),
unit,
decimals: attr_map.get("decimals").cloned(),
precision: attr_map.get("precision").cloned(),
nil: attr_map
.get("xsi:nil")
.or_else(|| attr_map.get("nil"))
.map(|value| value.eq_ignore_ascii_case("true"))
.unwrap_or(false),
value,
period_start: context.and_then(|value| value.period_start.clone()),
period_end: context.and_then(|value| value.period_end.clone()),
period_instant: context.and_then(|value| value.period_instant.clone()),
dimensions: context
.map(|value| value.dimensions.clone())
.unwrap_or_default(),
is_dimensionless: context
.map(|value| value.dimensions.is_empty())
.unwrap_or(true),
source_file: source_file.clone(),
});
}
let contexts = context_by_id
.values()
.map(|context| ContextOutput {
context_id: context.id.clone(),
entity_identifier: context.entity_identifier.clone(),
entity_scheme: context.entity_scheme.clone(),
period_start: context.period_start.clone(),
period_end: context.period_end.clone(),
period_instant: context.period_instant.clone(),
segment_json: context.segment.clone(),
scenario_json: context.scenario.clone(),
})
.collect::<Vec<_>>();
ParsedInstance { contexts, facts }
}
fn parse_namespace_map(raw: &str, root_tag_hint: &str) -> HashMap<String, String> {
let mut map = HashMap::new();
let root_start = Regex::new(&format!(r#"(?is)<[^>]*{root_tag_hint}[^>]*>"#))
.unwrap()
.find(raw)
.map(|match_| match_.as_str().to_string())
.unwrap_or_else(|| raw.chars().take(1200).collect::<String>());
for captures in Regex::new(r#"xmlns:([a-zA-Z0-9_\-]+)=["']([^"']+)["']"#)
.unwrap()
.captures_iter(&root_start)
{
if let (Some(prefix), Some(uri)) = (captures.get(1), captures.get(2)) {
map.insert(
prefix.as_str().trim().to_string(),
uri.as_str().trim().to_string(),
);
}
}
map
}
fn parse_contexts(raw: &str) -> HashMap<String, ParsedContext> {
let mut contexts = HashMap::new();
for captures in CONTEXT_RE.captures_iter(raw) {
let Some(context_id) = captures
.get(1)
.map(|value| value.as_str().trim().to_string())
else {
continue;
};
let block = captures
.get(2)
.map(|value| value.as_str())
.unwrap_or_default();
let (entity_identifier, entity_scheme) = IDENTIFIER_RE
.captures(block)
.map(|captures| {
(
captures
.get(2)
.map(|value| decode_xml_entities(value.as_str().trim())),
captures
.get(1)
.map(|value| decode_xml_entities(value.as_str().trim())),
)
})
.unwrap_or((None, None));
let period_start = START_DATE_RE
.captures(block)
.and_then(|captures| captures.get(1))
.map(|value| decode_xml_entities(value.as_str().trim()));
let period_end = END_DATE_RE
.captures(block)
.and_then(|captures| captures.get(1))
.map(|value| decode_xml_entities(value.as_str().trim()));
let period_instant = INSTANT_RE
.captures(block)
.and_then(|captures| captures.get(1))
.map(|value| decode_xml_entities(value.as_str().trim()));
let segment = SEGMENT_RE
.captures(block)
.and_then(|captures| captures.get(1))
.map(|value| parse_dimension_container(value.as_str()));
let scenario = SCENARIO_RE
.captures(block)
.and_then(|captures| captures.get(1))
.map(|value| parse_dimension_container(value.as_str()));
let mut dimensions = Vec::new();
if let Some(segment_value) = segment.as_ref() {
if let Some(members) = segment_value
.get("explicitMembers")
.and_then(|value| value.as_array())
{
for member in members {
if let (Some(axis), Some(member_value)) = (
member.get("axis").and_then(|value| value.as_str()),
member.get("member").and_then(|value| value.as_str()),
) {
dimensions.push(DimensionOutput {
axis: axis.to_string(),
member: member_value.to_string(),
});
}
}
}
}
if let Some(scenario_value) = scenario.as_ref() {
if let Some(members) = scenario_value
.get("explicitMembers")
.and_then(|value| value.as_array())
{
for member in members {
if let (Some(axis), Some(member_value)) = (
member.get("axis").and_then(|value| value.as_str()),
member.get("member").and_then(|value| value.as_str()),
) {
dimensions.push(DimensionOutput {
axis: axis.to_string(),
member: member_value.to_string(),
});
}
}
}
}
contexts.insert(
context_id.clone(),
ParsedContext {
id: context_id,
entity_identifier,
entity_scheme,
period_start,
period_end,
period_instant,
dimensions,
segment,
scenario,
},
);
}
contexts
}
fn parse_dimension_container(raw: &str) -> serde_json::Value {
let explicit_members = EXPLICIT_MEMBER_RE
.captures_iter(raw)
.filter_map(|captures| {
Some(serde_json::json!({
"axis": decode_xml_entities(captures.get(1)?.as_str().trim()),
"member": decode_xml_entities(captures.get(2)?.as_str().trim())
}))
})
.collect::<Vec<_>>();
let typed_members = TYPED_MEMBER_RE
.captures_iter(raw)
.filter_map(|captures| {
Some(serde_json::json!({
"axis": decode_xml_entities(captures.get(1)?.as_str().trim()),
"value": decode_xml_entities(captures.get(2)?.as_str().trim())
}))
})
.collect::<Vec<_>>();
serde_json::json!({
"explicitMembers": explicit_members,
"typedMembers": typed_members
})
}
fn parse_units(raw: &str) -> HashMap<String, ParsedUnit> {
let mut units = HashMap::new();
for captures in UNIT_RE.captures_iter(raw) {
let Some(id) = captures
.get(1)
.map(|value| value.as_str().trim().to_string())
else {
continue;
};
let block = captures
.get(2)
.map(|value| value.as_str())
.unwrap_or_default();
let measures = MEASURE_RE
.captures_iter(block)
.filter_map(|captures| captures.get(1))
.map(|value| decode_xml_entities(value.as_str().trim()))
.filter(|value| !value.is_empty())
.collect::<Vec<_>>();
let measure = if measures.len() == 1 {
measures.first().cloned()
} else if measures.len() > 1 {
Some(measures.join("/"))
} else {
None
};
units.insert(id, ParsedUnit { measure });
}
units
}
fn is_xbrl_infrastructure_prefix(prefix: &str) -> bool {
matches!(
prefix.to_ascii_lowercase().as_str(),
"xbrli" | "xlink" | "link" | "xbrldi" | "xbrldt"
)
}
fn parse_attrs(raw: &str) -> HashMap<String, String> {
let mut map = HashMap::new();
for captures in ATTR_RE.captures_iter(raw) {
if let (Some(name), Some(value)) = (captures.get(1), captures.get(2)) {
map.insert(
name.as_str().to_string(),
decode_xml_entities(value.as_str()),
);
}
}
map
}
fn decode_xml_entities(value: &str) -> String {
value
.replace("&amp;", "&")
.replace("&lt;", "<")
.replace("&gt;", ">")
.replace("&quot;", "\"")
.replace("&#39;", "'")
.replace("&#160;", " ")
.replace("&nbsp;", " ")
}
fn parse_number(raw: &str) -> Option<f64> {
let trimmed = raw.trim();
if trimmed.is_empty() || trimmed.chars().all(|char| char == '-') {
return None;
}
let negative = trimmed.starts_with('(') && trimmed.ends_with(')');
let normalized = Regex::new(r#"<[^>]+>"#)
.unwrap()
.replace_all(trimmed, " ")
.replace(',', "")
.replace('$', "")
.replace(['(', ')'], "")
.replace('\u{2212}', "-")
.split_whitespace()
.collect::<String>();
let parsed = normalized.parse::<f64>().ok()?;
Some(if negative { -parsed.abs() } else { parsed })
}
fn parse_label_linkbase(raw: &str) -> HashMap<String, String> {
let namespaces = parse_namespace_map(raw, "linkbase");
let mut preferred = HashMap::<String, (String, i64)>::new();
for captures in LABEL_LINK_RE.captures_iter(raw) {
let block = captures
.get(1)
.map(|value| value.as_str())
.unwrap_or_default();
let mut loc_by_label = HashMap::<String, String>::new();
let mut resource_by_label = HashMap::<String, (String, Option<String>)>::new();
for captures in LOC_RE.captures_iter(block) {
let attrs = parse_attrs(
captures
.get(1)
.map(|value| value.as_str())
.unwrap_or_default(),
);
let Some(label) = attrs.get("xlink:label").cloned() else {
continue;
};
let Some(href) = attrs.get("xlink:href").cloned() else {
continue;
};
let Some(qname) = qname_from_href(&href) else {
continue;
};
let Some((concept_key, _, _)) = concept_from_qname(&qname, &namespaces) else {
continue;
};
loc_by_label.insert(label, concept_key);
}
for captures in LABEL_RESOURCE_RE.captures_iter(block) {
let attrs = parse_attrs(
captures
.get(1)
.map(|value| value.as_str())
.unwrap_or_default(),
);
let Some(label) = attrs.get("xlink:label").cloned() else {
continue;
};
let body = decode_xml_entities(
captures
.get(2)
.map(|value| value.as_str())
.unwrap_or_default(),
)
.split_whitespace()
.collect::<Vec<_>>()
.join(" ");
if body.is_empty() {
continue;
}
resource_by_label.insert(label, (body, attrs.get("xlink:role").cloned()));
}
for captures in LABEL_ARC_RE.captures_iter(block) {
let attrs = parse_attrs(
captures
.get(1)
.map(|value| value.as_str())
.unwrap_or_default(),
);
let Some(from) = attrs.get("xlink:from").cloned() else {
continue;
};
let Some(to) = attrs.get("xlink:to").cloned() else {
continue;
};
let Some(concept_key) = loc_by_label.get(&from) else {
continue;
};
let Some((label, role)) = resource_by_label.get(&to) else {
continue;
};
let priority = label_priority(role.as_deref());
let current = preferred.get(concept_key).cloned();
if current
.as_ref()
.map(|(_, current_priority)| priority > *current_priority)
.unwrap_or(true)
{
preferred.insert(concept_key.clone(), (label.clone(), priority));
}
}
}
preferred
.into_iter()
.map(|(key, (value, _))| (key, value))
.collect()
}
fn parse_presentation_linkbase(raw: &str) -> Vec<PresentationNode> {
let namespaces = parse_namespace_map(raw, "linkbase");
let mut rows = Vec::new();
for captures in PRESENTATION_LINK_RE.captures_iter(raw) {
let link_attrs = parse_attrs(
captures
.get(1)
.map(|value| value.as_str())
.unwrap_or_default(),
);
let Some(role_uri) = link_attrs.get("xlink:role").cloned() else {
continue;
};
let block = captures
.get(2)
.map(|value| value.as_str())
.unwrap_or_default();
let mut loc_by_label = HashMap::<String, (String, String, bool)>::new();
let mut children_by_label = HashMap::<String, Vec<(String, f64)>>::new();
let mut incoming = HashSet::<String>::new();
let mut all_referenced = HashSet::<String>::new();
for captures in LOC_RE.captures_iter(block) {
let attrs = parse_attrs(
captures
.get(1)
.map(|value| value.as_str())
.unwrap_or_default(),
);
let Some(label) = attrs.get("xlink:label").cloned() else {
continue;
};
let Some(href) = attrs.get("xlink:href").cloned() else {
continue;
};
let Some(qname) = qname_from_href(&href) else {
continue;
};
let Some((concept_key, qname, local_name)) = concept_from_qname(&qname, &namespaces)
else {
continue;
};
loc_by_label.insert(
label,
(
concept_key,
qname,
local_name.to_ascii_lowercase().contains("abstract"),
),
);
}
for captures in PRESENTATION_ARC_RE.captures_iter(block) {
let attrs = parse_attrs(
captures
.get(1)
.map(|value| value.as_str())
.unwrap_or_default(),
);
let Some(from) = attrs.get("xlink:from").cloned() else {
continue;
};
let Some(to) = attrs.get("xlink:to").cloned() else {
continue;
};
if !loc_by_label.contains_key(&from) || !loc_by_label.contains_key(&to) {
continue;
}
let order = attrs
.get("order")
.and_then(|value| value.parse::<f64>().ok())
.unwrap_or_else(|| {
children_by_label
.get(&from)
.map(|children| children.len() as f64 + 1.0)
.unwrap_or(1.0)
});
children_by_label
.entry(from.clone())
.or_default()
.push((to.clone(), order));
incoming.insert(to.clone());
all_referenced.insert(from);
all_referenced.insert(to);
}
let roots = all_referenced
.iter()
.filter(|label| !incoming.contains(*label))
.cloned()
.collect::<Vec<_>>();
let mut visited = HashSet::<String>::new();
fn dfs(
label: &str,
depth: i64,
parent_label: Option<&str>,
base_order: f64,
role_uri: &str,
loc_by_label: &HashMap<String, (String, String, bool)>,
children_by_label: &HashMap<String, Vec<(String, f64)>>,
rows: &mut Vec<PresentationNode>,
visited: &mut HashSet<String>,
) {
let Some((concept_key, _qname, is_abstract)) = loc_by_label.get(label) else {
return;
};
let path_key = format!("{}::{label}::{depth}", parent_label.unwrap_or("root"));
if !visited.insert(path_key) {
return;
}
let parent_concept_key = parent_label.and_then(|parent| {
loc_by_label
.get(parent)
.map(|(concept_key, _, _)| concept_key.clone())
});
rows.push(PresentationNode {
concept_key: concept_key.clone(),
role_uri: role_uri.to_string(),
order: base_order,
depth,
parent_concept_key,
is_abstract: *is_abstract,
});
let mut children = children_by_label.get(label).cloned().unwrap_or_default();
children.sort_by(|left, right| {
left.1
.partial_cmp(&right.1)
.unwrap_or(std::cmp::Ordering::Equal)
});
for (index, (child_label, _)) in children.into_iter().enumerate() {
dfs(
&child_label,
depth + 1,
Some(label),
base_order + (index as f64 + 1.0) / 1000.0,
role_uri,
loc_by_label,
children_by_label,
rows,
visited,
);
}
}
for (index, root) in roots.iter().enumerate() {
dfs(
root,
0,
None,
index as f64 + 1.0,
&role_uri,
&loc_by_label,
&children_by_label,
&mut rows,
&mut visited,
);
}
}
rows
}
fn qname_from_href(href: &str) -> Option<String> {
let fragment = href.split('#').nth(1).unwrap_or(href).trim();
if fragment.is_empty() {
return None;
}
let cleaned = fragment.trim_start_matches("loc_");
if cleaned.contains(':') {
return Some(cleaned.to_string());
}
cleaned
.split_once('_')
.map(|(prefix, local)| format!("{prefix}:{local}"))
}
fn concept_from_qname(
qname: &str,
namespaces: &HashMap<String, String>,
) -> Option<(String, String, String)> {
let (prefix, local_name) = qname.split_once(':')?;
let namespace_uri = namespaces
.get(prefix)
.cloned()
.unwrap_or_else(|| format!("urn:unknown:{prefix}"));
Some((
format!("{namespace_uri}#{local_name}"),
qname.to_string(),
local_name.to_string(),
))
}
fn label_priority(role: Option<&str>) -> i64 {
let normalized = role.unwrap_or_default().to_ascii_lowercase();
if normalized.ends_with("/label") {
4
} else if normalized.ends_with("/terselabel") {
3
} else if normalized.ends_with("/verboselabel") {
2
} else if normalized.is_empty() {
0
} else {
1
}
}
struct MaterializedStatements {
periods: Vec<PeriodOutput>,
statement_rows: StatementRowMap,
concepts: Vec<ConceptOutput>,
facts: Vec<FactOutput>,
}
fn materialize_taxonomy_statements(
filing_id: i64,
accession_number: &str,
filing_date: &str,
filing_type: &str,
facts: &[ParsedFact],
presentation: &[PresentationNode],
label_by_concept: &HashMap<String, String>,
) -> MaterializedStatements {
let compact_accession = accession_number.replace('-', "");
let mut period_by_signature = HashMap::<String, PeriodOutput>::new();
for fact in facts {
let signature = period_signature(fact);
if period_by_signature.contains_key(&signature) {
continue;
}
let date = fact
.period_end
.clone()
.or_else(|| fact.period_instant.clone())
.unwrap_or_else(|| filing_date.to_string());
let id = format!(
"{date}-{compact_accession}-{}",
period_by_signature.len() + 1
);
let period_label = if fact.period_instant.is_some() && fact.period_start.is_none() {
"Instant".to_string()
} else if fact.period_start.is_some() && fact.period_end.is_some() {
format!(
"{} to {}",
fact.period_start.clone().unwrap_or_default(),
fact.period_end.clone().unwrap_or_default()
)
} else {
"Filing Period".to_string()
};
period_by_signature.insert(
signature,
PeriodOutput {
id,
filing_id,
accession_number: accession_number.to_string(),
filing_date: filing_date.to_string(),
period_start: fact.period_start.clone(),
period_end: fact
.period_end
.clone()
.or_else(|| fact.period_instant.clone()),
filing_type: filing_type.to_string(),
period_label,
},
);
}
let mut periods = period_by_signature.values().cloned().collect::<Vec<_>>();
periods.sort_by(|left, right| {
let left_key = left
.period_end
.clone()
.unwrap_or_else(|| left.filing_date.clone());
let right_key = right
.period_end
.clone()
.unwrap_or_else(|| right.filing_date.clone());
left_key
.cmp(&right_key)
.then_with(|| left.id.cmp(&right.id))
});
let period_id_by_signature = period_by_signature
.iter()
.map(|(signature, period)| (signature.clone(), period.id.clone()))
.collect::<HashMap<_, _>>();
let mut presentation_by_concept = HashMap::<String, Vec<&PresentationNode>>::new();
for node in presentation {
presentation_by_concept
.entry(node.concept_key.clone())
.or_default()
.push(node);
}
let mut grouped_by_statement = empty_parsed_fact_map();
let mut enriched_facts = Vec::new();
for (index, fact) in facts.iter().enumerate() {
let nodes = presentation_by_concept
.get(&fact.concept_key)
.cloned()
.unwrap_or_default();
let best_node = nodes.first().copied();
let statement_kind = best_node
.and_then(|node| classify_statement_role(&node.role_uri))
.or_else(|| concept_statement_fallback(&fact.local_name));
let fact_output = FactOutput {
concept_key: fact.concept_key.clone(),
qname: fact.qname.clone(),
namespace_uri: fact.namespace_uri.clone(),
local_name: fact.local_name.clone(),
data_type: fact.data_type.clone(),
statement_kind: statement_kind.clone(),
role_uri: best_node.map(|node| node.role_uri.clone()),
authoritative_concept_key: None,
mapping_method: None,
surface_key: None,
detail_parent_surface_key: None,
kpi_key: None,
residual_flag: false,
context_id: fact.context_id.clone(),
unit: fact.unit.clone(),
decimals: fact.decimals.clone(),
precision: fact.precision.clone(),
nil: fact.nil,
value_num: fact.value,
period_start: fact.period_start.clone(),
period_end: fact.period_end.clone(),
period_instant: fact.period_instant.clone(),
dimensions: fact.dimensions.clone(),
is_dimensionless: fact.is_dimensionless,
source_file: fact.source_file.clone(),
};
if let Some(statement_kind) = statement_kind.clone() {
if let Some(statement_key) = statement_key_ref(&statement_kind) {
grouped_by_statement
.entry(statement_key)
.or_default()
.entry(fact.concept_key.clone())
.or_default()
.push((index as i64 + 1, fact.clone(), best_node.cloned()));
}
}
enriched_facts.push(fact_output);
}
let mut statement_rows = empty_statement_row_map();
let mut concepts = Vec::<ConceptOutput>::new();
for statement_kind in statement_keys() {
let concept_groups = grouped_by_statement
.remove(statement_kind)
.unwrap_or_default();
let mut concept_keys = HashSet::<String>::new();
for node in presentation.iter().filter(|node| {
classify_statement_role(&node.role_uri).as_deref() == Some(statement_kind)
}) {
concept_keys.insert(node.concept_key.clone());
}
for concept_key in concept_groups.keys() {
concept_keys.insert(concept_key.clone());
}
let mut ordered_concepts = concept_keys
.into_iter()
.map(|concept_key| {
let nodes = presentation
.iter()
.filter(|node| {
node.concept_key == concept_key
&& classify_statement_role(&node.role_uri).as_deref()
== Some(statement_kind)
})
.collect::<Vec<_>>();
let order = nodes
.iter()
.map(|node| node.order)
.fold(f64::INFINITY, f64::min);
let depth = nodes.iter().map(|node| node.depth).min().unwrap_or(0);
let role_uri = nodes.first().map(|node| node.role_uri.clone());
let parent_concept_key = nodes
.first()
.and_then(|node| node.parent_concept_key.clone());
(concept_key, order, depth, role_uri, parent_concept_key)
})
.collect::<Vec<_>>();
ordered_concepts.sort_by(|left, right| {
left.1
.partial_cmp(&right.1)
.unwrap_or(std::cmp::Ordering::Equal)
.then_with(|| left.0.cmp(&right.0))
});
for (concept_key, presentation_order, depth, role_uri, parent_concept_key) in
ordered_concepts
{
let fact_group = concept_groups
.get(&concept_key)
.cloned()
.unwrap_or_default();
let (namespace_uri, local_name) = split_concept_key(&concept_key);
let qname = fact_group
.first()
.map(|(_, fact, _)| fact.qname.clone())
.unwrap_or_else(|| format!("unknown:{local_name}"));
let label = label_by_concept
.get(&concept_key)
.cloned()
.unwrap_or_else(|| local_name_to_label(&local_name));
let mut values = BTreeMap::<String, Option<f64>>::new();
let mut units = BTreeMap::<String, Option<String>>::new();
let mut source_fact_ids = Vec::<i64>::new();
let mut has_dimensions = false;
let mut fact_groups = HashMap::<String, Vec<(i64, ParsedFact)>>::new();
for (fact_id, fact, _) in fact_group.iter() {
fact_groups
.entry(period_signature(fact))
.or_default()
.push((*fact_id, fact.clone()));
}
for (signature, grouped_facts) in fact_groups {
let Some(period_id) = period_id_by_signature.get(&signature) else {
continue;
};
let preferred = pick_preferred_fact(&grouped_facts);
if let Some((fact_id, fact)) = preferred {
values.insert(period_id.clone(), Some(fact.value));
units.insert(period_id.clone(), fact.unit.clone());
source_fact_ids.push(*fact_id);
has_dimensions = has_dimensions || !fact.is_dimensionless;
}
}
let row = StatementRowOutput {
key: concept_key.clone(),
label: label.clone(),
concept_key: concept_key.clone(),
qname: qname.clone(),
namespace_uri: namespace_uri.clone(),
local_name: local_name.clone(),
is_extension: !is_standard_namespace(&namespace_uri),
statement: statement_kind.to_string(),
role_uri: role_uri.clone(),
order: if presentation_order.is_finite() {
(presentation_order * 1000.0).round() as i64
} else {
1_000_000
},
depth,
parent_key: parent_concept_key.clone(),
values,
units,
has_dimensions,
source_fact_ids: {
source_fact_ids.sort();
source_fact_ids
},
};
if let Some(statement_rows) = statement_rows.get_mut(statement_kind) {
statement_rows.push(row.clone());
}
concepts.push(ConceptOutput {
concept_key,
qname,
namespace_uri,
local_name,
label: Some(label),
is_extension: !is_standard_namespace(&row.namespace_uri),
balance: None,
period_type: None,
data_type: None,
statement_kind: Some(statement_kind.to_string()),
role_uri,
authoritative_concept_key: None,
mapping_method: None,
surface_key: None,
detail_parent_surface_key: None,
kpi_key: None,
residual_flag: false,
presentation_order: if presentation_order.is_finite() {
Some(presentation_order)
} else {
None
},
presentation_depth: Some(depth),
parent_concept_key,
is_abstract: presentation
.iter()
.find(|node| node.concept_key == row.concept_key)
.map(|node| node.is_abstract)
.unwrap_or(false),
});
}
}
MaterializedStatements {
periods,
statement_rows,
concepts,
facts: enriched_facts,
}
}
fn empty_parsed_fact_map(
) -> HashMap<&'static str, HashMap<String, Vec<(i64, ParsedFact, Option<PresentationNode>)>>> {
let mut map = HashMap::new();
for key in statement_keys() {
map.insert(key, HashMap::new());
}
map
}
fn empty_statement_row_map() -> StatementRowMap {
statement_keys()
.into_iter()
.map(|key| (key.to_string(), Vec::new()))
.collect()
}
fn empty_surface_row_map() -> SurfaceRowMap {
statement_keys()
.into_iter()
.map(|key| (key.to_string(), Vec::new()))
.collect()
}
fn empty_detail_row_map() -> DetailRowStatementMap {
statement_keys()
.into_iter()
.map(|key| (key.to_string(), BTreeMap::new()))
.collect()
}
fn statement_keys() -> [&'static str; 5] {
[
"income",
"balance",
"cash_flow",
"equity",
"comprehensive_income",
]
}
fn statement_key_ref(value: &str) -> Option<&'static str> {
match value {
"income" => Some("income"),
"balance" => Some("balance"),
"cash_flow" => Some("cash_flow"),
"equity" => Some("equity"),
"comprehensive_income" => Some("comprehensive_income"),
_ => None,
}
}
fn pick_preferred_fact(grouped_facts: &[(i64, ParsedFact)]) -> Option<&(i64, ParsedFact)> {
grouped_facts.iter().max_by(|left, right| {
let left_dimension_score = if left.1.is_dimensionless { 1 } else { 0 };
let right_dimension_score = if right.1.is_dimensionless { 1 } else { 0 };
left_dimension_score
.cmp(&right_dimension_score)
.then_with(|| {
let left_date = left
.1
.period_end
.as_ref()
.or(left.1.period_instant.as_ref())
.cloned()
.unwrap_or_default();
let right_date = right
.1
.period_end
.as_ref()
.or(right.1.period_instant.as_ref())
.cloned()
.unwrap_or_default();
left_date.cmp(&right_date)
})
.then_with(|| {
left.1
.value
.abs()
.partial_cmp(&right.1.value.abs())
.unwrap_or(std::cmp::Ordering::Equal)
})
})
}
fn period_signature(fact: &ParsedFact) -> String {
format!(
"start:{}|end:{}|instant:{}",
fact.period_start.clone().unwrap_or_default(),
fact.period_end.clone().unwrap_or_default(),
fact.period_instant.clone().unwrap_or_default()
)
}
fn split_concept_key(concept_key: &str) -> (String, String) {
concept_key
.rsplit_once('#')
.map(|(namespace_uri, local_name)| (namespace_uri.to_string(), local_name.to_string()))
.unwrap_or_else(|| ("urn:unknown".to_string(), concept_key.to_string()))
}
fn local_name_to_label(local_name: &str) -> String {
let spaced = Regex::new(r#"([a-z0-9])([A-Z])"#)
.unwrap()
.replace_all(local_name, "$1 $2")
.to_string();
Regex::new(r#"([A-Z]+)([A-Z][a-z])"#)
.unwrap()
.replace_all(&spaced, "$1 $2")
.replace('_', " ")
.trim()
.to_string()
}
fn classify_statement_role(role_uri: &str) -> Option<String> {
let normalized = role_uri.to_ascii_lowercase();
if Regex::new(r#"cash\s*flow|statementsof?cashflows|netcash"#)
.unwrap()
.is_match(&normalized)
{
return Some("cash_flow".to_string());
}
if Regex::new(r#"shareholders?|stockholders?|equity|retainedearnings"#)
.unwrap()
.is_match(&normalized)
{
return Some("equity".to_string());
}
if Regex::new(r#"comprehensive\s*income"#)
.unwrap()
.is_match(&normalized)
{
return Some("comprehensive_income".to_string());
}
if Regex::new(r#"balance\s*sheet|financial\s*position|assets?andliabilities"#)
.unwrap()
.is_match(&normalized)
{
return Some("balance".to_string());
}
if Regex::new(r#"operations|income\s*statement|statementsofincome|profit"#)
.unwrap()
.is_match(&normalized)
{
return Some("income".to_string());
}
None
}
fn concept_statement_fallback(local_name: &str) -> Option<String> {
let normalized = local_name.to_ascii_lowercase();
if Regex::new(r#"equity|retainedearnings|additionalpaidincapital"#)
.unwrap()
.is_match(&normalized)
{
return Some("equity".to_string());
}
if normalized.contains("comprehensiveincome") {
return Some("comprehensive_income".to_string());
}
if Regex::new(
r#"deferredpolicyacquisitioncosts(andvalueofbusinessacquired)?$|supplementaryinsuranceinformationdeferredpolicyacquisitioncosts$|deferredacquisitioncosts$"#,
)
.unwrap()
.is_match(&normalized)
{
return Some("balance".to_string());
}
if Regex::new(
r#"netcashprovidedbyusedin.*activities|increasedecreasein|paymentstoacquire|paymentsforcapitalimprovements$|paymentsfordepositsonrealestateacquisitions$|paymentsforrepurchase|paymentsofdividends|dividendscommonstockcash$|proceedsfrom|repaymentsofdebt|sharebasedcompensation$|allocatedsharebasedcompensationexpense$|depreciationdepletionandamortization$|depreciationamortizationandaccretionnet$|depreciationandamortization$|depreciationamortizationandother$|otheradjustmentstoreconcilenetincomelosstocashprovidedbyusedinoperatingactivities"#,
)
.unwrap()
.is_match(&normalized)
{
return Some("cash_flow".to_string());
}
if Regex::new(
r#"asset|liabilit|debt|financingreceivable|loansreceivable|deposits|allowanceforcreditloss|futurepolicybenefits|policyholderaccountbalances|unearnedpremiums|realestateinvestmentproperty|grossatcarryingvalue|investmentproperty"#,
)
.unwrap()
.is_match(&normalized)
{
return Some("balance".to_string());
}
if Regex::new(
r#"revenue|income|profit|expense|costof|leaseincome|rental|premiums|claims|underwriting|policyacquisition|interestincome|interestexpense|noninterest|leasedandrentedproperty"#,
)
.unwrap()
.is_match(&normalized)
{
return Some("income".to_string());
}
None
}
fn is_standard_namespace(namespace_uri: &str) -> bool {
let lower = namespace_uri.to_ascii_lowercase();
lower.contains("us-gaap")
|| lower.contains("ifrs")
|| lower.contains("/dei/")
|| lower.contains("xbrl.sec.gov/dei")
}
#[cfg(test)]
mod tests {
use super::*;
use crate::pack_selector::FiscalPack;
fn period(id: &str, period_end: &str) -> PeriodOutput {
PeriodOutput {
id: id.to_string(),
filing_id: 1,
accession_number: "0000000000-00-000001".to_string(),
filing_date: "2025-12-31".to_string(),
period_start: Some("2025-01-01".to_string()),
period_end: Some(period_end.to_string()),
filing_type: "10-K".to_string(),
period_label: period_end.to_string(),
}
}
fn row(
key: &str,
qname: &str,
statement: &str,
order: i64,
values: &[(&str, f64)],
) -> StatementRowOutput {
let namespace_uri = qname
.split_once(':')
.map(|(prefix, _)| {
if prefix == "us-gaap" {
"http://fasb.org/us-gaap/2024".to_string()
} else {
format!("urn:{prefix}")
}
})
.unwrap_or_else(|| "urn:unknown".to_string());
let local_name = qname
.split_once(':')
.map(|(_, local_name)| local_name.to_string())
.unwrap_or_else(|| qname.to_string());
StatementRowOutput {
key: key.to_string(),
label: local_name_to_label(&local_name),
concept_key: format!("{namespace_uri}#{local_name}"),
qname: qname.to_string(),
namespace_uri,
local_name,
is_extension: false,
statement: statement.to_string(),
role_uri: Some(statement.to_string()),
order,
depth: 0,
parent_key: None,
values: values
.iter()
.map(|(period_id, value)| (period_id.to_string(), Some(*value)))
.collect(),
units: values
.iter()
.map(|(period_id, _)| (period_id.to_string(), Some("iso4217:USD".to_string())))
.collect(),
has_dimensions: false,
source_fact_ids: vec![order],
}
}
#[test]
fn builds_compact_surface_rows_from_core_pack() {
let periods = vec![period("2024", "2024-12-31"), period("2025", "2025-12-31")];
let mut statement_rows = empty_statement_row_map();
statement_rows.insert(
"income".to_string(),
vec![
row(
"revenue-row",
"us-gaap:RevenueFromContractWithCustomerExcludingAssessedTax",
"income",
10,
&[("2024", 100.0), ("2025", 120.0)],
),
row(
"operating-expenses-row",
"us-gaap:OperatingExpenses",
"income",
20,
&[("2024", 40.0), ("2025", 50.0)],
),
row(
"sga-row",
"us-gaap:SellingGeneralAndAdministrativeExpense",
"income",
30,
&[("2024", 25.0), ("2025", 31.0)],
),
row(
"rd-row",
"us-gaap:ResearchAndDevelopmentExpense",
"income",
40,
&[("2024", 15.0), ("2025", 19.0)],
),
row(
"net-income-row",
"us-gaap:NetIncomeLoss",
"income",
50,
&[("2024", 22.0), ("2025", 30.0)],
),
row(
"unmapped-row",
"company:OtherOperatingCharges",
"income",
60,
&[("2024", 3.0), ("2025", 4.0)],
),
],
);
statement_rows.insert(
"balance".to_string(),
vec![row(
"assets-row",
"us-gaap:Assets",
"balance",
70,
&[("2024", 500.0), ("2025", 550.0)],
)],
);
statement_rows.insert(
"cash_flow".to_string(),
vec![row(
"ocf-row",
"us-gaap:NetCashProvidedByUsedInOperatingActivities",
"cash_flow",
80,
&[("2024", 60.0), ("2025", 65.0)],
)],
);
let model = surface_mapper::build_compact_surface_model(
&periods,
&statement_rows,
"us-gaap",
FiscalPack::Core,
vec![],
)
.expect("core pack should load and map");
let income_surface_rows = model
.surface_rows
.get("income")
.expect("income surface rows");
let op_expenses = income_surface_rows
.iter()
.find(|row| row.key == "operating_expenses")
.expect("operating expenses surface row");
let revenue = income_surface_rows
.iter()
.find(|row| row.key == "revenue")
.expect("revenue surface row");
assert_eq!(revenue.values.get("2025").copied().flatten(), Some(120.0));
assert_eq!(
op_expenses.values.get("2024").copied().flatten(),
Some(40.0)
);
assert_eq!(op_expenses.detail_count, Some(2));
let operating_expense_details = model
.detail_rows
.get("income")
.and_then(|groups| groups.get("operating_expenses"))
.expect("operating expenses details");
assert_eq!(operating_expense_details.len(), 2);
assert!(operating_expense_details
.iter()
.any(|row| row.key == "sga-row"));
assert!(operating_expense_details
.iter()
.any(|row| row.key == "rd-row"));
let residual_rows = model
.detail_rows
.get("income")
.and_then(|groups| groups.get("unmapped"))
.expect("unmapped detail rows");
assert_eq!(residual_rows.len(), 1);
assert_eq!(residual_rows[0].key, "unmapped-row");
assert!(residual_rows[0].residual_flag);
let rd_mapping = model
.concept_mappings
.get("http://fasb.org/us-gaap/2024#ResearchAndDevelopmentExpense")
.expect("rd mapping");
assert_eq!(
rd_mapping.detail_parent_surface_key.as_deref(),
Some("operating_expenses")
);
assert_eq!(
rd_mapping.surface_key.as_deref(),
Some("operating_expenses")
);
let residual_mapping = model
.concept_mappings
.get("urn:company#OtherOperatingCharges")
.expect("residual mapping");
assert!(residual_mapping.residual_flag);
assert_eq!(
residual_mapping.detail_parent_surface_key.as_deref(),
Some("unmapped")
);
assert_eq!(model.normalization_summary.surface_row_count, 6);
assert_eq!(model.normalization_summary.detail_row_count, 3);
assert_eq!(model.normalization_summary.unmapped_row_count, 1);
}
#[test]
fn parses_basic_xbrl_facts_without_regex_backreferences() {
let raw = r#"
<xbrli:xbrl xmlns:xbrli="http://www.xbrl.org/2003/instance" xmlns:us-gaap="http://fasb.org/us-gaap/2024">
<xbrli:context id="c1">
<xbrli:entity>
<xbrli:identifier scheme="http://www.sec.gov/CIK">0000320193</xbrli:identifier>
</xbrli:entity>
<xbrli:period>
<xbrli:startDate>2025-01-01</xbrli:startDate>
<xbrli:endDate>2025-12-31</xbrli:endDate>
</xbrli:period>
</xbrli:context>
<xbrli:unit id="u1">
<xbrli:measure>iso4217:USD</xbrli:measure>
</xbrli:unit>
<us-gaap:RevenueFromContractWithCustomerExcludingAssessedTax contextRef="c1" unitRef="u1" decimals="-6">1000</us-gaap:RevenueFromContractWithCustomerExcludingAssessedTax>
</xbrli:xbrl>
"#;
let parsed = parse_xbrl_instance(raw, Some("test.xml".to_string()));
assert_eq!(parsed.facts.len(), 1);
assert_eq!(
parsed.facts[0].qname,
"us-gaap:RevenueFromContractWithCustomerExcludingAssessedTax"
);
assert_eq!(parsed.facts[0].value, 1000.0);
assert_eq!(parsed.facts[0].unit.as_deref(), Some("iso4217:USD"));
}
#[test]
fn classifies_pack_specific_concepts_without_presentation_roles() {
assert_eq!(
concept_statement_fallback(
"FinancingReceivableExcludingAccruedInterestAfterAllowanceForCreditLoss"
)
.as_deref(),
Some("balance")
);
assert_eq!(
concept_statement_fallback("Deposits").as_deref(),
Some("balance")
);
assert_eq!(
concept_statement_fallback("RealEstateInvestmentPropertyNet").as_deref(),
Some("balance")
);
assert_eq!(
concept_statement_fallback("DeferredPolicyAcquisitionCosts").as_deref(),
Some("balance")
);
assert_eq!(
concept_statement_fallback("DeferredPolicyAcquisitionCostsAndValueOfBusinessAcquired")
.as_deref(),
Some("balance")
);
assert_eq!(
concept_statement_fallback("IncreaseDecreaseInAccountsReceivable").as_deref(),
Some("cash_flow")
);
assert_eq!(
concept_statement_fallback("PaymentsOfDividends").as_deref(),
Some("cash_flow")
);
assert_eq!(
concept_statement_fallback("RepaymentsOfDebt").as_deref(),
Some("cash_flow")
);
assert_eq!(
concept_statement_fallback("ShareBasedCompensation").as_deref(),
Some("cash_flow")
);
assert_eq!(
concept_statement_fallback("PaymentsForCapitalImprovements").as_deref(),
Some("cash_flow")
);
assert_eq!(
concept_statement_fallback("PaymentsForDepositsOnRealEstateAcquisitions").as_deref(),
Some("cash_flow")
);
assert_eq!(
concept_statement_fallback("LeaseIncome").as_deref(),
Some("income")
);
assert_eq!(
concept_statement_fallback("DirectCostsOfLeasedAndRentedPropertyOrEquipment")
.as_deref(),
Some("income")
);
}
}