- Add core.computed.json with 32 ratio definitions (filing + market derived) - Add Rust types for ComputedDefinition and ComputationSpec - Create generate-taxonomy.ts to generate TypeScript from Rust JSON - Generate lib/generated/ (gitignored) with surfaces, computed, kpis - Update financial-metrics.ts to use generated definitions - Add build-time generation via 'bun run generate' - Add taxonomy architecture documentation Two-phase ratio computation: - Filing-derived: margins, returns, per-share, growth (Rust computes) - Market-derived: valuation ratios (TypeScript computes with price data) All 32 ratios defined in core.computed.json: - Margins: gross, operating, ebitda, net, fcf - Returns: roa, roe, roic, roce - Financial health: debt_to_equity, net_debt_to_ebitda, cash_to_debt, current_ratio - Per-share: revenue, fcf, book_value - Growth: yoy metrics + 3y/5y cagr - Valuation: market_cap, ev, p/e, p/fcf, p/b, ev/sales, ev/ebitda, ev/fcf
2461 lines
81 KiB
Rust
2461 lines
81 KiB
Rust
use anyhow::{anyhow, Context, Result};
|
|
use once_cell::sync::Lazy;
|
|
use regex::Regex;
|
|
use reqwest::blocking::Client;
|
|
use serde::{Deserialize, Serialize};
|
|
use std::collections::{BTreeMap, HashMap, HashSet};
|
|
|
|
mod kpi_mapper;
|
|
mod metrics;
|
|
mod pack_selector;
|
|
mod surface_mapper;
|
|
mod taxonomy_loader;
|
|
mod universal_income;
|
|
|
|
use taxonomy_loader::{ComputationSpec, ComputedDefinition};
|
|
|
|
#[cfg(feature = "with-crabrl")]
|
|
use crabrl as _;
|
|
|
|
pub const PARSER_ENGINE: &str = "fiscal-xbrl";
|
|
pub const PARSER_VERSION: &str = env!("CARGO_PKG_VERSION");
|
|
|
|
static CONTEXT_RE: Lazy<Regex> = Lazy::new(|| {
|
|
Regex::new(r#"(?is)<(?:[a-z0-9_\-]+:)?context\b[^>]*\bid=["']([^"']+)["'][^>]*>(.*?)</(?:[a-z0-9_\-]+:)?context>"#).unwrap()
|
|
});
|
|
static UNIT_RE: Lazy<Regex> = Lazy::new(|| {
|
|
Regex::new(r#"(?is)<(?:[a-z0-9_\-]+:)?unit\b[^>]*\bid=["']([^"']+)["'][^>]*>(.*?)</(?:[a-z0-9_\-]+:)?unit>"#).unwrap()
|
|
});
|
|
static FACT_RE: Lazy<Regex> = Lazy::new(|| {
|
|
Regex::new(r#"(?is)<([a-zA-Z0-9_\-]+):([a-zA-Z0-9_\-.]+)\b([^>]*\bcontextRef=["'][^"']+["'][^>]*)>(.*?)</[a-zA-Z0-9_\-]+:[a-zA-Z0-9_\-.]+>"#).unwrap()
|
|
});
|
|
static EXPLICIT_MEMBER_RE: Lazy<Regex> = Lazy::new(|| {
|
|
Regex::new(r#"(?is)<(?:[a-z0-9_\-]+:)?explicitMember\b[^>]*\bdimension=["']([^"']+)["'][^>]*>(.*?)</(?:[a-z0-9_\-]+:)?explicitMember>"#).unwrap()
|
|
});
|
|
static TYPED_MEMBER_RE: Lazy<Regex> = Lazy::new(|| {
|
|
Regex::new(r#"(?is)<(?:[a-z0-9_\-]+:)?typedMember\b[^>]*\bdimension=["']([^"']+)["'][^>]*>(.*?)</(?:[a-z0-9_\-]+:)?typedMember>"#).unwrap()
|
|
});
|
|
static IDENTIFIER_RE: Lazy<Regex> = Lazy::new(|| {
|
|
Regex::new(r#"(?is)<(?:[a-z0-9_\-]+:)?identifier\b[^>]*\bscheme=["']([^"']+)["'][^>]*>(.*?)</(?:[a-z0-9_\-]+:)?identifier>"#).unwrap()
|
|
});
|
|
static SEGMENT_RE: Lazy<Regex> = Lazy::new(|| {
|
|
Regex::new(r#"(?is)<(?:[a-z0-9_\-]+:)?segment\b[^>]*>(.*?)</(?:[a-z0-9_\-]+:)?segment>"#)
|
|
.unwrap()
|
|
});
|
|
static SCENARIO_RE: Lazy<Regex> = Lazy::new(|| {
|
|
Regex::new(r#"(?is)<(?:[a-z0-9_\-]+:)?scenario\b[^>]*>(.*?)</(?:[a-z0-9_\-]+:)?scenario>"#)
|
|
.unwrap()
|
|
});
|
|
static START_DATE_RE: Lazy<Regex> = Lazy::new(|| {
|
|
Regex::new(r#"(?is)<(?:[a-z0-9_\-]+:)?startDate>(.*?)</(?:[a-z0-9_\-]+:)?startDate>"#).unwrap()
|
|
});
|
|
static END_DATE_RE: Lazy<Regex> = Lazy::new(|| {
|
|
Regex::new(r#"(?is)<(?:[a-z0-9_\-]+:)?endDate>(.*?)</(?:[a-z0-9_\-]+:)?endDate>"#).unwrap()
|
|
});
|
|
static INSTANT_RE: Lazy<Regex> = Lazy::new(|| {
|
|
Regex::new(r#"(?is)<(?:[a-z0-9_\-]+:)?instant>(.*?)</(?:[a-z0-9_\-]+:)?instant>"#).unwrap()
|
|
});
|
|
static MEASURE_RE: Lazy<Regex> = Lazy::new(|| {
|
|
Regex::new(r#"(?is)<(?:[a-z0-9_\-]+:)?measure>(.*?)</(?:[a-z0-9_\-]+:)?measure>"#).unwrap()
|
|
});
|
|
static LABEL_LINK_RE: Lazy<Regex> = Lazy::new(|| {
|
|
Regex::new(r#"(?is)<(?:[a-z0-9_\-]+:)?labelLink\b[^>]*>(.*?)</(?:[a-z0-9_\-]+:)?labelLink>"#)
|
|
.unwrap()
|
|
});
|
|
static PRESENTATION_LINK_RE: Lazy<Regex> = Lazy::new(|| {
|
|
Regex::new(r#"(?is)<(?:[a-z0-9_\-]+:)?presentationLink\b([^>]*)>(.*?)</(?:[a-z0-9_\-]+:)?presentationLink>"#).unwrap()
|
|
});
|
|
static LOC_RE: Lazy<Regex> = Lazy::new(|| {
|
|
Regex::new(r#"(?is)<(?:[a-z0-9_\-]+:)?loc\b([^>]*)/?>(?:</(?:[a-z0-9_\-]+:)?loc>)?"#).unwrap()
|
|
});
|
|
static LABEL_RESOURCE_RE: Lazy<Regex> = Lazy::new(|| {
|
|
Regex::new(r#"(?is)<(?:[a-z0-9_\-]+:)?label\b([^>]*)>(.*?)</(?:[a-z0-9_\-]+:)?label>"#).unwrap()
|
|
});
|
|
static LABEL_ARC_RE: Lazy<Regex> = Lazy::new(|| {
|
|
Regex::new(r#"(?is)<(?:[a-z0-9_\-]+:)?labelArc\b([^>]*)/?>(?:</(?:[a-z0-9_\-]+:)?labelArc>)?"#)
|
|
.unwrap()
|
|
});
|
|
static PRESENTATION_ARC_RE: Lazy<Regex> = Lazy::new(|| {
|
|
Regex::new(r#"(?is)<(?:[a-z0-9_\-]+:)?presentationArc\b([^>]*)/?>(?:</(?:[a-z0-9_\-]+:)?presentationArc>)?"#).unwrap()
|
|
});
|
|
static ATTR_RE: Lazy<Regex> =
|
|
Lazy::new(|| Regex::new(r#"([a-zA-Z0-9:_\-]+)=["']([^"']+)["']"#).unwrap());
|
|
|
|
#[derive(Debug, Deserialize)]
|
|
#[serde(rename_all = "camelCase")]
|
|
pub struct HydrateFilingRequest {
|
|
pub filing_id: i64,
|
|
pub ticker: String,
|
|
pub cik: String,
|
|
pub accession_number: String,
|
|
pub filing_date: String,
|
|
pub filing_type: String,
|
|
pub filing_url: Option<String>,
|
|
pub primary_document: Option<String>,
|
|
pub cache_dir: Option<String>,
|
|
}
|
|
|
|
#[derive(Debug, Serialize)]
|
|
pub struct HydrateFilingResponse {
|
|
pub filing_id: i64,
|
|
pub ticker: String,
|
|
pub filing_date: String,
|
|
pub filing_type: String,
|
|
pub parse_status: String,
|
|
pub parse_error: Option<String>,
|
|
pub source: String,
|
|
pub parser_engine: String,
|
|
pub parser_version: String,
|
|
pub taxonomy_regime: String,
|
|
pub fiscal_pack: Option<String>,
|
|
pub periods: Vec<PeriodOutput>,
|
|
pub faithful_rows: StatementRowMap,
|
|
pub statement_rows: StatementRowMap,
|
|
pub surface_rows: SurfaceRowMap,
|
|
pub detail_rows: DetailRowStatementMap,
|
|
pub kpi_rows: Vec<KpiRowOutput>,
|
|
pub computed_definitions: Vec<ComputedDefinitionOutput>,
|
|
pub contexts: Vec<ContextOutput>,
|
|
pub derived_metrics: FilingMetrics,
|
|
pub validation_result: ValidationResultOutput,
|
|
pub facts_count: usize,
|
|
pub concepts_count: usize,
|
|
pub dimensions_count: usize,
|
|
pub assets: Vec<AssetOutput>,
|
|
pub concepts: Vec<ConceptOutput>,
|
|
pub facts: Vec<FactOutput>,
|
|
pub metric_validations: Vec<MetricValidationOutput>,
|
|
pub normalization_summary: NormalizationSummaryOutput,
|
|
}
|
|
|
|
#[derive(Debug, Clone, Serialize, Default)]
|
|
pub struct FilingMetrics {
|
|
pub revenue: Option<f64>,
|
|
#[serde(rename = "netIncome")]
|
|
pub net_income: Option<f64>,
|
|
#[serde(rename = "totalAssets")]
|
|
pub total_assets: Option<f64>,
|
|
pub cash: Option<f64>,
|
|
pub debt: Option<f64>,
|
|
}
|
|
|
|
#[derive(Debug, Clone, Serialize)]
|
|
pub struct ValidationResultOutput {
|
|
pub status: String,
|
|
pub checks: Vec<serde_json::Value>,
|
|
#[serde(rename = "validatedAt")]
|
|
pub validated_at: Option<String>,
|
|
}
|
|
|
|
#[derive(Debug, Clone, Serialize)]
|
|
pub struct AssetOutput {
|
|
pub asset_type: String,
|
|
pub name: String,
|
|
pub url: String,
|
|
pub size_bytes: Option<i64>,
|
|
pub score: Option<f64>,
|
|
pub is_selected: bool,
|
|
}
|
|
|
|
#[derive(Debug, Clone, Serialize)]
|
|
pub struct PeriodOutput {
|
|
pub id: String,
|
|
pub filing_id: i64,
|
|
pub accession_number: String,
|
|
pub filing_date: String,
|
|
pub period_start: Option<String>,
|
|
pub period_end: Option<String>,
|
|
pub filing_type: String,
|
|
pub period_label: String,
|
|
}
|
|
|
|
#[derive(Debug, Clone, Serialize)]
|
|
pub struct ContextOutput {
|
|
pub context_id: String,
|
|
pub entity_identifier: Option<String>,
|
|
pub entity_scheme: Option<String>,
|
|
pub period_start: Option<String>,
|
|
pub period_end: Option<String>,
|
|
pub period_instant: Option<String>,
|
|
pub segment_json: Option<serde_json::Value>,
|
|
pub scenario_json: Option<serde_json::Value>,
|
|
}
|
|
|
|
#[derive(Debug, Clone, Serialize)]
|
|
pub struct StatementRowOutput {
|
|
pub key: String,
|
|
pub label: String,
|
|
pub concept_key: String,
|
|
pub qname: String,
|
|
pub namespace_uri: String,
|
|
pub local_name: String,
|
|
pub is_extension: bool,
|
|
pub statement: String,
|
|
pub role_uri: Option<String>,
|
|
pub order: i64,
|
|
pub depth: i64,
|
|
pub parent_key: Option<String>,
|
|
pub values: BTreeMap<String, Option<f64>>,
|
|
pub units: BTreeMap<String, Option<String>>,
|
|
pub has_dimensions: bool,
|
|
pub source_fact_ids: Vec<i64>,
|
|
}
|
|
|
|
#[derive(Debug, Clone, Serialize)]
|
|
pub struct SurfaceRowOutput {
|
|
pub key: String,
|
|
pub label: String,
|
|
pub category: String,
|
|
pub template_section: String,
|
|
pub order: i64,
|
|
pub unit: String,
|
|
pub values: BTreeMap<String, Option<f64>>,
|
|
pub source_concepts: Vec<String>,
|
|
pub source_row_keys: Vec<String>,
|
|
pub source_fact_ids: Vec<i64>,
|
|
pub formula_key: Option<String>,
|
|
pub has_dimensions: bool,
|
|
pub resolved_source_row_keys: BTreeMap<String, Option<String>>,
|
|
pub statement: Option<String>,
|
|
pub detail_count: Option<i64>,
|
|
#[serde(skip_serializing_if = "Option::is_none")]
|
|
pub resolution_method: Option<String>,
|
|
#[serde(skip_serializing_if = "Option::is_none")]
|
|
pub confidence: Option<String>,
|
|
#[serde(default, skip_serializing_if = "Vec::is_empty")]
|
|
pub warning_codes: Vec<String>,
|
|
}
|
|
|
|
#[derive(Debug, Clone, Serialize)]
|
|
pub struct DetailRowOutput {
|
|
pub key: String,
|
|
pub parent_surface_key: String,
|
|
pub label: String,
|
|
pub concept_key: String,
|
|
pub qname: String,
|
|
pub namespace_uri: String,
|
|
pub local_name: String,
|
|
pub unit: Option<String>,
|
|
pub values: BTreeMap<String, Option<f64>>,
|
|
pub source_fact_ids: Vec<i64>,
|
|
pub is_extension: bool,
|
|
pub dimensions_summary: Vec<String>,
|
|
pub residual_flag: bool,
|
|
}
|
|
|
|
#[derive(Debug, Clone, Serialize)]
|
|
pub struct KpiRowOutput {
|
|
pub key: String,
|
|
pub label: String,
|
|
pub category: String,
|
|
pub unit: String,
|
|
pub order: i64,
|
|
pub segment: Option<String>,
|
|
pub axis: Option<String>,
|
|
pub member: Option<String>,
|
|
pub values: BTreeMap<String, Option<f64>>,
|
|
pub source_concepts: Vec<String>,
|
|
pub source_fact_ids: Vec<i64>,
|
|
pub provenance_type: String,
|
|
pub has_dimensions: bool,
|
|
}
|
|
|
|
#[derive(Debug, Clone, Serialize)]
|
|
pub struct ComputedDefinitionOutput {
|
|
pub key: String,
|
|
pub label: String,
|
|
pub category: String,
|
|
pub order: i64,
|
|
pub unit: String,
|
|
pub computation: ComputationSpecOutput,
|
|
#[serde(default, skip_serializing_if = "Vec::is_empty")]
|
|
pub supported_cadences: Vec<String>,
|
|
#[serde(default, skip_serializing_if = "Vec::is_empty")]
|
|
pub requires_external_data: Vec<String>,
|
|
}
|
|
|
|
#[derive(Debug, Clone, Serialize)]
|
|
#[serde(tag = "type", rename_all = "snake_case")]
|
|
pub enum ComputationSpecOutput {
|
|
Ratio {
|
|
numerator: String,
|
|
denominator: String,
|
|
},
|
|
YoyGrowth {
|
|
source: String,
|
|
},
|
|
Cagr {
|
|
source: String,
|
|
years: i64,
|
|
},
|
|
PerShare {
|
|
source: String,
|
|
shares_key: String,
|
|
},
|
|
Simple {
|
|
formula: String,
|
|
},
|
|
}
|
|
|
|
impl From<&ComputationSpec> for ComputationSpecOutput {
|
|
fn from(spec: &ComputationSpec) -> Self {
|
|
match spec {
|
|
ComputationSpec::Ratio {
|
|
numerator,
|
|
denominator,
|
|
} => ComputationSpecOutput::Ratio {
|
|
numerator: numerator.clone(),
|
|
denominator: denominator.clone(),
|
|
},
|
|
ComputationSpec::YoyGrowth { source } => ComputationSpecOutput::YoyGrowth {
|
|
source: source.clone(),
|
|
},
|
|
ComputationSpec::Cagr { source, years } => ComputationSpecOutput::Cagr {
|
|
source: source.clone(),
|
|
years: *years,
|
|
},
|
|
ComputationSpec::PerShare { source, shares_key } => ComputationSpecOutput::PerShare {
|
|
source: source.clone(),
|
|
shares_key: shares_key.clone(),
|
|
},
|
|
ComputationSpec::Simple { formula } => ComputationSpecOutput::Simple {
|
|
formula: formula.clone(),
|
|
},
|
|
}
|
|
}
|
|
}
|
|
|
|
impl From<&ComputedDefinition> for ComputedDefinitionOutput {
|
|
fn from(def: &ComputedDefinition) -> Self {
|
|
ComputedDefinitionOutput {
|
|
key: def.key.clone(),
|
|
label: def.label.clone(),
|
|
category: def.category.clone(),
|
|
order: def.order,
|
|
unit: def.unit.clone(),
|
|
computation: ComputationSpecOutput::from(&def.computation),
|
|
supported_cadences: def.supported_cadences.clone(),
|
|
requires_external_data: def.requires_external_data.clone(),
|
|
}
|
|
}
|
|
}
|
|
|
|
#[derive(Debug, Clone, Serialize)]
|
|
pub struct ConceptOutput {
|
|
pub concept_key: String,
|
|
pub qname: String,
|
|
pub namespace_uri: String,
|
|
pub local_name: String,
|
|
pub label: Option<String>,
|
|
pub is_extension: bool,
|
|
pub balance: Option<String>,
|
|
pub period_type: Option<String>,
|
|
pub data_type: Option<String>,
|
|
pub statement_kind: Option<String>,
|
|
pub role_uri: Option<String>,
|
|
pub authoritative_concept_key: Option<String>,
|
|
pub mapping_method: Option<String>,
|
|
pub surface_key: Option<String>,
|
|
pub detail_parent_surface_key: Option<String>,
|
|
pub kpi_key: Option<String>,
|
|
pub residual_flag: bool,
|
|
pub presentation_order: Option<f64>,
|
|
pub presentation_depth: Option<i64>,
|
|
pub parent_concept_key: Option<String>,
|
|
pub is_abstract: bool,
|
|
}
|
|
|
|
#[derive(Debug, Clone, Serialize)]
|
|
pub struct FactOutput {
|
|
pub concept_key: String,
|
|
pub qname: String,
|
|
pub namespace_uri: String,
|
|
pub local_name: String,
|
|
pub data_type: Option<String>,
|
|
pub statement_kind: Option<String>,
|
|
pub role_uri: Option<String>,
|
|
pub authoritative_concept_key: Option<String>,
|
|
pub mapping_method: Option<String>,
|
|
pub surface_key: Option<String>,
|
|
pub detail_parent_surface_key: Option<String>,
|
|
pub kpi_key: Option<String>,
|
|
pub residual_flag: bool,
|
|
pub context_id: String,
|
|
pub unit: Option<String>,
|
|
pub decimals: Option<String>,
|
|
pub precision: Option<String>,
|
|
pub nil: bool,
|
|
pub value_num: f64,
|
|
pub period_start: Option<String>,
|
|
pub period_end: Option<String>,
|
|
pub period_instant: Option<String>,
|
|
pub dimensions: Vec<DimensionOutput>,
|
|
pub is_dimensionless: bool,
|
|
pub source_file: Option<String>,
|
|
}
|
|
|
|
#[derive(Debug, Clone, Serialize)]
|
|
pub struct DimensionOutput {
|
|
pub axis: String,
|
|
pub member: String,
|
|
}
|
|
|
|
#[derive(Debug, Clone, Serialize)]
|
|
pub struct MetricValidationOutput {
|
|
pub metric_key: String,
|
|
pub taxonomy_value: Option<f64>,
|
|
pub llm_value: Option<f64>,
|
|
pub absolute_diff: Option<f64>,
|
|
pub relative_diff: Option<f64>,
|
|
pub status: String,
|
|
pub evidence_pages: Vec<i64>,
|
|
pub pdf_url: Option<String>,
|
|
pub provider: Option<String>,
|
|
pub model: Option<String>,
|
|
pub error: Option<String>,
|
|
}
|
|
|
|
#[derive(Debug, Clone, Serialize, Default)]
|
|
pub struct NormalizationSummaryOutput {
|
|
pub surface_row_count: usize,
|
|
pub detail_row_count: usize,
|
|
pub kpi_row_count: usize,
|
|
pub unmapped_row_count: usize,
|
|
pub material_unmapped_row_count: usize,
|
|
pub warnings: Vec<String>,
|
|
}
|
|
|
|
pub type StatementRowMap = BTreeMap<String, Vec<StatementRowOutput>>;
|
|
pub type SurfaceRowMap = BTreeMap<String, Vec<SurfaceRowOutput>>;
|
|
pub type DetailRowStatementMap = BTreeMap<String, BTreeMap<String, Vec<DetailRowOutput>>>;
|
|
|
|
#[derive(Debug, Clone)]
|
|
struct ParsedContext {
|
|
id: String,
|
|
entity_identifier: Option<String>,
|
|
entity_scheme: Option<String>,
|
|
period_start: Option<String>,
|
|
period_end: Option<String>,
|
|
period_instant: Option<String>,
|
|
dimensions: Vec<DimensionOutput>,
|
|
segment: Option<serde_json::Value>,
|
|
scenario: Option<serde_json::Value>,
|
|
}
|
|
|
|
#[derive(Debug, Clone)]
|
|
struct ParsedUnit {
|
|
measure: Option<String>,
|
|
}
|
|
|
|
#[derive(Debug, Clone)]
|
|
struct ParsedFact {
|
|
concept_key: String,
|
|
qname: String,
|
|
namespace_uri: String,
|
|
local_name: String,
|
|
data_type: Option<String>,
|
|
context_id: String,
|
|
unit: Option<String>,
|
|
decimals: Option<String>,
|
|
precision: Option<String>,
|
|
nil: bool,
|
|
value: f64,
|
|
period_start: Option<String>,
|
|
period_end: Option<String>,
|
|
period_instant: Option<String>,
|
|
dimensions: Vec<DimensionOutput>,
|
|
is_dimensionless: bool,
|
|
source_file: Option<String>,
|
|
}
|
|
|
|
#[derive(Debug, Clone)]
|
|
struct PresentationNode {
|
|
concept_key: String,
|
|
role_uri: String,
|
|
order: f64,
|
|
depth: i64,
|
|
parent_concept_key: Option<String>,
|
|
is_abstract: bool,
|
|
}
|
|
|
|
pub fn hydrate_filing(input: HydrateFilingRequest) -> Result<HydrateFilingResponse> {
|
|
let client = Client::builder()
|
|
.user_agent("Fiscal Clone <support@fiscal.local>")
|
|
.build()
|
|
.context("unable to build HTTP client")?;
|
|
|
|
let discovered = discover_filing_assets(&input, &client)?;
|
|
let empty_rows = empty_statement_row_map();
|
|
let empty_surface_rows = empty_surface_row_map();
|
|
let empty_detail_rows = empty_detail_row_map();
|
|
let validation_result = ValidationResultOutput {
|
|
status: "not_run".to_string(),
|
|
checks: vec![],
|
|
validated_at: None,
|
|
};
|
|
|
|
let Some(instance_asset) = discovered
|
|
.assets
|
|
.iter()
|
|
.find(|asset| asset.asset_type == "instance" && asset.is_selected)
|
|
.cloned()
|
|
else {
|
|
return Ok(HydrateFilingResponse {
|
|
filing_id: input.filing_id,
|
|
ticker: input.ticker.to_uppercase(),
|
|
filing_date: input.filing_date,
|
|
filing_type: input.filing_type,
|
|
parse_status: "failed".to_string(),
|
|
parse_error: Some("No XBRL instance found".to_string()),
|
|
source: "legacy_html_fallback".to_string(),
|
|
parser_engine: PARSER_ENGINE.to_string(),
|
|
parser_version: PARSER_VERSION.to_string(),
|
|
taxonomy_regime: "unknown".to_string(),
|
|
fiscal_pack: Some("core".to_string()),
|
|
periods: vec![],
|
|
faithful_rows: empty_rows.clone(),
|
|
statement_rows: empty_rows,
|
|
surface_rows: empty_surface_rows,
|
|
detail_rows: empty_detail_rows,
|
|
kpi_rows: vec![],
|
|
computed_definitions: vec![],
|
|
contexts: vec![],
|
|
derived_metrics: FilingMetrics::default(),
|
|
validation_result,
|
|
facts_count: 0,
|
|
concepts_count: 0,
|
|
dimensions_count: 0,
|
|
assets: discovered.assets,
|
|
concepts: vec![],
|
|
facts: vec![],
|
|
metric_validations: vec![],
|
|
normalization_summary: NormalizationSummaryOutput {
|
|
surface_row_count: 0,
|
|
detail_row_count: 0,
|
|
kpi_row_count: 0,
|
|
unmapped_row_count: 0,
|
|
material_unmapped_row_count: 0,
|
|
warnings: vec![],
|
|
},
|
|
});
|
|
};
|
|
|
|
let instance_text = fetch_text(&client, &instance_asset.url)
|
|
.context("fetch request failed for XBRL instance")?;
|
|
let parsed_instance = parse_xbrl_instance(&instance_text, Some(instance_asset.name.clone()));
|
|
|
|
let mut label_by_concept = HashMap::new();
|
|
let mut presentation = Vec::new();
|
|
let mut source = "xbrl_instance".to_string();
|
|
let mut parse_error = None;
|
|
|
|
for asset in discovered.assets.iter().filter(|asset| {
|
|
asset.is_selected && (asset.asset_type == "presentation" || asset.asset_type == "label")
|
|
}) {
|
|
match fetch_text(&client, &asset.url) {
|
|
Ok(content) => {
|
|
if asset.asset_type == "presentation" {
|
|
let parsed = parse_presentation_linkbase(&content);
|
|
if !parsed.is_empty() {
|
|
source = "xbrl_instance_with_linkbase".to_string();
|
|
}
|
|
presentation.extend(parsed);
|
|
} else {
|
|
for (key, value) in parse_label_linkbase(&content) {
|
|
label_by_concept.entry(key).or_insert(value);
|
|
}
|
|
}
|
|
}
|
|
Err(error) => {
|
|
if parse_error.is_none() {
|
|
parse_error = Some(error.to_string());
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
let materialized = materialize_taxonomy_statements(
|
|
input.filing_id,
|
|
&input.accession_number,
|
|
&input.filing_date,
|
|
&input.filing_type,
|
|
&parsed_instance.facts,
|
|
&presentation,
|
|
&label_by_concept,
|
|
);
|
|
let taxonomy_regime = infer_taxonomy_regime(&parsed_instance.facts);
|
|
let mut concepts = materialized.concepts;
|
|
let mut facts = materialized.facts;
|
|
let pack_selection = pack_selector::select_fiscal_pack(&materialized.statement_rows, &facts);
|
|
let fiscal_pack = pack_selection.pack.as_str().to_string();
|
|
let mut compact_model = surface_mapper::build_compact_surface_model(
|
|
&materialized.periods,
|
|
&materialized.statement_rows,
|
|
&taxonomy_regime,
|
|
pack_selection.pack,
|
|
pack_selection.warnings,
|
|
)?;
|
|
universal_income::apply_universal_income_rows(
|
|
&materialized.periods,
|
|
&materialized.statement_rows,
|
|
&facts,
|
|
&taxonomy_regime,
|
|
pack_selection.pack,
|
|
&mut compact_model,
|
|
)?;
|
|
let kpi_result =
|
|
kpi_mapper::build_taxonomy_kpis(&materialized.periods, &facts, pack_selection.pack)?;
|
|
compact_model.normalization_summary.kpi_row_count = kpi_result.rows.len();
|
|
for warning in kpi_result.warnings {
|
|
if !compact_model
|
|
.normalization_summary
|
|
.warnings
|
|
.contains(&warning)
|
|
{
|
|
compact_model.normalization_summary.warnings.push(warning);
|
|
}
|
|
}
|
|
surface_mapper::merge_mapping_assignments(
|
|
&mut compact_model.concept_mappings,
|
|
kpi_result.mapping_assignments,
|
|
);
|
|
surface_mapper::apply_mapping_assignments(
|
|
&mut concepts,
|
|
&mut facts,
|
|
&compact_model.concept_mappings,
|
|
);
|
|
|
|
let computed_pack = taxonomy_loader::load_computed_pack(pack_selection.pack)
|
|
.ok()
|
|
.or_else(|| taxonomy_loader::load_computed_pack(pack_selector::FiscalPack::Core).ok());
|
|
let computed_definitions: Vec<ComputedDefinitionOutput> = computed_pack
|
|
.map(|pack| {
|
|
pack.computed
|
|
.iter()
|
|
.map(ComputedDefinitionOutput::from)
|
|
.collect()
|
|
})
|
|
.unwrap_or_default();
|
|
|
|
let has_rows = materialized
|
|
.statement_rows
|
|
.values()
|
|
.map(|rows| rows.len())
|
|
.sum::<usize>()
|
|
> 0;
|
|
let has_facts = !facts.is_empty();
|
|
let parse_status = if has_rows && has_facts {
|
|
"ready"
|
|
} else if has_facts {
|
|
"partial"
|
|
} else {
|
|
"failed"
|
|
};
|
|
|
|
Ok(HydrateFilingResponse {
|
|
filing_id: input.filing_id,
|
|
ticker: input.ticker.to_uppercase(),
|
|
filing_date: input.filing_date,
|
|
filing_type: input.filing_type,
|
|
parse_status: parse_status.to_string(),
|
|
parse_error: if parse_status == "failed" {
|
|
Some(parse_error.unwrap_or_else(|| "No XBRL facts extracted".to_string()))
|
|
} else {
|
|
parse_error
|
|
},
|
|
source,
|
|
parser_engine: PARSER_ENGINE.to_string(),
|
|
parser_version: PARSER_VERSION.to_string(),
|
|
taxonomy_regime,
|
|
fiscal_pack: Some(fiscal_pack),
|
|
periods: materialized.periods,
|
|
faithful_rows: materialized.statement_rows.clone(),
|
|
statement_rows: materialized.statement_rows,
|
|
surface_rows: compact_model.surface_rows,
|
|
detail_rows: compact_model.detail_rows,
|
|
kpi_rows: kpi_result.rows,
|
|
computed_definitions,
|
|
contexts: parsed_instance.contexts,
|
|
derived_metrics: metrics::derive_metrics(&facts),
|
|
validation_result,
|
|
facts_count: facts.len(),
|
|
concepts_count: concepts.len(),
|
|
dimensions_count: facts
|
|
.iter()
|
|
.flat_map(|fact| {
|
|
fact.dimensions
|
|
.iter()
|
|
.map(|dimension| format!("{}::{}", dimension.axis, dimension.member))
|
|
})
|
|
.collect::<HashSet<_>>()
|
|
.len(),
|
|
assets: discovered.assets,
|
|
concepts,
|
|
facts,
|
|
metric_validations: vec![],
|
|
normalization_summary: compact_model.normalization_summary,
|
|
})
|
|
}
|
|
|
|
fn infer_taxonomy_regime(facts: &[ParsedFact]) -> String {
|
|
if facts
|
|
.iter()
|
|
.any(|fact| fact.namespace_uri.to_lowercase().contains("us-gaap"))
|
|
{
|
|
return "us-gaap".to_string();
|
|
}
|
|
|
|
if facts
|
|
.iter()
|
|
.any(|fact| fact.namespace_uri.to_lowercase().contains("ifrs"))
|
|
{
|
|
return "ifrs-full".to_string();
|
|
}
|
|
|
|
"unknown".to_string()
|
|
}
|
|
|
|
#[derive(Debug, Deserialize)]
|
|
struct FilingDirectoryPayload {
|
|
directory: Option<FilingDirectory>,
|
|
}
|
|
|
|
#[derive(Debug, Deserialize)]
|
|
struct FilingDirectory {
|
|
item: Option<Vec<FilingDirectoryItem>>,
|
|
}
|
|
|
|
#[derive(Debug, Deserialize)]
|
|
struct FilingDirectoryItem {
|
|
name: Option<String>,
|
|
size: Option<serde_json::Value>,
|
|
}
|
|
|
|
#[derive(Debug)]
|
|
struct DiscoveredAssets {
|
|
assets: Vec<AssetOutput>,
|
|
}
|
|
|
|
fn discover_filing_assets(
|
|
input: &HydrateFilingRequest,
|
|
client: &Client,
|
|
) -> Result<DiscoveredAssets> {
|
|
let Some(directory_url) = resolve_filing_directory_url(
|
|
input.filing_url.as_deref(),
|
|
&input.cik,
|
|
&input.accession_number,
|
|
) else {
|
|
return Ok(DiscoveredAssets { assets: vec![] });
|
|
};
|
|
|
|
let payload =
|
|
fetch_json::<FilingDirectoryPayload>(client, &format!("{directory_url}index.json")).ok();
|
|
let mut discovered = Vec::new();
|
|
|
|
if let Some(items) =
|
|
payload.and_then(|payload| payload.directory.and_then(|directory| directory.item))
|
|
{
|
|
for item in items {
|
|
let Some(name) = item
|
|
.name
|
|
.map(|name| name.trim().to_string())
|
|
.filter(|name| !name.is_empty())
|
|
else {
|
|
continue;
|
|
};
|
|
|
|
let asset_type = classify_asset_type(&name);
|
|
let size_bytes = parse_size(item.size.as_ref());
|
|
discovered.push(AssetOutput {
|
|
asset_type: asset_type.to_string(),
|
|
name: name.clone(),
|
|
url: format!("{directory_url}{}", name.trim_start_matches('/')),
|
|
size_bytes,
|
|
score: None,
|
|
is_selected: false,
|
|
});
|
|
}
|
|
}
|
|
|
|
if discovered.is_empty() {
|
|
if let Some(filing_url) = &input.filing_url {
|
|
discovered.push(AssetOutput {
|
|
asset_type: if filing_url.to_lowercase().ends_with(".xml") {
|
|
"instance".to_string()
|
|
} else {
|
|
"other".to_string()
|
|
},
|
|
name: input
|
|
.primary_document
|
|
.clone()
|
|
.or_else(|| filing_url.split('/').last().map(|part| part.to_string()))
|
|
.unwrap_or_else(|| "primary_document".to_string()),
|
|
url: filing_url.clone(),
|
|
size_bytes: None,
|
|
score: None,
|
|
is_selected: true,
|
|
});
|
|
}
|
|
}
|
|
|
|
let selected_instance_url = discovered
|
|
.iter()
|
|
.filter(|asset| asset.asset_type == "instance")
|
|
.map(|asset| {
|
|
(
|
|
asset.url.clone(),
|
|
score_instance(&asset.name, input.primary_document.as_deref()),
|
|
)
|
|
})
|
|
.max_by(|left, right| {
|
|
left.1
|
|
.partial_cmp(&right.1)
|
|
.unwrap_or(std::cmp::Ordering::Equal)
|
|
})
|
|
.map(|entry| entry.0);
|
|
|
|
for asset in &mut discovered {
|
|
asset.score = if asset.asset_type == "instance" {
|
|
Some(score_instance(
|
|
&asset.name,
|
|
input.primary_document.as_deref(),
|
|
))
|
|
} else if asset.asset_type == "pdf" {
|
|
Some(score_pdf(&asset.name, asset.size_bytes))
|
|
} else {
|
|
None
|
|
};
|
|
|
|
asset.is_selected = match asset.asset_type.as_str() {
|
|
"instance" => selected_instance_url
|
|
.as_ref()
|
|
.map(|url| url == &asset.url)
|
|
.unwrap_or(false),
|
|
"presentation" | "label" => true,
|
|
_ => false,
|
|
};
|
|
}
|
|
|
|
Ok(DiscoveredAssets { assets: discovered })
|
|
}
|
|
|
|
fn resolve_filing_directory_url(
|
|
filing_url: Option<&str>,
|
|
cik: &str,
|
|
accession_number: &str,
|
|
) -> Option<String> {
|
|
if let Some(filing_url) = filing_url.map(str::trim).filter(|value| !value.is_empty()) {
|
|
if let Some(last_slash) = filing_url.rfind('/') {
|
|
if last_slash > "https://".len() {
|
|
return Some(filing_url[..=last_slash].to_string());
|
|
}
|
|
}
|
|
}
|
|
|
|
let cik_path = normalize_cik_for_path(cik)?;
|
|
let accession_path = accession_number.replace('-', "");
|
|
Some(format!(
|
|
"https://www.sec.gov/Archives/edgar/data/{cik_path}/{accession_path}/"
|
|
))
|
|
}
|
|
|
|
fn normalize_cik_for_path(value: &str) -> Option<String> {
|
|
let digits = value
|
|
.chars()
|
|
.filter(|char| char.is_ascii_digit())
|
|
.collect::<String>();
|
|
if digits.is_empty() {
|
|
return None;
|
|
}
|
|
digits.parse::<u64>().ok().map(|parsed| parsed.to_string())
|
|
}
|
|
|
|
fn classify_asset_type(name: &str) -> &'static str {
|
|
let lower = name.to_lowercase();
|
|
if lower.ends_with(".pdf") {
|
|
return "pdf";
|
|
}
|
|
if lower.ends_with(".xsd") {
|
|
return "schema";
|
|
}
|
|
if lower.ends_with(".xml") {
|
|
if lower.ends_with("_pre.xml")
|
|
|| lower.ends_with("-pre.xml")
|
|
|| lower.contains("presentation")
|
|
{
|
|
return "presentation";
|
|
}
|
|
if lower.ends_with("_lab.xml") || lower.ends_with("-lab.xml") || lower.contains("label") {
|
|
return "label";
|
|
}
|
|
if lower.ends_with("_cal.xml")
|
|
|| lower.ends_with("-cal.xml")
|
|
|| lower.contains("calculation")
|
|
{
|
|
return "calculation";
|
|
}
|
|
if lower.ends_with("_def.xml")
|
|
|| lower.ends_with("-def.xml")
|
|
|| lower.contains("definition")
|
|
{
|
|
return "definition";
|
|
}
|
|
return "instance";
|
|
}
|
|
"other"
|
|
}
|
|
|
|
fn score_instance(name: &str, primary_document: Option<&str>) -> f64 {
|
|
let lower = name.to_lowercase();
|
|
let mut score = 1.0;
|
|
if lower.ends_with("_htm.xml") {
|
|
score += 4.0;
|
|
}
|
|
if lower.ends_with("_ins.xml") {
|
|
score += 4.0;
|
|
}
|
|
if let Some(base_primary) = primary_document
|
|
.map(|value| value.replace(|char: char| char == '.' || char == '-', "_"))
|
|
.map(|value| value.to_lowercase())
|
|
{
|
|
let base = base_primary
|
|
.rsplit_once('_')
|
|
.map(|(head, _)| head.to_string())
|
|
.unwrap_or(base_primary);
|
|
if !base.is_empty() && lower.contains(&base) {
|
|
score += 5.0;
|
|
}
|
|
}
|
|
if lower.contains("cal")
|
|
|| lower.contains("def")
|
|
|| lower.contains("lab")
|
|
|| lower.contains("pre")
|
|
{
|
|
score -= 3.0;
|
|
}
|
|
score
|
|
}
|
|
|
|
fn score_pdf(name: &str, size_bytes: Option<i64>) -> f64 {
|
|
let lower = name.to_lowercase();
|
|
let mut score = 0.0;
|
|
if ["financial", "statement", "annual", "quarter", "10k", "10q"]
|
|
.iter()
|
|
.any(|needle| lower.contains(needle))
|
|
{
|
|
score += 8.0;
|
|
}
|
|
if lower.contains("exhibit") {
|
|
score -= 2.0;
|
|
}
|
|
if size_bytes.unwrap_or_default() > 100_000 {
|
|
score += 1.0;
|
|
}
|
|
score
|
|
}
|
|
|
|
fn parse_size(value: Option<&serde_json::Value>) -> Option<i64> {
|
|
match value {
|
|
Some(serde_json::Value::Number(number)) => number.as_i64(),
|
|
Some(serde_json::Value::String(raw)) => raw.parse::<i64>().ok(),
|
|
_ => None,
|
|
}
|
|
}
|
|
|
|
fn fetch_text(client: &Client, url: &str) -> Result<String> {
|
|
let response = client
|
|
.get(url)
|
|
.send()
|
|
.with_context(|| format!("request failed for {url}"))?;
|
|
if !response.status().is_success() {
|
|
return Err(anyhow!("request failed for {url} ({})", response.status()));
|
|
}
|
|
response
|
|
.text()
|
|
.with_context(|| format!("unable to read response body for {url}"))
|
|
}
|
|
|
|
fn fetch_json<T: for<'de> Deserialize<'de>>(client: &Client, url: &str) -> Result<T> {
|
|
let response = client
|
|
.get(url)
|
|
.send()
|
|
.with_context(|| format!("request failed for {url}"))?;
|
|
if !response.status().is_success() {
|
|
return Err(anyhow!("request failed for {url} ({})", response.status()));
|
|
}
|
|
response
|
|
.json::<T>()
|
|
.with_context(|| format!("unable to parse JSON response for {url}"))
|
|
}
|
|
|
|
struct ParsedInstance {
|
|
contexts: Vec<ContextOutput>,
|
|
facts: Vec<ParsedFact>,
|
|
}
|
|
|
|
fn parse_xbrl_instance(raw: &str, source_file: Option<String>) -> ParsedInstance {
|
|
let namespaces = parse_namespace_map(raw, "xbrl");
|
|
let context_by_id = parse_contexts(raw);
|
|
let unit_by_id = parse_units(raw);
|
|
let mut facts = Vec::new();
|
|
|
|
for captures in FACT_RE.captures_iter(raw) {
|
|
let prefix = captures
|
|
.get(1)
|
|
.map(|value| value.as_str().trim())
|
|
.unwrap_or_default();
|
|
let local_name = captures
|
|
.get(2)
|
|
.map(|value| value.as_str().trim())
|
|
.unwrap_or_default();
|
|
let attrs = captures
|
|
.get(3)
|
|
.map(|value| value.as_str())
|
|
.unwrap_or_default();
|
|
let body = decode_xml_entities(
|
|
captures
|
|
.get(4)
|
|
.map(|value| value.as_str())
|
|
.unwrap_or_default()
|
|
.trim(),
|
|
);
|
|
|
|
if prefix.is_empty() || local_name.is_empty() || is_xbrl_infrastructure_prefix(prefix) {
|
|
continue;
|
|
}
|
|
|
|
let attr_map = parse_attrs(attrs);
|
|
let Some(context_id) = attr_map
|
|
.get("contextRef")
|
|
.cloned()
|
|
.or_else(|| attr_map.get("contextref").cloned())
|
|
else {
|
|
continue;
|
|
};
|
|
|
|
let Some(value) = parse_number(&body) else {
|
|
continue;
|
|
};
|
|
|
|
let namespace_uri = namespaces
|
|
.get(prefix)
|
|
.cloned()
|
|
.unwrap_or_else(|| format!("urn:unknown:{prefix}"));
|
|
let context = context_by_id.get(&context_id);
|
|
let unit_ref = attr_map
|
|
.get("unitRef")
|
|
.cloned()
|
|
.or_else(|| attr_map.get("unitref").cloned());
|
|
let unit = unit_ref
|
|
.as_ref()
|
|
.and_then(|unit_ref| unit_by_id.get(unit_ref))
|
|
.and_then(|unit| unit.measure.clone())
|
|
.or(unit_ref);
|
|
|
|
facts.push(ParsedFact {
|
|
concept_key: format!("{namespace_uri}#{local_name}"),
|
|
qname: format!("{prefix}:{local_name}"),
|
|
namespace_uri,
|
|
local_name: local_name.to_string(),
|
|
data_type: None,
|
|
context_id: context_id.clone(),
|
|
unit,
|
|
decimals: attr_map.get("decimals").cloned(),
|
|
precision: attr_map.get("precision").cloned(),
|
|
nil: attr_map
|
|
.get("xsi:nil")
|
|
.or_else(|| attr_map.get("nil"))
|
|
.map(|value| value.eq_ignore_ascii_case("true"))
|
|
.unwrap_or(false),
|
|
value,
|
|
period_start: context.and_then(|value| value.period_start.clone()),
|
|
period_end: context.and_then(|value| value.period_end.clone()),
|
|
period_instant: context.and_then(|value| value.period_instant.clone()),
|
|
dimensions: context
|
|
.map(|value| value.dimensions.clone())
|
|
.unwrap_or_default(),
|
|
is_dimensionless: context
|
|
.map(|value| value.dimensions.is_empty())
|
|
.unwrap_or(true),
|
|
source_file: source_file.clone(),
|
|
});
|
|
}
|
|
|
|
let contexts = context_by_id
|
|
.values()
|
|
.map(|context| ContextOutput {
|
|
context_id: context.id.clone(),
|
|
entity_identifier: context.entity_identifier.clone(),
|
|
entity_scheme: context.entity_scheme.clone(),
|
|
period_start: context.period_start.clone(),
|
|
period_end: context.period_end.clone(),
|
|
period_instant: context.period_instant.clone(),
|
|
segment_json: context.segment.clone(),
|
|
scenario_json: context.scenario.clone(),
|
|
})
|
|
.collect::<Vec<_>>();
|
|
|
|
ParsedInstance { contexts, facts }
|
|
}
|
|
|
|
fn parse_namespace_map(raw: &str, root_tag_hint: &str) -> HashMap<String, String> {
|
|
let mut map = HashMap::new();
|
|
let root_start = Regex::new(&format!(r#"(?is)<[^>]*{root_tag_hint}[^>]*>"#))
|
|
.unwrap()
|
|
.find(raw)
|
|
.map(|match_| match_.as_str().to_string())
|
|
.unwrap_or_else(|| raw.chars().take(1200).collect::<String>());
|
|
|
|
for captures in Regex::new(r#"xmlns:([a-zA-Z0-9_\-]+)=["']([^"']+)["']"#)
|
|
.unwrap()
|
|
.captures_iter(&root_start)
|
|
{
|
|
if let (Some(prefix), Some(uri)) = (captures.get(1), captures.get(2)) {
|
|
map.insert(
|
|
prefix.as_str().trim().to_string(),
|
|
uri.as_str().trim().to_string(),
|
|
);
|
|
}
|
|
}
|
|
|
|
map
|
|
}
|
|
|
|
fn parse_contexts(raw: &str) -> HashMap<String, ParsedContext> {
|
|
let mut contexts = HashMap::new();
|
|
|
|
for captures in CONTEXT_RE.captures_iter(raw) {
|
|
let Some(context_id) = captures
|
|
.get(1)
|
|
.map(|value| value.as_str().trim().to_string())
|
|
else {
|
|
continue;
|
|
};
|
|
let block = captures
|
|
.get(2)
|
|
.map(|value| value.as_str())
|
|
.unwrap_or_default();
|
|
let (entity_identifier, entity_scheme) = IDENTIFIER_RE
|
|
.captures(block)
|
|
.map(|captures| {
|
|
(
|
|
captures
|
|
.get(2)
|
|
.map(|value| decode_xml_entities(value.as_str().trim())),
|
|
captures
|
|
.get(1)
|
|
.map(|value| decode_xml_entities(value.as_str().trim())),
|
|
)
|
|
})
|
|
.unwrap_or((None, None));
|
|
|
|
let period_start = START_DATE_RE
|
|
.captures(block)
|
|
.and_then(|captures| captures.get(1))
|
|
.map(|value| decode_xml_entities(value.as_str().trim()));
|
|
let period_end = END_DATE_RE
|
|
.captures(block)
|
|
.and_then(|captures| captures.get(1))
|
|
.map(|value| decode_xml_entities(value.as_str().trim()));
|
|
let period_instant = INSTANT_RE
|
|
.captures(block)
|
|
.and_then(|captures| captures.get(1))
|
|
.map(|value| decode_xml_entities(value.as_str().trim()));
|
|
|
|
let segment = SEGMENT_RE
|
|
.captures(block)
|
|
.and_then(|captures| captures.get(1))
|
|
.map(|value| parse_dimension_container(value.as_str()));
|
|
let scenario = SCENARIO_RE
|
|
.captures(block)
|
|
.and_then(|captures| captures.get(1))
|
|
.map(|value| parse_dimension_container(value.as_str()));
|
|
|
|
let mut dimensions = Vec::new();
|
|
if let Some(segment_value) = segment.as_ref() {
|
|
if let Some(members) = segment_value
|
|
.get("explicitMembers")
|
|
.and_then(|value| value.as_array())
|
|
{
|
|
for member in members {
|
|
if let (Some(axis), Some(member_value)) = (
|
|
member.get("axis").and_then(|value| value.as_str()),
|
|
member.get("member").and_then(|value| value.as_str()),
|
|
) {
|
|
dimensions.push(DimensionOutput {
|
|
axis: axis.to_string(),
|
|
member: member_value.to_string(),
|
|
});
|
|
}
|
|
}
|
|
}
|
|
}
|
|
if let Some(scenario_value) = scenario.as_ref() {
|
|
if let Some(members) = scenario_value
|
|
.get("explicitMembers")
|
|
.and_then(|value| value.as_array())
|
|
{
|
|
for member in members {
|
|
if let (Some(axis), Some(member_value)) = (
|
|
member.get("axis").and_then(|value| value.as_str()),
|
|
member.get("member").and_then(|value| value.as_str()),
|
|
) {
|
|
dimensions.push(DimensionOutput {
|
|
axis: axis.to_string(),
|
|
member: member_value.to_string(),
|
|
});
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
contexts.insert(
|
|
context_id.clone(),
|
|
ParsedContext {
|
|
id: context_id,
|
|
entity_identifier,
|
|
entity_scheme,
|
|
period_start,
|
|
period_end,
|
|
period_instant,
|
|
dimensions,
|
|
segment,
|
|
scenario,
|
|
},
|
|
);
|
|
}
|
|
|
|
contexts
|
|
}
|
|
|
|
fn parse_dimension_container(raw: &str) -> serde_json::Value {
|
|
let explicit_members = EXPLICIT_MEMBER_RE
|
|
.captures_iter(raw)
|
|
.filter_map(|captures| {
|
|
Some(serde_json::json!({
|
|
"axis": decode_xml_entities(captures.get(1)?.as_str().trim()),
|
|
"member": decode_xml_entities(captures.get(2)?.as_str().trim())
|
|
}))
|
|
})
|
|
.collect::<Vec<_>>();
|
|
let typed_members = TYPED_MEMBER_RE
|
|
.captures_iter(raw)
|
|
.filter_map(|captures| {
|
|
Some(serde_json::json!({
|
|
"axis": decode_xml_entities(captures.get(1)?.as_str().trim()),
|
|
"value": decode_xml_entities(captures.get(2)?.as_str().trim())
|
|
}))
|
|
})
|
|
.collect::<Vec<_>>();
|
|
|
|
serde_json::json!({
|
|
"explicitMembers": explicit_members,
|
|
"typedMembers": typed_members
|
|
})
|
|
}
|
|
|
|
fn parse_units(raw: &str) -> HashMap<String, ParsedUnit> {
|
|
let mut units = HashMap::new();
|
|
for captures in UNIT_RE.captures_iter(raw) {
|
|
let Some(id) = captures
|
|
.get(1)
|
|
.map(|value| value.as_str().trim().to_string())
|
|
else {
|
|
continue;
|
|
};
|
|
let block = captures
|
|
.get(2)
|
|
.map(|value| value.as_str())
|
|
.unwrap_or_default();
|
|
let measures = MEASURE_RE
|
|
.captures_iter(block)
|
|
.filter_map(|captures| captures.get(1))
|
|
.map(|value| decode_xml_entities(value.as_str().trim()))
|
|
.filter(|value| !value.is_empty())
|
|
.collect::<Vec<_>>();
|
|
|
|
let measure = if measures.len() == 1 {
|
|
measures.first().cloned()
|
|
} else if measures.len() > 1 {
|
|
Some(measures.join("/"))
|
|
} else {
|
|
None
|
|
};
|
|
|
|
units.insert(id, ParsedUnit { measure });
|
|
}
|
|
units
|
|
}
|
|
|
|
fn is_xbrl_infrastructure_prefix(prefix: &str) -> bool {
|
|
matches!(
|
|
prefix.to_ascii_lowercase().as_str(),
|
|
"xbrli" | "xlink" | "link" | "xbrldi" | "xbrldt"
|
|
)
|
|
}
|
|
|
|
fn parse_attrs(raw: &str) -> HashMap<String, String> {
|
|
let mut map = HashMap::new();
|
|
for captures in ATTR_RE.captures_iter(raw) {
|
|
if let (Some(name), Some(value)) = (captures.get(1), captures.get(2)) {
|
|
map.insert(
|
|
name.as_str().to_string(),
|
|
decode_xml_entities(value.as_str()),
|
|
);
|
|
}
|
|
}
|
|
map
|
|
}
|
|
|
|
fn decode_xml_entities(value: &str) -> String {
|
|
value
|
|
.replace("&", "&")
|
|
.replace("<", "<")
|
|
.replace(">", ">")
|
|
.replace(""", "\"")
|
|
.replace("'", "'")
|
|
.replace(" ", " ")
|
|
.replace(" ", " ")
|
|
}
|
|
|
|
fn parse_number(raw: &str) -> Option<f64> {
|
|
let trimmed = raw.trim();
|
|
if trimmed.is_empty() || trimmed.chars().all(|char| char == '-') {
|
|
return None;
|
|
}
|
|
let negative = trimmed.starts_with('(') && trimmed.ends_with(')');
|
|
let normalized = Regex::new(r#"<[^>]+>"#)
|
|
.unwrap()
|
|
.replace_all(trimmed, " ")
|
|
.replace(',', "")
|
|
.replace('$', "")
|
|
.replace(['(', ')'], "")
|
|
.replace('\u{2212}', "-")
|
|
.split_whitespace()
|
|
.collect::<String>();
|
|
let parsed = normalized.parse::<f64>().ok()?;
|
|
Some(if negative { -parsed.abs() } else { parsed })
|
|
}
|
|
|
|
fn parse_label_linkbase(raw: &str) -> HashMap<String, String> {
|
|
let namespaces = parse_namespace_map(raw, "linkbase");
|
|
let mut preferred = HashMap::<String, (String, i64)>::new();
|
|
|
|
for captures in LABEL_LINK_RE.captures_iter(raw) {
|
|
let block = captures
|
|
.get(1)
|
|
.map(|value| value.as_str())
|
|
.unwrap_or_default();
|
|
let mut loc_by_label = HashMap::<String, String>::new();
|
|
let mut resource_by_label = HashMap::<String, (String, Option<String>)>::new();
|
|
|
|
for captures in LOC_RE.captures_iter(block) {
|
|
let attrs = parse_attrs(
|
|
captures
|
|
.get(1)
|
|
.map(|value| value.as_str())
|
|
.unwrap_or_default(),
|
|
);
|
|
let Some(label) = attrs.get("xlink:label").cloned() else {
|
|
continue;
|
|
};
|
|
let Some(href) = attrs.get("xlink:href").cloned() else {
|
|
continue;
|
|
};
|
|
let Some(qname) = qname_from_href(&href) else {
|
|
continue;
|
|
};
|
|
let Some((concept_key, _, _)) = concept_from_qname(&qname, &namespaces) else {
|
|
continue;
|
|
};
|
|
loc_by_label.insert(label, concept_key);
|
|
}
|
|
|
|
for captures in LABEL_RESOURCE_RE.captures_iter(block) {
|
|
let attrs = parse_attrs(
|
|
captures
|
|
.get(1)
|
|
.map(|value| value.as_str())
|
|
.unwrap_or_default(),
|
|
);
|
|
let Some(label) = attrs.get("xlink:label").cloned() else {
|
|
continue;
|
|
};
|
|
let body = decode_xml_entities(
|
|
captures
|
|
.get(2)
|
|
.map(|value| value.as_str())
|
|
.unwrap_or_default(),
|
|
)
|
|
.split_whitespace()
|
|
.collect::<Vec<_>>()
|
|
.join(" ");
|
|
if body.is_empty() {
|
|
continue;
|
|
}
|
|
resource_by_label.insert(label, (body, attrs.get("xlink:role").cloned()));
|
|
}
|
|
|
|
for captures in LABEL_ARC_RE.captures_iter(block) {
|
|
let attrs = parse_attrs(
|
|
captures
|
|
.get(1)
|
|
.map(|value| value.as_str())
|
|
.unwrap_or_default(),
|
|
);
|
|
let Some(from) = attrs.get("xlink:from").cloned() else {
|
|
continue;
|
|
};
|
|
let Some(to) = attrs.get("xlink:to").cloned() else {
|
|
continue;
|
|
};
|
|
let Some(concept_key) = loc_by_label.get(&from) else {
|
|
continue;
|
|
};
|
|
let Some((label, role)) = resource_by_label.get(&to) else {
|
|
continue;
|
|
};
|
|
let priority = label_priority(role.as_deref());
|
|
let current = preferred.get(concept_key).cloned();
|
|
if current
|
|
.as_ref()
|
|
.map(|(_, current_priority)| priority > *current_priority)
|
|
.unwrap_or(true)
|
|
{
|
|
preferred.insert(concept_key.clone(), (label.clone(), priority));
|
|
}
|
|
}
|
|
}
|
|
|
|
preferred
|
|
.into_iter()
|
|
.map(|(key, (value, _))| (key, value))
|
|
.collect()
|
|
}
|
|
|
|
fn parse_presentation_linkbase(raw: &str) -> Vec<PresentationNode> {
|
|
let namespaces = parse_namespace_map(raw, "linkbase");
|
|
let mut rows = Vec::new();
|
|
|
|
for captures in PRESENTATION_LINK_RE.captures_iter(raw) {
|
|
let link_attrs = parse_attrs(
|
|
captures
|
|
.get(1)
|
|
.map(|value| value.as_str())
|
|
.unwrap_or_default(),
|
|
);
|
|
let Some(role_uri) = link_attrs.get("xlink:role").cloned() else {
|
|
continue;
|
|
};
|
|
let block = captures
|
|
.get(2)
|
|
.map(|value| value.as_str())
|
|
.unwrap_or_default();
|
|
let mut loc_by_label = HashMap::<String, (String, String, bool)>::new();
|
|
let mut children_by_label = HashMap::<String, Vec<(String, f64)>>::new();
|
|
let mut incoming = HashSet::<String>::new();
|
|
let mut all_referenced = HashSet::<String>::new();
|
|
|
|
for captures in LOC_RE.captures_iter(block) {
|
|
let attrs = parse_attrs(
|
|
captures
|
|
.get(1)
|
|
.map(|value| value.as_str())
|
|
.unwrap_or_default(),
|
|
);
|
|
let Some(label) = attrs.get("xlink:label").cloned() else {
|
|
continue;
|
|
};
|
|
let Some(href) = attrs.get("xlink:href").cloned() else {
|
|
continue;
|
|
};
|
|
let Some(qname) = qname_from_href(&href) else {
|
|
continue;
|
|
};
|
|
let Some((concept_key, qname, local_name)) = concept_from_qname(&qname, &namespaces)
|
|
else {
|
|
continue;
|
|
};
|
|
loc_by_label.insert(
|
|
label,
|
|
(
|
|
concept_key,
|
|
qname,
|
|
local_name.to_ascii_lowercase().contains("abstract"),
|
|
),
|
|
);
|
|
}
|
|
|
|
for captures in PRESENTATION_ARC_RE.captures_iter(block) {
|
|
let attrs = parse_attrs(
|
|
captures
|
|
.get(1)
|
|
.map(|value| value.as_str())
|
|
.unwrap_or_default(),
|
|
);
|
|
let Some(from) = attrs.get("xlink:from").cloned() else {
|
|
continue;
|
|
};
|
|
let Some(to) = attrs.get("xlink:to").cloned() else {
|
|
continue;
|
|
};
|
|
if !loc_by_label.contains_key(&from) || !loc_by_label.contains_key(&to) {
|
|
continue;
|
|
}
|
|
let order = attrs
|
|
.get("order")
|
|
.and_then(|value| value.parse::<f64>().ok())
|
|
.unwrap_or_else(|| {
|
|
children_by_label
|
|
.get(&from)
|
|
.map(|children| children.len() as f64 + 1.0)
|
|
.unwrap_or(1.0)
|
|
});
|
|
children_by_label
|
|
.entry(from.clone())
|
|
.or_default()
|
|
.push((to.clone(), order));
|
|
incoming.insert(to.clone());
|
|
all_referenced.insert(from);
|
|
all_referenced.insert(to);
|
|
}
|
|
|
|
let roots = all_referenced
|
|
.iter()
|
|
.filter(|label| !incoming.contains(*label))
|
|
.cloned()
|
|
.collect::<Vec<_>>();
|
|
let mut visited = HashSet::<String>::new();
|
|
|
|
fn dfs(
|
|
label: &str,
|
|
depth: i64,
|
|
parent_label: Option<&str>,
|
|
base_order: f64,
|
|
role_uri: &str,
|
|
loc_by_label: &HashMap<String, (String, String, bool)>,
|
|
children_by_label: &HashMap<String, Vec<(String, f64)>>,
|
|
rows: &mut Vec<PresentationNode>,
|
|
visited: &mut HashSet<String>,
|
|
) {
|
|
let Some((concept_key, _qname, is_abstract)) = loc_by_label.get(label) else {
|
|
return;
|
|
};
|
|
let path_key = format!("{}::{label}::{depth}", parent_label.unwrap_or("root"));
|
|
if !visited.insert(path_key) {
|
|
return;
|
|
}
|
|
|
|
let parent_concept_key = parent_label.and_then(|parent| {
|
|
loc_by_label
|
|
.get(parent)
|
|
.map(|(concept_key, _, _)| concept_key.clone())
|
|
});
|
|
rows.push(PresentationNode {
|
|
concept_key: concept_key.clone(),
|
|
role_uri: role_uri.to_string(),
|
|
order: base_order,
|
|
depth,
|
|
parent_concept_key,
|
|
is_abstract: *is_abstract,
|
|
});
|
|
|
|
let mut children = children_by_label.get(label).cloned().unwrap_or_default();
|
|
children.sort_by(|left, right| {
|
|
left.1
|
|
.partial_cmp(&right.1)
|
|
.unwrap_or(std::cmp::Ordering::Equal)
|
|
});
|
|
for (index, (child_label, _)) in children.into_iter().enumerate() {
|
|
dfs(
|
|
&child_label,
|
|
depth + 1,
|
|
Some(label),
|
|
base_order + (index as f64 + 1.0) / 1000.0,
|
|
role_uri,
|
|
loc_by_label,
|
|
children_by_label,
|
|
rows,
|
|
visited,
|
|
);
|
|
}
|
|
}
|
|
|
|
for (index, root) in roots.iter().enumerate() {
|
|
dfs(
|
|
root,
|
|
0,
|
|
None,
|
|
index as f64 + 1.0,
|
|
&role_uri,
|
|
&loc_by_label,
|
|
&children_by_label,
|
|
&mut rows,
|
|
&mut visited,
|
|
);
|
|
}
|
|
}
|
|
|
|
rows
|
|
}
|
|
|
|
fn qname_from_href(href: &str) -> Option<String> {
|
|
let fragment = href.split('#').nth(1).unwrap_or(href).trim();
|
|
if fragment.is_empty() {
|
|
return None;
|
|
}
|
|
let cleaned = fragment.trim_start_matches("loc_");
|
|
if cleaned.contains(':') {
|
|
return Some(cleaned.to_string());
|
|
}
|
|
cleaned
|
|
.split_once('_')
|
|
.map(|(prefix, local)| format!("{prefix}:{local}"))
|
|
}
|
|
|
|
fn concept_from_qname(
|
|
qname: &str,
|
|
namespaces: &HashMap<String, String>,
|
|
) -> Option<(String, String, String)> {
|
|
let (prefix, local_name) = qname.split_once(':')?;
|
|
let namespace_uri = namespaces
|
|
.get(prefix)
|
|
.cloned()
|
|
.unwrap_or_else(|| format!("urn:unknown:{prefix}"));
|
|
Some((
|
|
format!("{namespace_uri}#{local_name}"),
|
|
qname.to_string(),
|
|
local_name.to_string(),
|
|
))
|
|
}
|
|
|
|
fn label_priority(role: Option<&str>) -> i64 {
|
|
let normalized = role.unwrap_or_default().to_ascii_lowercase();
|
|
if normalized.ends_with("/label") {
|
|
4
|
|
} else if normalized.ends_with("/terselabel") {
|
|
3
|
|
} else if normalized.ends_with("/verboselabel") {
|
|
2
|
|
} else if normalized.is_empty() {
|
|
0
|
|
} else {
|
|
1
|
|
}
|
|
}
|
|
|
|
struct MaterializedStatements {
|
|
periods: Vec<PeriodOutput>,
|
|
statement_rows: StatementRowMap,
|
|
concepts: Vec<ConceptOutput>,
|
|
facts: Vec<FactOutput>,
|
|
}
|
|
|
|
fn materialize_taxonomy_statements(
|
|
filing_id: i64,
|
|
accession_number: &str,
|
|
filing_date: &str,
|
|
filing_type: &str,
|
|
facts: &[ParsedFact],
|
|
presentation: &[PresentationNode],
|
|
label_by_concept: &HashMap<String, String>,
|
|
) -> MaterializedStatements {
|
|
let compact_accession = accession_number.replace('-', "");
|
|
let mut period_by_signature = HashMap::<String, PeriodOutput>::new();
|
|
|
|
for fact in facts {
|
|
let signature = period_signature(fact);
|
|
if period_by_signature.contains_key(&signature) {
|
|
continue;
|
|
}
|
|
let date = fact
|
|
.period_end
|
|
.clone()
|
|
.or_else(|| fact.period_instant.clone())
|
|
.unwrap_or_else(|| filing_date.to_string());
|
|
let id = format!(
|
|
"{date}-{compact_accession}-{}",
|
|
period_by_signature.len() + 1
|
|
);
|
|
let period_label = if fact.period_instant.is_some() && fact.period_start.is_none() {
|
|
"Instant".to_string()
|
|
} else if fact.period_start.is_some() && fact.period_end.is_some() {
|
|
format!(
|
|
"{} to {}",
|
|
fact.period_start.clone().unwrap_or_default(),
|
|
fact.period_end.clone().unwrap_or_default()
|
|
)
|
|
} else {
|
|
"Filing Period".to_string()
|
|
};
|
|
period_by_signature.insert(
|
|
signature,
|
|
PeriodOutput {
|
|
id,
|
|
filing_id,
|
|
accession_number: accession_number.to_string(),
|
|
filing_date: filing_date.to_string(),
|
|
period_start: fact.period_start.clone(),
|
|
period_end: fact
|
|
.period_end
|
|
.clone()
|
|
.or_else(|| fact.period_instant.clone()),
|
|
filing_type: filing_type.to_string(),
|
|
period_label,
|
|
},
|
|
);
|
|
}
|
|
|
|
let mut periods = period_by_signature.values().cloned().collect::<Vec<_>>();
|
|
periods.sort_by(|left, right| {
|
|
let left_key = left
|
|
.period_end
|
|
.clone()
|
|
.unwrap_or_else(|| left.filing_date.clone());
|
|
let right_key = right
|
|
.period_end
|
|
.clone()
|
|
.unwrap_or_else(|| right.filing_date.clone());
|
|
left_key
|
|
.cmp(&right_key)
|
|
.then_with(|| left.id.cmp(&right.id))
|
|
});
|
|
let period_id_by_signature = period_by_signature
|
|
.iter()
|
|
.map(|(signature, period)| (signature.clone(), period.id.clone()))
|
|
.collect::<HashMap<_, _>>();
|
|
|
|
let mut presentation_by_concept = HashMap::<String, Vec<&PresentationNode>>::new();
|
|
for node in presentation {
|
|
presentation_by_concept
|
|
.entry(node.concept_key.clone())
|
|
.or_default()
|
|
.push(node);
|
|
}
|
|
|
|
let mut grouped_by_statement = empty_parsed_fact_map();
|
|
let mut enriched_facts = Vec::new();
|
|
|
|
for (index, fact) in facts.iter().enumerate() {
|
|
let nodes = presentation_by_concept
|
|
.get(&fact.concept_key)
|
|
.cloned()
|
|
.unwrap_or_default();
|
|
let best_node = nodes.first().copied();
|
|
let statement_kind = best_node
|
|
.and_then(|node| classify_statement_role(&node.role_uri))
|
|
.or_else(|| concept_statement_fallback(&fact.local_name));
|
|
|
|
let fact_output = FactOutput {
|
|
concept_key: fact.concept_key.clone(),
|
|
qname: fact.qname.clone(),
|
|
namespace_uri: fact.namespace_uri.clone(),
|
|
local_name: fact.local_name.clone(),
|
|
data_type: fact.data_type.clone(),
|
|
statement_kind: statement_kind.clone(),
|
|
role_uri: best_node.map(|node| node.role_uri.clone()),
|
|
authoritative_concept_key: None,
|
|
mapping_method: None,
|
|
surface_key: None,
|
|
detail_parent_surface_key: None,
|
|
kpi_key: None,
|
|
residual_flag: false,
|
|
context_id: fact.context_id.clone(),
|
|
unit: fact.unit.clone(),
|
|
decimals: fact.decimals.clone(),
|
|
precision: fact.precision.clone(),
|
|
nil: fact.nil,
|
|
value_num: fact.value,
|
|
period_start: fact.period_start.clone(),
|
|
period_end: fact.period_end.clone(),
|
|
period_instant: fact.period_instant.clone(),
|
|
dimensions: fact.dimensions.clone(),
|
|
is_dimensionless: fact.is_dimensionless,
|
|
source_file: fact.source_file.clone(),
|
|
};
|
|
|
|
if let Some(statement_kind) = statement_kind.clone() {
|
|
if let Some(statement_key) = statement_key_ref(&statement_kind) {
|
|
grouped_by_statement
|
|
.entry(statement_key)
|
|
.or_default()
|
|
.entry(fact.concept_key.clone())
|
|
.or_default()
|
|
.push((index as i64 + 1, fact.clone(), best_node.cloned()));
|
|
}
|
|
}
|
|
|
|
enriched_facts.push(fact_output);
|
|
}
|
|
|
|
let mut statement_rows = empty_statement_row_map();
|
|
let mut concepts = Vec::<ConceptOutput>::new();
|
|
|
|
for statement_kind in statement_keys() {
|
|
let concept_groups = grouped_by_statement
|
|
.remove(statement_kind)
|
|
.unwrap_or_default();
|
|
let mut concept_keys = HashSet::<String>::new();
|
|
for node in presentation.iter().filter(|node| {
|
|
classify_statement_role(&node.role_uri).as_deref() == Some(statement_kind)
|
|
}) {
|
|
concept_keys.insert(node.concept_key.clone());
|
|
}
|
|
for concept_key in concept_groups.keys() {
|
|
concept_keys.insert(concept_key.clone());
|
|
}
|
|
|
|
let mut ordered_concepts = concept_keys
|
|
.into_iter()
|
|
.map(|concept_key| {
|
|
let nodes = presentation
|
|
.iter()
|
|
.filter(|node| {
|
|
node.concept_key == concept_key
|
|
&& classify_statement_role(&node.role_uri).as_deref()
|
|
== Some(statement_kind)
|
|
})
|
|
.collect::<Vec<_>>();
|
|
let order = nodes
|
|
.iter()
|
|
.map(|node| node.order)
|
|
.fold(f64::INFINITY, f64::min);
|
|
let depth = nodes.iter().map(|node| node.depth).min().unwrap_or(0);
|
|
let role_uri = nodes.first().map(|node| node.role_uri.clone());
|
|
let parent_concept_key = nodes
|
|
.first()
|
|
.and_then(|node| node.parent_concept_key.clone());
|
|
(concept_key, order, depth, role_uri, parent_concept_key)
|
|
})
|
|
.collect::<Vec<_>>();
|
|
ordered_concepts.sort_by(|left, right| {
|
|
left.1
|
|
.partial_cmp(&right.1)
|
|
.unwrap_or(std::cmp::Ordering::Equal)
|
|
.then_with(|| left.0.cmp(&right.0))
|
|
});
|
|
|
|
for (concept_key, presentation_order, depth, role_uri, parent_concept_key) in
|
|
ordered_concepts
|
|
{
|
|
let fact_group = concept_groups
|
|
.get(&concept_key)
|
|
.cloned()
|
|
.unwrap_or_default();
|
|
let (namespace_uri, local_name) = split_concept_key(&concept_key);
|
|
let qname = fact_group
|
|
.first()
|
|
.map(|(_, fact, _)| fact.qname.clone())
|
|
.unwrap_or_else(|| format!("unknown:{local_name}"));
|
|
let label = label_by_concept
|
|
.get(&concept_key)
|
|
.cloned()
|
|
.unwrap_or_else(|| local_name_to_label(&local_name));
|
|
let mut values = BTreeMap::<String, Option<f64>>::new();
|
|
let mut units = BTreeMap::<String, Option<String>>::new();
|
|
let mut source_fact_ids = Vec::<i64>::new();
|
|
let mut has_dimensions = false;
|
|
|
|
let mut fact_groups = HashMap::<String, Vec<(i64, ParsedFact)>>::new();
|
|
for (fact_id, fact, _) in fact_group.iter() {
|
|
fact_groups
|
|
.entry(period_signature(fact))
|
|
.or_default()
|
|
.push((*fact_id, fact.clone()));
|
|
}
|
|
|
|
for (signature, grouped_facts) in fact_groups {
|
|
let Some(period_id) = period_id_by_signature.get(&signature) else {
|
|
continue;
|
|
};
|
|
let preferred = pick_preferred_fact(&grouped_facts);
|
|
if let Some((fact_id, fact)) = preferred {
|
|
values.insert(period_id.clone(), Some(fact.value));
|
|
units.insert(period_id.clone(), fact.unit.clone());
|
|
source_fact_ids.push(*fact_id);
|
|
has_dimensions = has_dimensions || !fact.is_dimensionless;
|
|
}
|
|
}
|
|
|
|
let row = StatementRowOutput {
|
|
key: concept_key.clone(),
|
|
label: label.clone(),
|
|
concept_key: concept_key.clone(),
|
|
qname: qname.clone(),
|
|
namespace_uri: namespace_uri.clone(),
|
|
local_name: local_name.clone(),
|
|
is_extension: !is_standard_namespace(&namespace_uri),
|
|
statement: statement_kind.to_string(),
|
|
role_uri: role_uri.clone(),
|
|
order: if presentation_order.is_finite() {
|
|
(presentation_order * 1000.0).round() as i64
|
|
} else {
|
|
1_000_000
|
|
},
|
|
depth,
|
|
parent_key: parent_concept_key.clone(),
|
|
values,
|
|
units,
|
|
has_dimensions,
|
|
source_fact_ids: {
|
|
source_fact_ids.sort();
|
|
source_fact_ids
|
|
},
|
|
};
|
|
|
|
if let Some(statement_rows) = statement_rows.get_mut(statement_kind) {
|
|
statement_rows.push(row.clone());
|
|
}
|
|
|
|
concepts.push(ConceptOutput {
|
|
concept_key,
|
|
qname,
|
|
namespace_uri,
|
|
local_name,
|
|
label: Some(label),
|
|
is_extension: !is_standard_namespace(&row.namespace_uri),
|
|
balance: None,
|
|
period_type: None,
|
|
data_type: None,
|
|
statement_kind: Some(statement_kind.to_string()),
|
|
role_uri,
|
|
authoritative_concept_key: None,
|
|
mapping_method: None,
|
|
surface_key: None,
|
|
detail_parent_surface_key: None,
|
|
kpi_key: None,
|
|
residual_flag: false,
|
|
presentation_order: if presentation_order.is_finite() {
|
|
Some(presentation_order)
|
|
} else {
|
|
None
|
|
},
|
|
presentation_depth: Some(depth),
|
|
parent_concept_key,
|
|
is_abstract: presentation
|
|
.iter()
|
|
.find(|node| node.concept_key == row.concept_key)
|
|
.map(|node| node.is_abstract)
|
|
.unwrap_or(false),
|
|
});
|
|
}
|
|
}
|
|
|
|
MaterializedStatements {
|
|
periods,
|
|
statement_rows,
|
|
concepts,
|
|
facts: enriched_facts,
|
|
}
|
|
}
|
|
|
|
fn empty_parsed_fact_map(
|
|
) -> HashMap<&'static str, HashMap<String, Vec<(i64, ParsedFact, Option<PresentationNode>)>>> {
|
|
let mut map = HashMap::new();
|
|
for key in statement_keys() {
|
|
map.insert(key, HashMap::new());
|
|
}
|
|
map
|
|
}
|
|
|
|
fn empty_statement_row_map() -> StatementRowMap {
|
|
statement_keys()
|
|
.into_iter()
|
|
.map(|key| (key.to_string(), Vec::new()))
|
|
.collect()
|
|
}
|
|
|
|
fn empty_surface_row_map() -> SurfaceRowMap {
|
|
statement_keys()
|
|
.into_iter()
|
|
.map(|key| (key.to_string(), Vec::new()))
|
|
.collect()
|
|
}
|
|
|
|
fn empty_detail_row_map() -> DetailRowStatementMap {
|
|
statement_keys()
|
|
.into_iter()
|
|
.map(|key| (key.to_string(), BTreeMap::new()))
|
|
.collect()
|
|
}
|
|
|
|
fn statement_keys() -> [&'static str; 5] {
|
|
[
|
|
"income",
|
|
"balance",
|
|
"cash_flow",
|
|
"equity",
|
|
"comprehensive_income",
|
|
]
|
|
}
|
|
|
|
fn statement_key_ref(value: &str) -> Option<&'static str> {
|
|
match value {
|
|
"income" => Some("income"),
|
|
"balance" => Some("balance"),
|
|
"cash_flow" => Some("cash_flow"),
|
|
"equity" => Some("equity"),
|
|
"comprehensive_income" => Some("comprehensive_income"),
|
|
_ => None,
|
|
}
|
|
}
|
|
|
|
fn pick_preferred_fact(grouped_facts: &[(i64, ParsedFact)]) -> Option<&(i64, ParsedFact)> {
|
|
grouped_facts.iter().max_by(|left, right| {
|
|
let left_dimension_score = if left.1.is_dimensionless { 1 } else { 0 };
|
|
let right_dimension_score = if right.1.is_dimensionless { 1 } else { 0 };
|
|
left_dimension_score
|
|
.cmp(&right_dimension_score)
|
|
.then_with(|| {
|
|
let left_date = left
|
|
.1
|
|
.period_end
|
|
.as_ref()
|
|
.or(left.1.period_instant.as_ref())
|
|
.cloned()
|
|
.unwrap_or_default();
|
|
let right_date = right
|
|
.1
|
|
.period_end
|
|
.as_ref()
|
|
.or(right.1.period_instant.as_ref())
|
|
.cloned()
|
|
.unwrap_or_default();
|
|
left_date.cmp(&right_date)
|
|
})
|
|
.then_with(|| {
|
|
left.1
|
|
.value
|
|
.abs()
|
|
.partial_cmp(&right.1.value.abs())
|
|
.unwrap_or(std::cmp::Ordering::Equal)
|
|
})
|
|
})
|
|
}
|
|
|
|
fn period_signature(fact: &ParsedFact) -> String {
|
|
format!(
|
|
"start:{}|end:{}|instant:{}",
|
|
fact.period_start.clone().unwrap_or_default(),
|
|
fact.period_end.clone().unwrap_or_default(),
|
|
fact.period_instant.clone().unwrap_or_default()
|
|
)
|
|
}
|
|
|
|
fn split_concept_key(concept_key: &str) -> (String, String) {
|
|
concept_key
|
|
.rsplit_once('#')
|
|
.map(|(namespace_uri, local_name)| (namespace_uri.to_string(), local_name.to_string()))
|
|
.unwrap_or_else(|| ("urn:unknown".to_string(), concept_key.to_string()))
|
|
}
|
|
|
|
fn local_name_to_label(local_name: &str) -> String {
|
|
let spaced = Regex::new(r#"([a-z0-9])([A-Z])"#)
|
|
.unwrap()
|
|
.replace_all(local_name, "$1 $2")
|
|
.to_string();
|
|
Regex::new(r#"([A-Z]+)([A-Z][a-z])"#)
|
|
.unwrap()
|
|
.replace_all(&spaced, "$1 $2")
|
|
.replace('_', " ")
|
|
.trim()
|
|
.to_string()
|
|
}
|
|
|
|
fn classify_statement_role(role_uri: &str) -> Option<String> {
|
|
let normalized = role_uri.to_ascii_lowercase();
|
|
if Regex::new(r#"cash\s*flow|statementsof?cashflows|netcash"#)
|
|
.unwrap()
|
|
.is_match(&normalized)
|
|
{
|
|
return Some("cash_flow".to_string());
|
|
}
|
|
if Regex::new(r#"shareholders?|stockholders?|equity|retainedearnings"#)
|
|
.unwrap()
|
|
.is_match(&normalized)
|
|
{
|
|
return Some("equity".to_string());
|
|
}
|
|
if Regex::new(r#"comprehensive\s*income"#)
|
|
.unwrap()
|
|
.is_match(&normalized)
|
|
{
|
|
return Some("comprehensive_income".to_string());
|
|
}
|
|
if Regex::new(r#"balance\s*sheet|financial\s*position|assets?andliabilities"#)
|
|
.unwrap()
|
|
.is_match(&normalized)
|
|
{
|
|
return Some("balance".to_string());
|
|
}
|
|
if Regex::new(r#"operations|income\s*statement|statementsofincome|profit"#)
|
|
.unwrap()
|
|
.is_match(&normalized)
|
|
{
|
|
return Some("income".to_string());
|
|
}
|
|
None
|
|
}
|
|
|
|
fn concept_statement_fallback(local_name: &str) -> Option<String> {
|
|
let normalized = local_name.to_ascii_lowercase();
|
|
if Regex::new(r#"equity|retainedearnings|additionalpaidincapital"#)
|
|
.unwrap()
|
|
.is_match(&normalized)
|
|
{
|
|
return Some("equity".to_string());
|
|
}
|
|
if normalized.contains("comprehensiveincome") {
|
|
return Some("comprehensive_income".to_string());
|
|
}
|
|
if Regex::new(
|
|
r#"deferredpolicyacquisitioncosts(andvalueofbusinessacquired)?$|supplementaryinsuranceinformationdeferredpolicyacquisitioncosts$|deferredacquisitioncosts$"#,
|
|
)
|
|
.unwrap()
|
|
.is_match(&normalized)
|
|
{
|
|
return Some("balance".to_string());
|
|
}
|
|
if Regex::new(
|
|
r#"netcashprovidedbyusedin.*activities|increasedecreasein|paymentstoacquire|paymentsforcapitalimprovements$|paymentsfordepositsonrealestateacquisitions$|paymentsforrepurchase|paymentsofdividends|dividendscommonstockcash$|proceedsfrom|repaymentsofdebt|sharebasedcompensation$|allocatedsharebasedcompensationexpense$|depreciationdepletionandamortization$|depreciationamortizationandaccretionnet$|depreciationandamortization$|depreciationamortizationandother$|otheradjustmentstoreconcilenetincomelosstocashprovidedbyusedinoperatingactivities"#,
|
|
)
|
|
.unwrap()
|
|
.is_match(&normalized)
|
|
{
|
|
return Some("cash_flow".to_string());
|
|
}
|
|
if Regex::new(
|
|
r#"asset|liabilit|debt|financingreceivable|loansreceivable|deposits|allowanceforcreditloss|futurepolicybenefits|policyholderaccountbalances|unearnedpremiums|realestateinvestmentproperty|grossatcarryingvalue|investmentproperty"#,
|
|
)
|
|
.unwrap()
|
|
.is_match(&normalized)
|
|
{
|
|
return Some("balance".to_string());
|
|
}
|
|
if Regex::new(
|
|
r#"revenue|income|profit|expense|costof|leaseincome|rental|premiums|claims|underwriting|policyacquisition|interestincome|interestexpense|noninterest|leasedandrentedproperty"#,
|
|
)
|
|
.unwrap()
|
|
.is_match(&normalized)
|
|
{
|
|
return Some("income".to_string());
|
|
}
|
|
None
|
|
}
|
|
|
|
fn is_standard_namespace(namespace_uri: &str) -> bool {
|
|
let lower = namespace_uri.to_ascii_lowercase();
|
|
lower.contains("us-gaap")
|
|
|| lower.contains("ifrs")
|
|
|| lower.contains("/dei/")
|
|
|| lower.contains("xbrl.sec.gov/dei")
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
use crate::pack_selector::FiscalPack;
|
|
|
|
fn period(id: &str, period_end: &str) -> PeriodOutput {
|
|
PeriodOutput {
|
|
id: id.to_string(),
|
|
filing_id: 1,
|
|
accession_number: "0000000000-00-000001".to_string(),
|
|
filing_date: "2025-12-31".to_string(),
|
|
period_start: Some("2025-01-01".to_string()),
|
|
period_end: Some(period_end.to_string()),
|
|
filing_type: "10-K".to_string(),
|
|
period_label: period_end.to_string(),
|
|
}
|
|
}
|
|
|
|
fn row(
|
|
key: &str,
|
|
qname: &str,
|
|
statement: &str,
|
|
order: i64,
|
|
values: &[(&str, f64)],
|
|
) -> StatementRowOutput {
|
|
let namespace_uri = qname
|
|
.split_once(':')
|
|
.map(|(prefix, _)| {
|
|
if prefix == "us-gaap" {
|
|
"http://fasb.org/us-gaap/2024".to_string()
|
|
} else {
|
|
format!("urn:{prefix}")
|
|
}
|
|
})
|
|
.unwrap_or_else(|| "urn:unknown".to_string());
|
|
let local_name = qname
|
|
.split_once(':')
|
|
.map(|(_, local_name)| local_name.to_string())
|
|
.unwrap_or_else(|| qname.to_string());
|
|
|
|
StatementRowOutput {
|
|
key: key.to_string(),
|
|
label: local_name_to_label(&local_name),
|
|
concept_key: format!("{namespace_uri}#{local_name}"),
|
|
qname: qname.to_string(),
|
|
namespace_uri,
|
|
local_name,
|
|
is_extension: false,
|
|
statement: statement.to_string(),
|
|
role_uri: Some(statement.to_string()),
|
|
order,
|
|
depth: 0,
|
|
parent_key: None,
|
|
values: values
|
|
.iter()
|
|
.map(|(period_id, value)| (period_id.to_string(), Some(*value)))
|
|
.collect(),
|
|
units: values
|
|
.iter()
|
|
.map(|(period_id, _)| (period_id.to_string(), Some("iso4217:USD".to_string())))
|
|
.collect(),
|
|
has_dimensions: false,
|
|
source_fact_ids: vec![order],
|
|
}
|
|
}
|
|
|
|
#[test]
|
|
fn builds_compact_surface_rows_from_core_pack() {
|
|
let periods = vec![period("2024", "2024-12-31"), period("2025", "2025-12-31")];
|
|
let mut statement_rows = empty_statement_row_map();
|
|
statement_rows.insert(
|
|
"income".to_string(),
|
|
vec![
|
|
row(
|
|
"revenue-row",
|
|
"us-gaap:RevenueFromContractWithCustomerExcludingAssessedTax",
|
|
"income",
|
|
10,
|
|
&[("2024", 100.0), ("2025", 120.0)],
|
|
),
|
|
row(
|
|
"operating-expenses-row",
|
|
"us-gaap:OperatingExpenses",
|
|
"income",
|
|
20,
|
|
&[("2024", 40.0), ("2025", 50.0)],
|
|
),
|
|
row(
|
|
"sga-row",
|
|
"us-gaap:SellingGeneralAndAdministrativeExpense",
|
|
"income",
|
|
30,
|
|
&[("2024", 25.0), ("2025", 31.0)],
|
|
),
|
|
row(
|
|
"rd-row",
|
|
"us-gaap:ResearchAndDevelopmentExpense",
|
|
"income",
|
|
40,
|
|
&[("2024", 15.0), ("2025", 19.0)],
|
|
),
|
|
row(
|
|
"net-income-row",
|
|
"us-gaap:NetIncomeLoss",
|
|
"income",
|
|
50,
|
|
&[("2024", 22.0), ("2025", 30.0)],
|
|
),
|
|
row(
|
|
"unmapped-row",
|
|
"company:OtherOperatingCharges",
|
|
"income",
|
|
60,
|
|
&[("2024", 3.0), ("2025", 4.0)],
|
|
),
|
|
],
|
|
);
|
|
statement_rows.insert(
|
|
"balance".to_string(),
|
|
vec![row(
|
|
"assets-row",
|
|
"us-gaap:Assets",
|
|
"balance",
|
|
70,
|
|
&[("2024", 500.0), ("2025", 550.0)],
|
|
)],
|
|
);
|
|
statement_rows.insert(
|
|
"cash_flow".to_string(),
|
|
vec![row(
|
|
"ocf-row",
|
|
"us-gaap:NetCashProvidedByUsedInOperatingActivities",
|
|
"cash_flow",
|
|
80,
|
|
&[("2024", 60.0), ("2025", 65.0)],
|
|
)],
|
|
);
|
|
|
|
let model = surface_mapper::build_compact_surface_model(
|
|
&periods,
|
|
&statement_rows,
|
|
"us-gaap",
|
|
FiscalPack::Core,
|
|
vec![],
|
|
)
|
|
.expect("core pack should load and map");
|
|
let income_surface_rows = model
|
|
.surface_rows
|
|
.get("income")
|
|
.expect("income surface rows");
|
|
let op_expenses = income_surface_rows
|
|
.iter()
|
|
.find(|row| row.key == "operating_expenses")
|
|
.expect("operating expenses surface row");
|
|
let revenue = income_surface_rows
|
|
.iter()
|
|
.find(|row| row.key == "revenue")
|
|
.expect("revenue surface row");
|
|
|
|
assert_eq!(revenue.values.get("2025").copied().flatten(), Some(120.0));
|
|
assert_eq!(
|
|
op_expenses.values.get("2024").copied().flatten(),
|
|
Some(40.0)
|
|
);
|
|
assert_eq!(op_expenses.detail_count, Some(2));
|
|
|
|
let operating_expense_details = model
|
|
.detail_rows
|
|
.get("income")
|
|
.and_then(|groups| groups.get("operating_expenses"))
|
|
.expect("operating expenses details");
|
|
assert_eq!(operating_expense_details.len(), 2);
|
|
assert!(operating_expense_details
|
|
.iter()
|
|
.any(|row| row.key == "sga-row"));
|
|
assert!(operating_expense_details
|
|
.iter()
|
|
.any(|row| row.key == "rd-row"));
|
|
|
|
let residual_rows = model
|
|
.detail_rows
|
|
.get("income")
|
|
.and_then(|groups| groups.get("unmapped"))
|
|
.expect("unmapped detail rows");
|
|
assert_eq!(residual_rows.len(), 1);
|
|
assert_eq!(residual_rows[0].key, "unmapped-row");
|
|
assert!(residual_rows[0].residual_flag);
|
|
|
|
let rd_mapping = model
|
|
.concept_mappings
|
|
.get("http://fasb.org/us-gaap/2024#ResearchAndDevelopmentExpense")
|
|
.expect("rd mapping");
|
|
assert_eq!(
|
|
rd_mapping.detail_parent_surface_key.as_deref(),
|
|
Some("operating_expenses")
|
|
);
|
|
assert_eq!(
|
|
rd_mapping.surface_key.as_deref(),
|
|
Some("operating_expenses")
|
|
);
|
|
|
|
let residual_mapping = model
|
|
.concept_mappings
|
|
.get("urn:company#OtherOperatingCharges")
|
|
.expect("residual mapping");
|
|
assert!(residual_mapping.residual_flag);
|
|
assert_eq!(
|
|
residual_mapping.detail_parent_surface_key.as_deref(),
|
|
Some("unmapped")
|
|
);
|
|
|
|
assert_eq!(model.normalization_summary.surface_row_count, 6);
|
|
assert_eq!(model.normalization_summary.detail_row_count, 3);
|
|
assert_eq!(model.normalization_summary.unmapped_row_count, 1);
|
|
}
|
|
|
|
#[test]
|
|
fn parses_basic_xbrl_facts_without_regex_backreferences() {
|
|
let raw = r#"
|
|
<xbrli:xbrl xmlns:xbrli="http://www.xbrl.org/2003/instance" xmlns:us-gaap="http://fasb.org/us-gaap/2024">
|
|
<xbrli:context id="c1">
|
|
<xbrli:entity>
|
|
<xbrli:identifier scheme="http://www.sec.gov/CIK">0000320193</xbrli:identifier>
|
|
</xbrli:entity>
|
|
<xbrli:period>
|
|
<xbrli:startDate>2025-01-01</xbrli:startDate>
|
|
<xbrli:endDate>2025-12-31</xbrli:endDate>
|
|
</xbrli:period>
|
|
</xbrli:context>
|
|
<xbrli:unit id="u1">
|
|
<xbrli:measure>iso4217:USD</xbrli:measure>
|
|
</xbrli:unit>
|
|
<us-gaap:RevenueFromContractWithCustomerExcludingAssessedTax contextRef="c1" unitRef="u1" decimals="-6">1000</us-gaap:RevenueFromContractWithCustomerExcludingAssessedTax>
|
|
</xbrli:xbrl>
|
|
"#;
|
|
|
|
let parsed = parse_xbrl_instance(raw, Some("test.xml".to_string()));
|
|
assert_eq!(parsed.facts.len(), 1);
|
|
assert_eq!(
|
|
parsed.facts[0].qname,
|
|
"us-gaap:RevenueFromContractWithCustomerExcludingAssessedTax"
|
|
);
|
|
assert_eq!(parsed.facts[0].value, 1000.0);
|
|
assert_eq!(parsed.facts[0].unit.as_deref(), Some("iso4217:USD"));
|
|
}
|
|
|
|
#[test]
|
|
fn classifies_pack_specific_concepts_without_presentation_roles() {
|
|
assert_eq!(
|
|
concept_statement_fallback(
|
|
"FinancingReceivableExcludingAccruedInterestAfterAllowanceForCreditLoss"
|
|
)
|
|
.as_deref(),
|
|
Some("balance")
|
|
);
|
|
assert_eq!(
|
|
concept_statement_fallback("Deposits").as_deref(),
|
|
Some("balance")
|
|
);
|
|
assert_eq!(
|
|
concept_statement_fallback("RealEstateInvestmentPropertyNet").as_deref(),
|
|
Some("balance")
|
|
);
|
|
assert_eq!(
|
|
concept_statement_fallback("DeferredPolicyAcquisitionCosts").as_deref(),
|
|
Some("balance")
|
|
);
|
|
assert_eq!(
|
|
concept_statement_fallback("DeferredPolicyAcquisitionCostsAndValueOfBusinessAcquired")
|
|
.as_deref(),
|
|
Some("balance")
|
|
);
|
|
assert_eq!(
|
|
concept_statement_fallback("IncreaseDecreaseInAccountsReceivable").as_deref(),
|
|
Some("cash_flow")
|
|
);
|
|
assert_eq!(
|
|
concept_statement_fallback("PaymentsOfDividends").as_deref(),
|
|
Some("cash_flow")
|
|
);
|
|
assert_eq!(
|
|
concept_statement_fallback("RepaymentsOfDebt").as_deref(),
|
|
Some("cash_flow")
|
|
);
|
|
assert_eq!(
|
|
concept_statement_fallback("ShareBasedCompensation").as_deref(),
|
|
Some("cash_flow")
|
|
);
|
|
assert_eq!(
|
|
concept_statement_fallback("PaymentsForCapitalImprovements").as_deref(),
|
|
Some("cash_flow")
|
|
);
|
|
assert_eq!(
|
|
concept_statement_fallback("PaymentsForDepositsOnRealEstateAcquisitions").as_deref(),
|
|
Some("cash_flow")
|
|
);
|
|
assert_eq!(
|
|
concept_statement_fallback("LeaseIncome").as_deref(),
|
|
Some("income")
|
|
);
|
|
assert_eq!(
|
|
concept_statement_fallback("DirectCostsOfLeasedAndRentedPropertyOrEquipment")
|
|
.as_deref(),
|
|
Some("income")
|
|
);
|
|
}
|
|
}
|