use anyhow::{anyhow, Context, Result}; use once_cell::sync::Lazy; use regex::Regex; use reqwest::blocking::Client; use serde::{Deserialize, Serialize}; use std::collections::{BTreeMap, HashMap, HashSet}; mod kpi_mapper; mod metrics; mod pack_selector; mod surface_mapper; mod taxonomy_loader; mod universal_income; use taxonomy_loader::{ComputationSpec, ComputedDefinition}; #[cfg(feature = "with-crabrl")] use crabrl as _; pub const PARSER_ENGINE: &str = "fiscal-xbrl"; pub const PARSER_VERSION: &str = env!("CARGO_PKG_VERSION"); static CONTEXT_RE: Lazy = Lazy::new(|| { Regex::new(r#"(?is)<(?:[a-z0-9_\-]+:)?context\b[^>]*\bid=["']([^"']+)["'][^>]*>(.*?)"#).unwrap() }); static UNIT_RE: Lazy = Lazy::new(|| { Regex::new(r#"(?is)<(?:[a-z0-9_\-]+:)?unit\b[^>]*\bid=["']([^"']+)["'][^>]*>(.*?)"#).unwrap() }); static FACT_RE: Lazy = Lazy::new(|| { Regex::new(r#"(?is)<([a-zA-Z0-9_\-]+):([a-zA-Z0-9_\-.]+)\b([^>]*\bcontextRef=["'][^"']+["'][^>]*)>(.*?)"#).unwrap() }); static EXPLICIT_MEMBER_RE: Lazy = Lazy::new(|| { Regex::new(r#"(?is)<(?:[a-z0-9_\-]+:)?explicitMember\b[^>]*\bdimension=["']([^"']+)["'][^>]*>(.*?)"#).unwrap() }); static TYPED_MEMBER_RE: Lazy = Lazy::new(|| { Regex::new(r#"(?is)<(?:[a-z0-9_\-]+:)?typedMember\b[^>]*\bdimension=["']([^"']+)["'][^>]*>(.*?)"#).unwrap() }); static IDENTIFIER_RE: Lazy = Lazy::new(|| { Regex::new(r#"(?is)<(?:[a-z0-9_\-]+:)?identifier\b[^>]*\bscheme=["']([^"']+)["'][^>]*>(.*?)"#).unwrap() }); static SEGMENT_RE: Lazy = Lazy::new(|| { Regex::new(r#"(?is)<(?:[a-z0-9_\-]+:)?segment\b[^>]*>(.*?)"#) .unwrap() }); static SCENARIO_RE: Lazy = Lazy::new(|| { Regex::new(r#"(?is)<(?:[a-z0-9_\-]+:)?scenario\b[^>]*>(.*?)"#) .unwrap() }); static START_DATE_RE: Lazy = Lazy::new(|| { Regex::new(r#"(?is)<(?:[a-z0-9_\-]+:)?startDate>(.*?)"#).unwrap() }); static END_DATE_RE: Lazy = Lazy::new(|| { Regex::new(r#"(?is)<(?:[a-z0-9_\-]+:)?endDate>(.*?)"#).unwrap() }); static INSTANT_RE: Lazy = Lazy::new(|| { Regex::new(r#"(?is)<(?:[a-z0-9_\-]+:)?instant>(.*?)"#).unwrap() }); static MEASURE_RE: Lazy = Lazy::new(|| { Regex::new(r#"(?is)<(?:[a-z0-9_\-]+:)?measure>(.*?)"#).unwrap() }); static LABEL_LINK_RE: Lazy = Lazy::new(|| { Regex::new(r#"(?is)<(?:[a-z0-9_\-]+:)?labelLink\b[^>]*>(.*?)"#) .unwrap() }); static PRESENTATION_LINK_RE: Lazy = Lazy::new(|| { Regex::new(r#"(?is)<(?:[a-z0-9_\-]+:)?presentationLink\b([^>]*)>(.*?)"#).unwrap() }); static LOC_RE: Lazy = Lazy::new(|| { Regex::new(r#"(?is)<(?:[a-z0-9_\-]+:)?loc\b([^>]*)/?>(?:)?"#).unwrap() }); static LABEL_RESOURCE_RE: Lazy = Lazy::new(|| { Regex::new(r#"(?is)<(?:[a-z0-9_\-]+:)?label\b([^>]*)>(.*?)"#).unwrap() }); static LABEL_ARC_RE: Lazy = Lazy::new(|| { Regex::new(r#"(?is)<(?:[a-z0-9_\-]+:)?labelArc\b([^>]*)/?>(?:)?"#) .unwrap() }); static PRESENTATION_ARC_RE: Lazy = Lazy::new(|| { Regex::new(r#"(?is)<(?:[a-z0-9_\-]+:)?presentationArc\b([^>]*)/?>(?:)?"#).unwrap() }); static ATTR_RE: Lazy = Lazy::new(|| Regex::new(r#"([a-zA-Z0-9:_\-]+)=["']([^"']+)["']"#).unwrap()); #[derive(Debug, Deserialize)] #[serde(rename_all = "camelCase")] pub struct HydrateFilingRequest { pub filing_id: i64, pub ticker: String, pub cik: String, pub accession_number: String, pub filing_date: String, pub filing_type: String, pub filing_url: Option, pub primary_document: Option, pub cache_dir: Option, } #[derive(Debug, Serialize)] pub struct HydrateFilingResponse { pub filing_id: i64, pub ticker: String, pub filing_date: String, pub filing_type: String, pub parse_status: String, pub parse_error: Option, pub source: String, pub parser_engine: String, pub parser_version: String, pub taxonomy_regime: String, pub fiscal_pack: Option, pub periods: Vec, pub faithful_rows: StatementRowMap, pub statement_rows: StatementRowMap, pub surface_rows: SurfaceRowMap, pub detail_rows: DetailRowStatementMap, pub kpi_rows: Vec, pub computed_definitions: Vec, pub contexts: Vec, pub derived_metrics: FilingMetrics, pub validation_result: ValidationResultOutput, pub facts_count: usize, pub concepts_count: usize, pub dimensions_count: usize, pub assets: Vec, pub concepts: Vec, pub facts: Vec, pub metric_validations: Vec, pub normalization_summary: NormalizationSummaryOutput, } #[derive(Debug, Clone, Serialize, Default)] pub struct FilingMetrics { pub revenue: Option, #[serde(rename = "netIncome")] pub net_income: Option, #[serde(rename = "totalAssets")] pub total_assets: Option, pub cash: Option, pub debt: Option, } #[derive(Debug, Clone, Serialize)] pub struct ValidationResultOutput { pub status: String, pub checks: Vec, #[serde(rename = "validatedAt")] pub validated_at: Option, } #[derive(Debug, Clone, Serialize)] pub struct AssetOutput { pub asset_type: String, pub name: String, pub url: String, pub size_bytes: Option, pub score: Option, pub is_selected: bool, } #[derive(Debug, Clone, Serialize)] pub struct PeriodOutput { pub id: String, pub filing_id: i64, pub accession_number: String, pub filing_date: String, pub period_start: Option, pub period_end: Option, pub filing_type: String, pub period_label: String, } #[derive(Debug, Clone, Serialize)] pub struct ContextOutput { pub context_id: String, pub entity_identifier: Option, pub entity_scheme: Option, pub period_start: Option, pub period_end: Option, pub period_instant: Option, pub segment_json: Option, pub scenario_json: Option, } #[derive(Debug, Clone, Serialize)] pub struct StatementRowOutput { pub key: String, pub label: String, pub concept_key: String, pub qname: String, pub namespace_uri: String, pub local_name: String, pub is_extension: bool, pub statement: String, pub role_uri: Option, pub order: i64, pub depth: i64, pub parent_key: Option, pub values: BTreeMap>, pub units: BTreeMap>, pub has_dimensions: bool, pub source_fact_ids: Vec, } #[derive(Debug, Clone, Serialize)] pub struct SurfaceRowOutput { pub key: String, pub label: String, pub category: String, pub template_section: String, pub order: i64, pub unit: String, pub values: BTreeMap>, pub source_concepts: Vec, pub source_row_keys: Vec, pub source_fact_ids: Vec, pub formula_key: Option, pub has_dimensions: bool, pub resolved_source_row_keys: BTreeMap>, pub statement: Option, pub detail_count: Option, #[serde(skip_serializing_if = "Option::is_none")] pub resolution_method: Option, #[serde(skip_serializing_if = "Option::is_none")] pub confidence: Option, #[serde(default, skip_serializing_if = "Vec::is_empty")] pub warning_codes: Vec, } #[derive(Debug, Clone, Serialize)] pub struct DetailRowOutput { pub key: String, pub parent_surface_key: String, pub label: String, pub concept_key: String, pub qname: String, pub namespace_uri: String, pub local_name: String, pub unit: Option, pub values: BTreeMap>, pub source_fact_ids: Vec, pub is_extension: bool, pub dimensions_summary: Vec, pub residual_flag: bool, } #[derive(Debug, Clone, Serialize)] pub struct KpiRowOutput { pub key: String, pub label: String, pub category: String, pub unit: String, pub order: i64, pub segment: Option, pub axis: Option, pub member: Option, pub values: BTreeMap>, pub source_concepts: Vec, pub source_fact_ids: Vec, pub provenance_type: String, pub has_dimensions: bool, } #[derive(Debug, Clone, Serialize)] pub struct ComputedDefinitionOutput { pub key: String, pub label: String, pub category: String, pub order: i64, pub unit: String, pub computation: ComputationSpecOutput, #[serde(default, skip_serializing_if = "Vec::is_empty")] pub supported_cadences: Vec, #[serde(default, skip_serializing_if = "Vec::is_empty")] pub requires_external_data: Vec, } #[derive(Debug, Clone, Serialize)] #[serde(tag = "type", rename_all = "snake_case")] pub enum ComputationSpecOutput { Ratio { numerator: String, denominator: String, }, YoyGrowth { source: String, }, Cagr { source: String, years: i64, }, PerShare { source: String, shares_key: String, }, Simple { formula: String, }, } impl From<&ComputationSpec> for ComputationSpecOutput { fn from(spec: &ComputationSpec) -> Self { match spec { ComputationSpec::Ratio { numerator, denominator, } => ComputationSpecOutput::Ratio { numerator: numerator.clone(), denominator: denominator.clone(), }, ComputationSpec::YoyGrowth { source } => ComputationSpecOutput::YoyGrowth { source: source.clone(), }, ComputationSpec::Cagr { source, years } => ComputationSpecOutput::Cagr { source: source.clone(), years: *years, }, ComputationSpec::PerShare { source, shares_key } => ComputationSpecOutput::PerShare { source: source.clone(), shares_key: shares_key.clone(), }, ComputationSpec::Simple { formula } => ComputationSpecOutput::Simple { formula: formula.clone(), }, } } } impl From<&ComputedDefinition> for ComputedDefinitionOutput { fn from(def: &ComputedDefinition) -> Self { ComputedDefinitionOutput { key: def.key.clone(), label: def.label.clone(), category: def.category.clone(), order: def.order, unit: def.unit.clone(), computation: ComputationSpecOutput::from(&def.computation), supported_cadences: def.supported_cadences.clone(), requires_external_data: def.requires_external_data.clone(), } } } #[derive(Debug, Clone, Serialize)] pub struct ConceptOutput { pub concept_key: String, pub qname: String, pub namespace_uri: String, pub local_name: String, pub label: Option, pub is_extension: bool, pub balance: Option, pub period_type: Option, pub data_type: Option, pub statement_kind: Option, pub role_uri: Option, pub authoritative_concept_key: Option, pub mapping_method: Option, pub surface_key: Option, pub detail_parent_surface_key: Option, pub kpi_key: Option, pub residual_flag: bool, pub presentation_order: Option, pub presentation_depth: Option, pub parent_concept_key: Option, pub is_abstract: bool, } #[derive(Debug, Clone, Serialize)] pub struct FactOutput { pub concept_key: String, pub qname: String, pub namespace_uri: String, pub local_name: String, pub data_type: Option, pub statement_kind: Option, pub role_uri: Option, pub authoritative_concept_key: Option, pub mapping_method: Option, pub surface_key: Option, pub detail_parent_surface_key: Option, pub kpi_key: Option, pub residual_flag: bool, pub context_id: String, pub unit: Option, pub decimals: Option, pub precision: Option, pub nil: bool, pub value_num: f64, pub period_start: Option, pub period_end: Option, pub period_instant: Option, pub dimensions: Vec, pub is_dimensionless: bool, pub source_file: Option, } #[derive(Debug, Clone, Serialize)] pub struct DimensionOutput { pub axis: String, pub member: String, } #[derive(Debug, Clone, Serialize)] pub struct MetricValidationOutput { pub metric_key: String, pub taxonomy_value: Option, pub llm_value: Option, pub absolute_diff: Option, pub relative_diff: Option, pub status: String, pub evidence_pages: Vec, pub pdf_url: Option, pub provider: Option, pub model: Option, pub error: Option, } #[derive(Debug, Clone, Serialize, Default)] pub struct NormalizationSummaryOutput { pub surface_row_count: usize, pub detail_row_count: usize, pub kpi_row_count: usize, pub unmapped_row_count: usize, pub material_unmapped_row_count: usize, pub warnings: Vec, } pub type StatementRowMap = BTreeMap>; pub type SurfaceRowMap = BTreeMap>; pub type DetailRowStatementMap = BTreeMap>>; #[derive(Debug, Clone)] struct ParsedContext { id: String, entity_identifier: Option, entity_scheme: Option, period_start: Option, period_end: Option, period_instant: Option, dimensions: Vec, segment: Option, scenario: Option, } #[derive(Debug, Clone)] struct ParsedUnit { measure: Option, } #[derive(Debug, Clone)] struct ParsedFact { concept_key: String, qname: String, namespace_uri: String, local_name: String, data_type: Option, context_id: String, unit: Option, decimals: Option, precision: Option, nil: bool, value: f64, period_start: Option, period_end: Option, period_instant: Option, dimensions: Vec, is_dimensionless: bool, source_file: Option, } #[derive(Debug, Clone)] struct PresentationNode { concept_key: String, role_uri: String, order: f64, depth: i64, parent_concept_key: Option, is_abstract: bool, } pub fn hydrate_filing(input: HydrateFilingRequest) -> Result { let client = Client::builder() .user_agent("Fiscal Clone ") .build() .context("unable to build HTTP client")?; let discovered = discover_filing_assets(&input, &client)?; let empty_rows = empty_statement_row_map(); let empty_surface_rows = empty_surface_row_map(); let empty_detail_rows = empty_detail_row_map(); let validation_result = ValidationResultOutput { status: "not_run".to_string(), checks: vec![], validated_at: None, }; let Some(instance_asset) = discovered .assets .iter() .find(|asset| asset.asset_type == "instance" && asset.is_selected) .cloned() else { return Ok(HydrateFilingResponse { filing_id: input.filing_id, ticker: input.ticker.to_uppercase(), filing_date: input.filing_date, filing_type: input.filing_type, parse_status: "failed".to_string(), parse_error: Some("No XBRL instance found".to_string()), source: "legacy_html_fallback".to_string(), parser_engine: PARSER_ENGINE.to_string(), parser_version: PARSER_VERSION.to_string(), taxonomy_regime: "unknown".to_string(), fiscal_pack: Some("core".to_string()), periods: vec![], faithful_rows: empty_rows.clone(), statement_rows: empty_rows, surface_rows: empty_surface_rows, detail_rows: empty_detail_rows, kpi_rows: vec![], computed_definitions: vec![], contexts: vec![], derived_metrics: FilingMetrics::default(), validation_result, facts_count: 0, concepts_count: 0, dimensions_count: 0, assets: discovered.assets, concepts: vec![], facts: vec![], metric_validations: vec![], normalization_summary: NormalizationSummaryOutput { surface_row_count: 0, detail_row_count: 0, kpi_row_count: 0, unmapped_row_count: 0, material_unmapped_row_count: 0, warnings: vec![], }, }); }; let instance_text = fetch_text(&client, &instance_asset.url) .context("fetch request failed for XBRL instance")?; let parsed_instance = parse_xbrl_instance(&instance_text, Some(instance_asset.name.clone())); let mut label_by_concept = HashMap::new(); let mut presentation = Vec::new(); let mut source = "xbrl_instance".to_string(); let mut parse_error = None; for asset in discovered.assets.iter().filter(|asset| { asset.is_selected && (asset.asset_type == "presentation" || asset.asset_type == "label") }) { match fetch_text(&client, &asset.url) { Ok(content) => { if asset.asset_type == "presentation" { let parsed = parse_presentation_linkbase(&content); if !parsed.is_empty() { source = "xbrl_instance_with_linkbase".to_string(); } presentation.extend(parsed); } else { for (key, value) in parse_label_linkbase(&content) { label_by_concept.entry(key).or_insert(value); } } } Err(error) => { if parse_error.is_none() { parse_error = Some(error.to_string()); } } } } let materialized = materialize_taxonomy_statements( input.filing_id, &input.accession_number, &input.filing_date, &input.filing_type, &parsed_instance.facts, &presentation, &label_by_concept, ); let taxonomy_regime = infer_taxonomy_regime(&parsed_instance.facts); let mut concepts = materialized.concepts; let mut facts = materialized.facts; let pack_selection = pack_selector::select_fiscal_pack(&materialized.statement_rows, &facts); let fiscal_pack = pack_selection.pack.as_str().to_string(); let mut compact_model = surface_mapper::build_compact_surface_model( &materialized.periods, &materialized.statement_rows, &taxonomy_regime, pack_selection.pack, pack_selection.warnings, )?; universal_income::apply_universal_income_rows( &materialized.periods, &materialized.statement_rows, &facts, &taxonomy_regime, pack_selection.pack, &mut compact_model, )?; let kpi_result = kpi_mapper::build_taxonomy_kpis(&materialized.periods, &facts, pack_selection.pack)?; compact_model.normalization_summary.kpi_row_count = kpi_result.rows.len(); for warning in kpi_result.warnings { if !compact_model .normalization_summary .warnings .contains(&warning) { compact_model.normalization_summary.warnings.push(warning); } } surface_mapper::merge_mapping_assignments( &mut compact_model.concept_mappings, kpi_result.mapping_assignments, ); surface_mapper::apply_mapping_assignments( &mut concepts, &mut facts, &compact_model.concept_mappings, ); let computed_pack = taxonomy_loader::load_computed_pack(pack_selection.pack) .ok() .or_else(|| taxonomy_loader::load_computed_pack(pack_selector::FiscalPack::Core).ok()); let computed_definitions: Vec = computed_pack .map(|pack| { pack.computed .iter() .map(ComputedDefinitionOutput::from) .collect() }) .unwrap_or_default(); let has_rows = materialized .statement_rows .values() .map(|rows| rows.len()) .sum::() > 0; let has_facts = !facts.is_empty(); let parse_status = if has_rows && has_facts { "ready" } else if has_facts { "partial" } else { "failed" }; Ok(HydrateFilingResponse { filing_id: input.filing_id, ticker: input.ticker.to_uppercase(), filing_date: input.filing_date, filing_type: input.filing_type, parse_status: parse_status.to_string(), parse_error: if parse_status == "failed" { Some(parse_error.unwrap_or_else(|| "No XBRL facts extracted".to_string())) } else { parse_error }, source, parser_engine: PARSER_ENGINE.to_string(), parser_version: PARSER_VERSION.to_string(), taxonomy_regime, fiscal_pack: Some(fiscal_pack), periods: materialized.periods, faithful_rows: materialized.statement_rows.clone(), statement_rows: materialized.statement_rows, surface_rows: compact_model.surface_rows, detail_rows: compact_model.detail_rows, kpi_rows: kpi_result.rows, computed_definitions, contexts: parsed_instance.contexts, derived_metrics: metrics::derive_metrics(&facts), validation_result, facts_count: facts.len(), concepts_count: concepts.len(), dimensions_count: facts .iter() .flat_map(|fact| { fact.dimensions .iter() .map(|dimension| format!("{}::{}", dimension.axis, dimension.member)) }) .collect::>() .len(), assets: discovered.assets, concepts, facts, metric_validations: vec![], normalization_summary: compact_model.normalization_summary, }) } fn infer_taxonomy_regime(facts: &[ParsedFact]) -> String { if facts .iter() .any(|fact| fact.namespace_uri.to_lowercase().contains("us-gaap")) { return "us-gaap".to_string(); } if facts .iter() .any(|fact| fact.namespace_uri.to_lowercase().contains("ifrs")) { return "ifrs-full".to_string(); } "unknown".to_string() } #[derive(Debug, Deserialize)] struct FilingDirectoryPayload { directory: Option, } #[derive(Debug, Deserialize)] struct FilingDirectory { item: Option>, } #[derive(Debug, Deserialize)] struct FilingDirectoryItem { name: Option, size: Option, } #[derive(Debug)] struct DiscoveredAssets { assets: Vec, } fn discover_filing_assets( input: &HydrateFilingRequest, client: &Client, ) -> Result { let Some(directory_url) = resolve_filing_directory_url( input.filing_url.as_deref(), &input.cik, &input.accession_number, ) else { return Ok(DiscoveredAssets { assets: vec![] }); }; let payload = fetch_json::(client, &format!("{directory_url}index.json")).ok(); let mut discovered = Vec::new(); if let Some(items) = payload.and_then(|payload| payload.directory.and_then(|directory| directory.item)) { for item in items { let Some(name) = item .name .map(|name| name.trim().to_string()) .filter(|name| !name.is_empty()) else { continue; }; let asset_type = classify_asset_type(&name); let size_bytes = parse_size(item.size.as_ref()); discovered.push(AssetOutput { asset_type: asset_type.to_string(), name: name.clone(), url: format!("{directory_url}{}", name.trim_start_matches('/')), size_bytes, score: None, is_selected: false, }); } } if discovered.is_empty() { if let Some(filing_url) = &input.filing_url { discovered.push(AssetOutput { asset_type: if filing_url.to_lowercase().ends_with(".xml") { "instance".to_string() } else { "other".to_string() }, name: input .primary_document .clone() .or_else(|| filing_url.split('/').last().map(|part| part.to_string())) .unwrap_or_else(|| "primary_document".to_string()), url: filing_url.clone(), size_bytes: None, score: None, is_selected: true, }); } } let selected_instance_url = discovered .iter() .filter(|asset| asset.asset_type == "instance") .map(|asset| { ( asset.url.clone(), score_instance(&asset.name, input.primary_document.as_deref()), ) }) .max_by(|left, right| { left.1 .partial_cmp(&right.1) .unwrap_or(std::cmp::Ordering::Equal) }) .map(|entry| entry.0); for asset in &mut discovered { asset.score = if asset.asset_type == "instance" { Some(score_instance( &asset.name, input.primary_document.as_deref(), )) } else if asset.asset_type == "pdf" { Some(score_pdf(&asset.name, asset.size_bytes)) } else { None }; asset.is_selected = match asset.asset_type.as_str() { "instance" => selected_instance_url .as_ref() .map(|url| url == &asset.url) .unwrap_or(false), "presentation" | "label" => true, _ => false, }; } Ok(DiscoveredAssets { assets: discovered }) } fn resolve_filing_directory_url( filing_url: Option<&str>, cik: &str, accession_number: &str, ) -> Option { if let Some(filing_url) = filing_url.map(str::trim).filter(|value| !value.is_empty()) { if let Some(last_slash) = filing_url.rfind('/') { if last_slash > "https://".len() { return Some(filing_url[..=last_slash].to_string()); } } } let cik_path = normalize_cik_for_path(cik)?; let accession_path = accession_number.replace('-', ""); Some(format!( "https://www.sec.gov/Archives/edgar/data/{cik_path}/{accession_path}/" )) } fn normalize_cik_for_path(value: &str) -> Option { let digits = value .chars() .filter(|char| char.is_ascii_digit()) .collect::(); if digits.is_empty() { return None; } digits.parse::().ok().map(|parsed| parsed.to_string()) } fn classify_asset_type(name: &str) -> &'static str { let lower = name.to_lowercase(); if lower.ends_with(".pdf") { return "pdf"; } if lower.ends_with(".xsd") { return "schema"; } if lower.ends_with(".xml") { if lower.ends_with("_pre.xml") || lower.ends_with("-pre.xml") || lower.contains("presentation") { return "presentation"; } if lower.ends_with("_lab.xml") || lower.ends_with("-lab.xml") || lower.contains("label") { return "label"; } if lower.ends_with("_cal.xml") || lower.ends_with("-cal.xml") || lower.contains("calculation") { return "calculation"; } if lower.ends_with("_def.xml") || lower.ends_with("-def.xml") || lower.contains("definition") { return "definition"; } return "instance"; } "other" } fn score_instance(name: &str, primary_document: Option<&str>) -> f64 { let lower = name.to_lowercase(); let mut score = 1.0; if lower.ends_with("_htm.xml") { score += 4.0; } if lower.ends_with("_ins.xml") { score += 4.0; } if let Some(base_primary) = primary_document .map(|value| value.replace(|char: char| char == '.' || char == '-', "_")) .map(|value| value.to_lowercase()) { let base = base_primary .rsplit_once('_') .map(|(head, _)| head.to_string()) .unwrap_or(base_primary); if !base.is_empty() && lower.contains(&base) { score += 5.0; } } if lower.contains("cal") || lower.contains("def") || lower.contains("lab") || lower.contains("pre") { score -= 3.0; } score } fn score_pdf(name: &str, size_bytes: Option) -> f64 { let lower = name.to_lowercase(); let mut score = 0.0; if ["financial", "statement", "annual", "quarter", "10k", "10q"] .iter() .any(|needle| lower.contains(needle)) { score += 8.0; } if lower.contains("exhibit") { score -= 2.0; } if size_bytes.unwrap_or_default() > 100_000 { score += 1.0; } score } fn parse_size(value: Option<&serde_json::Value>) -> Option { match value { Some(serde_json::Value::Number(number)) => number.as_i64(), Some(serde_json::Value::String(raw)) => raw.parse::().ok(), _ => None, } } fn fetch_text(client: &Client, url: &str) -> Result { let response = client .get(url) .send() .with_context(|| format!("request failed for {url}"))?; if !response.status().is_success() { return Err(anyhow!("request failed for {url} ({})", response.status())); } response .text() .with_context(|| format!("unable to read response body for {url}")) } fn fetch_json Deserialize<'de>>(client: &Client, url: &str) -> Result { let response = client .get(url) .send() .with_context(|| format!("request failed for {url}"))?; if !response.status().is_success() { return Err(anyhow!("request failed for {url} ({})", response.status())); } response .json::() .with_context(|| format!("unable to parse JSON response for {url}")) } struct ParsedInstance { contexts: Vec, facts: Vec, } fn parse_xbrl_instance(raw: &str, source_file: Option) -> ParsedInstance { let namespaces = parse_namespace_map(raw, "xbrl"); let context_by_id = parse_contexts(raw); let unit_by_id = parse_units(raw); let mut facts = Vec::new(); for captures in FACT_RE.captures_iter(raw) { let prefix = captures .get(1) .map(|value| value.as_str().trim()) .unwrap_or_default(); let local_name = captures .get(2) .map(|value| value.as_str().trim()) .unwrap_or_default(); let attrs = captures .get(3) .map(|value| value.as_str()) .unwrap_or_default(); let body = decode_xml_entities( captures .get(4) .map(|value| value.as_str()) .unwrap_or_default() .trim(), ); if prefix.is_empty() || local_name.is_empty() || is_xbrl_infrastructure_prefix(prefix) { continue; } let attr_map = parse_attrs(attrs); let Some(context_id) = attr_map .get("contextRef") .cloned() .or_else(|| attr_map.get("contextref").cloned()) else { continue; }; let Some(value) = parse_number(&body) else { continue; }; let namespace_uri = namespaces .get(prefix) .cloned() .unwrap_or_else(|| format!("urn:unknown:{prefix}")); let context = context_by_id.get(&context_id); let unit_ref = attr_map .get("unitRef") .cloned() .or_else(|| attr_map.get("unitref").cloned()); let unit = unit_ref .as_ref() .and_then(|unit_ref| unit_by_id.get(unit_ref)) .and_then(|unit| unit.measure.clone()) .or(unit_ref); facts.push(ParsedFact { concept_key: format!("{namespace_uri}#{local_name}"), qname: format!("{prefix}:{local_name}"), namespace_uri, local_name: local_name.to_string(), data_type: None, context_id: context_id.clone(), unit, decimals: attr_map.get("decimals").cloned(), precision: attr_map.get("precision").cloned(), nil: attr_map .get("xsi:nil") .or_else(|| attr_map.get("nil")) .map(|value| value.eq_ignore_ascii_case("true")) .unwrap_or(false), value, period_start: context.and_then(|value| value.period_start.clone()), period_end: context.and_then(|value| value.period_end.clone()), period_instant: context.and_then(|value| value.period_instant.clone()), dimensions: context .map(|value| value.dimensions.clone()) .unwrap_or_default(), is_dimensionless: context .map(|value| value.dimensions.is_empty()) .unwrap_or(true), source_file: source_file.clone(), }); } let contexts = context_by_id .values() .map(|context| ContextOutput { context_id: context.id.clone(), entity_identifier: context.entity_identifier.clone(), entity_scheme: context.entity_scheme.clone(), period_start: context.period_start.clone(), period_end: context.period_end.clone(), period_instant: context.period_instant.clone(), segment_json: context.segment.clone(), scenario_json: context.scenario.clone(), }) .collect::>(); ParsedInstance { contexts, facts } } fn parse_namespace_map(raw: &str, root_tag_hint: &str) -> HashMap { let mut map = HashMap::new(); let root_start = Regex::new(&format!(r#"(?is)<[^>]*{root_tag_hint}[^>]*>"#)) .unwrap() .find(raw) .map(|match_| match_.as_str().to_string()) .unwrap_or_else(|| raw.chars().take(1200).collect::()); for captures in Regex::new(r#"xmlns:([a-zA-Z0-9_\-]+)=["']([^"']+)["']"#) .unwrap() .captures_iter(&root_start) { if let (Some(prefix), Some(uri)) = (captures.get(1), captures.get(2)) { map.insert( prefix.as_str().trim().to_string(), uri.as_str().trim().to_string(), ); } } map } fn parse_contexts(raw: &str) -> HashMap { let mut contexts = HashMap::new(); for captures in CONTEXT_RE.captures_iter(raw) { let Some(context_id) = captures .get(1) .map(|value| value.as_str().trim().to_string()) else { continue; }; let block = captures .get(2) .map(|value| value.as_str()) .unwrap_or_default(); let (entity_identifier, entity_scheme) = IDENTIFIER_RE .captures(block) .map(|captures| { ( captures .get(2) .map(|value| decode_xml_entities(value.as_str().trim())), captures .get(1) .map(|value| decode_xml_entities(value.as_str().trim())), ) }) .unwrap_or((None, None)); let period_start = START_DATE_RE .captures(block) .and_then(|captures| captures.get(1)) .map(|value| decode_xml_entities(value.as_str().trim())); let period_end = END_DATE_RE .captures(block) .and_then(|captures| captures.get(1)) .map(|value| decode_xml_entities(value.as_str().trim())); let period_instant = INSTANT_RE .captures(block) .and_then(|captures| captures.get(1)) .map(|value| decode_xml_entities(value.as_str().trim())); let segment = SEGMENT_RE .captures(block) .and_then(|captures| captures.get(1)) .map(|value| parse_dimension_container(value.as_str())); let scenario = SCENARIO_RE .captures(block) .and_then(|captures| captures.get(1)) .map(|value| parse_dimension_container(value.as_str())); let mut dimensions = Vec::new(); if let Some(segment_value) = segment.as_ref() { if let Some(members) = segment_value .get("explicitMembers") .and_then(|value| value.as_array()) { for member in members { if let (Some(axis), Some(member_value)) = ( member.get("axis").and_then(|value| value.as_str()), member.get("member").and_then(|value| value.as_str()), ) { dimensions.push(DimensionOutput { axis: axis.to_string(), member: member_value.to_string(), }); } } } } if let Some(scenario_value) = scenario.as_ref() { if let Some(members) = scenario_value .get("explicitMembers") .and_then(|value| value.as_array()) { for member in members { if let (Some(axis), Some(member_value)) = ( member.get("axis").and_then(|value| value.as_str()), member.get("member").and_then(|value| value.as_str()), ) { dimensions.push(DimensionOutput { axis: axis.to_string(), member: member_value.to_string(), }); } } } } contexts.insert( context_id.clone(), ParsedContext { id: context_id, entity_identifier, entity_scheme, period_start, period_end, period_instant, dimensions, segment, scenario, }, ); } contexts } fn parse_dimension_container(raw: &str) -> serde_json::Value { let explicit_members = EXPLICIT_MEMBER_RE .captures_iter(raw) .filter_map(|captures| { Some(serde_json::json!({ "axis": decode_xml_entities(captures.get(1)?.as_str().trim()), "member": decode_xml_entities(captures.get(2)?.as_str().trim()) })) }) .collect::>(); let typed_members = TYPED_MEMBER_RE .captures_iter(raw) .filter_map(|captures| { Some(serde_json::json!({ "axis": decode_xml_entities(captures.get(1)?.as_str().trim()), "value": decode_xml_entities(captures.get(2)?.as_str().trim()) })) }) .collect::>(); serde_json::json!({ "explicitMembers": explicit_members, "typedMembers": typed_members }) } fn parse_units(raw: &str) -> HashMap { let mut units = HashMap::new(); for captures in UNIT_RE.captures_iter(raw) { let Some(id) = captures .get(1) .map(|value| value.as_str().trim().to_string()) else { continue; }; let block = captures .get(2) .map(|value| value.as_str()) .unwrap_or_default(); let measures = MEASURE_RE .captures_iter(block) .filter_map(|captures| captures.get(1)) .map(|value| decode_xml_entities(value.as_str().trim())) .filter(|value| !value.is_empty()) .collect::>(); let measure = if measures.len() == 1 { measures.first().cloned() } else if measures.len() > 1 { Some(measures.join("/")) } else { None }; units.insert(id, ParsedUnit { measure }); } units } fn is_xbrl_infrastructure_prefix(prefix: &str) -> bool { matches!( prefix.to_ascii_lowercase().as_str(), "xbrli" | "xlink" | "link" | "xbrldi" | "xbrldt" ) } fn parse_attrs(raw: &str) -> HashMap { let mut map = HashMap::new(); for captures in ATTR_RE.captures_iter(raw) { if let (Some(name), Some(value)) = (captures.get(1), captures.get(2)) { map.insert( name.as_str().to_string(), decode_xml_entities(value.as_str()), ); } } map } fn decode_xml_entities(value: &str) -> String { value .replace("&", "&") .replace("<", "<") .replace(">", ">") .replace(""", "\"") .replace("'", "'") .replace(" ", " ") .replace(" ", " ") } fn parse_number(raw: &str) -> Option { let trimmed = raw.trim(); if trimmed.is_empty() || trimmed.chars().all(|char| char == '-') { return None; } let negative = trimmed.starts_with('(') && trimmed.ends_with(')'); let normalized = Regex::new(r#"<[^>]+>"#) .unwrap() .replace_all(trimmed, " ") .replace(',', "") .replace('$', "") .replace(['(', ')'], "") .replace('\u{2212}', "-") .split_whitespace() .collect::(); let parsed = normalized.parse::().ok()?; Some(if negative { -parsed.abs() } else { parsed }) } fn parse_label_linkbase(raw: &str) -> HashMap { let namespaces = parse_namespace_map(raw, "linkbase"); let mut preferred = HashMap::::new(); for captures in LABEL_LINK_RE.captures_iter(raw) { let block = captures .get(1) .map(|value| value.as_str()) .unwrap_or_default(); let mut loc_by_label = HashMap::::new(); let mut resource_by_label = HashMap::)>::new(); for captures in LOC_RE.captures_iter(block) { let attrs = parse_attrs( captures .get(1) .map(|value| value.as_str()) .unwrap_or_default(), ); let Some(label) = attrs.get("xlink:label").cloned() else { continue; }; let Some(href) = attrs.get("xlink:href").cloned() else { continue; }; let Some(qname) = qname_from_href(&href) else { continue; }; let Some((concept_key, _, _)) = concept_from_qname(&qname, &namespaces) else { continue; }; loc_by_label.insert(label, concept_key); } for captures in LABEL_RESOURCE_RE.captures_iter(block) { let attrs = parse_attrs( captures .get(1) .map(|value| value.as_str()) .unwrap_or_default(), ); let Some(label) = attrs.get("xlink:label").cloned() else { continue; }; let body = decode_xml_entities( captures .get(2) .map(|value| value.as_str()) .unwrap_or_default(), ) .split_whitespace() .collect::>() .join(" "); if body.is_empty() { continue; } resource_by_label.insert(label, (body, attrs.get("xlink:role").cloned())); } for captures in LABEL_ARC_RE.captures_iter(block) { let attrs = parse_attrs( captures .get(1) .map(|value| value.as_str()) .unwrap_or_default(), ); let Some(from) = attrs.get("xlink:from").cloned() else { continue; }; let Some(to) = attrs.get("xlink:to").cloned() else { continue; }; let Some(concept_key) = loc_by_label.get(&from) else { continue; }; let Some((label, role)) = resource_by_label.get(&to) else { continue; }; let priority = label_priority(role.as_deref()); let current = preferred.get(concept_key).cloned(); if current .as_ref() .map(|(_, current_priority)| priority > *current_priority) .unwrap_or(true) { preferred.insert(concept_key.clone(), (label.clone(), priority)); } } } preferred .into_iter() .map(|(key, (value, _))| (key, value)) .collect() } fn parse_presentation_linkbase(raw: &str) -> Vec { let namespaces = parse_namespace_map(raw, "linkbase"); let mut rows = Vec::new(); for captures in PRESENTATION_LINK_RE.captures_iter(raw) { let link_attrs = parse_attrs( captures .get(1) .map(|value| value.as_str()) .unwrap_or_default(), ); let Some(role_uri) = link_attrs.get("xlink:role").cloned() else { continue; }; let block = captures .get(2) .map(|value| value.as_str()) .unwrap_or_default(); let mut loc_by_label = HashMap::::new(); let mut children_by_label = HashMap::>::new(); let mut incoming = HashSet::::new(); let mut all_referenced = HashSet::::new(); for captures in LOC_RE.captures_iter(block) { let attrs = parse_attrs( captures .get(1) .map(|value| value.as_str()) .unwrap_or_default(), ); let Some(label) = attrs.get("xlink:label").cloned() else { continue; }; let Some(href) = attrs.get("xlink:href").cloned() else { continue; }; let Some(qname) = qname_from_href(&href) else { continue; }; let Some((concept_key, qname, local_name)) = concept_from_qname(&qname, &namespaces) else { continue; }; loc_by_label.insert( label, ( concept_key, qname, local_name.to_ascii_lowercase().contains("abstract"), ), ); } for captures in PRESENTATION_ARC_RE.captures_iter(block) { let attrs = parse_attrs( captures .get(1) .map(|value| value.as_str()) .unwrap_or_default(), ); let Some(from) = attrs.get("xlink:from").cloned() else { continue; }; let Some(to) = attrs.get("xlink:to").cloned() else { continue; }; if !loc_by_label.contains_key(&from) || !loc_by_label.contains_key(&to) { continue; } let order = attrs .get("order") .and_then(|value| value.parse::().ok()) .unwrap_or_else(|| { children_by_label .get(&from) .map(|children| children.len() as f64 + 1.0) .unwrap_or(1.0) }); children_by_label .entry(from.clone()) .or_default() .push((to.clone(), order)); incoming.insert(to.clone()); all_referenced.insert(from); all_referenced.insert(to); } let roots = all_referenced .iter() .filter(|label| !incoming.contains(*label)) .cloned() .collect::>(); let mut visited = HashSet::::new(); fn dfs( label: &str, depth: i64, parent_label: Option<&str>, base_order: f64, role_uri: &str, loc_by_label: &HashMap, children_by_label: &HashMap>, rows: &mut Vec, visited: &mut HashSet, ) { let Some((concept_key, _qname, is_abstract)) = loc_by_label.get(label) else { return; }; let path_key = format!("{}::{label}::{depth}", parent_label.unwrap_or("root")); if !visited.insert(path_key) { return; } let parent_concept_key = parent_label.and_then(|parent| { loc_by_label .get(parent) .map(|(concept_key, _, _)| concept_key.clone()) }); rows.push(PresentationNode { concept_key: concept_key.clone(), role_uri: role_uri.to_string(), order: base_order, depth, parent_concept_key, is_abstract: *is_abstract, }); let mut children = children_by_label.get(label).cloned().unwrap_or_default(); children.sort_by(|left, right| { left.1 .partial_cmp(&right.1) .unwrap_or(std::cmp::Ordering::Equal) }); for (index, (child_label, _)) in children.into_iter().enumerate() { dfs( &child_label, depth + 1, Some(label), base_order + (index as f64 + 1.0) / 1000.0, role_uri, loc_by_label, children_by_label, rows, visited, ); } } for (index, root) in roots.iter().enumerate() { dfs( root, 0, None, index as f64 + 1.0, &role_uri, &loc_by_label, &children_by_label, &mut rows, &mut visited, ); } } rows } fn qname_from_href(href: &str) -> Option { let fragment = href.split('#').nth(1).unwrap_or(href).trim(); if fragment.is_empty() { return None; } let cleaned = fragment.trim_start_matches("loc_"); if cleaned.contains(':') { return Some(cleaned.to_string()); } cleaned .split_once('_') .map(|(prefix, local)| format!("{prefix}:{local}")) } fn concept_from_qname( qname: &str, namespaces: &HashMap, ) -> Option<(String, String, String)> { let (prefix, local_name) = qname.split_once(':')?; let namespace_uri = namespaces .get(prefix) .cloned() .unwrap_or_else(|| format!("urn:unknown:{prefix}")); Some(( format!("{namespace_uri}#{local_name}"), qname.to_string(), local_name.to_string(), )) } fn label_priority(role: Option<&str>) -> i64 { let normalized = role.unwrap_or_default().to_ascii_lowercase(); if normalized.ends_with("/label") { 4 } else if normalized.ends_with("/terselabel") { 3 } else if normalized.ends_with("/verboselabel") { 2 } else if normalized.is_empty() { 0 } else { 1 } } struct MaterializedStatements { periods: Vec, statement_rows: StatementRowMap, concepts: Vec, facts: Vec, } fn materialize_taxonomy_statements( filing_id: i64, accession_number: &str, filing_date: &str, filing_type: &str, facts: &[ParsedFact], presentation: &[PresentationNode], label_by_concept: &HashMap, ) -> MaterializedStatements { let compact_accession = accession_number.replace('-', ""); let mut period_by_signature = HashMap::::new(); for fact in facts { let signature = period_signature(fact); if period_by_signature.contains_key(&signature) { continue; } let date = fact .period_end .clone() .or_else(|| fact.period_instant.clone()) .unwrap_or_else(|| filing_date.to_string()); let id = format!( "{date}-{compact_accession}-{}", period_by_signature.len() + 1 ); let period_label = if fact.period_instant.is_some() && fact.period_start.is_none() { "Instant".to_string() } else if fact.period_start.is_some() && fact.period_end.is_some() { format!( "{} to {}", fact.period_start.clone().unwrap_or_default(), fact.period_end.clone().unwrap_or_default() ) } else { "Filing Period".to_string() }; period_by_signature.insert( signature, PeriodOutput { id, filing_id, accession_number: accession_number.to_string(), filing_date: filing_date.to_string(), period_start: fact.period_start.clone(), period_end: fact .period_end .clone() .or_else(|| fact.period_instant.clone()), filing_type: filing_type.to_string(), period_label, }, ); } let mut periods = period_by_signature.values().cloned().collect::>(); periods.sort_by(|left, right| { let left_key = left .period_end .clone() .unwrap_or_else(|| left.filing_date.clone()); let right_key = right .period_end .clone() .unwrap_or_else(|| right.filing_date.clone()); left_key .cmp(&right_key) .then_with(|| left.id.cmp(&right.id)) }); let period_id_by_signature = period_by_signature .iter() .map(|(signature, period)| (signature.clone(), period.id.clone())) .collect::>(); let mut presentation_by_concept = HashMap::>::new(); for node in presentation { presentation_by_concept .entry(node.concept_key.clone()) .or_default() .push(node); } let mut grouped_by_statement = empty_parsed_fact_map(); let mut enriched_facts = Vec::new(); for (index, fact) in facts.iter().enumerate() { let nodes = presentation_by_concept .get(&fact.concept_key) .cloned() .unwrap_or_default(); let best_node = nodes.first().copied(); let statement_kind = best_node .and_then(|node| classify_statement_role(&node.role_uri)) .or_else(|| concept_statement_fallback(&fact.local_name)); let fact_output = FactOutput { concept_key: fact.concept_key.clone(), qname: fact.qname.clone(), namespace_uri: fact.namespace_uri.clone(), local_name: fact.local_name.clone(), data_type: fact.data_type.clone(), statement_kind: statement_kind.clone(), role_uri: best_node.map(|node| node.role_uri.clone()), authoritative_concept_key: None, mapping_method: None, surface_key: None, detail_parent_surface_key: None, kpi_key: None, residual_flag: false, context_id: fact.context_id.clone(), unit: fact.unit.clone(), decimals: fact.decimals.clone(), precision: fact.precision.clone(), nil: fact.nil, value_num: fact.value, period_start: fact.period_start.clone(), period_end: fact.period_end.clone(), period_instant: fact.period_instant.clone(), dimensions: fact.dimensions.clone(), is_dimensionless: fact.is_dimensionless, source_file: fact.source_file.clone(), }; if let Some(statement_kind) = statement_kind.clone() { if let Some(statement_key) = statement_key_ref(&statement_kind) { grouped_by_statement .entry(statement_key) .or_default() .entry(fact.concept_key.clone()) .or_default() .push((index as i64 + 1, fact.clone(), best_node.cloned())); } } enriched_facts.push(fact_output); } let mut statement_rows = empty_statement_row_map(); let mut concepts = Vec::::new(); for statement_kind in statement_keys() { let concept_groups = grouped_by_statement .remove(statement_kind) .unwrap_or_default(); let mut concept_keys = HashSet::::new(); for node in presentation.iter().filter(|node| { classify_statement_role(&node.role_uri).as_deref() == Some(statement_kind) }) { concept_keys.insert(node.concept_key.clone()); } for concept_key in concept_groups.keys() { concept_keys.insert(concept_key.clone()); } let mut ordered_concepts = concept_keys .into_iter() .map(|concept_key| { let nodes = presentation .iter() .filter(|node| { node.concept_key == concept_key && classify_statement_role(&node.role_uri).as_deref() == Some(statement_kind) }) .collect::>(); let order = nodes .iter() .map(|node| node.order) .fold(f64::INFINITY, f64::min); let depth = nodes.iter().map(|node| node.depth).min().unwrap_or(0); let role_uri = nodes.first().map(|node| node.role_uri.clone()); let parent_concept_key = nodes .first() .and_then(|node| node.parent_concept_key.clone()); (concept_key, order, depth, role_uri, parent_concept_key) }) .collect::>(); ordered_concepts.sort_by(|left, right| { left.1 .partial_cmp(&right.1) .unwrap_or(std::cmp::Ordering::Equal) .then_with(|| left.0.cmp(&right.0)) }); for (concept_key, presentation_order, depth, role_uri, parent_concept_key) in ordered_concepts { let fact_group = concept_groups .get(&concept_key) .cloned() .unwrap_or_default(); let (namespace_uri, local_name) = split_concept_key(&concept_key); let qname = fact_group .first() .map(|(_, fact, _)| fact.qname.clone()) .unwrap_or_else(|| format!("unknown:{local_name}")); let label = label_by_concept .get(&concept_key) .cloned() .unwrap_or_else(|| local_name_to_label(&local_name)); let mut values = BTreeMap::>::new(); let mut units = BTreeMap::>::new(); let mut source_fact_ids = Vec::::new(); let mut has_dimensions = false; let mut fact_groups = HashMap::>::new(); for (fact_id, fact, _) in fact_group.iter() { fact_groups .entry(period_signature(fact)) .or_default() .push((*fact_id, fact.clone())); } for (signature, grouped_facts) in fact_groups { let Some(period_id) = period_id_by_signature.get(&signature) else { continue; }; let preferred = pick_preferred_fact(&grouped_facts); if let Some((fact_id, fact)) = preferred { values.insert(period_id.clone(), Some(fact.value)); units.insert(period_id.clone(), fact.unit.clone()); source_fact_ids.push(*fact_id); has_dimensions = has_dimensions || !fact.is_dimensionless; } } let row = StatementRowOutput { key: concept_key.clone(), label: label.clone(), concept_key: concept_key.clone(), qname: qname.clone(), namespace_uri: namespace_uri.clone(), local_name: local_name.clone(), is_extension: !is_standard_namespace(&namespace_uri), statement: statement_kind.to_string(), role_uri: role_uri.clone(), order: if presentation_order.is_finite() { (presentation_order * 1000.0).round() as i64 } else { 1_000_000 }, depth, parent_key: parent_concept_key.clone(), values, units, has_dimensions, source_fact_ids: { source_fact_ids.sort(); source_fact_ids }, }; if let Some(statement_rows) = statement_rows.get_mut(statement_kind) { statement_rows.push(row.clone()); } concepts.push(ConceptOutput { concept_key, qname, namespace_uri, local_name, label: Some(label), is_extension: !is_standard_namespace(&row.namespace_uri), balance: None, period_type: None, data_type: None, statement_kind: Some(statement_kind.to_string()), role_uri, authoritative_concept_key: None, mapping_method: None, surface_key: None, detail_parent_surface_key: None, kpi_key: None, residual_flag: false, presentation_order: if presentation_order.is_finite() { Some(presentation_order) } else { None }, presentation_depth: Some(depth), parent_concept_key, is_abstract: presentation .iter() .find(|node| node.concept_key == row.concept_key) .map(|node| node.is_abstract) .unwrap_or(false), }); } } MaterializedStatements { periods, statement_rows, concepts, facts: enriched_facts, } } fn empty_parsed_fact_map( ) -> HashMap<&'static str, HashMap)>>> { let mut map = HashMap::new(); for key in statement_keys() { map.insert(key, HashMap::new()); } map } fn empty_statement_row_map() -> StatementRowMap { statement_keys() .into_iter() .map(|key| (key.to_string(), Vec::new())) .collect() } fn empty_surface_row_map() -> SurfaceRowMap { statement_keys() .into_iter() .map(|key| (key.to_string(), Vec::new())) .collect() } fn empty_detail_row_map() -> DetailRowStatementMap { statement_keys() .into_iter() .map(|key| (key.to_string(), BTreeMap::new())) .collect() } fn statement_keys() -> [&'static str; 5] { [ "income", "balance", "cash_flow", "equity", "comprehensive_income", ] } fn statement_key_ref(value: &str) -> Option<&'static str> { match value { "income" => Some("income"), "balance" => Some("balance"), "cash_flow" => Some("cash_flow"), "equity" => Some("equity"), "comprehensive_income" => Some("comprehensive_income"), _ => None, } } fn pick_preferred_fact(grouped_facts: &[(i64, ParsedFact)]) -> Option<&(i64, ParsedFact)> { grouped_facts.iter().max_by(|left, right| { let left_dimension_score = if left.1.is_dimensionless { 1 } else { 0 }; let right_dimension_score = if right.1.is_dimensionless { 1 } else { 0 }; left_dimension_score .cmp(&right_dimension_score) .then_with(|| { let left_date = left .1 .period_end .as_ref() .or(left.1.period_instant.as_ref()) .cloned() .unwrap_or_default(); let right_date = right .1 .period_end .as_ref() .or(right.1.period_instant.as_ref()) .cloned() .unwrap_or_default(); left_date.cmp(&right_date) }) .then_with(|| { left.1 .value .abs() .partial_cmp(&right.1.value.abs()) .unwrap_or(std::cmp::Ordering::Equal) }) }) } fn period_signature(fact: &ParsedFact) -> String { format!( "start:{}|end:{}|instant:{}", fact.period_start.clone().unwrap_or_default(), fact.period_end.clone().unwrap_or_default(), fact.period_instant.clone().unwrap_or_default() ) } fn split_concept_key(concept_key: &str) -> (String, String) { concept_key .rsplit_once('#') .map(|(namespace_uri, local_name)| (namespace_uri.to_string(), local_name.to_string())) .unwrap_or_else(|| ("urn:unknown".to_string(), concept_key.to_string())) } fn local_name_to_label(local_name: &str) -> String { let spaced = Regex::new(r#"([a-z0-9])([A-Z])"#) .unwrap() .replace_all(local_name, "$1 $2") .to_string(); Regex::new(r#"([A-Z]+)([A-Z][a-z])"#) .unwrap() .replace_all(&spaced, "$1 $2") .replace('_', " ") .trim() .to_string() } fn classify_statement_role(role_uri: &str) -> Option { let normalized = role_uri.to_ascii_lowercase(); if Regex::new(r#"cash\s*flow|statementsof?cashflows|netcash"#) .unwrap() .is_match(&normalized) { return Some("cash_flow".to_string()); } if Regex::new(r#"shareholders?|stockholders?|equity|retainedearnings"#) .unwrap() .is_match(&normalized) { return Some("equity".to_string()); } if Regex::new(r#"comprehensive\s*income"#) .unwrap() .is_match(&normalized) { return Some("comprehensive_income".to_string()); } if Regex::new(r#"balance\s*sheet|financial\s*position|assets?andliabilities"#) .unwrap() .is_match(&normalized) { return Some("balance".to_string()); } if Regex::new(r#"operations|income\s*statement|statementsofincome|profit"#) .unwrap() .is_match(&normalized) { return Some("income".to_string()); } None } fn concept_statement_fallback(local_name: &str) -> Option { let normalized = local_name.to_ascii_lowercase(); if Regex::new(r#"equity|retainedearnings|additionalpaidincapital"#) .unwrap() .is_match(&normalized) { return Some("equity".to_string()); } if normalized.contains("comprehensiveincome") { return Some("comprehensive_income".to_string()); } if Regex::new( r#"deferredpolicyacquisitioncosts(andvalueofbusinessacquired)?$|supplementaryinsuranceinformationdeferredpolicyacquisitioncosts$|deferredacquisitioncosts$"#, ) .unwrap() .is_match(&normalized) { return Some("balance".to_string()); } if Regex::new( r#"netcashprovidedbyusedin.*activities|increasedecreasein|paymentstoacquire|paymentsforcapitalimprovements$|paymentsfordepositsonrealestateacquisitions$|paymentsforrepurchase|paymentsofdividends|dividendscommonstockcash$|proceedsfrom|repaymentsofdebt|sharebasedcompensation$|allocatedsharebasedcompensationexpense$|depreciationdepletionandamortization$|depreciationamortizationandaccretionnet$|depreciationandamortization$|depreciationamortizationandother$|otheradjustmentstoreconcilenetincomelosstocashprovidedbyusedinoperatingactivities"#, ) .unwrap() .is_match(&normalized) { return Some("cash_flow".to_string()); } if Regex::new( r#"asset|liabilit|debt|financingreceivable|loansreceivable|deposits|allowanceforcreditloss|futurepolicybenefits|policyholderaccountbalances|unearnedpremiums|realestateinvestmentproperty|grossatcarryingvalue|investmentproperty"#, ) .unwrap() .is_match(&normalized) { return Some("balance".to_string()); } if Regex::new( r#"revenue|income|profit|expense|costof|leaseincome|rental|premiums|claims|underwriting|policyacquisition|interestincome|interestexpense|noninterest|leasedandrentedproperty"#, ) .unwrap() .is_match(&normalized) { return Some("income".to_string()); } None } fn is_standard_namespace(namespace_uri: &str) -> bool { let lower = namespace_uri.to_ascii_lowercase(); lower.contains("us-gaap") || lower.contains("ifrs") || lower.contains("/dei/") || lower.contains("xbrl.sec.gov/dei") } #[cfg(test)] mod tests { use super::*; use crate::pack_selector::FiscalPack; fn period(id: &str, period_end: &str) -> PeriodOutput { PeriodOutput { id: id.to_string(), filing_id: 1, accession_number: "0000000000-00-000001".to_string(), filing_date: "2025-12-31".to_string(), period_start: Some("2025-01-01".to_string()), period_end: Some(period_end.to_string()), filing_type: "10-K".to_string(), period_label: period_end.to_string(), } } fn row( key: &str, qname: &str, statement: &str, order: i64, values: &[(&str, f64)], ) -> StatementRowOutput { let namespace_uri = qname .split_once(':') .map(|(prefix, _)| { if prefix == "us-gaap" { "http://fasb.org/us-gaap/2024".to_string() } else { format!("urn:{prefix}") } }) .unwrap_or_else(|| "urn:unknown".to_string()); let local_name = qname .split_once(':') .map(|(_, local_name)| local_name.to_string()) .unwrap_or_else(|| qname.to_string()); StatementRowOutput { key: key.to_string(), label: local_name_to_label(&local_name), concept_key: format!("{namespace_uri}#{local_name}"), qname: qname.to_string(), namespace_uri, local_name, is_extension: false, statement: statement.to_string(), role_uri: Some(statement.to_string()), order, depth: 0, parent_key: None, values: values .iter() .map(|(period_id, value)| (period_id.to_string(), Some(*value))) .collect(), units: values .iter() .map(|(period_id, _)| (period_id.to_string(), Some("iso4217:USD".to_string()))) .collect(), has_dimensions: false, source_fact_ids: vec![order], } } #[test] fn builds_compact_surface_rows_from_core_pack() { let periods = vec![period("2024", "2024-12-31"), period("2025", "2025-12-31")]; let mut statement_rows = empty_statement_row_map(); statement_rows.insert( "income".to_string(), vec![ row( "revenue-row", "us-gaap:RevenueFromContractWithCustomerExcludingAssessedTax", "income", 10, &[("2024", 100.0), ("2025", 120.0)], ), row( "operating-expenses-row", "us-gaap:OperatingExpenses", "income", 20, &[("2024", 40.0), ("2025", 50.0)], ), row( "sga-row", "us-gaap:SellingGeneralAndAdministrativeExpense", "income", 30, &[("2024", 25.0), ("2025", 31.0)], ), row( "rd-row", "us-gaap:ResearchAndDevelopmentExpense", "income", 40, &[("2024", 15.0), ("2025", 19.0)], ), row( "net-income-row", "us-gaap:NetIncomeLoss", "income", 50, &[("2024", 22.0), ("2025", 30.0)], ), row( "unmapped-row", "company:OtherOperatingCharges", "income", 60, &[("2024", 3.0), ("2025", 4.0)], ), ], ); statement_rows.insert( "balance".to_string(), vec![row( "assets-row", "us-gaap:Assets", "balance", 70, &[("2024", 500.0), ("2025", 550.0)], )], ); statement_rows.insert( "cash_flow".to_string(), vec![row( "ocf-row", "us-gaap:NetCashProvidedByUsedInOperatingActivities", "cash_flow", 80, &[("2024", 60.0), ("2025", 65.0)], )], ); let model = surface_mapper::build_compact_surface_model( &periods, &statement_rows, "us-gaap", FiscalPack::Core, vec![], ) .expect("core pack should load and map"); let income_surface_rows = model .surface_rows .get("income") .expect("income surface rows"); let op_expenses = income_surface_rows .iter() .find(|row| row.key == "operating_expenses") .expect("operating expenses surface row"); let revenue = income_surface_rows .iter() .find(|row| row.key == "revenue") .expect("revenue surface row"); assert_eq!(revenue.values.get("2025").copied().flatten(), Some(120.0)); assert_eq!( op_expenses.values.get("2024").copied().flatten(), Some(40.0) ); assert_eq!(op_expenses.detail_count, Some(2)); let operating_expense_details = model .detail_rows .get("income") .and_then(|groups| groups.get("operating_expenses")) .expect("operating expenses details"); assert_eq!(operating_expense_details.len(), 2); assert!(operating_expense_details .iter() .any(|row| row.key == "sga-row")); assert!(operating_expense_details .iter() .any(|row| row.key == "rd-row")); let residual_rows = model .detail_rows .get("income") .and_then(|groups| groups.get("unmapped")) .expect("unmapped detail rows"); assert_eq!(residual_rows.len(), 1); assert_eq!(residual_rows[0].key, "unmapped-row"); assert!(residual_rows[0].residual_flag); let rd_mapping = model .concept_mappings .get("http://fasb.org/us-gaap/2024#ResearchAndDevelopmentExpense") .expect("rd mapping"); assert_eq!( rd_mapping.detail_parent_surface_key.as_deref(), Some("operating_expenses") ); assert_eq!( rd_mapping.surface_key.as_deref(), Some("operating_expenses") ); let residual_mapping = model .concept_mappings .get("urn:company#OtherOperatingCharges") .expect("residual mapping"); assert!(residual_mapping.residual_flag); assert_eq!( residual_mapping.detail_parent_surface_key.as_deref(), Some("unmapped") ); assert_eq!(model.normalization_summary.surface_row_count, 6); assert_eq!(model.normalization_summary.detail_row_count, 3); assert_eq!(model.normalization_summary.unmapped_row_count, 1); } #[test] fn parses_basic_xbrl_facts_without_regex_backreferences() { let raw = r#" 0000320193 2025-01-01 2025-12-31 iso4217:USD 1000 "#; let parsed = parse_xbrl_instance(raw, Some("test.xml".to_string())); assert_eq!(parsed.facts.len(), 1); assert_eq!( parsed.facts[0].qname, "us-gaap:RevenueFromContractWithCustomerExcludingAssessedTax" ); assert_eq!(parsed.facts[0].value, 1000.0); assert_eq!(parsed.facts[0].unit.as_deref(), Some("iso4217:USD")); } #[test] fn classifies_pack_specific_concepts_without_presentation_roles() { assert_eq!( concept_statement_fallback( "FinancingReceivableExcludingAccruedInterestAfterAllowanceForCreditLoss" ) .as_deref(), Some("balance") ); assert_eq!( concept_statement_fallback("Deposits").as_deref(), Some("balance") ); assert_eq!( concept_statement_fallback("RealEstateInvestmentPropertyNet").as_deref(), Some("balance") ); assert_eq!( concept_statement_fallback("DeferredPolicyAcquisitionCosts").as_deref(), Some("balance") ); assert_eq!( concept_statement_fallback("DeferredPolicyAcquisitionCostsAndValueOfBusinessAcquired") .as_deref(), Some("balance") ); assert_eq!( concept_statement_fallback("IncreaseDecreaseInAccountsReceivable").as_deref(), Some("cash_flow") ); assert_eq!( concept_statement_fallback("PaymentsOfDividends").as_deref(), Some("cash_flow") ); assert_eq!( concept_statement_fallback("RepaymentsOfDebt").as_deref(), Some("cash_flow") ); assert_eq!( concept_statement_fallback("ShareBasedCompensation").as_deref(), Some("cash_flow") ); assert_eq!( concept_statement_fallback("PaymentsForCapitalImprovements").as_deref(), Some("cash_flow") ); assert_eq!( concept_statement_fallback("PaymentsForDepositsOnRealEstateAcquisitions").as_deref(), Some("cash_flow") ); assert_eq!( concept_statement_fallback("LeaseIncome").as_deref(), Some("income") ); assert_eq!( concept_statement_fallback("DirectCostsOfLeasedAndRentedPropertyOrEquipment") .as_deref(), Some("income") ); } }