feat(taxonomy): add rust sidecar compact surface pipeline

This commit is contained in:
2026-03-12 15:23:10 -04:00
parent f2c25fb9c6
commit 58061af006
84 changed files with 19350 additions and 265 deletions

View File

@@ -0,0 +1,667 @@
use anyhow::Result;
use std::collections::{BTreeMap, HashMap, HashSet};
use crate::pack_selector::FiscalPack;
use crate::taxonomy_loader::{load_crosswalk, load_surface_pack, CrosswalkFile, SurfaceDefinition};
use crate::{
ConceptOutput, DetailRowOutput, DetailRowStatementMap, FactOutput, NormalizationSummaryOutput,
PeriodOutput, StatementRowMap, StatementRowOutput, SurfaceRowMap, SurfaceRowOutput,
};
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum MappingMethod {
AuthoritativeDirect,
DirectSourceConcept,
AggregateChildren,
TaxonomyKpi,
UnmappedResidual,
}
impl MappingMethod {
pub fn as_str(&self) -> &'static str {
match self {
MappingMethod::AuthoritativeDirect => "authoritative_direct",
MappingMethod::DirectSourceConcept => "direct_source_concept",
MappingMethod::AggregateChildren => "aggregate_children",
MappingMethod::TaxonomyKpi => "taxonomy_kpi",
MappingMethod::UnmappedResidual => "unmapped_residual",
}
}
}
#[derive(Debug, Clone, Default)]
pub struct MappingAssignment {
pub authoritative_concept_key: Option<String>,
pub mapping_method: Option<MappingMethod>,
pub surface_key: Option<String>,
pub detail_parent_surface_key: Option<String>,
pub kpi_key: Option<String>,
pub residual_flag: bool,
}
#[derive(Debug, Default)]
pub struct CompactSurfaceModel {
pub surface_rows: SurfaceRowMap,
pub detail_rows: DetailRowStatementMap,
pub normalization_summary: NormalizationSummaryOutput,
pub concept_mappings: HashMap<String, MappingAssignment>,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
enum MatchRole {
Surface,
Detail,
}
#[derive(Debug, Clone)]
struct MatchedStatementRow<'a> {
row: &'a StatementRowOutput,
authoritative_concept_key: Option<String>,
mapping_method: MappingMethod,
match_role: MatchRole,
rank: i64,
}
pub fn build_compact_surface_model(
periods: &[PeriodOutput],
statement_rows: &StatementRowMap,
taxonomy_regime: &str,
fiscal_pack: FiscalPack,
warnings: Vec<String>,
) -> Result<CompactSurfaceModel> {
let pack = load_surface_pack(fiscal_pack)?;
let crosswalk = load_crosswalk(taxonomy_regime)?;
let mut surface_rows = empty_surface_row_map();
let mut detail_rows = empty_detail_row_map();
let mut concept_mappings = HashMap::<String, MappingAssignment>::new();
let mut surface_row_count = 0usize;
let mut detail_row_count = 0usize;
let mut unmapped_row_count = 0usize;
let mut material_unmapped_row_count = 0usize;
for statement in statement_keys() {
let rows = statement_rows
.get(statement)
.cloned()
.unwrap_or_default();
let statement_definitions = pack
.surfaces
.iter()
.filter(|definition| definition.statement == statement)
.collect::<Vec<_>>();
let mut used_row_keys = HashSet::<String>::new();
let mut statement_surface_rows = Vec::<SurfaceRowOutput>::new();
let mut statement_detail_rows = BTreeMap::<String, Vec<DetailRowOutput>>::new();
for definition in statement_definitions {
let matches = rows
.iter()
.filter(|row| !used_row_keys.contains(&row.key))
.filter_map(|row| match_statement_row(row, definition, crosswalk.as_ref()))
.collect::<Vec<_>>();
if matches.is_empty() {
continue;
}
let direct_surface_matches = matches
.iter()
.filter(|matched| matched.match_role == MatchRole::Surface)
.cloned()
.collect::<Vec<_>>();
let detail_component_matches = matches
.iter()
.filter(|matched| matched.match_role == MatchRole::Detail)
.cloned()
.collect::<Vec<_>>();
let mut surface_source_matches = if !direct_surface_matches.is_empty() {
vec![pick_best_match(&direct_surface_matches).clone()]
} else if definition.rollup_policy == "aggregate_children" {
detail_component_matches.clone()
} else {
Vec::new()
};
if surface_source_matches.is_empty() {
continue;
}
let detail_matches = if definition.detail_grouping_policy == "group_all_children" {
if detail_component_matches.is_empty() && definition.rollup_policy == "aggregate_children" {
Vec::new()
} else {
detail_component_matches.clone()
}
} else {
Vec::new()
};
if definition.rollup_policy == "aggregate_children"
&& direct_surface_matches.is_empty()
&& !surface_source_matches.is_empty()
{
for matched in &mut surface_source_matches {
matched.mapping_method = MappingMethod::AggregateChildren;
}
}
let values = build_surface_values(periods, &surface_source_matches);
if !has_any_value(&values) {
continue;
}
let resolved_source_row_keys = periods
.iter()
.map(|period| {
let resolved = if surface_source_matches.len() == 1 {
surface_source_matches
.first()
.and_then(|matched| matched.row.values.get(&period.id).copied().flatten().map(|_| matched.row.key.clone()))
} else {
None
};
(period.id.clone(), resolved)
})
.collect::<BTreeMap<_, _>>();
let source_concepts = unique_sorted_strings(
surface_source_matches
.iter()
.map(|matched| matched.row.qname.clone())
.collect::<Vec<_>>(),
);
let source_row_keys = unique_sorted_strings(
surface_source_matches
.iter()
.map(|matched| matched.row.key.clone())
.collect::<Vec<_>>(),
);
let source_fact_ids = unique_sorted_i64(
surface_source_matches
.iter()
.flat_map(|matched| matched.row.source_fact_ids.clone())
.collect::<Vec<_>>(),
);
let has_dimensions = surface_source_matches.iter().any(|matched| matched.row.has_dimensions);
for matched in &surface_source_matches {
used_row_keys.insert(matched.row.key.clone());
concept_mappings.insert(
matched.row.concept_key.clone(),
MappingAssignment {
authoritative_concept_key: matched.authoritative_concept_key.clone(),
mapping_method: Some(matched.mapping_method),
surface_key: Some(definition.surface_key.clone()),
detail_parent_surface_key: None,
kpi_key: None,
residual_flag: false,
},
);
}
let details = detail_matches
.iter()
.map(|matched| {
used_row_keys.insert(matched.row.key.clone());
concept_mappings.insert(
matched.row.concept_key.clone(),
MappingAssignment {
authoritative_concept_key: matched.authoritative_concept_key.clone(),
mapping_method: Some(matched.mapping_method),
surface_key: Some(definition.surface_key.clone()),
detail_parent_surface_key: Some(definition.surface_key.clone()),
kpi_key: None,
residual_flag: false,
},
);
build_detail_row(matched.row, &definition.surface_key, false)
})
.collect::<Vec<_>>();
if !details.is_empty() {
detail_row_count += details.len();
statement_detail_rows.insert(definition.surface_key.clone(), details);
}
statement_surface_rows.push(SurfaceRowOutput {
key: definition.surface_key.clone(),
label: definition.label.clone(),
category: definition.category.clone(),
template_section: definition.category.clone(),
order: definition.order,
unit: definition.unit.clone(),
values,
source_concepts,
source_row_keys,
source_fact_ids,
formula_key: definition.formula_fallback.as_ref().map(|_| definition.surface_key.clone()),
has_dimensions,
resolved_source_row_keys,
statement: Some(definition.statement.clone()),
detail_count: statement_detail_rows
.get(&definition.surface_key)
.map(|rows| rows.len() as i64),
resolution_method: None,
confidence: None,
warning_codes: vec![],
});
surface_row_count += 1;
let _ = &definition.materiality_policy;
}
statement_surface_rows.sort_by(|left, right| left.order.cmp(&right.order).then_with(|| left.label.cmp(&right.label)));
let baseline = baseline_for_statement(statement, &statement_surface_rows);
let threshold = materiality_threshold(statement, baseline);
let residual_rows = rows
.iter()
.filter(|row| !used_row_keys.contains(&row.key))
.filter(|row| has_any_value(&row.values))
.map(|row| {
concept_mappings.insert(
row.concept_key.clone(),
MappingAssignment {
authoritative_concept_key: None,
mapping_method: Some(MappingMethod::UnmappedResidual),
surface_key: None,
detail_parent_surface_key: Some("unmapped".to_string()),
kpi_key: None,
residual_flag: true,
},
);
build_detail_row(row, "unmapped", true)
})
.collect::<Vec<_>>();
if !residual_rows.is_empty() {
unmapped_row_count += residual_rows.len();
material_unmapped_row_count += residual_rows
.iter()
.filter(|row| max_abs_value(&row.values) >= threshold)
.count();
detail_row_count += residual_rows.len();
statement_detail_rows.insert("unmapped".to_string(), residual_rows);
}
surface_rows.insert(statement.to_string(), statement_surface_rows);
detail_rows.insert(statement.to_string(), statement_detail_rows);
}
Ok(CompactSurfaceModel {
surface_rows,
detail_rows,
normalization_summary: NormalizationSummaryOutput {
surface_row_count,
detail_row_count,
kpi_row_count: 0,
unmapped_row_count,
material_unmapped_row_count,
warnings,
},
concept_mappings,
})
}
pub fn merge_mapping_assignments(
primary: &mut HashMap<String, MappingAssignment>,
secondary: HashMap<String, MappingAssignment>,
) {
for (concept_key, assignment) in secondary {
let existing = primary.entry(concept_key).or_default();
existing.authoritative_concept_key = existing
.authoritative_concept_key
.clone()
.or(assignment.authoritative_concept_key);
if existing.mapping_method.is_none()
|| matches!(existing.mapping_method, Some(MappingMethod::UnmappedResidual))
{
existing.mapping_method = assignment.mapping_method;
}
if existing.surface_key.is_none() {
existing.surface_key = assignment.surface_key;
}
if existing.detail_parent_surface_key.is_none() {
existing.detail_parent_surface_key = assignment.detail_parent_surface_key;
}
if existing.kpi_key.is_none() {
existing.kpi_key = assignment.kpi_key;
}
existing.residual_flag = existing.residual_flag && assignment.residual_flag;
}
}
pub fn apply_mapping_assignments(
concepts: &mut [ConceptOutput],
facts: &mut [FactOutput],
mappings: &HashMap<String, MappingAssignment>,
) {
for concept in concepts {
if let Some(mapping) = mappings.get(&concept.concept_key) {
concept.authoritative_concept_key = mapping.authoritative_concept_key.clone();
concept.mapping_method = mapping.mapping_method.map(|method| method.as_str().to_string());
concept.surface_key = mapping.surface_key.clone();
concept.detail_parent_surface_key = mapping.detail_parent_surface_key.clone();
concept.kpi_key = mapping.kpi_key.clone();
concept.residual_flag = mapping.residual_flag;
}
}
for fact in facts {
if let Some(mapping) = mappings.get(&fact.concept_key) {
fact.authoritative_concept_key = mapping.authoritative_concept_key.clone();
fact.mapping_method = mapping.mapping_method.map(|method| method.as_str().to_string());
fact.surface_key = mapping.surface_key.clone();
fact.detail_parent_surface_key = mapping.detail_parent_surface_key.clone();
fact.kpi_key = mapping.kpi_key.clone();
fact.residual_flag = mapping.residual_flag;
}
}
}
fn match_statement_row<'a>(
row: &'a StatementRowOutput,
definition: &SurfaceDefinition,
crosswalk: Option<&CrosswalkFile>,
) -> Option<MatchedStatementRow<'a>> {
let authoritative_mapping = crosswalk.and_then(|crosswalk| crosswalk.mappings.get(&row.qname));
let authoritative_concept_key = authoritative_mapping
.map(|mapping| mapping.authoritative_concept_key.clone())
.or_else(|| {
if !row.is_extension {
Some(row.qname.clone())
} else {
None
}
});
let matches_authoritative = authoritative_concept_key.as_ref().map_or(false, |concept| {
definition
.allowed_authoritative_concepts
.iter()
.any(|candidate| candidate_matches(candidate, concept))
}) || authoritative_mapping
.map(|mapping| mapping.surface_key == definition.surface_key)
.unwrap_or(false);
if matches_authoritative {
return Some(MatchedStatementRow {
row,
authoritative_concept_key,
mapping_method: MappingMethod::AuthoritativeDirect,
match_role: MatchRole::Surface,
rank: 0,
});
}
let matches_source = definition
.allowed_source_concepts
.iter()
.any(|candidate| candidate_matches(candidate, &row.qname) || candidate_matches(candidate, &row.local_name));
if matches_source {
return Some(MatchedStatementRow {
row,
authoritative_concept_key,
mapping_method: MappingMethod::DirectSourceConcept,
match_role: if definition.rollup_policy == "aggregate_children" {
MatchRole::Detail
} else {
MatchRole::Surface
},
rank: 1,
});
}
None
}
fn pick_best_match<'a>(matches: &'a [MatchedStatementRow<'a>]) -> &'a MatchedStatementRow<'a> {
matches
.iter()
.min_by(|left, right| {
left.rank
.cmp(&right.rank)
.then_with(|| {
let left_dimension_rank = if left.row.has_dimensions { 1 } else { 0 };
let right_dimension_rank = if right.row.has_dimensions { 1 } else { 0 };
left_dimension_rank.cmp(&right_dimension_rank)
})
.then_with(|| left.row.order.cmp(&right.row.order))
.then_with(|| {
max_abs_value(&right.row.values)
.partial_cmp(&max_abs_value(&left.row.values))
.unwrap_or(std::cmp::Ordering::Equal)
})
.then_with(|| left.row.label.cmp(&right.row.label))
})
.expect("pick_best_match requires at least one match")
}
fn build_surface_values(
periods: &[PeriodOutput],
matches: &[MatchedStatementRow<'_>],
) -> BTreeMap<String, Option<f64>> {
periods
.iter()
.map(|period| {
let value = if matches.len() == 1 {
matches
.first()
.and_then(|matched| matched.row.values.get(&period.id).copied())
.flatten()
} else {
sum_nullable_values(
matches
.iter()
.map(|matched| matched.row.values.get(&period.id).copied().flatten())
.collect::<Vec<_>>(),
)
};
(period.id.clone(), value)
})
.collect()
}
fn sum_nullable_values(values: Vec<Option<f64>>) -> Option<f64> {
if values.iter().all(|value| value.is_none()) {
return None;
}
Some(values.into_iter().map(|value| value.unwrap_or(0.0)).sum())
}
fn build_detail_row(
row: &StatementRowOutput,
parent_surface_key: &str,
residual_flag: bool,
) -> DetailRowOutput {
DetailRowOutput {
key: row.key.clone(),
parent_surface_key: parent_surface_key.to_string(),
label: row.label.clone(),
concept_key: row.concept_key.clone(),
qname: row.qname.clone(),
namespace_uri: row.namespace_uri.clone(),
local_name: row.local_name.clone(),
unit: row.units.values().find_map(|value| value.clone()),
values: row.values.clone(),
source_fact_ids: row.source_fact_ids.clone(),
is_extension: row.is_extension,
dimensions_summary: if row.has_dimensions {
vec!["has_dimensions".to_string()]
} else {
vec![]
},
residual_flag,
}
}
fn has_any_value(values: &BTreeMap<String, Option<f64>>) -> bool {
values.values().any(|value| value.is_some())
}
fn max_abs_value(values: &BTreeMap<String, Option<f64>>) -> f64 {
values
.values()
.fold(0.0_f64, |max, value| max.max(value.unwrap_or(0.0).abs()))
}
fn baseline_for_statement(statement: &str, surface_rows: &[SurfaceRowOutput]) -> f64 {
let anchor_key = if statement == "balance" {
"total_assets"
} else {
"revenue"
};
surface_rows
.iter()
.find(|row| row.key == anchor_key)
.map(|row| max_abs_value(&row.values))
.unwrap_or(0.0)
}
fn materiality_threshold(statement: &str, baseline: f64) -> f64 {
if statement == "balance" {
return (baseline * 0.005).max(5_000_000.0);
}
(baseline * 0.01).max(1_000_000.0)
}
fn unique_sorted_strings(values: Vec<String>) -> Vec<String> {
let mut values = values.into_iter().collect::<HashSet<_>>().into_iter().collect::<Vec<_>>();
values.sort();
values
}
fn unique_sorted_i64(values: Vec<i64>) -> Vec<i64> {
let mut values = values.into_iter().collect::<HashSet<_>>().into_iter().collect::<Vec<_>>();
values.sort();
values
}
fn candidate_matches(candidate: &str, actual: &str) -> bool {
candidate.eq_ignore_ascii_case(actual)
|| candidate
.rsplit_once(':')
.map(|(_, local_name)| local_name.eq_ignore_ascii_case(actual))
.unwrap_or(false)
|| actual
.rsplit_once(':')
.map(|(_, local_name)| local_name.eq_ignore_ascii_case(candidate))
.unwrap_or(false)
}
fn statement_keys() -> [&'static str; 5] {
["income", "balance", "cash_flow", "equity", "comprehensive_income"]
}
fn empty_surface_row_map() -> SurfaceRowMap {
statement_keys()
.into_iter()
.map(|key| (key.to_string(), Vec::new()))
.collect()
}
fn empty_detail_row_map() -> DetailRowStatementMap {
statement_keys()
.into_iter()
.map(|key| (key.to_string(), BTreeMap::new()))
.collect()
}
#[cfg(test)]
mod tests {
use super::*;
use crate::pack_selector::FiscalPack;
use crate::{PeriodOutput, StatementRowOutput};
fn period(id: &str) -> PeriodOutput {
PeriodOutput {
id: id.to_string(),
filing_id: 1,
accession_number: "0000000000-00-000001".to_string(),
filing_date: "2025-12-31".to_string(),
period_start: Some("2025-01-01".to_string()),
period_end: Some("2025-12-31".to_string()),
filing_type: "10-K".to_string(),
period_label: id.to_string(),
}
}
fn row(key: &str, qname: &str, statement: &str, value: f64) -> StatementRowOutput {
StatementRowOutput {
key: key.to_string(),
label: key.to_string(),
concept_key: format!("http://fasb.org/us-gaap/2024#{}", qname.split(':').nth(1).unwrap_or(key)),
qname: qname.to_string(),
namespace_uri: "http://fasb.org/us-gaap/2024".to_string(),
local_name: qname.split(':').nth(1).unwrap_or(key).to_string(),
is_extension: false,
statement: statement.to_string(),
role_uri: Some(statement.to_string()),
order: 1,
depth: 0,
parent_key: None,
values: BTreeMap::from([("p1".to_string(), Some(value))]),
units: BTreeMap::from([("p1".to_string(), Some("iso4217:USD".to_string()))]),
has_dimensions: false,
source_fact_ids: vec![1],
}
}
fn empty_map() -> StatementRowMap {
BTreeMap::from([
("income".to_string(), Vec::new()),
("balance".to_string(), Vec::new()),
("cash_flow".to_string(), Vec::new()),
("equity".to_string(), Vec::new()),
("comprehensive_income".to_string(), Vec::new()),
])
}
#[test]
fn prefers_direct_authoritative_row_over_child_aggregation() {
let mut rows = empty_map();
rows.get_mut("income").unwrap().extend([
row("op-expenses", "us-gaap:OperatingExpenses", "income", 40.0),
row("sga", "us-gaap:SellingGeneralAndAdministrativeExpense", "income", 25.0),
row("rd", "us-gaap:ResearchAndDevelopmentExpense", "income", 15.0),
]);
let model = build_compact_surface_model(
&[period("p1")],
&rows,
"us-gaap",
FiscalPack::Core,
vec![],
)
.expect("compact model should build");
let op_expenses = model
.surface_rows
.get("income")
.unwrap()
.iter()
.find(|row| row.key == "operating_expenses")
.unwrap();
assert_eq!(op_expenses.values.get("p1").copied().flatten(), Some(40.0));
assert_eq!(op_expenses.detail_count, Some(2));
}
#[test]
fn emits_unmapped_residual_rows() {
let mut rows = empty_map();
rows.get_mut("income").unwrap().push(row("custom", "company:CustomMetric", "income", 12.0));
let model = build_compact_surface_model(
&[period("p1")],
&rows,
"us-gaap",
FiscalPack::Core,
vec![],
)
.expect("compact model should build");
let residual_rows = model.detail_rows.get("income").unwrap().get("unmapped").unwrap();
assert_eq!(residual_rows.len(), 1);
assert!(residual_rows[0].residual_flag);
}
}