feat(taxonomy): add rust sidecar compact surface pipeline

This commit is contained in:
2026-03-12 15:23:10 -04:00
parent f2c25fb9c6
commit 58061af006
84 changed files with 19350 additions and 265 deletions

View File

@@ -0,0 +1,700 @@
use anyhow::Result;
use std::collections::{BTreeMap, HashMap, HashSet};
use crate::pack_selector::FiscalPack;
use crate::surface_mapper::{MappingAssignment, MappingMethod};
use crate::taxonomy_loader::{load_kpi_pack, KpiDefinition};
use crate::{FactOutput, KpiRowOutput, PeriodOutput};
#[derive(Debug, Default)]
pub struct KpiExtractionResult {
pub rows: Vec<KpiRowOutput>,
pub mapping_assignments: HashMap<String, MappingAssignment>,
pub warnings: Vec<String>,
}
pub fn build_taxonomy_kpis(
periods: &[PeriodOutput],
facts: &[FactOutput],
fiscal_pack: FiscalPack,
) -> Result<KpiExtractionResult> {
if fiscal_pack == FiscalPack::Core {
return Ok(KpiExtractionResult::default());
}
let kpi_pack = load_kpi_pack(fiscal_pack)?;
let mut rows = Vec::<KpiRowOutput>::new();
let mut mapping_assignments = HashMap::<String, MappingAssignment>::new();
for (index, definition) in kpi_pack.kpis.iter().enumerate() {
let Some(kpi_row) = build_kpi_row(definition, index as i64, periods, facts) else {
continue;
};
for concept_key in unique_sorted_strings(kpi_row.source_concepts.iter().map(|qname| concept_key_from_qname(qname)).collect()) {
mapping_assignments.insert(
concept_key,
MappingAssignment {
authoritative_concept_key: None,
mapping_method: Some(MappingMethod::TaxonomyKpi),
surface_key: None,
detail_parent_surface_key: None,
kpi_key: Some(kpi_row.key.clone()),
residual_flag: false,
},
);
}
rows.push(kpi_row);
}
rows.sort_by(|left, right| left.order.cmp(&right.order).then_with(|| left.label.cmp(&right.label)));
Ok(KpiExtractionResult {
rows,
mapping_assignments,
warnings: vec![],
})
}
fn build_kpi_row(
definition: &KpiDefinition,
order_index: i64,
periods: &[PeriodOutput],
facts: &[FactOutput],
) -> Option<KpiRowOutput> {
match definition.key.as_str() {
"loan_growth" => growth_kpi_row(
definition,
order_index,
periods,
facts,
&[
"FinancingReceivableRecordedInvestment",
"LoansReceivableNetReportedAmount",
"FinancingReceivableExcludingAccruedInterestBeforeAllowanceForCreditLoss",
"FinancingReceivableExcludingAccruedInterestAfterAllowanceForCreditLoss",
"FinanceReceivableAllowanceForCreditLossesExcluded",
],
),
"deposit_growth" => growth_kpi_row(
definition,
order_index,
periods,
facts,
&["DepositsLiabilities", "Deposits", "DepositsDomestic", "DepositsForeign"],
),
"premium_growth" => growth_kpi_row(
definition,
order_index,
periods,
facts,
&[
"Premiums",
"PremiumsEarned",
"PremiumsWritten",
"PremiumsEarnedNet",
"PremiumsWrittenNet",
"SupplementaryInsuranceInformationPremiumRevenue",
],
),
"net_interest_margin" => direct_or_formula_row(
definition,
order_index,
periods,
facts,
&["NetInterestMargin", "NetInterestSpread"],
Some((
&[
"InterestAndDividendIncomeOperating",
"InterestIncomeExpenseOperatingNet",
"InterestIncomeExpenseNet",
],
&["Assets", "AverageInterestEarningAssets"],
true,
)),
),
"combined_ratio" => direct_or_formula_row(
definition,
order_index,
periods,
facts,
&["CombinedRatio"],
Some((
&[
"PolicyholderBenefitsAndClaimsIncurredNet",
"BenefitsLossesAndExpenses",
"LossesAndLossAdjustmentExpenses",
"SupplementaryInsuranceInformationBenefitsClaimsLossesAndSettlementExpense",
],
&[
"Premiums",
"PremiumsEarned",
"PremiumsWritten",
"PremiumsEarnedNet",
"PremiumsWrittenNet",
"SupplementaryInsuranceInformationPremiumRevenue",
],
true,
)),
),
"property_count" => direct_or_formula_row(
definition,
order_index,
periods,
facts,
&["NumberOfRealEstateProperties", "SECScheduleIIIRealEstateNumberOfUnits"],
None,
),
"investment_property_growth" => growth_kpi_row(
definition,
order_index,
periods,
facts,
&[
"RealEstateInvestmentPropertyNet",
"RealEstateInvestmentPropertyAtCost",
"RealEstateGrossAtCarryingValue",
],
),
"aum" => direct_or_formula_row(
definition,
order_index,
periods,
facts,
&["AssetsUnderManagementCarryingAmount"],
None,
),
"fee_paying_aum" => direct_or_formula_row(
definition,
order_index,
periods,
facts,
&["FeePayingAssetUnderManagement"],
None,
),
_ => None,
}
}
fn growth_kpi_row(
definition: &KpiDefinition,
order_index: i64,
periods: &[PeriodOutput],
facts: &[FactOutput],
local_names: &[&str],
) -> Option<KpiRowOutput> {
let matched = collect_period_values(periods, facts, local_names);
if matched.values.is_empty() {
return None;
}
let sorted_periods = sort_periods(periods);
let mut values = BTreeMap::<String, Option<f64>>::new();
for window in sorted_periods.windows(2) {
let previous = window.first()?;
let current = window.get(1)?;
let current_value = matched.values.get(&current.id).copied().flatten();
let previous_value = matched.values.get(&previous.id).copied().flatten();
let growth = match (current_value, previous_value) {
(Some(current_value), Some(previous_value)) if previous_value != 0.0 => {
Some(current_value / previous_value - 1.0)
}
_ => None,
};
values.insert(current.id.clone(), growth);
}
build_kpi_output(definition, order_index, "operating_kpi", values, matched)
}
fn direct_or_formula_row(
definition: &KpiDefinition,
order_index: i64,
periods: &[PeriodOutput],
facts: &[FactOutput],
direct_local_names: &[&str],
formula: Option<(&[&str], &[&str], bool)>,
) -> Option<KpiRowOutput> {
let direct = collect_period_values(periods, facts, direct_local_names);
if !direct.values.is_empty() {
return build_kpi_output(
definition,
order_index,
"operating_kpi",
direct.values.clone(),
direct,
);
}
let direct_by_end_date = collect_end_date_values(facts, direct_local_names);
if !direct_by_end_date.values.is_empty() {
return build_date_aligned_kpi_output(
definition,
order_index,
"operating_kpi",
periods,
direct_by_end_date,
);
}
let Some((numerator_names, denominator_names, divide)) = formula else {
return None;
};
let numerator = collect_period_values(periods, facts, numerator_names);
let denominator = collect_period_values(periods, facts, denominator_names);
let mut values = BTreeMap::<String, Option<f64>>::new();
let mut sources = PeriodFactValues::default();
for period in periods {
let numerator_value = numerator.values.get(&period.id).copied().flatten();
let denominator_value = denominator.values.get(&period.id).copied().flatten();
let next_value = if divide {
match (numerator_value, denominator_value) {
(Some(numerator_value), Some(denominator_value)) if denominator_value != 0.0 => {
Some(numerator_value / denominator_value)
}
_ => None,
}
} else {
None
};
values.insert(period.id.clone(), next_value);
for qname in numerator.source_concepts.iter().chain(denominator.source_concepts.iter()) {
sources.source_concepts.insert(qname.clone());
}
for fact_id in numerator.source_fact_ids.iter().chain(denominator.source_fact_ids.iter()) {
sources.source_fact_ids.insert(*fact_id);
}
sources.has_dimensions = sources.has_dimensions || numerator.has_dimensions || denominator.has_dimensions;
}
if values.values().any(|value| value.is_some()) {
return Some(KpiRowOutput {
key: definition.key.clone(),
label: definition.label.clone(),
category: "operating_kpi".to_string(),
unit: definition.unit.clone(),
order: (order_index + 1) * 10,
segment: None,
axis: None,
member: None,
values,
source_concepts: unique_sorted_strings(sources.source_concepts.into_iter().collect()),
source_fact_ids: unique_sorted_i64(sources.source_fact_ids.into_iter().collect()),
provenance_type: "taxonomy".to_string(),
has_dimensions: sources.has_dimensions,
});
}
let numerator_by_end_date = collect_end_date_values(facts, numerator_names);
let denominator_by_end_date = collect_end_date_values(facts, denominator_names);
let mut aligned_values = BTreeMap::<String, Option<f64>>::new();
for end_date in numerator_by_end_date.values.keys() {
let numerator_value = numerator_by_end_date.values.get(end_date).copied().flatten();
let denominator_value = denominator_by_end_date.values.get(end_date).copied().flatten();
let next_value = if divide {
match (numerator_value, denominator_value) {
(Some(numerator_value), Some(denominator_value)) if denominator_value != 0.0 => {
Some(numerator_value / denominator_value)
}
_ => None,
}
} else {
None
};
let Some(period_id) = select_period_id_for_end_date(periods, end_date, true) else {
continue;
};
aligned_values.insert(period_id, next_value);
}
if aligned_values.values().all(|value| value.is_none()) {
return None;
}
let mut aligned_sources = DateFactValues {
prefer_duration: true,
..DateFactValues::default()
};
aligned_sources.source_concepts.extend(numerator_by_end_date.source_concepts);
aligned_sources.source_concepts.extend(denominator_by_end_date.source_concepts);
aligned_sources.source_fact_ids.extend(numerator_by_end_date.source_fact_ids);
aligned_sources.source_fact_ids.extend(denominator_by_end_date.source_fact_ids);
aligned_sources.has_dimensions = numerator_by_end_date.has_dimensions || denominator_by_end_date.has_dimensions;
Some(KpiRowOutput {
key: definition.key.clone(),
label: definition.label.clone(),
category: "operating_kpi".to_string(),
unit: definition.unit.clone(),
order: (order_index + 1) * 10,
segment: None,
axis: None,
member: None,
values: aligned_values,
source_concepts: unique_sorted_strings(aligned_sources.source_concepts.into_iter().collect()),
source_fact_ids: unique_sorted_i64(aligned_sources.source_fact_ids.into_iter().collect()),
provenance_type: "taxonomy".to_string(),
has_dimensions: aligned_sources.has_dimensions,
})
}
#[derive(Debug, Default, Clone)]
struct PeriodFactValues {
values: BTreeMap<String, Option<f64>>,
source_concepts: HashSet<String>,
source_fact_ids: HashSet<i64>,
has_dimensions: bool,
}
#[derive(Debug, Default, Clone)]
struct DateFactValues {
values: BTreeMap<String, Option<f64>>,
source_concepts: HashSet<String>,
source_fact_ids: HashSet<i64>,
has_dimensions: bool,
prefer_duration: bool,
}
fn collect_period_values(
periods: &[PeriodOutput],
facts: &[FactOutput],
local_names: &[&str],
) -> PeriodFactValues {
let mut values = PeriodFactValues::default();
let targets = local_names
.iter()
.map(|name| name.to_ascii_lowercase())
.collect::<HashSet<_>>();
let mut fact_ids_by_period = HashMap::<String, Vec<(i64, &FactOutput)>>::new();
for (index, fact) in facts.iter().enumerate() {
if !targets.contains(&fact.local_name.to_ascii_lowercase()) {
continue;
}
let Some(period_id) = period_id_for_fact(periods, fact) else {
continue;
};
fact_ids_by_period
.entry(period_id)
.or_default()
.push((index as i64 + 1, fact));
}
for period in periods {
let Some(grouped_facts) = fact_ids_by_period.get(&period.id) else {
continue;
};
let Some((fact_id, fact)) = pick_preferred_fact(grouped_facts) else {
continue;
};
values.values.insert(period.id.clone(), Some(fact.value_num));
values.source_concepts.insert(fact.qname.clone());
values.source_fact_ids.insert(*fact_id);
values.has_dimensions = values.has_dimensions || !fact.is_dimensionless;
}
values
}
fn collect_end_date_values(
facts: &[FactOutput],
local_names: &[&str],
) -> DateFactValues {
let mut values = DateFactValues::default();
let targets = local_names
.iter()
.map(|name| name.to_ascii_lowercase())
.collect::<HashSet<_>>();
let mut fact_ids_by_end_date = HashMap::<String, Vec<(i64, &FactOutput)>>::new();
for (index, fact) in facts.iter().enumerate() {
if !targets.contains(&fact.local_name.to_ascii_lowercase()) {
continue;
}
let Some(end_date) = fact.period_end.clone().or_else(|| fact.period_instant.clone()) else {
continue;
};
fact_ids_by_end_date
.entry(end_date)
.or_default()
.push((index as i64 + 1, fact));
}
for (end_date, grouped_facts) in fact_ids_by_end_date {
let Some((fact_id, fact)) = pick_preferred_fact(&grouped_facts) else {
continue;
};
values.values.insert(end_date, Some(fact.value_num));
values.source_concepts.insert(fact.qname.clone());
values.source_fact_ids.insert(*fact_id);
values.has_dimensions = values.has_dimensions || !fact.is_dimensionless;
values.prefer_duration = values.prefer_duration || fact.period_start.is_some();
}
values
}
fn period_id_for_fact(periods: &[PeriodOutput], fact: &FactOutput) -> Option<String> {
let fact_period_end = fact.period_end.clone().or_else(|| fact.period_instant.clone());
periods
.iter()
.find(|period| {
period.period_start == fact.period_start
&& period.period_end == fact_period_end
})
.map(|period| period.id.clone())
}
fn pick_preferred_fact<'a>(grouped_facts: &'a [(i64, &'a FactOutput)]) -> Option<&'a (i64, &'a FactOutput)> {
grouped_facts.iter().max_by(|left, right| {
let left_dimension_score = if left.1.is_dimensionless { 1 } else { 0 };
let right_dimension_score = if right.1.is_dimensionless { 1 } else { 0 };
left_dimension_score
.cmp(&right_dimension_score)
.then_with(|| {
left.1
.value_num
.abs()
.partial_cmp(&right.1.value_num.abs())
.unwrap_or(std::cmp::Ordering::Equal)
})
})
}
fn select_period_id_for_end_date(
periods: &[PeriodOutput],
end_date: &str,
prefer_duration: bool,
) -> Option<String> {
periods
.iter()
.filter(|period| period.period_end.as_deref() == Some(end_date))
.max_by(|left, right| {
let left_score = if prefer_duration {
if left.period_start.is_some() { 1 } else { 0 }
} else if left.period_start.is_none() {
1
} else {
0
};
let right_score = if prefer_duration {
if right.period_start.is_some() { 1 } else { 0 }
} else if right.period_start.is_none() {
1
} else {
0
};
left_score
.cmp(&right_score)
.then_with(|| left.id.cmp(&right.id))
})
.map(|period| period.id.clone())
}
fn sort_periods(periods: &[PeriodOutput]) -> Vec<&PeriodOutput> {
let mut periods = periods.iter().collect::<Vec<_>>();
periods.sort_by(|left, right| {
let left_key = left.period_end.clone().unwrap_or_else(|| left.filing_date.clone());
let right_key = right.period_end.clone().unwrap_or_else(|| right.filing_date.clone());
left_key.cmp(&right_key).then_with(|| left.id.cmp(&right.id))
});
periods
}
fn build_kpi_output(
definition: &KpiDefinition,
order_index: i64,
category: &str,
values: BTreeMap<String, Option<f64>>,
matched: PeriodFactValues,
) -> Option<KpiRowOutput> {
if values.values().all(|value| value.is_none()) {
return None;
}
Some(KpiRowOutput {
key: definition.key.clone(),
label: definition.label.clone(),
category: category.to_string(),
unit: definition.unit.clone(),
order: (order_index + 1) * 10,
segment: None,
axis: None,
member: None,
values,
source_concepts: unique_sorted_strings(matched.source_concepts.into_iter().collect()),
source_fact_ids: unique_sorted_i64(matched.source_fact_ids.into_iter().collect()),
provenance_type: "taxonomy".to_string(),
has_dimensions: matched.has_dimensions,
})
}
fn build_date_aligned_kpi_output(
definition: &KpiDefinition,
order_index: i64,
category: &str,
periods: &[PeriodOutput],
matched: DateFactValues,
) -> Option<KpiRowOutput> {
let mut values = BTreeMap::<String, Option<f64>>::new();
for (end_date, value) in &matched.values {
let Some(period_id) = select_period_id_for_end_date(periods, end_date, matched.prefer_duration) else {
continue;
};
values.insert(period_id, *value);
}
if values.values().all(|value| value.is_none()) {
return None;
}
Some(KpiRowOutput {
key: definition.key.clone(),
label: definition.label.clone(),
category: category.to_string(),
unit: definition.unit.clone(),
order: (order_index + 1) * 10,
segment: None,
axis: None,
member: None,
values,
source_concepts: unique_sorted_strings(matched.source_concepts.into_iter().collect()),
source_fact_ids: unique_sorted_i64(matched.source_fact_ids.into_iter().collect()),
provenance_type: "taxonomy".to_string(),
has_dimensions: matched.has_dimensions,
})
}
fn concept_key_from_qname(qname: &str) -> String {
if let Some((prefix, local_name)) = qname.split_once(':') {
let namespace_uri = if prefix.eq_ignore_ascii_case("us-gaap") {
"http://fasb.org/us-gaap/2024".to_string()
} else if prefix.eq_ignore_ascii_case("ifrs-full") {
"http://xbrl.ifrs.org/taxonomy/2024-03-27/ifrs-full".to_string()
} else {
format!("urn:{prefix}")
};
return format!("{namespace_uri}#{local_name}");
}
qname.to_string()
}
fn unique_sorted_strings(values: Vec<String>) -> Vec<String> {
let mut values = values.into_iter().collect::<HashSet<_>>().into_iter().collect::<Vec<_>>();
values.sort();
values
}
fn unique_sorted_i64(values: Vec<i64>) -> Vec<i64> {
let mut values = values.into_iter().collect::<HashSet<_>>().into_iter().collect::<Vec<_>>();
values.sort();
values
}
#[cfg(test)]
mod tests {
use super::*;
use crate::pack_selector::FiscalPack;
use crate::{FactOutput, PeriodOutput};
fn period(id: &str, end: &str, start: Option<&str>) -> PeriodOutput {
PeriodOutput {
id: id.to_string(),
filing_id: 1,
accession_number: "0000000000-00-000001".to_string(),
filing_date: end.to_string(),
period_start: start.map(|value| value.to_string()),
period_end: Some(end.to_string()),
filing_type: "10-Q".to_string(),
period_label: id.to_string(),
}
}
fn fact(local_name: &str, period_start: Option<&str>, period_end: &str, value: f64) -> FactOutput {
FactOutput {
concept_key: format!("http://fasb.org/us-gaap/2024#{local_name}"),
qname: format!("us-gaap:{local_name}"),
namespace_uri: "http://fasb.org/us-gaap/2024".to_string(),
local_name: local_name.to_string(),
data_type: None,
statement_kind: Some("balance".to_string()),
role_uri: Some("balance".to_string()),
authoritative_concept_key: None,
mapping_method: None,
surface_key: None,
detail_parent_surface_key: None,
kpi_key: None,
residual_flag: false,
context_id: "c1".to_string(),
unit: Some("iso4217:USD".to_string()),
decimals: None,
precision: None,
nil: false,
value_num: value,
period_start: period_start.map(|value| value.to_string()),
period_end: Some(period_end.to_string()),
period_instant: None,
dimensions: vec![],
is_dimensionless: true,
source_file: None,
}
}
#[test]
fn emits_taxonomy_growth_kpis_for_bank_pack() {
let periods = vec![
period("prev", "2024-12-31", None),
period("curr", "2025-12-31", None),
];
let facts = vec![
fact("FinancingReceivableRecordedInvestment", None, "2024-12-31", 100.0),
fact("FinancingReceivableRecordedInvestment", None, "2025-12-31", 120.0),
fact("DepositsLiabilities", None, "2024-12-31", 200.0),
fact("DepositsLiabilities", None, "2025-12-31", 250.0),
];
let result = build_taxonomy_kpis(&periods, &facts, FiscalPack::BankLender)
.expect("taxonomy kpis should build");
assert!(result.rows.iter().all(|row| row.provenance_type == "taxonomy"));
assert!(result.rows.iter().any(|row| row.key == "loan_growth"));
assert!(result.rows.iter().any(|row| row.key == "deposit_growth"));
}
#[test]
fn emits_net_interest_margin_when_duration_and_instant_periods_share_end_date() {
let periods = vec![
period("dur-prev", "2024-12-31", Some("2024-01-01")),
period("inst-prev", "2024-12-31", None),
period("dur-curr", "2025-12-31", Some("2025-01-01")),
period("inst-curr", "2025-12-31", None),
];
let facts = vec![
fact("InterestIncomeExpenseNet", Some("2024-01-01"), "2024-12-31", 90.0),
fact("InterestIncomeExpenseNet", Some("2025-01-01"), "2025-12-31", 100.0),
fact("Assets", None, "2024-12-31", 1000.0),
fact("Assets", None, "2025-12-31", 1200.0),
];
let result = build_taxonomy_kpis(&periods, &facts, FiscalPack::BankLender)
.expect("taxonomy kpis should build");
let net_interest_margin = result
.rows
.iter()
.find(|row| row.key == "net_interest_margin")
.expect("net interest margin should be present");
assert_eq!(net_interest_margin.values.get("dur-prev").copied().flatten(), Some(0.09));
assert_eq!(net_interest_margin.values.get("dur-curr").copied().flatten(), Some(100.0 / 1200.0));
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,90 @@
use crate::{FactOutput, FilingMetrics};
pub fn derive_metrics(facts: &[FactOutput]) -> FilingMetrics {
fn pick_best(facts: &[&FactOutput]) -> Option<f64> {
facts
.iter()
.max_by(|left, right| {
let left_dimension_score = if left.is_dimensionless { 1 } else { 0 };
let right_dimension_score = if right.is_dimensionless { 1 } else { 0 };
left_dimension_score
.cmp(&right_dimension_score)
.then_with(|| {
let left_date = left
.period_end
.as_ref()
.or(left.period_instant.as_ref())
.cloned()
.unwrap_or_default();
let right_date = right
.period_end
.as_ref()
.or(right.period_instant.as_ref())
.cloned()
.unwrap_or_default();
left_date.cmp(&right_date)
})
.then_with(|| {
left.value_num
.abs()
.partial_cmp(&right.value_num.abs())
.unwrap_or(std::cmp::Ordering::Equal)
})
})
.map(|fact| fact.value_num)
}
fn by_local_names<'a>(facts: &'a [FactOutput], names: &[&str]) -> Vec<&'a FactOutput> {
let targets = names.iter().map(|name| name.to_ascii_lowercase()).collect::<Vec<_>>();
facts
.iter()
.filter(|fact| targets.iter().any(|target| fact.local_name.eq_ignore_ascii_case(target)))
.collect()
}
let revenue = pick_best(&by_local_names(
facts,
&[
"Revenues",
"SalesRevenueNet",
"RevenueFromContractWithCustomerExcludingAssessedTax",
"TotalRevenuesAndOtherIncome",
],
));
let net_income = pick_best(&by_local_names(facts, &["NetIncomeLoss", "ProfitLoss"]));
let total_assets = pick_best(&by_local_names(facts, &["Assets"]));
let cash = pick_best(&by_local_names(
facts,
&[
"CashAndCashEquivalentsAtCarryingValue",
"CashCashEquivalentsRestrictedCashAndRestrictedCashEquivalents",
],
));
let direct_debt = pick_best(&by_local_names(
facts,
&[
"DebtAndFinanceLeaseLiabilities",
"Debt",
"LongTermDebtAndCapitalLeaseObligations",
],
));
let current_debt = pick_best(&by_local_names(
facts,
&["DebtCurrent", "ShortTermBorrowings", "LongTermDebtCurrent"],
));
let long_term_debt = pick_best(&by_local_names(
facts,
&["LongTermDebtNoncurrent", "LongTermDebt", "DebtNoncurrent"],
));
FilingMetrics {
revenue,
net_income,
total_assets,
cash,
debt: direct_debt.or_else(|| match (current_debt, long_term_debt) {
(Some(left), Some(right)) => Some(left + right),
_ => None,
}),
}
}

View File

@@ -0,0 +1,361 @@
use std::collections::HashSet;
use crate::{FactOutput, StatementRowMap};
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum FiscalPack {
Core,
BankLender,
Insurance,
ReitRealEstate,
BrokerAssetManager,
}
impl FiscalPack {
pub fn as_str(&self) -> &'static str {
match self {
FiscalPack::Core => "core",
FiscalPack::BankLender => "bank_lender",
FiscalPack::Insurance => "insurance",
FiscalPack::ReitRealEstate => "reit_real_estate",
FiscalPack::BrokerAssetManager => "broker_asset_manager",
}
}
}
#[derive(Debug, Clone)]
pub struct PackSelection {
pub pack: FiscalPack,
pub warnings: Vec<String>,
}
pub fn select_fiscal_pack(statement_rows: &StatementRowMap, facts: &[FactOutput]) -> PackSelection {
let concept_names = collect_concept_names(statement_rows, facts);
let role_tokens = collect_role_tokens(statement_rows, facts);
let bank_score = score_bank_lender(&concept_names, &role_tokens);
let insurance_score = score_insurance(&concept_names, &role_tokens);
let reit_score = score_reit_real_estate(&concept_names, &role_tokens);
let broker_score = score_broker_asset_manager(&concept_names, &role_tokens);
let mut scored_packs = vec![
(FiscalPack::BankLender, bank_score),
(FiscalPack::Insurance, insurance_score),
(FiscalPack::ReitRealEstate, reit_score),
(FiscalPack::BrokerAssetManager, broker_score),
];
scored_packs.sort_by(|left, right| right.1.cmp(&left.1));
let (top_pack, top_score) = scored_packs[0];
let second_score = scored_packs.get(1).map(|(_, score)| *score).unwrap_or_default();
let margin = top_score - second_score;
let selected_pack = if top_score >= 10 && margin >= 4 {
top_pack
} else {
FiscalPack::Core
};
let mut warnings = Vec::new();
if selected_pack == FiscalPack::Core && top_score > 0 {
warnings.push("fiscal_pack_defaulted_to_core".to_string());
}
PackSelection {
pack: selected_pack,
warnings,
}
}
fn collect_concept_names(statement_rows: &StatementRowMap, facts: &[FactOutput]) -> HashSet<String> {
let mut names = HashSet::new();
for rows in statement_rows.values() {
for row in rows {
names.insert(row.local_name.to_ascii_lowercase());
names.insert(row.qname.to_ascii_lowercase());
}
}
for fact in facts {
names.insert(fact.local_name.to_ascii_lowercase());
names.insert(fact.qname.to_ascii_lowercase());
}
names
}
fn collect_role_tokens(statement_rows: &StatementRowMap, facts: &[FactOutput]) -> HashSet<String> {
let mut roles = HashSet::new();
for rows in statement_rows.values() {
for row in rows {
if let Some(role_uri) = &row.role_uri {
roles.insert(role_uri.to_ascii_lowercase());
}
}
}
for fact in facts {
if let Some(role_uri) = &fact.role_uri {
roles.insert(role_uri.to_ascii_lowercase());
}
}
roles
}
fn score_bank_lender(concepts: &HashSet<String>, roles: &HashSet<String>) -> i64 {
let mut score = 0;
score += weighted_match(
concepts,
&[
"depositsliabilities",
"us-gaap:depositsliabilities",
"deposits",
],
8,
);
score += weighted_match(
concepts,
&[
"financingreceivablerecordedinvestment",
"us-gaap:financingreceivablerecordedinvestment",
"loansreceivablenetreportedamount",
"us-gaap:loansreceivablenetreportedamount",
],
8,
);
score += weighted_match(
concepts,
&[
"allowanceforcreditlosses",
"allowanceforloanlosses",
"provisionforcreditlosses",
"loanlossprovision",
"netinterestincome",
"interestexpense",
"interestanddividendincomeoperating",
],
4,
);
score += weighted_role_match(roles, &["deposit", "loan", "credit", "netinterest"], 2);
score
}
fn score_insurance(concepts: &HashSet<String>, roles: &HashSet<String>) -> i64 {
let mut score = 0;
score += weighted_match(
concepts,
&[
"premiums",
"premiumswritten",
"premiumsearned",
"premiumswrittennet",
"premiumsearnednet",
"us-gaap:premiums",
],
8,
);
score += weighted_match(
concepts,
&[
"policyholderbenefitsandclaimsincurrednet",
"futurepolicybenefits",
"liabilityforfuturepolicybenefits",
"liabilityforunpaidlossesandlossadjustmentexpenses",
"liabilityforunpaidclaimsandclaimsadjustmentexpense",
"liabilityforfuturepolicybenefits",
"deferredpolicyacquisitioncosts",
"deferredpolicyacquisitioncostsamortizationexpense",
"netinvestmentincome",
"underwritingincomeloss",
"unearnedpremiumsnet",
],
6,
);
score += weighted_role_match(roles, &["insurance", "premium", "policy", "claims"], 2);
score
}
fn score_reit_real_estate(concepts: &HashSet<String>, roles: &HashSet<String>) -> i64 {
let mut score = 0;
score += weighted_match(
concepts,
&[
"leaseincome",
"realestateinvestmentpropertynet",
"realestategrossatcarryingvalue",
"realestateinvestmentpropertyatcost",
],
8,
);
score += weighted_match(
concepts,
&[
"numberofrealestateproperties",
"directcostsofleasedandrentedpropertyorequipment",
"depreciationdepletionandamortization",
"realestateaccumulateddepreciation",
"paymentstoacquirecommercialrealestate",
],
6,
);
score += weighted_role_match(roles, &["realestate", "property", "lease", "rental"], 2);
score
}
fn score_broker_asset_manager(concepts: &HashSet<String>, roles: &HashSet<String>) -> i64 {
let mut score = 0;
score += weighted_match(
concepts,
&[
"assetsundermanagementcarryingamount",
"feepayingassetundermanagement",
],
8,
);
score += weighted_match(
concepts,
&[
"performancefeerevenuerecognized",
"subadvisoryandother",
"sponsorfees",
],
6,
);
score += weighted_match(
concepts,
&[
"totalsalesassetandaccountexpense",
"estimatedannualfixedminimumfeesforcurrentlyoutstandingcontracts",
"reductioninthevalueofmanagementcontract",
],
6,
);
score += weighted_role_match(roles, &["advis", "management", "asset", "distribution"], 2);
score
}
fn weighted_match(concepts: &HashSet<String>, candidates: &[&str], weight: i64) -> i64 {
if candidates
.iter()
.any(|candidate| concepts.contains(&candidate.to_ascii_lowercase()))
{
weight
} else {
0
}
}
fn weighted_role_match(roles: &HashSet<String>, candidates: &[&str], weight: i64) -> i64 {
if roles.iter().any(|role| candidates.iter().any(|candidate| role.contains(candidate))) {
weight
} else {
0
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::{StatementRowOutput, StatementRowMap};
use std::collections::BTreeMap;
fn row(local_name: &str, statement: &str) -> StatementRowOutput {
StatementRowOutput {
key: local_name.to_string(),
label: local_name.to_string(),
concept_key: format!("http://fasb.org/us-gaap/2024#{local_name}"),
qname: format!("us-gaap:{local_name}"),
namespace_uri: "http://fasb.org/us-gaap/2024".to_string(),
local_name: local_name.to_string(),
is_extension: false,
statement: statement.to_string(),
role_uri: Some(statement.to_string()),
order: 1,
depth: 0,
parent_key: None,
values: BTreeMap::from([("p1".to_string(), Some(1.0))]),
units: BTreeMap::from([("p1".to_string(), Some("iso4217:USD".to_string()))]),
has_dimensions: false,
source_fact_ids: vec![1],
}
}
fn empty_map() -> StatementRowMap {
BTreeMap::from([
("income".to_string(), Vec::new()),
("balance".to_string(), Vec::new()),
("cash_flow".to_string(), Vec::new()),
("equity".to_string(), Vec::new()),
("comprehensive_income".to_string(), Vec::new()),
])
}
#[test]
fn chooses_bank_lender_from_loan_and_deposit_signatures() {
let mut rows = empty_map();
rows.get_mut("balance").unwrap().extend([
row("DepositsLiabilities", "balance"),
row("FinancingReceivableRecordedInvestment", "balance"),
row("AllowanceForCreditLosses", "balance"),
]);
let selection = select_fiscal_pack(&rows, &[]);
assert_eq!(selection.pack, FiscalPack::BankLender);
assert!(selection.warnings.is_empty());
}
#[test]
fn chooses_insurance_from_premium_and_claim_signatures() {
let mut rows = empty_map();
rows.get_mut("income").unwrap().extend([
row("Premiums", "income"),
row("PolicyholderBenefitsAndClaimsIncurredNet", "income"),
]);
rows.get_mut("balance").unwrap().push(row("FuturePolicyBenefits", "balance"));
let selection = select_fiscal_pack(&rows, &[]);
assert_eq!(selection.pack, FiscalPack::Insurance);
assert!(selection.warnings.is_empty());
}
#[test]
fn defaults_to_core_on_low_confidence() {
let mut rows = empty_map();
rows.get_mut("income").unwrap().push(row("InterestExpense", "income"));
let selection = select_fiscal_pack(&rows, &[]);
assert_eq!(selection.pack, FiscalPack::Core);
assert_eq!(selection.warnings, vec!["fiscal_pack_defaulted_to_core"]);
}
#[test]
fn chooses_reit_from_property_and_lease_signatures() {
let mut rows = empty_map();
rows.get_mut("income").unwrap().push(row("LeaseIncome", "income"));
rows.get_mut("balance").unwrap().push(row("RealEstateInvestmentPropertyNet", "balance"));
rows.get_mut("balance").unwrap().push(row("NumberOfRealEstateProperties", "balance"));
let selection = select_fiscal_pack(&rows, &[]);
assert_eq!(selection.pack, FiscalPack::ReitRealEstate);
}
#[test]
fn chooses_broker_asset_manager_from_aum_and_fee_signatures() {
let mut rows = empty_map();
rows.get_mut("income").unwrap().push(row("PerformanceFeeRevenueRecognized", "income"));
rows.get_mut("balance").unwrap().push(row("AssetsUnderManagementCarryingAmount", "balance"));
rows.get_mut("balance").unwrap().push(row("FeePayingAssetUnderManagement", "balance"));
let selection = select_fiscal_pack(&rows, &[]);
assert_eq!(selection.pack, FiscalPack::BrokerAssetManager);
}
}

View File

@@ -0,0 +1,667 @@
use anyhow::Result;
use std::collections::{BTreeMap, HashMap, HashSet};
use crate::pack_selector::FiscalPack;
use crate::taxonomy_loader::{load_crosswalk, load_surface_pack, CrosswalkFile, SurfaceDefinition};
use crate::{
ConceptOutput, DetailRowOutput, DetailRowStatementMap, FactOutput, NormalizationSummaryOutput,
PeriodOutput, StatementRowMap, StatementRowOutput, SurfaceRowMap, SurfaceRowOutput,
};
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum MappingMethod {
AuthoritativeDirect,
DirectSourceConcept,
AggregateChildren,
TaxonomyKpi,
UnmappedResidual,
}
impl MappingMethod {
pub fn as_str(&self) -> &'static str {
match self {
MappingMethod::AuthoritativeDirect => "authoritative_direct",
MappingMethod::DirectSourceConcept => "direct_source_concept",
MappingMethod::AggregateChildren => "aggregate_children",
MappingMethod::TaxonomyKpi => "taxonomy_kpi",
MappingMethod::UnmappedResidual => "unmapped_residual",
}
}
}
#[derive(Debug, Clone, Default)]
pub struct MappingAssignment {
pub authoritative_concept_key: Option<String>,
pub mapping_method: Option<MappingMethod>,
pub surface_key: Option<String>,
pub detail_parent_surface_key: Option<String>,
pub kpi_key: Option<String>,
pub residual_flag: bool,
}
#[derive(Debug, Default)]
pub struct CompactSurfaceModel {
pub surface_rows: SurfaceRowMap,
pub detail_rows: DetailRowStatementMap,
pub normalization_summary: NormalizationSummaryOutput,
pub concept_mappings: HashMap<String, MappingAssignment>,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
enum MatchRole {
Surface,
Detail,
}
#[derive(Debug, Clone)]
struct MatchedStatementRow<'a> {
row: &'a StatementRowOutput,
authoritative_concept_key: Option<String>,
mapping_method: MappingMethod,
match_role: MatchRole,
rank: i64,
}
pub fn build_compact_surface_model(
periods: &[PeriodOutput],
statement_rows: &StatementRowMap,
taxonomy_regime: &str,
fiscal_pack: FiscalPack,
warnings: Vec<String>,
) -> Result<CompactSurfaceModel> {
let pack = load_surface_pack(fiscal_pack)?;
let crosswalk = load_crosswalk(taxonomy_regime)?;
let mut surface_rows = empty_surface_row_map();
let mut detail_rows = empty_detail_row_map();
let mut concept_mappings = HashMap::<String, MappingAssignment>::new();
let mut surface_row_count = 0usize;
let mut detail_row_count = 0usize;
let mut unmapped_row_count = 0usize;
let mut material_unmapped_row_count = 0usize;
for statement in statement_keys() {
let rows = statement_rows
.get(statement)
.cloned()
.unwrap_or_default();
let statement_definitions = pack
.surfaces
.iter()
.filter(|definition| definition.statement == statement)
.collect::<Vec<_>>();
let mut used_row_keys = HashSet::<String>::new();
let mut statement_surface_rows = Vec::<SurfaceRowOutput>::new();
let mut statement_detail_rows = BTreeMap::<String, Vec<DetailRowOutput>>::new();
for definition in statement_definitions {
let matches = rows
.iter()
.filter(|row| !used_row_keys.contains(&row.key))
.filter_map(|row| match_statement_row(row, definition, crosswalk.as_ref()))
.collect::<Vec<_>>();
if matches.is_empty() {
continue;
}
let direct_surface_matches = matches
.iter()
.filter(|matched| matched.match_role == MatchRole::Surface)
.cloned()
.collect::<Vec<_>>();
let detail_component_matches = matches
.iter()
.filter(|matched| matched.match_role == MatchRole::Detail)
.cloned()
.collect::<Vec<_>>();
let mut surface_source_matches = if !direct_surface_matches.is_empty() {
vec![pick_best_match(&direct_surface_matches).clone()]
} else if definition.rollup_policy == "aggregate_children" {
detail_component_matches.clone()
} else {
Vec::new()
};
if surface_source_matches.is_empty() {
continue;
}
let detail_matches = if definition.detail_grouping_policy == "group_all_children" {
if detail_component_matches.is_empty() && definition.rollup_policy == "aggregate_children" {
Vec::new()
} else {
detail_component_matches.clone()
}
} else {
Vec::new()
};
if definition.rollup_policy == "aggregate_children"
&& direct_surface_matches.is_empty()
&& !surface_source_matches.is_empty()
{
for matched in &mut surface_source_matches {
matched.mapping_method = MappingMethod::AggregateChildren;
}
}
let values = build_surface_values(periods, &surface_source_matches);
if !has_any_value(&values) {
continue;
}
let resolved_source_row_keys = periods
.iter()
.map(|period| {
let resolved = if surface_source_matches.len() == 1 {
surface_source_matches
.first()
.and_then(|matched| matched.row.values.get(&period.id).copied().flatten().map(|_| matched.row.key.clone()))
} else {
None
};
(period.id.clone(), resolved)
})
.collect::<BTreeMap<_, _>>();
let source_concepts = unique_sorted_strings(
surface_source_matches
.iter()
.map(|matched| matched.row.qname.clone())
.collect::<Vec<_>>(),
);
let source_row_keys = unique_sorted_strings(
surface_source_matches
.iter()
.map(|matched| matched.row.key.clone())
.collect::<Vec<_>>(),
);
let source_fact_ids = unique_sorted_i64(
surface_source_matches
.iter()
.flat_map(|matched| matched.row.source_fact_ids.clone())
.collect::<Vec<_>>(),
);
let has_dimensions = surface_source_matches.iter().any(|matched| matched.row.has_dimensions);
for matched in &surface_source_matches {
used_row_keys.insert(matched.row.key.clone());
concept_mappings.insert(
matched.row.concept_key.clone(),
MappingAssignment {
authoritative_concept_key: matched.authoritative_concept_key.clone(),
mapping_method: Some(matched.mapping_method),
surface_key: Some(definition.surface_key.clone()),
detail_parent_surface_key: None,
kpi_key: None,
residual_flag: false,
},
);
}
let details = detail_matches
.iter()
.map(|matched| {
used_row_keys.insert(matched.row.key.clone());
concept_mappings.insert(
matched.row.concept_key.clone(),
MappingAssignment {
authoritative_concept_key: matched.authoritative_concept_key.clone(),
mapping_method: Some(matched.mapping_method),
surface_key: Some(definition.surface_key.clone()),
detail_parent_surface_key: Some(definition.surface_key.clone()),
kpi_key: None,
residual_flag: false,
},
);
build_detail_row(matched.row, &definition.surface_key, false)
})
.collect::<Vec<_>>();
if !details.is_empty() {
detail_row_count += details.len();
statement_detail_rows.insert(definition.surface_key.clone(), details);
}
statement_surface_rows.push(SurfaceRowOutput {
key: definition.surface_key.clone(),
label: definition.label.clone(),
category: definition.category.clone(),
template_section: definition.category.clone(),
order: definition.order,
unit: definition.unit.clone(),
values,
source_concepts,
source_row_keys,
source_fact_ids,
formula_key: definition.formula_fallback.as_ref().map(|_| definition.surface_key.clone()),
has_dimensions,
resolved_source_row_keys,
statement: Some(definition.statement.clone()),
detail_count: statement_detail_rows
.get(&definition.surface_key)
.map(|rows| rows.len() as i64),
resolution_method: None,
confidence: None,
warning_codes: vec![],
});
surface_row_count += 1;
let _ = &definition.materiality_policy;
}
statement_surface_rows.sort_by(|left, right| left.order.cmp(&right.order).then_with(|| left.label.cmp(&right.label)));
let baseline = baseline_for_statement(statement, &statement_surface_rows);
let threshold = materiality_threshold(statement, baseline);
let residual_rows = rows
.iter()
.filter(|row| !used_row_keys.contains(&row.key))
.filter(|row| has_any_value(&row.values))
.map(|row| {
concept_mappings.insert(
row.concept_key.clone(),
MappingAssignment {
authoritative_concept_key: None,
mapping_method: Some(MappingMethod::UnmappedResidual),
surface_key: None,
detail_parent_surface_key: Some("unmapped".to_string()),
kpi_key: None,
residual_flag: true,
},
);
build_detail_row(row, "unmapped", true)
})
.collect::<Vec<_>>();
if !residual_rows.is_empty() {
unmapped_row_count += residual_rows.len();
material_unmapped_row_count += residual_rows
.iter()
.filter(|row| max_abs_value(&row.values) >= threshold)
.count();
detail_row_count += residual_rows.len();
statement_detail_rows.insert("unmapped".to_string(), residual_rows);
}
surface_rows.insert(statement.to_string(), statement_surface_rows);
detail_rows.insert(statement.to_string(), statement_detail_rows);
}
Ok(CompactSurfaceModel {
surface_rows,
detail_rows,
normalization_summary: NormalizationSummaryOutput {
surface_row_count,
detail_row_count,
kpi_row_count: 0,
unmapped_row_count,
material_unmapped_row_count,
warnings,
},
concept_mappings,
})
}
pub fn merge_mapping_assignments(
primary: &mut HashMap<String, MappingAssignment>,
secondary: HashMap<String, MappingAssignment>,
) {
for (concept_key, assignment) in secondary {
let existing = primary.entry(concept_key).or_default();
existing.authoritative_concept_key = existing
.authoritative_concept_key
.clone()
.or(assignment.authoritative_concept_key);
if existing.mapping_method.is_none()
|| matches!(existing.mapping_method, Some(MappingMethod::UnmappedResidual))
{
existing.mapping_method = assignment.mapping_method;
}
if existing.surface_key.is_none() {
existing.surface_key = assignment.surface_key;
}
if existing.detail_parent_surface_key.is_none() {
existing.detail_parent_surface_key = assignment.detail_parent_surface_key;
}
if existing.kpi_key.is_none() {
existing.kpi_key = assignment.kpi_key;
}
existing.residual_flag = existing.residual_flag && assignment.residual_flag;
}
}
pub fn apply_mapping_assignments(
concepts: &mut [ConceptOutput],
facts: &mut [FactOutput],
mappings: &HashMap<String, MappingAssignment>,
) {
for concept in concepts {
if let Some(mapping) = mappings.get(&concept.concept_key) {
concept.authoritative_concept_key = mapping.authoritative_concept_key.clone();
concept.mapping_method = mapping.mapping_method.map(|method| method.as_str().to_string());
concept.surface_key = mapping.surface_key.clone();
concept.detail_parent_surface_key = mapping.detail_parent_surface_key.clone();
concept.kpi_key = mapping.kpi_key.clone();
concept.residual_flag = mapping.residual_flag;
}
}
for fact in facts {
if let Some(mapping) = mappings.get(&fact.concept_key) {
fact.authoritative_concept_key = mapping.authoritative_concept_key.clone();
fact.mapping_method = mapping.mapping_method.map(|method| method.as_str().to_string());
fact.surface_key = mapping.surface_key.clone();
fact.detail_parent_surface_key = mapping.detail_parent_surface_key.clone();
fact.kpi_key = mapping.kpi_key.clone();
fact.residual_flag = mapping.residual_flag;
}
}
}
fn match_statement_row<'a>(
row: &'a StatementRowOutput,
definition: &SurfaceDefinition,
crosswalk: Option<&CrosswalkFile>,
) -> Option<MatchedStatementRow<'a>> {
let authoritative_mapping = crosswalk.and_then(|crosswalk| crosswalk.mappings.get(&row.qname));
let authoritative_concept_key = authoritative_mapping
.map(|mapping| mapping.authoritative_concept_key.clone())
.or_else(|| {
if !row.is_extension {
Some(row.qname.clone())
} else {
None
}
});
let matches_authoritative = authoritative_concept_key.as_ref().map_or(false, |concept| {
definition
.allowed_authoritative_concepts
.iter()
.any(|candidate| candidate_matches(candidate, concept))
}) || authoritative_mapping
.map(|mapping| mapping.surface_key == definition.surface_key)
.unwrap_or(false);
if matches_authoritative {
return Some(MatchedStatementRow {
row,
authoritative_concept_key,
mapping_method: MappingMethod::AuthoritativeDirect,
match_role: MatchRole::Surface,
rank: 0,
});
}
let matches_source = definition
.allowed_source_concepts
.iter()
.any(|candidate| candidate_matches(candidate, &row.qname) || candidate_matches(candidate, &row.local_name));
if matches_source {
return Some(MatchedStatementRow {
row,
authoritative_concept_key,
mapping_method: MappingMethod::DirectSourceConcept,
match_role: if definition.rollup_policy == "aggregate_children" {
MatchRole::Detail
} else {
MatchRole::Surface
},
rank: 1,
});
}
None
}
fn pick_best_match<'a>(matches: &'a [MatchedStatementRow<'a>]) -> &'a MatchedStatementRow<'a> {
matches
.iter()
.min_by(|left, right| {
left.rank
.cmp(&right.rank)
.then_with(|| {
let left_dimension_rank = if left.row.has_dimensions { 1 } else { 0 };
let right_dimension_rank = if right.row.has_dimensions { 1 } else { 0 };
left_dimension_rank.cmp(&right_dimension_rank)
})
.then_with(|| left.row.order.cmp(&right.row.order))
.then_with(|| {
max_abs_value(&right.row.values)
.partial_cmp(&max_abs_value(&left.row.values))
.unwrap_or(std::cmp::Ordering::Equal)
})
.then_with(|| left.row.label.cmp(&right.row.label))
})
.expect("pick_best_match requires at least one match")
}
fn build_surface_values(
periods: &[PeriodOutput],
matches: &[MatchedStatementRow<'_>],
) -> BTreeMap<String, Option<f64>> {
periods
.iter()
.map(|period| {
let value = if matches.len() == 1 {
matches
.first()
.and_then(|matched| matched.row.values.get(&period.id).copied())
.flatten()
} else {
sum_nullable_values(
matches
.iter()
.map(|matched| matched.row.values.get(&period.id).copied().flatten())
.collect::<Vec<_>>(),
)
};
(period.id.clone(), value)
})
.collect()
}
fn sum_nullable_values(values: Vec<Option<f64>>) -> Option<f64> {
if values.iter().all(|value| value.is_none()) {
return None;
}
Some(values.into_iter().map(|value| value.unwrap_or(0.0)).sum())
}
fn build_detail_row(
row: &StatementRowOutput,
parent_surface_key: &str,
residual_flag: bool,
) -> DetailRowOutput {
DetailRowOutput {
key: row.key.clone(),
parent_surface_key: parent_surface_key.to_string(),
label: row.label.clone(),
concept_key: row.concept_key.clone(),
qname: row.qname.clone(),
namespace_uri: row.namespace_uri.clone(),
local_name: row.local_name.clone(),
unit: row.units.values().find_map(|value| value.clone()),
values: row.values.clone(),
source_fact_ids: row.source_fact_ids.clone(),
is_extension: row.is_extension,
dimensions_summary: if row.has_dimensions {
vec!["has_dimensions".to_string()]
} else {
vec![]
},
residual_flag,
}
}
fn has_any_value(values: &BTreeMap<String, Option<f64>>) -> bool {
values.values().any(|value| value.is_some())
}
fn max_abs_value(values: &BTreeMap<String, Option<f64>>) -> f64 {
values
.values()
.fold(0.0_f64, |max, value| max.max(value.unwrap_or(0.0).abs()))
}
fn baseline_for_statement(statement: &str, surface_rows: &[SurfaceRowOutput]) -> f64 {
let anchor_key = if statement == "balance" {
"total_assets"
} else {
"revenue"
};
surface_rows
.iter()
.find(|row| row.key == anchor_key)
.map(|row| max_abs_value(&row.values))
.unwrap_or(0.0)
}
fn materiality_threshold(statement: &str, baseline: f64) -> f64 {
if statement == "balance" {
return (baseline * 0.005).max(5_000_000.0);
}
(baseline * 0.01).max(1_000_000.0)
}
fn unique_sorted_strings(values: Vec<String>) -> Vec<String> {
let mut values = values.into_iter().collect::<HashSet<_>>().into_iter().collect::<Vec<_>>();
values.sort();
values
}
fn unique_sorted_i64(values: Vec<i64>) -> Vec<i64> {
let mut values = values.into_iter().collect::<HashSet<_>>().into_iter().collect::<Vec<_>>();
values.sort();
values
}
fn candidate_matches(candidate: &str, actual: &str) -> bool {
candidate.eq_ignore_ascii_case(actual)
|| candidate
.rsplit_once(':')
.map(|(_, local_name)| local_name.eq_ignore_ascii_case(actual))
.unwrap_or(false)
|| actual
.rsplit_once(':')
.map(|(_, local_name)| local_name.eq_ignore_ascii_case(candidate))
.unwrap_or(false)
}
fn statement_keys() -> [&'static str; 5] {
["income", "balance", "cash_flow", "equity", "comprehensive_income"]
}
fn empty_surface_row_map() -> SurfaceRowMap {
statement_keys()
.into_iter()
.map(|key| (key.to_string(), Vec::new()))
.collect()
}
fn empty_detail_row_map() -> DetailRowStatementMap {
statement_keys()
.into_iter()
.map(|key| (key.to_string(), BTreeMap::new()))
.collect()
}
#[cfg(test)]
mod tests {
use super::*;
use crate::pack_selector::FiscalPack;
use crate::{PeriodOutput, StatementRowOutput};
fn period(id: &str) -> PeriodOutput {
PeriodOutput {
id: id.to_string(),
filing_id: 1,
accession_number: "0000000000-00-000001".to_string(),
filing_date: "2025-12-31".to_string(),
period_start: Some("2025-01-01".to_string()),
period_end: Some("2025-12-31".to_string()),
filing_type: "10-K".to_string(),
period_label: id.to_string(),
}
}
fn row(key: &str, qname: &str, statement: &str, value: f64) -> StatementRowOutput {
StatementRowOutput {
key: key.to_string(),
label: key.to_string(),
concept_key: format!("http://fasb.org/us-gaap/2024#{}", qname.split(':').nth(1).unwrap_or(key)),
qname: qname.to_string(),
namespace_uri: "http://fasb.org/us-gaap/2024".to_string(),
local_name: qname.split(':').nth(1).unwrap_or(key).to_string(),
is_extension: false,
statement: statement.to_string(),
role_uri: Some(statement.to_string()),
order: 1,
depth: 0,
parent_key: None,
values: BTreeMap::from([("p1".to_string(), Some(value))]),
units: BTreeMap::from([("p1".to_string(), Some("iso4217:USD".to_string()))]),
has_dimensions: false,
source_fact_ids: vec![1],
}
}
fn empty_map() -> StatementRowMap {
BTreeMap::from([
("income".to_string(), Vec::new()),
("balance".to_string(), Vec::new()),
("cash_flow".to_string(), Vec::new()),
("equity".to_string(), Vec::new()),
("comprehensive_income".to_string(), Vec::new()),
])
}
#[test]
fn prefers_direct_authoritative_row_over_child_aggregation() {
let mut rows = empty_map();
rows.get_mut("income").unwrap().extend([
row("op-expenses", "us-gaap:OperatingExpenses", "income", 40.0),
row("sga", "us-gaap:SellingGeneralAndAdministrativeExpense", "income", 25.0),
row("rd", "us-gaap:ResearchAndDevelopmentExpense", "income", 15.0),
]);
let model = build_compact_surface_model(
&[period("p1")],
&rows,
"us-gaap",
FiscalPack::Core,
vec![],
)
.expect("compact model should build");
let op_expenses = model
.surface_rows
.get("income")
.unwrap()
.iter()
.find(|row| row.key == "operating_expenses")
.unwrap();
assert_eq!(op_expenses.values.get("p1").copied().flatten(), Some(40.0));
assert_eq!(op_expenses.detail_count, Some(2));
}
#[test]
fn emits_unmapped_residual_rows() {
let mut rows = empty_map();
rows.get_mut("income").unwrap().push(row("custom", "company:CustomMetric", "income", 12.0));
let model = build_compact_surface_model(
&[period("p1")],
&rows,
"us-gaap",
FiscalPack::Core,
vec![],
)
.expect("compact model should build");
let residual_rows = model.detail_rows.get("income").unwrap().get("unmapped").unwrap();
assert_eq!(residual_rows.len(), 1);
assert!(residual_rows[0].residual_flag);
}
}

View File

@@ -0,0 +1,249 @@
use anyhow::{anyhow, Context, Result};
use serde::Deserialize;
use std::env;
use std::fs;
use std::collections::HashMap;
use std::path::PathBuf;
use crate::pack_selector::FiscalPack;
#[derive(Debug, Deserialize, Clone)]
pub struct SurfacePackFile {
pub version: String,
pub pack: String,
pub surfaces: Vec<SurfaceDefinition>,
}
#[derive(Debug, Deserialize, Clone)]
pub struct SurfaceDefinition {
pub surface_key: String,
pub statement: String,
pub label: String,
pub category: String,
pub order: i64,
pub unit: String,
pub rollup_policy: String,
pub allowed_source_concepts: Vec<String>,
pub allowed_authoritative_concepts: Vec<String>,
pub formula_fallback: Option<serde_json::Value>,
pub detail_grouping_policy: String,
pub materiality_policy: String,
}
#[derive(Debug, Deserialize, Clone)]
pub struct CrosswalkFile {
pub version: String,
pub regime: String,
pub mappings: std::collections::HashMap<String, CrosswalkMapping>,
}
#[derive(Debug, Deserialize, Clone)]
pub struct CrosswalkMapping {
pub surface_key: String,
pub authoritative_concept_key: String,
}
#[derive(Debug, Deserialize, Clone)]
pub struct KpiPackFile {
pub version: String,
pub pack: String,
pub kpis: Vec<KpiDefinition>,
}
#[derive(Debug, Deserialize, Clone)]
pub struct KpiDefinition {
pub key: String,
pub label: String,
pub unit: String,
}
#[derive(Debug, Deserialize, Clone)]
pub struct UniversalIncomeFile {
pub version: String,
pub rows: Vec<UniversalIncomeDefinition>,
}
#[derive(Debug, Deserialize, Clone)]
pub struct UniversalIncomeDefinition {
pub key: String,
pub statement: String,
pub label: String,
pub category: String,
pub order: i64,
pub unit: String,
}
#[derive(Debug, Deserialize, Clone)]
pub struct IncomeBridgeFile {
pub version: String,
pub pack: String,
pub rows: HashMap<String, IncomeBridgeRow>,
}
#[derive(Debug, Deserialize, Clone, Default)]
pub struct IncomeBridgeComponents {
#[serde(default)]
pub positive: Vec<String>,
#[serde(default)]
pub negative: Vec<String>,
}
#[derive(Debug, Deserialize, Clone, Default)]
pub struct IncomeBridgeConceptGroups {
#[serde(default)]
pub positive: Vec<IncomeBridgeConceptGroup>,
#[serde(default)]
pub negative: Vec<IncomeBridgeConceptGroup>,
}
#[derive(Debug, Deserialize, Clone)]
pub struct IncomeBridgeConceptGroup {
pub name: String,
pub concepts: Vec<String>,
}
#[derive(Debug, Deserialize, Clone)]
pub struct IncomeBridgeRow {
#[serde(default)]
pub direct_authoritative_concepts: Vec<String>,
#[serde(default)]
pub direct_source_concepts: Vec<String>,
#[serde(default)]
pub component_surfaces: IncomeBridgeComponents,
#[serde(default)]
pub component_concept_groups: IncomeBridgeConceptGroups,
pub formula: String,
#[serde(default)]
pub not_meaningful_for_pack: bool,
#[serde(default)]
pub warning_codes_when_used: Vec<String>,
}
pub fn resolve_taxonomy_dir() -> Result<PathBuf> {
let mut candidates = Vec::new();
if let Some(value) = env::var("FISCAL_TAXONOMY_DIR")
.ok()
.map(|value| value.trim().to_string())
.filter(|value| !value.is_empty())
{
candidates.push(PathBuf::from(value));
}
if let Ok(current_dir) = env::current_dir() {
candidates.push(current_dir.join("rust").join("taxonomy"));
candidates.push(current_dir.join("taxonomy"));
}
candidates.push(PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("../taxonomy"));
if let Ok(executable) = env::current_exe() {
if let Some(parent) = executable.parent() {
candidates.push(parent.join("../rust/taxonomy"));
candidates.push(parent.join("../taxonomy"));
}
}
candidates
.into_iter()
.find(|path| path.is_dir())
.ok_or_else(|| anyhow!("taxonomy resolution failed: unable to locate runtime taxonomy directory"))
}
pub fn load_surface_pack(pack: FiscalPack) -> Result<SurfacePackFile> {
let taxonomy_dir = resolve_taxonomy_dir()?;
let path = taxonomy_dir
.join("fiscal")
.join("v1")
.join(format!("{}.surface.json", pack.as_str()));
let raw = fs::read_to_string(&path)
.with_context(|| format!("taxonomy resolution failed: unable to read {}", path.display()))?;
let file = serde_json::from_str::<SurfacePackFile>(&raw)
.with_context(|| format!("taxonomy resolution failed: unable to parse {}", path.display()))?;
let _ = (&file.version, &file.pack);
Ok(file)
}
pub fn load_crosswalk(regime: &str) -> Result<Option<CrosswalkFile>> {
let file_name = match regime {
"us-gaap" => "us-gaap.json",
"ifrs-full" => "ifrs.json",
_ => return Ok(None),
};
let taxonomy_dir = resolve_taxonomy_dir()?;
let path = taxonomy_dir.join("crosswalk").join(file_name);
let raw = fs::read_to_string(&path)
.with_context(|| format!("taxonomy resolution failed: unable to read {}", path.display()))?;
let file = serde_json::from_str::<CrosswalkFile>(&raw)
.with_context(|| format!("taxonomy resolution failed: unable to parse {}", path.display()))?;
let _ = (&file.version, &file.regime);
Ok(Some(file))
}
pub fn load_kpi_pack(pack: FiscalPack) -> Result<KpiPackFile> {
let taxonomy_dir = resolve_taxonomy_dir()?;
let path = taxonomy_dir
.join("fiscal")
.join("v1")
.join("kpis")
.join(format!("{}.kpis.json", pack.as_str()));
let raw = fs::read_to_string(&path)
.with_context(|| format!("taxonomy resolution failed: unable to read {}", path.display()))?;
let file = serde_json::from_str::<KpiPackFile>(&raw)
.with_context(|| format!("taxonomy resolution failed: unable to parse {}", path.display()))?;
let _ = (&file.version, &file.pack);
Ok(file)
}
pub fn load_universal_income_definitions() -> Result<UniversalIncomeFile> {
let taxonomy_dir = resolve_taxonomy_dir()?;
let path = taxonomy_dir
.join("fiscal")
.join("v1")
.join("universal_income.surface.json");
let raw = fs::read_to_string(&path)
.with_context(|| format!("taxonomy resolution failed: unable to read {}", path.display()))?;
let file = serde_json::from_str::<UniversalIncomeFile>(&raw)
.with_context(|| format!("taxonomy resolution failed: unable to parse {}", path.display()))?;
let _ = &file.version;
Ok(file)
}
pub fn load_income_bridge(pack: FiscalPack) -> Result<IncomeBridgeFile> {
let taxonomy_dir = resolve_taxonomy_dir()?;
let path = taxonomy_dir
.join("fiscal")
.join("v1")
.join(format!("{}.income-bridge.json", pack.as_str()));
let raw = fs::read_to_string(&path)
.with_context(|| format!("taxonomy resolution failed: unable to read {}", path.display()))?;
let file = serde_json::from_str::<IncomeBridgeFile>(&raw)
.with_context(|| format!("taxonomy resolution failed: unable to parse {}", path.display()))?;
let _ = (&file.version, &file.pack);
Ok(file)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn resolves_taxonomy_dir_and_loads_core_pack() {
let taxonomy_dir = resolve_taxonomy_dir().expect("taxonomy dir should resolve during tests");
assert!(taxonomy_dir.exists());
let surface_pack = load_surface_pack(FiscalPack::Core).expect("core surface pack should load");
assert_eq!(surface_pack.pack, "core");
assert!(!surface_pack.surfaces.is_empty());
let kpi_pack = load_kpi_pack(FiscalPack::Core).expect("core kpi pack should load");
assert_eq!(kpi_pack.pack, "core");
let universal_income = load_universal_income_definitions().expect("universal income config should load");
assert!(!universal_income.rows.is_empty());
let core_bridge = load_income_bridge(FiscalPack::Core).expect("core bridge should load");
assert_eq!(core_bridge.pack, "core");
}
}

File diff suppressed because it is too large Load Diff