diff --git a/rust/fiscal-xbrl-core/src/universal_income.rs b/rust/fiscal-xbrl-core/src/universal_income.rs index f857476..f0effe0 100644 --- a/rust/fiscal-xbrl-core/src/universal_income.rs +++ b/rust/fiscal-xbrl-core/src/universal_income.rs @@ -12,6 +12,37 @@ use crate::{ SurfaceRowOutput, }; +const RESIDUAL_TOLERANCE: f64 = 1.0; +const EXCLUDED_OPERATING_RESIDUAL_KEYS: &[&str] = &[ + "revenue", + "cost_of_revenue", + "gross_profit", + "operating_expenses", + "selling_general_and_administrative", + "sales_and_marketing", + "general_and_administrative", + "other_operating_expense", + "operating_income", + "income_tax_expense", + "net_income", +]; +const COMMON_OPERATING_COMPONENT_KEYS: &[&str] = &[ + "research_and_development", + "depreciation_and_amortization", + "depreciation", + "amortization", + "restructuring", + "asset_impairment", + "stock_based_compensation", + "labor_and_related_expense", + "occupancy_net", + "data_processing_expense", + "claims_and_benefits", + "underwriting_expenses", + "property_operating_expense", + "administrative_expense", +]; + #[derive(Debug, Clone, Copy, PartialEq, Eq)] enum ResolutionMethod { Direct, @@ -154,6 +185,7 @@ pub fn apply_universal_income_rows( resolve_universal_row( definition, bridge_row, + &income_bridge.rows, periods, &income_statement_rows, facts, @@ -237,6 +269,7 @@ pub fn apply_universal_income_rows( fn resolve_universal_row( definition: &UniversalIncomeDefinition, bridge_row: &IncomeBridgeRow, + bridge_rows: &HashMap, periods: &[PeriodOutput], income_statement_rows: &[StatementRowOutput], facts: &[FactOutput], @@ -313,6 +346,46 @@ fn resolve_universal_row( }; } + if definition.key == "selling_general_and_administrative" { + let formula_row = build_formula_row( + definition, + bridge_row, + periods, + income_statement_rows, + income_surface_rows, + crosswalk, + ); + if has_any_value(&formula_row.row.values) { + return formula_row; + } + + if let Some(residual_row) = build_residual_sga_row( + definition, + bridge_rows, + periods, + income_statement_rows, + facts, + income_surface_rows, + crosswalk, + ) { + return residual_row; + } + + return formula_row; + } + + if definition.key == "other_operating_expense" { + return build_residual_other_operating_expense_row( + definition, + bridge_rows, + periods, + income_statement_rows, + facts, + income_surface_rows, + crosswalk, + ); + } + build_formula_row( definition, bridge_row, @@ -501,6 +574,247 @@ fn build_formula_row( } } +fn build_residual_sga_row( + definition: &UniversalIncomeDefinition, + bridge_rows: &HashMap, + periods: &[PeriodOutput], + income_statement_rows: &[StatementRowOutput], + facts: &[FactOutput], + income_surface_rows: &[SurfaceRowOutput], + crosswalk: Option<&CrosswalkFile>, +) -> Option { + let operating_expenses = find_surface_row(income_surface_rows, "operating_expenses")?; + let research_source = resolve_component_surface_source( + "research_and_development", + income_statement_rows, + income_surface_rows, + crosswalk, + ) + .filter(|source| non_formula_value_source_for_any_period(source)); + let explicit_other_bridge = bridge_rows.get("other_operating_expense"); + + let mut values = BTreeMap::>::new(); + let mut negative_residual = false; + let mut used_sources = Vec::::new(); + + for period in periods { + let mut period_value = None; + + if !has_explicit_direct_match_for_period( + period, + income_statement_rows, + facts, + explicit_other_bridge, + crosswalk, + ) { + if let Some(operating_value) = + residual_anchor_value_for_period(operating_expenses, &period.id) + { + let research_value = research_source.as_ref().and_then(|source| { + non_formula_value_from_source_for_period(source, &period.id) + }); + + if let Some(research_value) = research_value { + let component_total = research_value; + let inferred = operating_value - component_total; + + if inferred.abs() <= RESIDUAL_TOLERANCE { + period_value = Some(0.0); + } else if inferred.is_sign_negative() { + negative_residual = true; + } else { + period_value = Some(inferred); + } + + if period_value.is_some() { + used_sources.push(surface_source(operating_expenses)); + if let Some(source) = research_source.as_ref() { + used_sources.push(source.clone()); + } + } + } + } + } + + values.insert(period.id.clone(), period_value); + } + + if !has_any_value(&values) { + if negative_residual { + return Some(unresolved_row( + definition, + periods, + &[ + "selling_general_and_administrative_negative_residual_invalid".to_string(), + "selling_general_and_administrative_unresolved".to_string(), + ], + )); + } + + return None; + } + + let mut warning_codes = + vec!["selling_general_and_administrative_residual_from_operating_expenses".to_string()]; + if negative_residual { + warning_codes + .push("selling_general_and_administrative_negative_residual_invalid".to_string()); + } + + Some(build_formula_surface_row( + definition, + periods, + values, + &used_sources, + warning_codes, + )) +} + +fn build_residual_other_operating_expense_row( + definition: &UniversalIncomeDefinition, + bridge_rows: &HashMap, + periods: &[PeriodOutput], + _income_statement_rows: &[StatementRowOutput], + _facts: &[FactOutput], + income_surface_rows: &[SurfaceRowOutput], + _crosswalk: Option<&CrosswalkFile>, +) -> ResolvedUniversalRow { + let Some(operating_expenses) = find_surface_row(income_surface_rows, "operating_expenses") + else { + return unresolved_row( + definition, + periods, + &[format!("{}_unresolved", definition.key)], + ); + }; + let Some(sga_row) = find_surface_row(income_surface_rows, "selling_general_and_administrative") + else { + return unresolved_row( + definition, + periods, + &[format!("{}_unresolved", definition.key)], + ); + }; + let component_rows = collect_residual_component_rows( + income_surface_rows, + bridge_rows, + "other_operating_expense", + ); + + let mut values = BTreeMap::>::new(); + let mut suppressed_zero_residual = false; + let mut used_sources = Vec::::new(); + + for period in periods { + let Some(operating_value) = + residual_anchor_value_for_period(operating_expenses, &period.id) + else { + values.insert(period.id.clone(), None); + continue; + }; + let Some(sga_value) = surface_row_value_for_period(sga_row, &period.id) else { + values.insert(period.id.clone(), None); + continue; + }; + + let period_components = component_rows + .iter() + .filter(|row| non_formula_value_for_period(row, &period.id).is_some()) + .copied() + .collect::>(); + let component_total = period_components + .iter() + .filter_map(|row| non_formula_value_for_period(row, &period.id)) + .sum::(); + let residual = operating_value - sga_value - component_total; + + let period_value = if residual.abs() <= RESIDUAL_TOLERANCE || residual <= 0.0 { + if residual.abs() <= RESIDUAL_TOLERANCE { + suppressed_zero_residual = true; + } + None + } else { + used_sources.push(surface_source(operating_expenses)); + used_sources.push(surface_source(sga_row)); + used_sources.extend(period_components.into_iter().map(surface_source)); + Some(residual) + }; + + values.insert(period.id.clone(), period_value); + } + + if !has_any_value(&values) { + if suppressed_zero_residual { + return ResolvedUniversalRow { + row: null_surface_row( + definition, + periods, + None, + Some(Confidence::Low), + vec!["other_operating_expense_suppressed_zero_residual".to_string()], + ), + detail_rows: vec![], + mapping_assignments: HashMap::new(), + warning_codes: vec!["other_operating_expense_suppressed_zero_residual".to_string()], + consumed_sources: ConsumedSources::default(), + }; + } + + return unresolved_row( + definition, + periods, + &[format!("{}_unresolved", definition.key)], + ); + } + + let mut warning_codes = vec!["other_operating_expense_formula_derived".to_string()]; + if suppressed_zero_residual { + warning_codes.push("other_operating_expense_suppressed_zero_residual".to_string()); + } + + build_formula_surface_row(definition, periods, values, &used_sources, warning_codes) +} + +fn build_formula_surface_row( + definition: &UniversalIncomeDefinition, + periods: &[PeriodOutput], + values: BTreeMap>, + source_rows: &[ValueSource], + warning_codes: Vec, +) -> ResolvedUniversalRow { + let merged_source = merge_value_sources(source_rows); + + ResolvedUniversalRow { + row: SurfaceRowOutput { + key: definition.key.clone(), + label: definition.label.clone(), + category: definition.category.clone(), + template_section: definition.category.clone(), + order: definition.order, + unit: definition.unit.clone(), + values, + source_concepts: merged_source.source_concepts, + source_row_keys: merged_source.source_row_keys, + source_fact_ids: merged_source.source_fact_ids, + formula_key: Some(definition.key.clone()), + has_dimensions: merged_source.has_dimensions, + resolved_source_row_keys: periods + .iter() + .map(|period| (period.id.clone(), None)) + .collect(), + statement: Some(definition.statement.clone()), + detail_count: Some(0), + resolution_method: Some(ResolutionMethod::FormulaDerived.as_str().to_string()), + confidence: Some(Confidence::Medium.as_str().to_string()), + warning_codes: warning_codes.clone(), + }, + detail_rows: vec![], + mapping_assignments: HashMap::new(), + warning_codes, + consumed_sources: ConsumedSources::default(), + } +} + fn build_direct_row( definition: &UniversalIncomeDefinition, periods: &[PeriodOutput], @@ -814,6 +1128,112 @@ fn collect_group_sources<'a>( (sources, rows) } +fn find_surface_row<'a>( + income_surface_rows: &'a [SurfaceRowOutput], + key: &str, +) -> Option<&'a SurfaceRowOutput> { + income_surface_rows.iter().find(|row| row.key == key) +} + +fn has_explicit_direct_match_for_period( + period: &PeriodOutput, + income_statement_rows: &[StatementRowOutput], + facts: &[FactOutput], + bridge_row: Option<&IncomeBridgeRow>, + crosswalk: Option<&CrosswalkFile>, +) -> bool { + let Some(bridge_row) = bridge_row else { + return false; + }; + + income_statement_rows.iter().any(|row| { + row.values.get(&period.id).copied().flatten().is_some() + && (match_direct_authoritative( + row, + &bridge_row.direct_authoritative_concepts, + crosswalk, + ) + .is_some() + || match_direct_source(row, &bridge_row.direct_source_concepts, crosswalk) + .is_some()) + }) || facts.iter().any(|fact| { + fact_matches_period(fact, period) + && (match_direct_fact_authoritative( + fact, + &bridge_row.direct_authoritative_concepts, + crosswalk, + ) + .is_some() + || match_direct_fact_source(fact, &bridge_row.direct_source_concepts, crosswalk) + .is_some()) + }) +} + +fn collect_residual_component_rows<'a>( + income_surface_rows: &'a [SurfaceRowOutput], + bridge_rows: &HashMap, + target_key: &str, +) -> Vec<&'a SurfaceRowOutput> { + let mut allowed_surface_keys = COMMON_OPERATING_COMPONENT_KEYS + .iter() + .map(|key| key.to_string()) + .collect::>(); + + if let Some(operating_bridge) = bridge_rows.get("operating_expenses") { + allowed_surface_keys.extend(operating_bridge.component_surfaces.positive.iter().cloned()); + } + + income_surface_rows + .iter() + .filter(|row| row.key != target_key) + .filter(|row| !EXCLUDED_OPERATING_RESIDUAL_KEYS.contains(&row.key.as_str())) + .filter(|row| allowed_surface_keys.contains(&row.key)) + .filter(|row| !surface_row_is_formula_derived(row)) + .collect() +} + +fn residual_anchor_value_for_period(row: &SurfaceRowOutput, period_id: &str) -> Option { + let confidence = row.confidence.as_deref().unwrap_or_default(); + let resolution_method = row.resolution_method.as_deref().unwrap_or_default(); + if resolution_method == ResolutionMethod::NotMeaningful.as_str() { + return None; + } + if resolution_method == ResolutionMethod::FormulaDerived.as_str() + && confidence != Confidence::High.as_str() + { + return None; + } + + surface_row_value_for_period(row, period_id) +} + +fn surface_row_value_for_period(row: &SurfaceRowOutput, period_id: &str) -> Option { + row.values.get(period_id).copied().flatten() +} + +fn non_formula_value_for_period(row: &SurfaceRowOutput, period_id: &str) -> Option { + if surface_row_is_formula_derived(row) { + return None; + } + + surface_row_value_for_period(row, period_id) +} + +fn non_formula_value_source_for_any_period(source: &ValueSource) -> bool { + source + .values + .keys() + .any(|period_id| non_formula_value_from_source_for_period(source, period_id).is_some()) +} + +fn non_formula_value_from_source_for_period(source: &ValueSource, period_id: &str) -> Option { + source.values.get(period_id).copied().flatten() +} + +fn surface_row_is_formula_derived(row: &SurfaceRowOutput) -> bool { + row.resolution_method.as_deref() == Some(ResolutionMethod::FormulaDerived.as_str()) +} + fn resolve_component_surface_source( surface_key: &str, income_statement_rows: &[StatementRowOutput], @@ -1628,12 +2048,17 @@ mod tests { } #[test] - fn derives_other_operating_expense_from_operating_expenses_minus_sga_and_missing_rnd() { - let rows = empty_rows(); + fn prefers_direct_sga_over_residual_inference() { + let mut rows = empty_rows(); + rows.get_mut("income").unwrap().push(row( + "sga-direct", + "us-gaap:SellingGeneralAndAdministrativeExpense", + 70.0, + )); let mut model = empty_model(); model.surface_rows.get_mut("income").unwrap().extend([ surface_row("operating_expenses", 100.0), - surface_row("selling_general_and_administrative", 60.0), + surface_row("research_and_development", 30.0), ]); apply_universal_income_rows( @@ -1646,6 +2071,47 @@ mod tests { ) .expect("universal income rows should build"); + let sga = model + .surface_rows + .get("income") + .unwrap() + .iter() + .find(|row| row.key == "selling_general_and_administrative") + .unwrap(); + + assert_eq!(sga.values.get("p1").copied().flatten(), Some(70.0)); + assert_eq!(sga.resolution_method.as_deref(), Some("direct")); + assert!(!sga.warning_codes.contains( + &"selling_general_and_administrative_residual_from_operating_expenses".to_string() + )); + } + + #[test] + fn derives_sga_from_operating_expenses_minus_research_and_development() { + let rows = empty_rows(); + let mut model = empty_model(); + model.surface_rows.get_mut("income").unwrap().extend([ + surface_row("operating_expenses", 100.0), + surface_row("research_and_development", 40.0), + ]); + + apply_universal_income_rows( + &[period("p1")], + &rows, + &[], + "us-gaap", + FiscalPack::Core, + &mut model, + ) + .expect("universal income rows should build"); + + let sga = model + .surface_rows + .get("income") + .unwrap() + .iter() + .find(|row| row.key == "selling_general_and_administrative") + .unwrap(); let other = model .surface_rows .get("income") @@ -1654,12 +2120,134 @@ mod tests { .find(|row| row.key == "other_operating_expense") .unwrap(); - assert_eq!(other.values.get("p1").copied().flatten(), Some(40.0)); + assert_eq!(sga.values.get("p1").copied().flatten(), Some(60.0)); assert_eq!( - other.formula_key.as_deref(), - Some("other_operating_expense") + sga.warning_codes, + vec!["selling_general_and_administrative_residual_from_operating_expenses".to_string()] ); - assert_eq!(other.resolution_method.as_deref(), Some("formula_derived")); + assert_eq!(other.values.get("p1").copied().flatten(), None); + assert!(other + .warning_codes + .contains(&"other_operating_expense_suppressed_zero_residual".to_string())); + } + + #[test] + fn derives_sga_from_operating_expenses_before_other_operating_expense_residual() { + let rows = empty_rows(); + let mut model = empty_model(); + model.surface_rows.get_mut("income").unwrap().extend([ + surface_row("operating_expenses", 100.0), + surface_row("research_and_development", 25.0), + surface_row("depreciation_and_amortization", 15.0), + ]); + + apply_universal_income_rows( + &[period("p1")], + &rows, + &[], + "us-gaap", + FiscalPack::Core, + &mut model, + ) + .expect("universal income rows should build"); + + let sga = model + .surface_rows + .get("income") + .unwrap() + .iter() + .find(|row| row.key == "selling_general_and_administrative") + .unwrap(); + let other = model + .surface_rows + .get("income") + .unwrap() + .iter() + .find(|row| row.key == "other_operating_expense") + .unwrap(); + + assert_eq!(sga.values.get("p1").copied().flatten(), Some(75.0)); + assert_eq!(other.values.get("p1").copied().flatten(), None); + } + + #[test] + fn leaves_sga_unresolved_when_operating_expense_residual_is_negative() { + let rows = empty_rows(); + let mut model = empty_model(); + model.surface_rows.get_mut("income").unwrap().extend([ + surface_row("operating_expenses", 100.0), + surface_row("research_and_development", 120.0), + ]); + + apply_universal_income_rows( + &[period("p1")], + &rows, + &[], + "us-gaap", + FiscalPack::Core, + &mut model, + ) + .expect("universal income rows should build"); + + let sga = model + .surface_rows + .get("income") + .unwrap() + .iter() + .find(|row| row.key == "selling_general_and_administrative") + .unwrap(); + + assert_eq!(sga.values.get("p1").copied().flatten(), None); + assert!(sga + .warning_codes + .contains(&"selling_general_and_administrative_negative_residual_invalid".to_string())); + assert!(sga + .warning_codes + .contains(&"selling_general_and_administrative_unresolved".to_string())); + } + + #[test] + fn keeps_explicit_other_operating_expense_direct() { + let mut rows = empty_rows(); + rows.get_mut("income").unwrap().push(row( + "other-expense", + "us-gaap:OtherOperatingExpense", + 12.0, + )); + let mut model = empty_model(); + model.surface_rows.get_mut("income").unwrap().extend([ + surface_row("operating_expenses", 100.0), + surface_row("research_and_development", 40.0), + ]); + + apply_universal_income_rows( + &[period("p1")], + &rows, + &[], + "us-gaap", + FiscalPack::Core, + &mut model, + ) + .expect("universal income rows should build"); + + let sga = model + .surface_rows + .get("income") + .unwrap() + .iter() + .find(|row| row.key == "selling_general_and_administrative") + .unwrap(); + let other = model + .surface_rows + .get("income") + .unwrap() + .iter() + .find(|row| row.key == "other_operating_expense") + .unwrap(); + + assert_eq!(sga.values.get("p1").copied().flatten(), None); + assert_eq!(other.values.get("p1").copied().flatten(), Some(12.0)); + assert_eq!(other.resolution_method.as_deref(), Some("direct")); } #[test] diff --git a/scripts/backfill-taxonomy-snapshots.ts b/scripts/backfill-taxonomy-snapshots.ts index 3010d64..8d62d15 100644 --- a/scripts/backfill-taxonomy-snapshots.ts +++ b/scripts/backfill-taxonomy-snapshots.ts @@ -5,6 +5,7 @@ import { normalizeFilingTaxonomySnapshotPayload, upsertFilingTaxonomySnapshot } from '@/lib/server/repos/filing-taxonomy'; +import { getIssuerOverlay } from '@/lib/server/repos/issuer-overlays'; type ScriptOptions = { apply: boolean; @@ -22,6 +23,17 @@ type ScriptSummary = { failed: number; }; +type ActiveOverlayState = { + definition: Awaited> extends infer T + ? T extends { active_revision: infer R | null } + ? R extends { definition_json: infer D } + ? D | null + : null + : null + : null; + revisionId: number | null; +}; + type FilingRow = { id: number; ticker: string; @@ -36,6 +48,28 @@ type FilingRow = { const REQUEST_DELAY_MS = 120; +async function loadActiveOverlayState( + ticker: string, + cache: Map>, +) { + const normalizedTicker = ticker.trim().toUpperCase(); + const existing = cache.get(normalizedTicker); + if (existing) { + return await existing; + } + + const pending = (async (): Promise => { + const overlay = await getIssuerOverlay(normalizedTicker); + return { + definition: overlay?.active_revision?.definition_json ?? null, + revisionId: overlay?.active_revision_id ?? null + }; + })(); + cache.set(normalizedTicker, pending); + + return await pending; +} + function parseOptions(argv: string[]): ScriptOptions { const options: ScriptOptions = { apply: false, @@ -147,6 +181,7 @@ async function loadFilings(options: ScriptOptions): Promise { async function runBackfill(options: ScriptOptions): Promise { const rows = await loadFilings(options); + const overlayCache = new Map>(); const summary: ScriptSummary = { scanned: 0, wouldWrite: 0, @@ -164,8 +199,13 @@ async function runBackfill(options: ScriptOptions): Promise { summary.scanned += 1; console.log(`[backfill-taxonomy-snapshots] [${index + 1}/${rows.length}] ${row.ticker} ${row.filingType} ${row.filingDate} ${row.accessionNumber}`); + const activeOverlay = await loadActiveOverlayState(row.ticker, overlayCache); const existing = await getFilingTaxonomySnapshotByFilingId(row.id); - const isFresh = existing && Date.parse(existing.updated_at) >= Date.parse(row.updatedAt); + const isFresh = Boolean( + existing + && Date.parse(existing.updated_at) >= Date.parse(row.updatedAt) + && (existing.issuer_overlay_revision_id ?? null) === activeOverlay.revisionId + ); if (isFresh && !options.refresh) { summary.skippedFresh += 1; @@ -181,7 +221,8 @@ async function runBackfill(options: ScriptOptions): Promise { filingDate: row.filingDate, filingType: row.filingType, filingUrl: row.filingUrl, - primaryDocument: row.primaryDocument + primaryDocument: row.primaryDocument, + issuerOverlay: activeOverlay.definition }); summary.wouldWrite += 1; @@ -189,6 +230,7 @@ async function runBackfill(options: ScriptOptions): Promise { if (options.apply) { const normalizedSnapshot = { ...snapshot, + issuer_overlay_revision_id: activeOverlay.revisionId, ...normalizeFilingTaxonomySnapshotPayload(snapshot) }; await upsertFilingTaxonomySnapshot(normalizedSnapshot);