Attempting to fix the Taxonomy. Stashing changes so far for worktree merge

This commit is contained in:
2026-03-21 00:52:54 -04:00
parent 391d6d34ce
commit 3e09e38dfa
2 changed files with 639 additions and 9 deletions

View File

@@ -12,6 +12,37 @@ use crate::{
SurfaceRowOutput,
};
const RESIDUAL_TOLERANCE: f64 = 1.0;
const EXCLUDED_OPERATING_RESIDUAL_KEYS: &[&str] = &[
"revenue",
"cost_of_revenue",
"gross_profit",
"operating_expenses",
"selling_general_and_administrative",
"sales_and_marketing",
"general_and_administrative",
"other_operating_expense",
"operating_income",
"income_tax_expense",
"net_income",
];
const COMMON_OPERATING_COMPONENT_KEYS: &[&str] = &[
"research_and_development",
"depreciation_and_amortization",
"depreciation",
"amortization",
"restructuring",
"asset_impairment",
"stock_based_compensation",
"labor_and_related_expense",
"occupancy_net",
"data_processing_expense",
"claims_and_benefits",
"underwriting_expenses",
"property_operating_expense",
"administrative_expense",
];
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
enum ResolutionMethod {
Direct,
@@ -154,6 +185,7 @@ pub fn apply_universal_income_rows(
resolve_universal_row(
definition,
bridge_row,
&income_bridge.rows,
periods,
&income_statement_rows,
facts,
@@ -237,6 +269,7 @@ pub fn apply_universal_income_rows(
fn resolve_universal_row(
definition: &UniversalIncomeDefinition,
bridge_row: &IncomeBridgeRow,
bridge_rows: &HashMap<String, IncomeBridgeRow>,
periods: &[PeriodOutput],
income_statement_rows: &[StatementRowOutput],
facts: &[FactOutput],
@@ -313,6 +346,46 @@ fn resolve_universal_row(
};
}
if definition.key == "selling_general_and_administrative" {
let formula_row = build_formula_row(
definition,
bridge_row,
periods,
income_statement_rows,
income_surface_rows,
crosswalk,
);
if has_any_value(&formula_row.row.values) {
return formula_row;
}
if let Some(residual_row) = build_residual_sga_row(
definition,
bridge_rows,
periods,
income_statement_rows,
facts,
income_surface_rows,
crosswalk,
) {
return residual_row;
}
return formula_row;
}
if definition.key == "other_operating_expense" {
return build_residual_other_operating_expense_row(
definition,
bridge_rows,
periods,
income_statement_rows,
facts,
income_surface_rows,
crosswalk,
);
}
build_formula_row(
definition,
bridge_row,
@@ -501,6 +574,247 @@ fn build_formula_row(
}
}
fn build_residual_sga_row(
definition: &UniversalIncomeDefinition,
bridge_rows: &HashMap<String, IncomeBridgeRow>,
periods: &[PeriodOutput],
income_statement_rows: &[StatementRowOutput],
facts: &[FactOutput],
income_surface_rows: &[SurfaceRowOutput],
crosswalk: Option<&CrosswalkFile>,
) -> Option<ResolvedUniversalRow> {
let operating_expenses = find_surface_row(income_surface_rows, "operating_expenses")?;
let research_source = resolve_component_surface_source(
"research_and_development",
income_statement_rows,
income_surface_rows,
crosswalk,
)
.filter(|source| non_formula_value_source_for_any_period(source));
let explicit_other_bridge = bridge_rows.get("other_operating_expense");
let mut values = BTreeMap::<String, Option<f64>>::new();
let mut negative_residual = false;
let mut used_sources = Vec::<ValueSource>::new();
for period in periods {
let mut period_value = None;
if !has_explicit_direct_match_for_period(
period,
income_statement_rows,
facts,
explicit_other_bridge,
crosswalk,
) {
if let Some(operating_value) =
residual_anchor_value_for_period(operating_expenses, &period.id)
{
let research_value = research_source.as_ref().and_then(|source| {
non_formula_value_from_source_for_period(source, &period.id)
});
if let Some(research_value) = research_value {
let component_total = research_value;
let inferred = operating_value - component_total;
if inferred.abs() <= RESIDUAL_TOLERANCE {
period_value = Some(0.0);
} else if inferred.is_sign_negative() {
negative_residual = true;
} else {
period_value = Some(inferred);
}
if period_value.is_some() {
used_sources.push(surface_source(operating_expenses));
if let Some(source) = research_source.as_ref() {
used_sources.push(source.clone());
}
}
}
}
}
values.insert(period.id.clone(), period_value);
}
if !has_any_value(&values) {
if negative_residual {
return Some(unresolved_row(
definition,
periods,
&[
"selling_general_and_administrative_negative_residual_invalid".to_string(),
"selling_general_and_administrative_unresolved".to_string(),
],
));
}
return None;
}
let mut warning_codes =
vec!["selling_general_and_administrative_residual_from_operating_expenses".to_string()];
if negative_residual {
warning_codes
.push("selling_general_and_administrative_negative_residual_invalid".to_string());
}
Some(build_formula_surface_row(
definition,
periods,
values,
&used_sources,
warning_codes,
))
}
fn build_residual_other_operating_expense_row(
definition: &UniversalIncomeDefinition,
bridge_rows: &HashMap<String, IncomeBridgeRow>,
periods: &[PeriodOutput],
_income_statement_rows: &[StatementRowOutput],
_facts: &[FactOutput],
income_surface_rows: &[SurfaceRowOutput],
_crosswalk: Option<&CrosswalkFile>,
) -> ResolvedUniversalRow {
let Some(operating_expenses) = find_surface_row(income_surface_rows, "operating_expenses")
else {
return unresolved_row(
definition,
periods,
&[format!("{}_unresolved", definition.key)],
);
};
let Some(sga_row) = find_surface_row(income_surface_rows, "selling_general_and_administrative")
else {
return unresolved_row(
definition,
periods,
&[format!("{}_unresolved", definition.key)],
);
};
let component_rows = collect_residual_component_rows(
income_surface_rows,
bridge_rows,
"other_operating_expense",
);
let mut values = BTreeMap::<String, Option<f64>>::new();
let mut suppressed_zero_residual = false;
let mut used_sources = Vec::<ValueSource>::new();
for period in periods {
let Some(operating_value) =
residual_anchor_value_for_period(operating_expenses, &period.id)
else {
values.insert(period.id.clone(), None);
continue;
};
let Some(sga_value) = surface_row_value_for_period(sga_row, &period.id) else {
values.insert(period.id.clone(), None);
continue;
};
let period_components = component_rows
.iter()
.filter(|row| non_formula_value_for_period(row, &period.id).is_some())
.copied()
.collect::<Vec<_>>();
let component_total = period_components
.iter()
.filter_map(|row| non_formula_value_for_period(row, &period.id))
.sum::<f64>();
let residual = operating_value - sga_value - component_total;
let period_value = if residual.abs() <= RESIDUAL_TOLERANCE || residual <= 0.0 {
if residual.abs() <= RESIDUAL_TOLERANCE {
suppressed_zero_residual = true;
}
None
} else {
used_sources.push(surface_source(operating_expenses));
used_sources.push(surface_source(sga_row));
used_sources.extend(period_components.into_iter().map(surface_source));
Some(residual)
};
values.insert(period.id.clone(), period_value);
}
if !has_any_value(&values) {
if suppressed_zero_residual {
return ResolvedUniversalRow {
row: null_surface_row(
definition,
periods,
None,
Some(Confidence::Low),
vec!["other_operating_expense_suppressed_zero_residual".to_string()],
),
detail_rows: vec![],
mapping_assignments: HashMap::new(),
warning_codes: vec!["other_operating_expense_suppressed_zero_residual".to_string()],
consumed_sources: ConsumedSources::default(),
};
}
return unresolved_row(
definition,
periods,
&[format!("{}_unresolved", definition.key)],
);
}
let mut warning_codes = vec!["other_operating_expense_formula_derived".to_string()];
if suppressed_zero_residual {
warning_codes.push("other_operating_expense_suppressed_zero_residual".to_string());
}
build_formula_surface_row(definition, periods, values, &used_sources, warning_codes)
}
fn build_formula_surface_row(
definition: &UniversalIncomeDefinition,
periods: &[PeriodOutput],
values: BTreeMap<String, Option<f64>>,
source_rows: &[ValueSource],
warning_codes: Vec<String>,
) -> ResolvedUniversalRow {
let merged_source = merge_value_sources(source_rows);
ResolvedUniversalRow {
row: SurfaceRowOutput {
key: definition.key.clone(),
label: definition.label.clone(),
category: definition.category.clone(),
template_section: definition.category.clone(),
order: definition.order,
unit: definition.unit.clone(),
values,
source_concepts: merged_source.source_concepts,
source_row_keys: merged_source.source_row_keys,
source_fact_ids: merged_source.source_fact_ids,
formula_key: Some(definition.key.clone()),
has_dimensions: merged_source.has_dimensions,
resolved_source_row_keys: periods
.iter()
.map(|period| (period.id.clone(), None))
.collect(),
statement: Some(definition.statement.clone()),
detail_count: Some(0),
resolution_method: Some(ResolutionMethod::FormulaDerived.as_str().to_string()),
confidence: Some(Confidence::Medium.as_str().to_string()),
warning_codes: warning_codes.clone(),
},
detail_rows: vec![],
mapping_assignments: HashMap::new(),
warning_codes,
consumed_sources: ConsumedSources::default(),
}
}
fn build_direct_row(
definition: &UniversalIncomeDefinition,
periods: &[PeriodOutput],
@@ -814,6 +1128,112 @@ fn collect_group_sources<'a>(
(sources, rows)
}
fn find_surface_row<'a>(
income_surface_rows: &'a [SurfaceRowOutput],
key: &str,
) -> Option<&'a SurfaceRowOutput> {
income_surface_rows.iter().find(|row| row.key == key)
}
fn has_explicit_direct_match_for_period(
period: &PeriodOutput,
income_statement_rows: &[StatementRowOutput],
facts: &[FactOutput],
bridge_row: Option<&IncomeBridgeRow>,
crosswalk: Option<&CrosswalkFile>,
) -> bool {
let Some(bridge_row) = bridge_row else {
return false;
};
income_statement_rows.iter().any(|row| {
row.values.get(&period.id).copied().flatten().is_some()
&& (match_direct_authoritative(
row,
&bridge_row.direct_authoritative_concepts,
crosswalk,
)
.is_some()
|| match_direct_source(row, &bridge_row.direct_source_concepts, crosswalk)
.is_some())
}) || facts.iter().any(|fact| {
fact_matches_period(fact, period)
&& (match_direct_fact_authoritative(
fact,
&bridge_row.direct_authoritative_concepts,
crosswalk,
)
.is_some()
|| match_direct_fact_source(fact, &bridge_row.direct_source_concepts, crosswalk)
.is_some())
})
}
fn collect_residual_component_rows<'a>(
income_surface_rows: &'a [SurfaceRowOutput],
bridge_rows: &HashMap<String, IncomeBridgeRow>,
target_key: &str,
) -> Vec<&'a SurfaceRowOutput> {
let mut allowed_surface_keys = COMMON_OPERATING_COMPONENT_KEYS
.iter()
.map(|key| key.to_string())
.collect::<HashSet<_>>();
if let Some(operating_bridge) = bridge_rows.get("operating_expenses") {
allowed_surface_keys.extend(operating_bridge.component_surfaces.positive.iter().cloned());
}
income_surface_rows
.iter()
.filter(|row| row.key != target_key)
.filter(|row| !EXCLUDED_OPERATING_RESIDUAL_KEYS.contains(&row.key.as_str()))
.filter(|row| allowed_surface_keys.contains(&row.key))
.filter(|row| !surface_row_is_formula_derived(row))
.collect()
}
fn residual_anchor_value_for_period(row: &SurfaceRowOutput, period_id: &str) -> Option<f64> {
let confidence = row.confidence.as_deref().unwrap_or_default();
let resolution_method = row.resolution_method.as_deref().unwrap_or_default();
if resolution_method == ResolutionMethod::NotMeaningful.as_str() {
return None;
}
if resolution_method == ResolutionMethod::FormulaDerived.as_str()
&& confidence != Confidence::High.as_str()
{
return None;
}
surface_row_value_for_period(row, period_id)
}
fn surface_row_value_for_period(row: &SurfaceRowOutput, period_id: &str) -> Option<f64> {
row.values.get(period_id).copied().flatten()
}
fn non_formula_value_for_period(row: &SurfaceRowOutput, period_id: &str) -> Option<f64> {
if surface_row_is_formula_derived(row) {
return None;
}
surface_row_value_for_period(row, period_id)
}
fn non_formula_value_source_for_any_period(source: &ValueSource) -> bool {
source
.values
.keys()
.any(|period_id| non_formula_value_from_source_for_period(source, period_id).is_some())
}
fn non_formula_value_from_source_for_period(source: &ValueSource, period_id: &str) -> Option<f64> {
source.values.get(period_id).copied().flatten()
}
fn surface_row_is_formula_derived(row: &SurfaceRowOutput) -> bool {
row.resolution_method.as_deref() == Some(ResolutionMethod::FormulaDerived.as_str())
}
fn resolve_component_surface_source(
surface_key: &str,
income_statement_rows: &[StatementRowOutput],
@@ -1628,12 +2048,17 @@ mod tests {
}
#[test]
fn derives_other_operating_expense_from_operating_expenses_minus_sga_and_missing_rnd() {
let rows = empty_rows();
fn prefers_direct_sga_over_residual_inference() {
let mut rows = empty_rows();
rows.get_mut("income").unwrap().push(row(
"sga-direct",
"us-gaap:SellingGeneralAndAdministrativeExpense",
70.0,
));
let mut model = empty_model();
model.surface_rows.get_mut("income").unwrap().extend([
surface_row("operating_expenses", 100.0),
surface_row("selling_general_and_administrative", 60.0),
surface_row("research_and_development", 30.0),
]);
apply_universal_income_rows(
@@ -1646,6 +2071,47 @@ mod tests {
)
.expect("universal income rows should build");
let sga = model
.surface_rows
.get("income")
.unwrap()
.iter()
.find(|row| row.key == "selling_general_and_administrative")
.unwrap();
assert_eq!(sga.values.get("p1").copied().flatten(), Some(70.0));
assert_eq!(sga.resolution_method.as_deref(), Some("direct"));
assert!(!sga.warning_codes.contains(
&"selling_general_and_administrative_residual_from_operating_expenses".to_string()
));
}
#[test]
fn derives_sga_from_operating_expenses_minus_research_and_development() {
let rows = empty_rows();
let mut model = empty_model();
model.surface_rows.get_mut("income").unwrap().extend([
surface_row("operating_expenses", 100.0),
surface_row("research_and_development", 40.0),
]);
apply_universal_income_rows(
&[period("p1")],
&rows,
&[],
"us-gaap",
FiscalPack::Core,
&mut model,
)
.expect("universal income rows should build");
let sga = model
.surface_rows
.get("income")
.unwrap()
.iter()
.find(|row| row.key == "selling_general_and_administrative")
.unwrap();
let other = model
.surface_rows
.get("income")
@@ -1654,12 +2120,134 @@ mod tests {
.find(|row| row.key == "other_operating_expense")
.unwrap();
assert_eq!(other.values.get("p1").copied().flatten(), Some(40.0));
assert_eq!(sga.values.get("p1").copied().flatten(), Some(60.0));
assert_eq!(
other.formula_key.as_deref(),
Some("other_operating_expense")
sga.warning_codes,
vec!["selling_general_and_administrative_residual_from_operating_expenses".to_string()]
);
assert_eq!(other.resolution_method.as_deref(), Some("formula_derived"));
assert_eq!(other.values.get("p1").copied().flatten(), None);
assert!(other
.warning_codes
.contains(&"other_operating_expense_suppressed_zero_residual".to_string()));
}
#[test]
fn derives_sga_from_operating_expenses_before_other_operating_expense_residual() {
let rows = empty_rows();
let mut model = empty_model();
model.surface_rows.get_mut("income").unwrap().extend([
surface_row("operating_expenses", 100.0),
surface_row("research_and_development", 25.0),
surface_row("depreciation_and_amortization", 15.0),
]);
apply_universal_income_rows(
&[period("p1")],
&rows,
&[],
"us-gaap",
FiscalPack::Core,
&mut model,
)
.expect("universal income rows should build");
let sga = model
.surface_rows
.get("income")
.unwrap()
.iter()
.find(|row| row.key == "selling_general_and_administrative")
.unwrap();
let other = model
.surface_rows
.get("income")
.unwrap()
.iter()
.find(|row| row.key == "other_operating_expense")
.unwrap();
assert_eq!(sga.values.get("p1").copied().flatten(), Some(75.0));
assert_eq!(other.values.get("p1").copied().flatten(), None);
}
#[test]
fn leaves_sga_unresolved_when_operating_expense_residual_is_negative() {
let rows = empty_rows();
let mut model = empty_model();
model.surface_rows.get_mut("income").unwrap().extend([
surface_row("operating_expenses", 100.0),
surface_row("research_and_development", 120.0),
]);
apply_universal_income_rows(
&[period("p1")],
&rows,
&[],
"us-gaap",
FiscalPack::Core,
&mut model,
)
.expect("universal income rows should build");
let sga = model
.surface_rows
.get("income")
.unwrap()
.iter()
.find(|row| row.key == "selling_general_and_administrative")
.unwrap();
assert_eq!(sga.values.get("p1").copied().flatten(), None);
assert!(sga
.warning_codes
.contains(&"selling_general_and_administrative_negative_residual_invalid".to_string()));
assert!(sga
.warning_codes
.contains(&"selling_general_and_administrative_unresolved".to_string()));
}
#[test]
fn keeps_explicit_other_operating_expense_direct() {
let mut rows = empty_rows();
rows.get_mut("income").unwrap().push(row(
"other-expense",
"us-gaap:OtherOperatingExpense",
12.0,
));
let mut model = empty_model();
model.surface_rows.get_mut("income").unwrap().extend([
surface_row("operating_expenses", 100.0),
surface_row("research_and_development", 40.0),
]);
apply_universal_income_rows(
&[period("p1")],
&rows,
&[],
"us-gaap",
FiscalPack::Core,
&mut model,
)
.expect("universal income rows should build");
let sga = model
.surface_rows
.get("income")
.unwrap()
.iter()
.find(|row| row.key == "selling_general_and_administrative")
.unwrap();
let other = model
.surface_rows
.get("income")
.unwrap()
.iter()
.find(|row| row.key == "other_operating_expense")
.unwrap();
assert_eq!(sga.values.get("p1").copied().flatten(), None);
assert_eq!(other.values.get("p1").copied().flatten(), Some(12.0));
assert_eq!(other.resolution_method.as_deref(), Some("direct"));
}
#[test]

View File

@@ -5,6 +5,7 @@ import {
normalizeFilingTaxonomySnapshotPayload,
upsertFilingTaxonomySnapshot
} from '@/lib/server/repos/filing-taxonomy';
import { getIssuerOverlay } from '@/lib/server/repos/issuer-overlays';
type ScriptOptions = {
apply: boolean;
@@ -22,6 +23,17 @@ type ScriptSummary = {
failed: number;
};
type ActiveOverlayState = {
definition: Awaited<ReturnType<typeof getIssuerOverlay>> extends infer T
? T extends { active_revision: infer R | null }
? R extends { definition_json: infer D }
? D | null
: null
: null
: null;
revisionId: number | null;
};
type FilingRow = {
id: number;
ticker: string;
@@ -36,6 +48,28 @@ type FilingRow = {
const REQUEST_DELAY_MS = 120;
async function loadActiveOverlayState(
ticker: string,
cache: Map<string, Promise<ActiveOverlayState>>,
) {
const normalizedTicker = ticker.trim().toUpperCase();
const existing = cache.get(normalizedTicker);
if (existing) {
return await existing;
}
const pending = (async (): Promise<ActiveOverlayState> => {
const overlay = await getIssuerOverlay(normalizedTicker);
return {
definition: overlay?.active_revision?.definition_json ?? null,
revisionId: overlay?.active_revision_id ?? null
};
})();
cache.set(normalizedTicker, pending);
return await pending;
}
function parseOptions(argv: string[]): ScriptOptions {
const options: ScriptOptions = {
apply: false,
@@ -147,6 +181,7 @@ async function loadFilings(options: ScriptOptions): Promise<FilingRow[]> {
async function runBackfill(options: ScriptOptions): Promise<ScriptSummary> {
const rows = await loadFilings(options);
const overlayCache = new Map<string, Promise<ActiveOverlayState>>();
const summary: ScriptSummary = {
scanned: 0,
wouldWrite: 0,
@@ -164,8 +199,13 @@ async function runBackfill(options: ScriptOptions): Promise<ScriptSummary> {
summary.scanned += 1;
console.log(`[backfill-taxonomy-snapshots] [${index + 1}/${rows.length}] ${row.ticker} ${row.filingType} ${row.filingDate} ${row.accessionNumber}`);
const activeOverlay = await loadActiveOverlayState(row.ticker, overlayCache);
const existing = await getFilingTaxonomySnapshotByFilingId(row.id);
const isFresh = existing && Date.parse(existing.updated_at) >= Date.parse(row.updatedAt);
const isFresh = Boolean(
existing
&& Date.parse(existing.updated_at) >= Date.parse(row.updatedAt)
&& (existing.issuer_overlay_revision_id ?? null) === activeOverlay.revisionId
);
if (isFresh && !options.refresh) {
summary.skippedFresh += 1;
@@ -181,7 +221,8 @@ async function runBackfill(options: ScriptOptions): Promise<ScriptSummary> {
filingDate: row.filingDate,
filingType: row.filingType,
filingUrl: row.filingUrl,
primaryDocument: row.primaryDocument
primaryDocument: row.primaryDocument,
issuerOverlay: activeOverlay.definition
});
summary.wouldWrite += 1;
@@ -189,6 +230,7 @@ async function runBackfill(options: ScriptOptions): Promise<ScriptSummary> {
if (options.apply) {
const normalizedSnapshot = {
...snapshot,
issuer_overlay_revision_id: activeOverlay.revisionId,
...normalizeFilingTaxonomySnapshotPayload(snapshot)
};
await upsertFilingTaxonomySnapshot(normalizedSnapshot);