Integrate crabrl parser into taxonomy hydration

This commit is contained in:
2026-03-16 15:18:01 -04:00
parent cf084793ed
commit a58b07456e
23 changed files with 4696 additions and 2466 deletions

View File

@@ -0,0 +1,231 @@
use anyhow::{Context, Result};
use crabrl::{Document, FactValue, Measure, Parser, Period, UnitType};
use serde_json::json;
use crate::{
is_xbrl_infrastructure_prefix, ContextOutput, DimensionOutput, ParsedFact, ParsedInstance,
};
pub(crate) fn parse_xbrl_instance(
raw: &str,
source_file: Option<String>,
) -> Result<ParsedInstance> {
let document = Parser::new()
.parse_bytes(raw.as_bytes())
.context("crabrl failed to parse XBRL instance")?;
Ok(ParsedInstance {
contexts: build_contexts(&document),
facts: build_facts(&document, source_file),
})
}
fn build_contexts(document: &Document) -> Vec<ContextOutput> {
document
.contexts
.iter()
.map(|context| {
let (period_start, period_end, period_instant) = convert_period(&context.period);
ContextOutput {
context_id: context.id.to_string(),
entity_identifier: Some(context.entity.identifier.to_string()),
entity_scheme: Some(context.entity.scheme.to_string()),
period_start,
period_end,
period_instant,
segment_json: context.entity.segment.as_ref().map(segment_to_json),
scenario_json: context.scenario.as_ref().map(scenario_to_json),
}
})
.collect()
}
fn build_facts(document: &Document, source_file: Option<String>) -> Vec<ParsedFact> {
document
.facts
.concept_ids
.iter()
.enumerate()
.filter_map(|(index, concept_id)| {
let qname = document
.concept_names
.get(*concept_id as usize)?
.to_string();
let (prefix, local_name) = split_qname(&qname)?;
if is_xbrl_infrastructure_prefix(&prefix) {
return None;
}
let value = numeric_fact_value(document.facts.values.get(index)?)?;
let context = document
.contexts
.get(*document.facts.context_ids.get(index)? as usize)?;
let namespace_uri = document
.namespaces
.get(prefix.as_str())
.map(|value| value.to_string())
.unwrap_or_else(|| format!("urn:unknown:{prefix}"));
let (period_start, period_end, period_instant) = convert_period(&context.period);
let dimensions = context_dimensions(context);
Some(ParsedFact {
concept_key: format!("{namespace_uri}#{local_name}"),
qname,
namespace_uri,
local_name,
data_type: None,
context_id: context.id.to_string(),
unit: unit_for_fact(document, index),
decimals: document
.facts
.decimals
.get(index)
.and_then(|value| value.map(|entry| entry.to_string())),
precision: None,
nil: matches!(document.facts.values.get(index), Some(FactValue::Nil)),
value,
period_start,
period_end,
period_instant,
is_dimensionless: dimensions.is_empty(),
dimensions,
source_file: source_file.clone(),
})
})
.collect()
}
fn numeric_fact_value(value: &FactValue) -> Option<f64> {
match value {
FactValue::Decimal(value) => Some(*value),
FactValue::Integer(value) => Some(*value as f64),
_ => None,
}
}
fn split_qname(qname: &str) -> Option<(String, String)> {
let (prefix, local_name) = qname.split_once(':')?;
let prefix = prefix.trim().to_string();
let local_name = local_name.trim().to_string();
if prefix.is_empty() || local_name.is_empty() {
return None;
}
Some((prefix, local_name))
}
fn convert_period(period: &Period) -> (Option<String>, Option<String>, Option<String>) {
match period {
Period::Instant { date } => (None, None, Some(date.to_string())),
Period::Duration { start, end } => (Some(start.to_string()), Some(end.to_string()), None),
Period::Forever => (None, None, None),
}
}
fn context_dimensions(context: &crabrl::Context) -> Vec<DimensionOutput> {
let mut dimensions = Vec::new();
if let Some(segment) = context.entity.segment.as_ref() {
dimensions.extend(
segment
.explicit_members
.iter()
.map(|member| DimensionOutput {
axis: member.dimension.to_string(),
member: member.member.to_string(),
}),
);
}
if let Some(scenario) = context.scenario.as_ref() {
dimensions.extend(
scenario
.explicit_members
.iter()
.map(|member| DimensionOutput {
axis: member.dimension.to_string(),
member: member.member.to_string(),
}),
);
}
dimensions
}
fn unit_for_fact(document: &Document, fact_index: usize) -> Option<String> {
let unit_id = *document.facts.unit_ids.get(fact_index)?;
if unit_id == 0 {
return None;
}
document
.units
.get((unit_id - 1) as usize)
.map(|unit| unit_type_to_string(&unit.unit_type))
}
fn unit_type_to_string(unit_type: &UnitType) -> String {
match unit_type {
UnitType::Simple(measures) => join_measures(measures, "/"),
UnitType::Multiply(measures) => join_measures(measures, "*"),
UnitType::Divide {
numerator,
denominator,
} => format!(
"{}/{}",
join_measures(numerator, "*"),
join_measures(denominator, "*")
),
}
}
fn join_measures(measures: &[Measure], separator: &str) -> String {
measures
.iter()
.map(measure_to_string)
.collect::<Vec<_>>()
.join(separator)
}
fn measure_to_string(measure: &Measure) -> String {
if measure.namespace.is_empty() {
measure.name.to_string()
} else {
format!("{}:{}", measure.namespace, measure.name)
}
}
fn segment_to_json(segment: &crabrl::Segment) -> serde_json::Value {
json!({
"explicitMembers": segment.explicit_members.iter().map(|member| {
json!({
"axis": member.dimension.to_string(),
"member": member.member.to_string(),
})
}).collect::<Vec<_>>(),
"typedMembers": segment.typed_members.iter().map(|member| {
json!({
"axis": member.dimension.to_string(),
"value": member.value.to_string(),
})
}).collect::<Vec<_>>(),
})
}
fn scenario_to_json(scenario: &crabrl::Scenario) -> serde_json::Value {
json!({
"explicitMembers": scenario.explicit_members.iter().map(|member| {
json!({
"axis": member.dimension.to_string(),
"member": member.member.to_string(),
})
}).collect::<Vec<_>>(),
"typedMembers": scenario.typed_members.iter().map(|member| {
json!({
"axis": member.dimension.to_string(),
"value": member.value.to_string(),
})
}).collect::<Vec<_>>(),
})
}

View File

@@ -9,6 +9,7 @@ use std::collections::{BTreeMap, HashMap, HashSet};
use std::sync::Mutex;
use std::time::{Duration, Instant};
mod crabrl_adapter;
mod kpi_mapper;
mod metrics;
mod pack_selector;
@@ -54,44 +55,6 @@ where
fetch_fn()
}
static CONTEXT_RE: Lazy<Regex> = Lazy::new(|| {
Regex::new(r#"(?is)<(?:[a-z0-9_\-]+:)?context\b[^>]*\bid=["']([^"']+)["'][^>]*>(.*?)</(?:[a-z0-9_\-]+:)?context>"#).unwrap()
});
static UNIT_RE: Lazy<Regex> = Lazy::new(|| {
Regex::new(r#"(?is)<(?:[a-z0-9_\-]+:)?unit\b[^>]*\bid=["']([^"']+)["'][^>]*>(.*?)</(?:[a-z0-9_\-]+:)?unit>"#).unwrap()
});
static FACT_RE: Lazy<Regex> = Lazy::new(|| {
Regex::new(r#"(?is)<([a-zA-Z0-9_\-]+):([a-zA-Z0-9_\-.]+)\b([^>]*\bcontextRef=["'][^"']+["'][^>]*)>(.*?)</[a-zA-Z0-9_\-]+:[a-zA-Z0-9_\-.]+>"#).unwrap()
});
static EXPLICIT_MEMBER_RE: Lazy<Regex> = Lazy::new(|| {
Regex::new(r#"(?is)<(?:[a-z0-9_\-]+:)?explicitMember\b[^>]*\bdimension=["']([^"']+)["'][^>]*>(.*?)</(?:[a-z0-9_\-]+:)?explicitMember>"#).unwrap()
});
static TYPED_MEMBER_RE: Lazy<Regex> = Lazy::new(|| {
Regex::new(r#"(?is)<(?:[a-z0-9_\-]+:)?typedMember\b[^>]*\bdimension=["']([^"']+)["'][^>]*>(.*?)</(?:[a-z0-9_\-]+:)?typedMember>"#).unwrap()
});
static IDENTIFIER_RE: Lazy<Regex> = Lazy::new(|| {
Regex::new(r#"(?is)<(?:[a-z0-9_\-]+:)?identifier\b[^>]*\bscheme=["']([^"']+)["'][^>]*>(.*?)</(?:[a-z0-9_\-]+:)?identifier>"#).unwrap()
});
static SEGMENT_RE: Lazy<Regex> = Lazy::new(|| {
Regex::new(r#"(?is)<(?:[a-z0-9_\-]+:)?segment\b[^>]*>(.*?)</(?:[a-z0-9_\-]+:)?segment>"#)
.unwrap()
});
static SCENARIO_RE: Lazy<Regex> = Lazy::new(|| {
Regex::new(r#"(?is)<(?:[a-z0-9_\-]+:)?scenario\b[^>]*>(.*?)</(?:[a-z0-9_\-]+:)?scenario>"#)
.unwrap()
});
static START_DATE_RE: Lazy<Regex> = Lazy::new(|| {
Regex::new(r#"(?is)<(?:[a-z0-9_\-]+:)?startDate>(.*?)</(?:[a-z0-9_\-]+:)?startDate>"#).unwrap()
});
static END_DATE_RE: Lazy<Regex> = Lazy::new(|| {
Regex::new(r#"(?is)<(?:[a-z0-9_\-]+:)?endDate>(.*?)</(?:[a-z0-9_\-]+:)?endDate>"#).unwrap()
});
static INSTANT_RE: Lazy<Regex> = Lazy::new(|| {
Regex::new(r#"(?is)<(?:[a-z0-9_\-]+:)?instant>(.*?)</(?:[a-z0-9_\-]+:)?instant>"#).unwrap()
});
static MEASURE_RE: Lazy<Regex> = Lazy::new(|| {
Regex::new(r#"(?is)<(?:[a-z0-9_\-]+:)?measure>(.*?)</(?:[a-z0-9_\-]+:)?measure>"#).unwrap()
});
static LABEL_LINK_RE: Lazy<Regex> = Lazy::new(|| {
Regex::new(r#"(?is)<(?:[a-z0-9_\-]+:)?labelLink\b[^>]*>(.*?)</(?:[a-z0-9_\-]+:)?labelLink>"#)
.unwrap()
@@ -465,25 +428,7 @@ pub type SurfaceRowMap = BTreeMap<String, Vec<SurfaceRowOutput>>;
pub type DetailRowStatementMap = BTreeMap<String, BTreeMap<String, Vec<DetailRowOutput>>>;
#[derive(Debug, Clone)]
struct ParsedContext {
id: String,
entity_identifier: Option<String>,
entity_scheme: Option<String>,
period_start: Option<String>,
period_end: Option<String>,
period_instant: Option<String>,
dimensions: Vec<DimensionOutput>,
segment: Option<serde_json::Value>,
scenario: Option<serde_json::Value>,
}
#[derive(Debug, Clone)]
struct ParsedUnit {
measure: Option<String>,
}
#[derive(Debug, Clone)]
struct ParsedFact {
pub(crate) struct ParsedFact {
concept_key: String,
qname: String,
namespace_uri: String,
@@ -593,7 +538,8 @@ pub fn hydrate_filing(input: HydrateFilingRequest) -> Result<HydrateFilingRespon
);
}
let parsed_instance = parse_xbrl_instance(&instance_text, Some(instance_asset.name.clone()));
let parsed_instance = parse_xbrl_instance(&instance_text, Some(instance_asset.name.clone()))
.context("parse failed for XBRL instance")?;
let mut label_by_concept = HashMap::new();
let mut presentation = Vec::new();
@@ -1144,114 +1090,13 @@ fn validate_xbrl_structure(xml: &str, source_file: Option<&str>) -> XbrlValidati
}
}
struct ParsedInstance {
pub(crate) struct ParsedInstance {
contexts: Vec<ContextOutput>,
facts: Vec<ParsedFact>,
}
fn parse_xbrl_instance(raw: &str, source_file: Option<String>) -> ParsedInstance {
let namespaces = parse_namespace_map(raw, "xbrl");
let context_by_id = parse_contexts(raw);
let unit_by_id = parse_units(raw);
let mut facts = Vec::new();
for captures in FACT_RE.captures_iter(raw) {
let prefix = captures
.get(1)
.map(|value| value.as_str().trim())
.unwrap_or_default();
let local_name = captures
.get(2)
.map(|value| value.as_str().trim())
.unwrap_or_default();
let attrs = captures
.get(3)
.map(|value| value.as_str())
.unwrap_or_default();
let body = decode_xml_entities(
captures
.get(4)
.map(|value| value.as_str())
.unwrap_or_default()
.trim(),
);
if prefix.is_empty() || local_name.is_empty() || is_xbrl_infrastructure_prefix(prefix) {
continue;
}
let attr_map = parse_attrs(attrs);
let Some(context_id) = attr_map
.get("contextRef")
.cloned()
.or_else(|| attr_map.get("contextref").cloned())
else {
continue;
};
let Some(value) = parse_number(&body) else {
continue;
};
let namespace_uri = namespaces
.get(prefix)
.cloned()
.unwrap_or_else(|| format!("urn:unknown:{prefix}"));
let context = context_by_id.get(&context_id);
let unit_ref = attr_map
.get("unitRef")
.cloned()
.or_else(|| attr_map.get("unitref").cloned());
let unit = unit_ref
.as_ref()
.and_then(|unit_ref| unit_by_id.get(unit_ref))
.and_then(|unit| unit.measure.clone())
.or(unit_ref);
facts.push(ParsedFact {
concept_key: format!("{namespace_uri}#{local_name}"),
qname: format!("{prefix}:{local_name}"),
namespace_uri,
local_name: local_name.to_string(),
data_type: None,
context_id: context_id.clone(),
unit,
decimals: attr_map.get("decimals").cloned(),
precision: attr_map.get("precision").cloned(),
nil: attr_map
.get("xsi:nil")
.or_else(|| attr_map.get("nil"))
.map(|value| value.eq_ignore_ascii_case("true"))
.unwrap_or(false),
value,
period_start: context.and_then(|value| value.period_start.clone()),
period_end: context.and_then(|value| value.period_end.clone()),
period_instant: context.and_then(|value| value.period_instant.clone()),
dimensions: context
.map(|value| value.dimensions.clone())
.unwrap_or_default(),
is_dimensionless: context
.map(|value| value.dimensions.is_empty())
.unwrap_or(true),
source_file: source_file.clone(),
});
}
let contexts = context_by_id
.values()
.map(|context| ContextOutput {
context_id: context.id.clone(),
entity_identifier: context.entity_identifier.clone(),
entity_scheme: context.entity_scheme.clone(),
period_start: context.period_start.clone(),
period_end: context.period_end.clone(),
period_instant: context.period_instant.clone(),
segment_json: context.segment.clone(),
scenario_json: context.scenario.clone(),
})
.collect::<Vec<_>>();
ParsedInstance { contexts, facts }
fn parse_xbrl_instance(raw: &str, source_file: Option<String>) -> Result<ParsedInstance> {
crabrl_adapter::parse_xbrl_instance(raw, source_file)
}
fn parse_namespace_map(raw: &str, root_tag_hint: &str) -> HashMap<String, String> {
@@ -1277,173 +1122,7 @@ fn parse_namespace_map(raw: &str, root_tag_hint: &str) -> HashMap<String, String
map
}
fn parse_contexts(raw: &str) -> HashMap<String, ParsedContext> {
let mut contexts = HashMap::new();
for captures in CONTEXT_RE.captures_iter(raw) {
let Some(context_id) = captures
.get(1)
.map(|value| value.as_str().trim().to_string())
else {
continue;
};
let block = captures
.get(2)
.map(|value| value.as_str())
.unwrap_or_default();
let (entity_identifier, entity_scheme) = IDENTIFIER_RE
.captures(block)
.map(|captures| {
(
captures
.get(2)
.map(|value| decode_xml_entities(value.as_str().trim())),
captures
.get(1)
.map(|value| decode_xml_entities(value.as_str().trim())),
)
})
.unwrap_or((None, None));
let period_start = START_DATE_RE
.captures(block)
.and_then(|captures| captures.get(1))
.map(|value| decode_xml_entities(value.as_str().trim()));
let period_end = END_DATE_RE
.captures(block)
.and_then(|captures| captures.get(1))
.map(|value| decode_xml_entities(value.as_str().trim()));
let period_instant = INSTANT_RE
.captures(block)
.and_then(|captures| captures.get(1))
.map(|value| decode_xml_entities(value.as_str().trim()));
let segment = SEGMENT_RE
.captures(block)
.and_then(|captures| captures.get(1))
.map(|value| parse_dimension_container(value.as_str()));
let scenario = SCENARIO_RE
.captures(block)
.and_then(|captures| captures.get(1))
.map(|value| parse_dimension_container(value.as_str()));
let mut dimensions = Vec::new();
if let Some(segment_value) = segment.as_ref() {
if let Some(members) = segment_value
.get("explicitMembers")
.and_then(|value| value.as_array())
{
for member in members {
if let (Some(axis), Some(member_value)) = (
member.get("axis").and_then(|value| value.as_str()),
member.get("member").and_then(|value| value.as_str()),
) {
dimensions.push(DimensionOutput {
axis: axis.to_string(),
member: member_value.to_string(),
});
}
}
}
}
if let Some(scenario_value) = scenario.as_ref() {
if let Some(members) = scenario_value
.get("explicitMembers")
.and_then(|value| value.as_array())
{
for member in members {
if let (Some(axis), Some(member_value)) = (
member.get("axis").and_then(|value| value.as_str()),
member.get("member").and_then(|value| value.as_str()),
) {
dimensions.push(DimensionOutput {
axis: axis.to_string(),
member: member_value.to_string(),
});
}
}
}
}
contexts.insert(
context_id.clone(),
ParsedContext {
id: context_id,
entity_identifier,
entity_scheme,
period_start,
period_end,
period_instant,
dimensions,
segment,
scenario,
},
);
}
contexts
}
fn parse_dimension_container(raw: &str) -> serde_json::Value {
let explicit_members = EXPLICIT_MEMBER_RE
.captures_iter(raw)
.filter_map(|captures| {
Some(serde_json::json!({
"axis": decode_xml_entities(captures.get(1)?.as_str().trim()),
"member": decode_xml_entities(captures.get(2)?.as_str().trim())
}))
})
.collect::<Vec<_>>();
let typed_members = TYPED_MEMBER_RE
.captures_iter(raw)
.filter_map(|captures| {
Some(serde_json::json!({
"axis": decode_xml_entities(captures.get(1)?.as_str().trim()),
"value": decode_xml_entities(captures.get(2)?.as_str().trim())
}))
})
.collect::<Vec<_>>();
serde_json::json!({
"explicitMembers": explicit_members,
"typedMembers": typed_members
})
}
fn parse_units(raw: &str) -> HashMap<String, ParsedUnit> {
let mut units = HashMap::new();
for captures in UNIT_RE.captures_iter(raw) {
let Some(id) = captures
.get(1)
.map(|value| value.as_str().trim().to_string())
else {
continue;
};
let block = captures
.get(2)
.map(|value| value.as_str())
.unwrap_or_default();
let measures = MEASURE_RE
.captures_iter(block)
.filter_map(|captures| captures.get(1))
.map(|value| decode_xml_entities(value.as_str().trim()))
.filter(|value| !value.is_empty())
.collect::<Vec<_>>();
let measure = if measures.len() == 1 {
measures.first().cloned()
} else if measures.len() > 1 {
Some(measures.join("/"))
} else {
None
};
units.insert(id, ParsedUnit { measure });
}
units
}
fn is_xbrl_infrastructure_prefix(prefix: &str) -> bool {
pub(crate) fn is_xbrl_infrastructure_prefix(prefix: &str) -> bool {
matches!(
prefix.to_ascii_lowercase().as_str(),
"xbrli" | "xlink" | "link" | "xbrldi" | "xbrldt"
@@ -1474,25 +1153,6 @@ fn decode_xml_entities(value: &str) -> String {
.replace("&nbsp;", " ")
}
fn parse_number(raw: &str) -> Option<f64> {
let trimmed = raw.trim();
if trimmed.is_empty() || trimmed.chars().all(|char| char == '-') {
return None;
}
let negative = trimmed.starts_with('(') && trimmed.ends_with(')');
let normalized = Regex::new(r#"<[^>]+>"#)
.unwrap()
.replace_all(trimmed, " ")
.replace(',', "")
.replace('$', "")
.replace(['(', ')'], "")
.replace('\u{2212}', "-")
.split_whitespace()
.collect::<String>();
let parsed = normalized.parse::<f64>().ok()?;
Some(if negative { -parsed.abs() } else { parsed })
}
fn parse_label_linkbase(raw: &str) -> HashMap<String, String> {
let namespaces = parse_namespace_map(raw, "linkbase");
let mut preferred = HashMap::<String, (String, i64)>::new();
@@ -2543,7 +2203,8 @@ mod tests {
</xbrli:xbrl>
"#;
let parsed = parse_xbrl_instance(raw, Some("test.xml".to_string()));
let parsed = parse_xbrl_instance(raw, Some("test.xml".to_string()))
.expect("crabrl parser should parse test instance");
assert_eq!(parsed.facts.len(), 1);
assert_eq!(
parsed.facts[0].qname,

View File

@@ -3,8 +3,8 @@ use std::collections::{BTreeMap, HashMap, HashSet};
use crate::pack_selector::FiscalPack;
use crate::taxonomy_loader::{
load_crosswalk, load_surface_pack, CrosswalkFile, SurfaceDefinition, SurfaceFormula,
SurfaceFormulaOp, SurfaceSignTransform,
load_crosswalk, load_income_bridge, load_surface_pack, CrosswalkFile, IncomeBridgeFile,
IncomeBridgeRow, SurfaceDefinition, SurfaceFormula, SurfaceFormulaOp, SurfaceSignTransform,
};
use crate::{
ConceptOutput, DetailRowOutput, DetailRowStatementMap, FactOutput, NormalizationSummaryOutput,
@@ -114,6 +114,7 @@ pub fn build_compact_surface_model(
) -> Result<CompactSurfaceModel> {
let pack = load_surface_pack(fiscal_pack)?;
let crosswalk = load_crosswalk(taxonomy_regime)?;
let income_bridge = load_income_bridge(fiscal_pack).ok();
let mut surface_rows = empty_surface_row_map();
let mut detail_rows = empty_detail_row_map();
let mut concept_mappings = HashMap::<String, MappingAssignment>::new();
@@ -157,14 +158,20 @@ pub fn build_compact_surface_model(
.filter(|matched| matched.match_role == MatchRole::Detail)
.cloned()
.collect::<Vec<_>>();
let bridge_detail_matches = collect_income_bridge_detail_matches(
definition,
&rows,
crosswalk.as_ref(),
income_bridge.as_ref(),
);
let detail_matches = if definition.detail_grouping_policy == "group_all_children" {
if detail_component_matches.is_empty()
&& definition.rollup_policy == "aggregate_children"
{
let detail_matches =
merge_detail_matches(&detail_component_matches, &bridge_detail_matches);
if detail_matches.is_empty() && definition.rollup_policy == "aggregate_children" {
Vec::new()
} else {
detail_component_matches.clone()
detail_matches
}
} else {
Vec::new()
@@ -758,28 +765,123 @@ fn match_statement_row<'a>(
None
}
fn collect_income_bridge_detail_matches<'a>(
definition: &SurfaceDefinition,
rows: &'a [StatementRowOutput],
crosswalk: Option<&CrosswalkFile>,
income_bridge: Option<&IncomeBridgeFile>,
) -> Vec<MatchedStatementRow<'a>> {
if definition.statement != "income"
|| definition.rollup_policy != "aggregate_children"
|| definition.detail_grouping_policy != "group_all_children"
{
return Vec::new();
}
let Some(bridge_row) =
income_bridge.and_then(|bridge| bridge.rows.get(&definition.surface_key))
else {
return Vec::new();
};
rows.iter()
.filter(|row| has_any_value(&row.values))
.filter_map(|row| match_income_bridge_detail_row(row, bridge_row, crosswalk))
.collect()
}
fn match_income_bridge_detail_row<'a>(
row: &'a StatementRowOutput,
bridge_row: &IncomeBridgeRow,
crosswalk: Option<&CrosswalkFile>,
) -> Option<MatchedStatementRow<'a>> {
let authoritative_concept_key = crosswalk
.and_then(|crosswalk| crosswalk.mappings.get(&row.qname))
.map(|mapping| mapping.authoritative_concept_key.clone())
.or_else(|| {
if !row.is_extension {
Some(row.qname.clone())
} else {
None
}
});
let matches_group = bridge_row
.component_concept_groups
.positive
.iter()
.chain(bridge_row.component_concept_groups.negative.iter())
.any(|group| {
group.concepts.iter().any(|candidate| {
candidate_matches(candidate, &row.qname)
|| candidate_matches(candidate, &row.local_name)
|| authoritative_concept_key
.as_ref()
.map(|concept| candidate_matches(candidate, concept))
.unwrap_or(false)
})
});
if !matches_group {
return None;
}
Some(MatchedStatementRow {
row,
authoritative_concept_key,
mapping_method: MappingMethod::AggregateChildren,
match_role: MatchRole::Detail,
rank: 2,
})
}
fn merge_detail_matches<'a>(
direct_matches: &[MatchedStatementRow<'a>],
bridge_matches: &[MatchedStatementRow<'a>],
) -> Vec<MatchedStatementRow<'a>> {
let mut merged = HashMap::<String, MatchedStatementRow<'a>>::new();
for matched in direct_matches.iter().chain(bridge_matches.iter()) {
merged
.entry(matched.row.key.clone())
.and_modify(|existing| {
if compare_statement_matches(matched, existing).is_lt() {
*existing = matched.clone();
}
})
.or_insert_with(|| matched.clone());
}
merged.into_values().collect()
}
fn pick_best_match<'a>(matches: &'a [MatchedStatementRow<'a>]) -> &'a MatchedStatementRow<'a> {
matches
.iter()
.min_by(|left, right| {
left.rank
.cmp(&right.rank)
.then_with(|| {
let left_dimension_rank = if left.row.has_dimensions { 1 } else { 0 };
let right_dimension_rank = if right.row.has_dimensions { 1 } else { 0 };
left_dimension_rank.cmp(&right_dimension_rank)
})
.then_with(|| left.row.order.cmp(&right.row.order))
.then_with(|| {
max_abs_value(&right.row.values)
.partial_cmp(&max_abs_value(&left.row.values))
.unwrap_or(std::cmp::Ordering::Equal)
})
.then_with(|| left.row.label.cmp(&right.row.label))
})
.min_by(|left, right| compare_statement_matches(left, right))
.expect("pick_best_match requires at least one match")
}
fn compare_statement_matches(
left: &MatchedStatementRow<'_>,
right: &MatchedStatementRow<'_>,
) -> std::cmp::Ordering {
left.rank
.cmp(&right.rank)
.then_with(|| {
let left_dimension_rank = if left.row.has_dimensions { 1 } else { 0 };
let right_dimension_rank = if right.row.has_dimensions { 1 } else { 0 };
left_dimension_rank.cmp(&right_dimension_rank)
})
.then_with(|| left.row.order.cmp(&right.row.order))
.then_with(|| {
max_abs_value(&right.row.values)
.partial_cmp(&max_abs_value(&left.row.values))
.unwrap_or(std::cmp::Ordering::Equal)
})
.then_with(|| left.row.label.cmp(&right.row.label))
}
fn build_surface_values(
periods: &[PeriodOutput],
matches: &[MatchedStatementRow<'_>],

View File

@@ -336,22 +336,26 @@ fn build_formula_row(
.positive
.iter()
.filter_map(|surface_key| {
income_surface_rows
.iter()
.find(|row| row.key == *surface_key)
resolve_component_surface_source(
surface_key,
income_statement_rows,
income_surface_rows,
crosswalk,
)
})
.map(surface_source)
.collect::<Vec<_>>();
let negative_surface_sources = bridge_row
.component_surfaces
.negative
.iter()
.filter_map(|surface_key| {
income_surface_rows
.iter()
.find(|row| row.key == *surface_key)
resolve_component_surface_source(
surface_key,
income_statement_rows,
income_surface_rows,
crosswalk,
)
})
.map(surface_source)
.collect::<Vec<_>>();
let (positive_group_sources, positive_group_rows) = collect_group_sources(
@@ -810,6 +814,44 @@ fn collect_group_sources<'a>(
(sources, rows)
}
fn resolve_component_surface_source(
surface_key: &str,
income_statement_rows: &[StatementRowOutput],
income_surface_rows: &[SurfaceRowOutput],
crosswalk: Option<&CrosswalkFile>,
) -> Option<ValueSource> {
if let Some(surface_row) = income_surface_rows
.iter()
.find(|row| row.key == surface_key)
{
return Some(surface_source(surface_row));
}
let matches = income_statement_rows
.iter()
.filter(|row| has_any_value(&row.values))
.filter(|row| row_matches_surface_key(row, surface_key, crosswalk))
.map(statement_row_source)
.collect::<Vec<_>>();
if matches.is_empty() {
return None;
}
Some(merge_value_sources(&matches))
}
fn row_matches_surface_key(
row: &StatementRowOutput,
surface_key: &str,
crosswalk: Option<&CrosswalkFile>,
) -> bool {
crosswalk
.and_then(|crosswalk| crosswalk.mappings.get(&row.qname))
.map(|mapping| mapping.surface_key.eq_ignore_ascii_case(surface_key))
.unwrap_or(false)
}
fn match_direct_authoritative<'a>(
row: &'a StatementRowOutput,
candidates: &[String],
@@ -1024,6 +1066,52 @@ fn surface_source(row: &SurfaceRowOutput) -> ValueSource {
}
}
fn merge_value_sources(sources: &[ValueSource]) -> ValueSource {
let mut values = BTreeMap::<String, Option<f64>>::new();
for period_id in sources.iter().flat_map(|source| source.values.keys()) {
values.entry(period_id.clone()).or_insert_with(|| {
let period_values = sources
.iter()
.map(|source| source.values.get(period_id).copied().flatten())
.collect::<Vec<_>>();
if period_values.iter().all(|value| value.is_none()) {
None
} else {
Some(
period_values
.into_iter()
.map(|value| value.unwrap_or(0.0))
.sum(),
)
}
});
}
ValueSource {
values,
source_concepts: unique_sorted_strings(
sources
.iter()
.flat_map(|source| source.source_concepts.clone())
.collect(),
),
source_row_keys: unique_sorted_strings(
sources
.iter()
.flat_map(|source| source.source_row_keys.clone())
.collect(),
),
source_fact_ids: unique_sorted_i64(
sources
.iter()
.flat_map(|source| source.source_fact_ids.clone())
.collect(),
),
has_dimensions: sources.iter().any(|source| source.has_dimensions),
}
}
fn fact_matches_period(fact: &FactOutput, period: &PeriodOutput) -> bool {
if fact.period_end != period.period_end {
return false;