Expand backend financial statement parsers

This commit is contained in:
2026-03-12 21:15:54 -04:00
parent 33ce48f53c
commit 7a7a78340f
13 changed files with 4398 additions and 456 deletions

View File

@@ -37,10 +37,12 @@ static IDENTIFIER_RE: Lazy<Regex> = Lazy::new(|| {
Regex::new(r#"(?is)<(?:[a-z0-9_\-]+:)?identifier\b[^>]*\bscheme=["']([^"']+)["'][^>]*>(.*?)</(?:[a-z0-9_\-]+:)?identifier>"#).unwrap()
});
static SEGMENT_RE: Lazy<Regex> = Lazy::new(|| {
Regex::new(r#"(?is)<(?:[a-z0-9_\-]+:)?segment\b[^>]*>(.*?)</(?:[a-z0-9_\-]+:)?segment>"#).unwrap()
Regex::new(r#"(?is)<(?:[a-z0-9_\-]+:)?segment\b[^>]*>(.*?)</(?:[a-z0-9_\-]+:)?segment>"#)
.unwrap()
});
static SCENARIO_RE: Lazy<Regex> = Lazy::new(|| {
Regex::new(r#"(?is)<(?:[a-z0-9_\-]+:)?scenario\b[^>]*>(.*?)</(?:[a-z0-9_\-]+:)?scenario>"#).unwrap()
Regex::new(r#"(?is)<(?:[a-z0-9_\-]+:)?scenario\b[^>]*>(.*?)</(?:[a-z0-9_\-]+:)?scenario>"#)
.unwrap()
});
static START_DATE_RE: Lazy<Regex> = Lazy::new(|| {
Regex::new(r#"(?is)<(?:[a-z0-9_\-]+:)?startDate>(.*?)</(?:[a-z0-9_\-]+:)?startDate>"#).unwrap()
@@ -55,7 +57,8 @@ static MEASURE_RE: Lazy<Regex> = Lazy::new(|| {
Regex::new(r#"(?is)<(?:[a-z0-9_\-]+:)?measure>(.*?)</(?:[a-z0-9_\-]+:)?measure>"#).unwrap()
});
static LABEL_LINK_RE: Lazy<Regex> = Lazy::new(|| {
Regex::new(r#"(?is)<(?:[a-z0-9_\-]+:)?labelLink\b[^>]*>(.*?)</(?:[a-z0-9_\-]+:)?labelLink>"#).unwrap()
Regex::new(r#"(?is)<(?:[a-z0-9_\-]+:)?labelLink\b[^>]*>(.*?)</(?:[a-z0-9_\-]+:)?labelLink>"#)
.unwrap()
});
static PRESENTATION_LINK_RE: Lazy<Regex> = Lazy::new(|| {
Regex::new(r#"(?is)<(?:[a-z0-9_\-]+:)?presentationLink\b([^>]*)>(.*?)</(?:[a-z0-9_\-]+:)?presentationLink>"#).unwrap()
@@ -67,12 +70,14 @@ static LABEL_RESOURCE_RE: Lazy<Regex> = Lazy::new(|| {
Regex::new(r#"(?is)<(?:[a-z0-9_\-]+:)?label\b([^>]*)>(.*?)</(?:[a-z0-9_\-]+:)?label>"#).unwrap()
});
static LABEL_ARC_RE: Lazy<Regex> = Lazy::new(|| {
Regex::new(r#"(?is)<(?:[a-z0-9_\-]+:)?labelArc\b([^>]*)/?>(?:</(?:[a-z0-9_\-]+:)?labelArc>)?"#).unwrap()
Regex::new(r#"(?is)<(?:[a-z0-9_\-]+:)?labelArc\b([^>]*)/?>(?:</(?:[a-z0-9_\-]+:)?labelArc>)?"#)
.unwrap()
});
static PRESENTATION_ARC_RE: Lazy<Regex> = Lazy::new(|| {
Regex::new(r#"(?is)<(?:[a-z0-9_\-]+:)?presentationArc\b([^>]*)/?>(?:</(?:[a-z0-9_\-]+:)?presentationArc>)?"#).unwrap()
});
static ATTR_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r#"([a-zA-Z0-9:_\-]+)=["']([^"']+)["']"#).unwrap());
static ATTR_RE: Lazy<Regex> =
Lazy::new(|| Regex::new(r#"([a-zA-Z0-9:_\-]+)=["']([^"']+)["']"#).unwrap());
#[derive(Debug, Deserialize)]
#[serde(rename_all = "camelCase")]
@@ -451,7 +456,8 @@ pub fn hydrate_filing(input: HydrateFilingRequest) -> Result<HydrateFilingRespon
});
};
let instance_text = fetch_text(&client, &instance_asset.url).context("fetch request failed for XBRL instance")?;
let instance_text = fetch_text(&client, &instance_asset.url)
.context("fetch request failed for XBRL instance")?;
let parsed_instance = parse_xbrl_instance(&instance_text, Some(instance_asset.name.clone()));
let mut label_by_concept = HashMap::new();
@@ -459,11 +465,9 @@ pub fn hydrate_filing(input: HydrateFilingRequest) -> Result<HydrateFilingRespon
let mut source = "xbrl_instance".to_string();
let mut parse_error = None;
for asset in discovered
.assets
.iter()
.filter(|asset| asset.is_selected && (asset.asset_type == "presentation" || asset.asset_type == "label"))
{
for asset in discovered.assets.iter().filter(|asset| {
asset.is_selected && (asset.asset_type == "presentation" || asset.asset_type == "label")
}) {
match fetch_text(&client, &asset.url) {
Ok(content) => {
if asset.asset_type == "presentation" {
@@ -515,10 +519,15 @@ pub fn hydrate_filing(input: HydrateFilingRequest) -> Result<HydrateFilingRespon
pack_selection.pack,
&mut compact_model,
)?;
let kpi_result = kpi_mapper::build_taxonomy_kpis(&materialized.periods, &facts, pack_selection.pack)?;
let kpi_result =
kpi_mapper::build_taxonomy_kpis(&materialized.periods, &facts, pack_selection.pack)?;
compact_model.normalization_summary.kpi_row_count = kpi_result.rows.len();
for warning in kpi_result.warnings {
if !compact_model.normalization_summary.warnings.contains(&warning) {
if !compact_model
.normalization_summary
.warnings
.contains(&warning)
{
compact_model.normalization_summary.warnings.push(warning);
}
}
@@ -526,7 +535,11 @@ pub fn hydrate_filing(input: HydrateFilingRequest) -> Result<HydrateFilingRespon
&mut compact_model.concept_mappings,
kpi_result.mapping_assignments,
);
surface_mapper::apply_mapping_assignments(&mut concepts, &mut facts, &compact_model.concept_mappings);
surface_mapper::apply_mapping_assignments(
&mut concepts,
&mut facts,
&compact_model.concept_mappings,
);
let has_rows = materialized
.statement_rows
@@ -572,7 +585,11 @@ pub fn hydrate_filing(input: HydrateFilingRequest) -> Result<HydrateFilingRespon
concepts_count: concepts.len(),
dimensions_count: facts
.iter()
.flat_map(|fact| fact.dimensions.iter().map(|dimension| format!("{}::{}", dimension.axis, dimension.member)))
.flat_map(|fact| {
fact.dimensions
.iter()
.map(|dimension| format!("{}::{}", dimension.axis, dimension.member))
})
.collect::<HashSet<_>>()
.len(),
assets: discovered.assets,
@@ -622,7 +639,10 @@ struct DiscoveredAssets {
assets: Vec<AssetOutput>,
}
fn discover_filing_assets(input: &HydrateFilingRequest, client: &Client) -> Result<DiscoveredAssets> {
fn discover_filing_assets(
input: &HydrateFilingRequest,
client: &Client,
) -> Result<DiscoveredAssets> {
let Some(directory_url) = resolve_filing_directory_url(
input.filing_url.as_deref(),
&input.cik,
@@ -631,12 +651,19 @@ fn discover_filing_assets(input: &HydrateFilingRequest, client: &Client) -> Resu
return Ok(DiscoveredAssets { assets: vec![] });
};
let payload = fetch_json::<FilingDirectoryPayload>(client, &format!("{directory_url}index.json")).ok();
let payload =
fetch_json::<FilingDirectoryPayload>(client, &format!("{directory_url}index.json")).ok();
let mut discovered = Vec::new();
if let Some(items) = payload.and_then(|payload| payload.directory.and_then(|directory| directory.item)) {
if let Some(items) =
payload.and_then(|payload| payload.directory.and_then(|directory| directory.item))
{
for item in items {
let Some(name) = item.name.map(|name| name.trim().to_string()).filter(|name| !name.is_empty()) else {
let Some(name) = item
.name
.map(|name| name.trim().to_string())
.filter(|name| !name.is_empty())
else {
continue;
};
@@ -683,12 +710,19 @@ fn discover_filing_assets(input: &HydrateFilingRequest, client: &Client) -> Resu
score_instance(&asset.name, input.primary_document.as_deref()),
)
})
.max_by(|left, right| left.1.partial_cmp(&right.1).unwrap_or(std::cmp::Ordering::Equal))
.max_by(|left, right| {
left.1
.partial_cmp(&right.1)
.unwrap_or(std::cmp::Ordering::Equal)
})
.map(|entry| entry.0);
for asset in &mut discovered {
asset.score = if asset.asset_type == "instance" {
Some(score_instance(&asset.name, input.primary_document.as_deref()))
Some(score_instance(
&asset.name,
input.primary_document.as_deref(),
))
} else if asset.asset_type == "pdf" {
Some(score_pdf(&asset.name, asset.size_bytes))
} else {
@@ -708,7 +742,11 @@ fn discover_filing_assets(input: &HydrateFilingRequest, client: &Client) -> Resu
Ok(DiscoveredAssets { assets: discovered })
}
fn resolve_filing_directory_url(filing_url: Option<&str>, cik: &str, accession_number: &str) -> Option<String> {
fn resolve_filing_directory_url(
filing_url: Option<&str>,
cik: &str,
accession_number: &str,
) -> Option<String> {
if let Some(filing_url) = filing_url.map(str::trim).filter(|value| !value.is_empty()) {
if let Some(last_slash) = filing_url.rfind('/') {
if last_slash > "https://".len() {
@@ -725,7 +763,10 @@ fn resolve_filing_directory_url(filing_url: Option<&str>, cik: &str, accession_n
}
fn normalize_cik_for_path(value: &str) -> Option<String> {
let digits = value.chars().filter(|char| char.is_ascii_digit()).collect::<String>();
let digits = value
.chars()
.filter(|char| char.is_ascii_digit())
.collect::<String>();
if digits.is_empty() {
return None;
}
@@ -741,16 +782,25 @@ fn classify_asset_type(name: &str) -> &'static str {
return "schema";
}
if lower.ends_with(".xml") {
if lower.ends_with("_pre.xml") || lower.ends_with("-pre.xml") || lower.contains("presentation") {
if lower.ends_with("_pre.xml")
|| lower.ends_with("-pre.xml")
|| lower.contains("presentation")
{
return "presentation";
}
if lower.ends_with("_lab.xml") || lower.ends_with("-lab.xml") || lower.contains("label") {
return "label";
}
if lower.ends_with("_cal.xml") || lower.ends_with("-cal.xml") || lower.contains("calculation") {
if lower.ends_with("_cal.xml")
|| lower.ends_with("-cal.xml")
|| lower.contains("calculation")
{
return "calculation";
}
if lower.ends_with("_def.xml") || lower.ends_with("-def.xml") || lower.contains("definition") {
if lower.ends_with("_def.xml")
|| lower.ends_with("-def.xml")
|| lower.contains("definition")
{
return "definition";
}
return "instance";
@@ -779,7 +829,11 @@ fn score_instance(name: &str, primary_document: Option<&str>) -> f64 {
score += 5.0;
}
}
if lower.contains("cal") || lower.contains("def") || lower.contains("lab") || lower.contains("pre") {
if lower.contains("cal")
|| lower.contains("def")
|| lower.contains("lab")
|| lower.contains("pre")
{
score -= 3.0;
}
score
@@ -819,7 +873,9 @@ fn fetch_text(client: &Client, url: &str) -> Result<String> {
if !response.status().is_success() {
return Err(anyhow!("request failed for {url} ({})", response.status()));
}
response.text().with_context(|| format!("unable to read response body for {url}"))
response
.text()
.with_context(|| format!("unable to read response body for {url}"))
}
fn fetch_json<T: for<'de> Deserialize<'de>>(client: &Client, url: &str) -> Result<T> {
@@ -847,17 +903,36 @@ fn parse_xbrl_instance(raw: &str, source_file: Option<String>) -> ParsedInstance
let mut facts = Vec::new();
for captures in FACT_RE.captures_iter(raw) {
let prefix = captures.get(1).map(|value| value.as_str().trim()).unwrap_or_default();
let local_name = captures.get(2).map(|value| value.as_str().trim()).unwrap_or_default();
let attrs = captures.get(3).map(|value| value.as_str()).unwrap_or_default();
let body = decode_xml_entities(captures.get(4).map(|value| value.as_str()).unwrap_or_default().trim());
let prefix = captures
.get(1)
.map(|value| value.as_str().trim())
.unwrap_or_default();
let local_name = captures
.get(2)
.map(|value| value.as_str().trim())
.unwrap_or_default();
let attrs = captures
.get(3)
.map(|value| value.as_str())
.unwrap_or_default();
let body = decode_xml_entities(
captures
.get(4)
.map(|value| value.as_str())
.unwrap_or_default()
.trim(),
);
if prefix.is_empty() || local_name.is_empty() || is_xbrl_infrastructure_prefix(prefix) {
continue;
}
let attr_map = parse_attrs(attrs);
let Some(context_id) = attr_map.get("contextRef").cloned().or_else(|| attr_map.get("contextref").cloned()) else {
let Some(context_id) = attr_map
.get("contextRef")
.cloned()
.or_else(|| attr_map.get("contextref").cloned())
else {
continue;
};
@@ -870,7 +945,10 @@ fn parse_xbrl_instance(raw: &str, source_file: Option<String>) -> ParsedInstance
.cloned()
.unwrap_or_else(|| format!("urn:unknown:{prefix}"));
let context = context_by_id.get(&context_id);
let unit_ref = attr_map.get("unitRef").cloned().or_else(|| attr_map.get("unitref").cloned());
let unit_ref = attr_map
.get("unitRef")
.cloned()
.or_else(|| attr_map.get("unitref").cloned());
let unit = unit_ref
.as_ref()
.and_then(|unit_ref| unit_by_id.get(unit_ref))
@@ -896,8 +974,12 @@ fn parse_xbrl_instance(raw: &str, source_file: Option<String>) -> ParsedInstance
period_start: context.and_then(|value| value.period_start.clone()),
period_end: context.and_then(|value| value.period_end.clone()),
period_instant: context.and_then(|value| value.period_instant.clone()),
dimensions: context.map(|value| value.dimensions.clone()).unwrap_or_default(),
is_dimensionless: context.map(|value| value.dimensions.is_empty()).unwrap_or(true),
dimensions: context
.map(|value| value.dimensions.clone())
.unwrap_or_default(),
is_dimensionless: context
.map(|value| value.dimensions.is_empty())
.unwrap_or(true),
source_file: source_file.clone(),
});
}
@@ -916,10 +998,7 @@ fn parse_xbrl_instance(raw: &str, source_file: Option<String>) -> ParsedInstance
})
.collect::<Vec<_>>();
ParsedInstance {
contexts,
facts,
}
ParsedInstance { contexts, facts }
}
fn parse_namespace_map(raw: &str, root_tag_hint: &str) -> HashMap<String, String> {
@@ -935,7 +1014,10 @@ fn parse_namespace_map(raw: &str, root_tag_hint: &str) -> HashMap<String, String
.captures_iter(&root_start)
{
if let (Some(prefix), Some(uri)) = (captures.get(1), captures.get(2)) {
map.insert(prefix.as_str().trim().to_string(), uri.as_str().trim().to_string());
map.insert(
prefix.as_str().trim().to_string(),
uri.as_str().trim().to_string(),
);
}
}
@@ -946,16 +1028,26 @@ fn parse_contexts(raw: &str) -> HashMap<String, ParsedContext> {
let mut contexts = HashMap::new();
for captures in CONTEXT_RE.captures_iter(raw) {
let Some(context_id) = captures.get(1).map(|value| value.as_str().trim().to_string()) else {
let Some(context_id) = captures
.get(1)
.map(|value| value.as_str().trim().to_string())
else {
continue;
};
let block = captures.get(2).map(|value| value.as_str()).unwrap_or_default();
let block = captures
.get(2)
.map(|value| value.as_str())
.unwrap_or_default();
let (entity_identifier, entity_scheme) = IDENTIFIER_RE
.captures(block)
.map(|captures| {
(
captures.get(2).map(|value| decode_xml_entities(value.as_str().trim())),
captures.get(1).map(|value| decode_xml_entities(value.as_str().trim())),
captures
.get(2)
.map(|value| decode_xml_entities(value.as_str().trim())),
captures
.get(1)
.map(|value| decode_xml_entities(value.as_str().trim())),
)
})
.unwrap_or((None, None));
@@ -984,7 +1076,10 @@ fn parse_contexts(raw: &str) -> HashMap<String, ParsedContext> {
let mut dimensions = Vec::new();
if let Some(segment_value) = segment.as_ref() {
if let Some(members) = segment_value.get("explicitMembers").and_then(|value| value.as_array()) {
if let Some(members) = segment_value
.get("explicitMembers")
.and_then(|value| value.as_array())
{
for member in members {
if let (Some(axis), Some(member_value)) = (
member.get("axis").and_then(|value| value.as_str()),
@@ -999,7 +1094,10 @@ fn parse_contexts(raw: &str) -> HashMap<String, ParsedContext> {
}
}
if let Some(scenario_value) = scenario.as_ref() {
if let Some(members) = scenario_value.get("explicitMembers").and_then(|value| value.as_array()) {
if let Some(members) = scenario_value
.get("explicitMembers")
.and_then(|value| value.as_array())
{
for member in members {
if let (Some(axis), Some(member_value)) = (
member.get("axis").and_then(|value| value.as_str()),
@@ -1062,10 +1160,16 @@ fn parse_dimension_container(raw: &str) -> serde_json::Value {
fn parse_units(raw: &str) -> HashMap<String, ParsedUnit> {
let mut units = HashMap::new();
for captures in UNIT_RE.captures_iter(raw) {
let Some(id) = captures.get(1).map(|value| value.as_str().trim().to_string()) else {
let Some(id) = captures
.get(1)
.map(|value| value.as_str().trim().to_string())
else {
continue;
};
let block = captures.get(2).map(|value| value.as_str()).unwrap_or_default();
let block = captures
.get(2)
.map(|value| value.as_str())
.unwrap_or_default();
let measures = MEASURE_RE
.captures_iter(block)
.filter_map(|captures| captures.get(1))
@@ -1097,7 +1201,10 @@ fn parse_attrs(raw: &str) -> HashMap<String, String> {
let mut map = HashMap::new();
for captures in ATTR_RE.captures_iter(raw) {
if let (Some(name), Some(value)) = (captures.get(1), captures.get(2)) {
map.insert(name.as_str().to_string(), decode_xml_entities(value.as_str()));
map.insert(
name.as_str().to_string(),
decode_xml_entities(value.as_str()),
);
}
}
map
@@ -1138,12 +1245,20 @@ fn parse_label_linkbase(raw: &str) -> HashMap<String, String> {
let mut preferred = HashMap::<String, (String, i64)>::new();
for captures in LABEL_LINK_RE.captures_iter(raw) {
let block = captures.get(1).map(|value| value.as_str()).unwrap_or_default();
let block = captures
.get(1)
.map(|value| value.as_str())
.unwrap_or_default();
let mut loc_by_label = HashMap::<String, String>::new();
let mut resource_by_label = HashMap::<String, (String, Option<String>)>::new();
for captures in LOC_RE.captures_iter(block) {
let attrs = parse_attrs(captures.get(1).map(|value| value.as_str()).unwrap_or_default());
let attrs = parse_attrs(
captures
.get(1)
.map(|value| value.as_str())
.unwrap_or_default(),
);
let Some(label) = attrs.get("xlink:label").cloned() else {
continue;
};
@@ -1160,14 +1275,24 @@ fn parse_label_linkbase(raw: &str) -> HashMap<String, String> {
}
for captures in LABEL_RESOURCE_RE.captures_iter(block) {
let attrs = parse_attrs(captures.get(1).map(|value| value.as_str()).unwrap_or_default());
let attrs = parse_attrs(
captures
.get(1)
.map(|value| value.as_str())
.unwrap_or_default(),
);
let Some(label) = attrs.get("xlink:label").cloned() else {
continue;
};
let body = decode_xml_entities(captures.get(2).map(|value| value.as_str()).unwrap_or_default())
.split_whitespace()
.collect::<Vec<_>>()
.join(" ");
let body = decode_xml_entities(
captures
.get(2)
.map(|value| value.as_str())
.unwrap_or_default(),
)
.split_whitespace()
.collect::<Vec<_>>()
.join(" ");
if body.is_empty() {
continue;
}
@@ -1175,7 +1300,12 @@ fn parse_label_linkbase(raw: &str) -> HashMap<String, String> {
}
for captures in LABEL_ARC_RE.captures_iter(block) {
let attrs = parse_attrs(captures.get(1).map(|value| value.as_str()).unwrap_or_default());
let attrs = parse_attrs(
captures
.get(1)
.map(|value| value.as_str())
.unwrap_or_default(),
);
let Some(from) = attrs.get("xlink:from").cloned() else {
continue;
};
@@ -1190,7 +1320,11 @@ fn parse_label_linkbase(raw: &str) -> HashMap<String, String> {
};
let priority = label_priority(role.as_deref());
let current = preferred.get(concept_key).cloned();
if current.as_ref().map(|(_, current_priority)| priority > *current_priority).unwrap_or(true) {
if current
.as_ref()
.map(|(_, current_priority)| priority > *current_priority)
.unwrap_or(true)
{
preferred.insert(concept_key.clone(), (label.clone(), priority));
}
}
@@ -1207,18 +1341,31 @@ fn parse_presentation_linkbase(raw: &str) -> Vec<PresentationNode> {
let mut rows = Vec::new();
for captures in PRESENTATION_LINK_RE.captures_iter(raw) {
let link_attrs = parse_attrs(captures.get(1).map(|value| value.as_str()).unwrap_or_default());
let link_attrs = parse_attrs(
captures
.get(1)
.map(|value| value.as_str())
.unwrap_or_default(),
);
let Some(role_uri) = link_attrs.get("xlink:role").cloned() else {
continue;
};
let block = captures.get(2).map(|value| value.as_str()).unwrap_or_default();
let block = captures
.get(2)
.map(|value| value.as_str())
.unwrap_or_default();
let mut loc_by_label = HashMap::<String, (String, String, bool)>::new();
let mut children_by_label = HashMap::<String, Vec<(String, f64)>>::new();
let mut incoming = HashSet::<String>::new();
let mut all_referenced = HashSet::<String>::new();
for captures in LOC_RE.captures_iter(block) {
let attrs = parse_attrs(captures.get(1).map(|value| value.as_str()).unwrap_or_default());
let attrs = parse_attrs(
captures
.get(1)
.map(|value| value.as_str())
.unwrap_or_default(),
);
let Some(label) = attrs.get("xlink:label").cloned() else {
continue;
};
@@ -1228,14 +1375,27 @@ fn parse_presentation_linkbase(raw: &str) -> Vec<PresentationNode> {
let Some(qname) = qname_from_href(&href) else {
continue;
};
let Some((concept_key, qname, local_name)) = concept_from_qname(&qname, &namespaces) else {
let Some((concept_key, qname, local_name)) = concept_from_qname(&qname, &namespaces)
else {
continue;
};
loc_by_label.insert(label, (concept_key, qname, local_name.to_ascii_lowercase().contains("abstract")));
loc_by_label.insert(
label,
(
concept_key,
qname,
local_name.to_ascii_lowercase().contains("abstract"),
),
);
}
for captures in PRESENTATION_ARC_RE.captures_iter(block) {
let attrs = parse_attrs(captures.get(1).map(|value| value.as_str()).unwrap_or_default());
let attrs = parse_attrs(
captures
.get(1)
.map(|value| value.as_str())
.unwrap_or_default(),
);
let Some(from) = attrs.get("xlink:from").cloned() else {
continue;
};
@@ -1248,8 +1408,16 @@ fn parse_presentation_linkbase(raw: &str) -> Vec<PresentationNode> {
let order = attrs
.get("order")
.and_then(|value| value.parse::<f64>().ok())
.unwrap_or_else(|| children_by_label.get(&from).map(|children| children.len() as f64 + 1.0).unwrap_or(1.0));
children_by_label.entry(from.clone()).or_default().push((to.clone(), order));
.unwrap_or_else(|| {
children_by_label
.get(&from)
.map(|children| children.len() as f64 + 1.0)
.unwrap_or(1.0)
});
children_by_label
.entry(from.clone())
.or_default()
.push((to.clone(), order));
incoming.insert(to.clone());
all_referenced.insert(from);
all_referenced.insert(to);
@@ -1281,7 +1449,11 @@ fn parse_presentation_linkbase(raw: &str) -> Vec<PresentationNode> {
return;
}
let parent_concept_key = parent_label.and_then(|parent| loc_by_label.get(parent).map(|(concept_key, _, _)| concept_key.clone()));
let parent_concept_key = parent_label.and_then(|parent| {
loc_by_label
.get(parent)
.map(|(concept_key, _, _)| concept_key.clone())
});
rows.push(PresentationNode {
concept_key: concept_key.clone(),
role_uri: role_uri.to_string(),
@@ -1292,7 +1464,11 @@ fn parse_presentation_linkbase(raw: &str) -> Vec<PresentationNode> {
});
let mut children = children_by_label.get(label).cloned().unwrap_or_default();
children.sort_by(|left, right| left.1.partial_cmp(&right.1).unwrap_or(std::cmp::Ordering::Equal));
children.sort_by(|left, right| {
left.1
.partial_cmp(&right.1)
.unwrap_or(std::cmp::Ordering::Equal)
});
for (index, (child_label, _)) in children.into_iter().enumerate() {
dfs(
&child_label,
@@ -1400,7 +1576,10 @@ fn materialize_taxonomy_statements(
.clone()
.or_else(|| fact.period_instant.clone())
.unwrap_or_else(|| filing_date.to_string());
let id = format!("{date}-{compact_accession}-{}", period_by_signature.len() + 1);
let id = format!(
"{date}-{compact_accession}-{}",
period_by_signature.len() + 1
);
let period_label = if fact.period_instant.is_some() && fact.period_start.is_none() {
"Instant".to_string()
} else if fact.period_start.is_some() && fact.period_end.is_some() {
@@ -1420,7 +1599,10 @@ fn materialize_taxonomy_statements(
accession_number: accession_number.to_string(),
filing_date: filing_date.to_string(),
period_start: fact.period_start.clone(),
period_end: fact.period_end.clone().or_else(|| fact.period_instant.clone()),
period_end: fact
.period_end
.clone()
.or_else(|| fact.period_instant.clone()),
filing_type: filing_type.to_string(),
period_label,
},
@@ -1429,9 +1611,17 @@ fn materialize_taxonomy_statements(
let mut periods = period_by_signature.values().cloned().collect::<Vec<_>>();
periods.sort_by(|left, right| {
let left_key = left.period_end.clone().unwrap_or_else(|| left.filing_date.clone());
let right_key = right.period_end.clone().unwrap_or_else(|| right.filing_date.clone());
left_key.cmp(&right_key).then_with(|| left.id.cmp(&right.id))
let left_key = left
.period_end
.clone()
.unwrap_or_else(|| left.filing_date.clone());
let right_key = right
.period_end
.clone()
.unwrap_or_else(|| right.filing_date.clone());
left_key
.cmp(&right_key)
.then_with(|| left.id.cmp(&right.id))
});
let period_id_by_signature = period_by_signature
.iter()
@@ -1440,7 +1630,10 @@ fn materialize_taxonomy_statements(
let mut presentation_by_concept = HashMap::<String, Vec<&PresentationNode>>::new();
for node in presentation {
presentation_by_concept.entry(node.concept_key.clone()).or_default().push(node);
presentation_by_concept
.entry(node.concept_key.clone())
.or_default()
.push(node);
}
let mut grouped_by_statement = empty_parsed_fact_map();
@@ -1502,9 +1695,13 @@ fn materialize_taxonomy_statements(
let mut concepts = Vec::<ConceptOutput>::new();
for statement_kind in statement_keys() {
let concept_groups = grouped_by_statement.remove(statement_kind).unwrap_or_default();
let concept_groups = grouped_by_statement
.remove(statement_kind)
.unwrap_or_default();
let mut concept_keys = HashSet::<String>::new();
for node in presentation.iter().filter(|node| classify_statement_role(&node.role_uri).as_deref() == Some(statement_kind)) {
for node in presentation.iter().filter(|node| {
classify_statement_role(&node.role_uri).as_deref() == Some(statement_kind)
}) {
concept_keys.insert(node.concept_key.clone());
}
for concept_key in concept_groups.keys() {
@@ -1516,12 +1713,21 @@ fn materialize_taxonomy_statements(
.map(|concept_key| {
let nodes = presentation
.iter()
.filter(|node| node.concept_key == concept_key && classify_statement_role(&node.role_uri).as_deref() == Some(statement_kind))
.filter(|node| {
node.concept_key == concept_key
&& classify_statement_role(&node.role_uri).as_deref()
== Some(statement_kind)
})
.collect::<Vec<_>>();
let order = nodes.iter().map(|node| node.order).fold(f64::INFINITY, f64::min);
let order = nodes
.iter()
.map(|node| node.order)
.fold(f64::INFINITY, f64::min);
let depth = nodes.iter().map(|node| node.depth).min().unwrap_or(0);
let role_uri = nodes.first().map(|node| node.role_uri.clone());
let parent_concept_key = nodes.first().and_then(|node| node.parent_concept_key.clone());
let parent_concept_key = nodes
.first()
.and_then(|node| node.parent_concept_key.clone());
(concept_key, order, depth, role_uri, parent_concept_key)
})
.collect::<Vec<_>>();
@@ -1532,8 +1738,13 @@ fn materialize_taxonomy_statements(
.then_with(|| left.0.cmp(&right.0))
});
for (concept_key, presentation_order, depth, role_uri, parent_concept_key) in ordered_concepts {
let fact_group = concept_groups.get(&concept_key).cloned().unwrap_or_default();
for (concept_key, presentation_order, depth, role_uri, parent_concept_key) in
ordered_concepts
{
let fact_group = concept_groups
.get(&concept_key)
.cloned()
.unwrap_or_default();
let (namespace_uri, local_name) = split_concept_key(&concept_key);
let qname = fact_group
.first()
@@ -1672,7 +1883,13 @@ fn empty_detail_row_map() -> DetailRowStatementMap {
}
fn statement_keys() -> [&'static str; 5] {
["income", "balance", "cash_flow", "equity", "comprehensive_income"]
[
"income",
"balance",
"cash_flow",
"equity",
"comprehensive_income",
]
}
fn statement_key_ref(value: &str) -> Option<&'static str> {
@@ -1709,7 +1926,13 @@ fn pick_preferred_fact(grouped_facts: &[(i64, ParsedFact)]) -> Option<&(i64, Par
.unwrap_or_default();
left_date.cmp(&right_date)
})
.then_with(|| left.1.value.abs().partial_cmp(&right.1.value.abs()).unwrap_or(std::cmp::Ordering::Equal))
.then_with(|| {
left.1
.value
.abs()
.partial_cmp(&right.1.value.abs())
.unwrap_or(std::cmp::Ordering::Equal)
})
})
}
@@ -1779,12 +2002,6 @@ fn classify_statement_role(role_uri: &str) -> Option<String> {
fn concept_statement_fallback(local_name: &str) -> Option<String> {
let normalized = local_name.to_ascii_lowercase();
if Regex::new(r#"cash|operatingactivities|investingactivities|financingactivities"#)
.unwrap()
.is_match(&normalized)
{
return Some("cash_flow".to_string());
}
if Regex::new(r#"equity|retainedearnings|additionalpaidincapital"#)
.unwrap()
.is_match(&normalized)
@@ -1794,6 +2011,22 @@ fn concept_statement_fallback(local_name: &str) -> Option<String> {
if normalized.contains("comprehensiveincome") {
return Some("comprehensive_income".to_string());
}
if Regex::new(
r#"deferredpolicyacquisitioncosts(andvalueofbusinessacquired)?$|supplementaryinsuranceinformationdeferredpolicyacquisitioncosts$|deferredacquisitioncosts$"#,
)
.unwrap()
.is_match(&normalized)
{
return Some("balance".to_string());
}
if Regex::new(
r#"netcashprovidedbyusedin.*activities|increasedecreasein|paymentstoacquire|paymentsforcapitalimprovements$|paymentsfordepositsonrealestateacquisitions$|paymentsforrepurchase|paymentsofdividends|dividendscommonstockcash$|proceedsfrom|repaymentsofdebt|sharebasedcompensation$|allocatedsharebasedcompensationexpense$|depreciationdepletionandamortization$|depreciationamortizationandaccretionnet$|depreciationandamortization$|depreciationamortizationandother$|otheradjustmentstoreconcilenetincomelosstocashprovidedbyusedinoperatingactivities"#,
)
.unwrap()
.is_match(&normalized)
{
return Some("cash_flow".to_string());
}
if Regex::new(
r#"asset|liabilit|debt|financingreceivable|loansreceivable|deposits|allowanceforcreditloss|futurepolicybenefits|policyholderaccountbalances|unearnedpremiums|realestateinvestmentproperty|grossatcarryingvalue|investmentproperty"#,
)
@@ -1967,7 +2200,10 @@ mod tests {
vec![],
)
.expect("core pack should load and map");
let income_surface_rows = model.surface_rows.get("income").expect("income surface rows");
let income_surface_rows = model
.surface_rows
.get("income")
.expect("income surface rows");
let op_expenses = income_surface_rows
.iter()
.find(|row| row.key == "operating_expenses")
@@ -1978,7 +2214,10 @@ mod tests {
.expect("revenue surface row");
assert_eq!(revenue.values.get("2025").copied().flatten(), Some(120.0));
assert_eq!(op_expenses.values.get("2024").copied().flatten(), Some(40.0));
assert_eq!(
op_expenses.values.get("2024").copied().flatten(),
Some(40.0)
);
assert_eq!(op_expenses.detail_count, Some(2));
let operating_expense_details = model
@@ -1987,8 +2226,12 @@ mod tests {
.and_then(|groups| groups.get("operating_expenses"))
.expect("operating expenses details");
assert_eq!(operating_expense_details.len(), 2);
assert!(operating_expense_details.iter().any(|row| row.key == "sga-row"));
assert!(operating_expense_details.iter().any(|row| row.key == "rd-row"));
assert!(operating_expense_details
.iter()
.any(|row| row.key == "sga-row"));
assert!(operating_expense_details
.iter()
.any(|row| row.key == "rd-row"));
let residual_rows = model
.detail_rows
@@ -2003,17 +2246,26 @@ mod tests {
.concept_mappings
.get("http://fasb.org/us-gaap/2024#ResearchAndDevelopmentExpense")
.expect("rd mapping");
assert_eq!(rd_mapping.detail_parent_surface_key.as_deref(), Some("operating_expenses"));
assert_eq!(rd_mapping.surface_key.as_deref(), Some("operating_expenses"));
assert_eq!(
rd_mapping.detail_parent_surface_key.as_deref(),
Some("operating_expenses")
);
assert_eq!(
rd_mapping.surface_key.as_deref(),
Some("operating_expenses")
);
let residual_mapping = model
.concept_mappings
.get("urn:company#OtherOperatingCharges")
.expect("residual mapping");
assert!(residual_mapping.residual_flag);
assert_eq!(residual_mapping.detail_parent_surface_key.as_deref(), Some("unmapped"));
assert_eq!(
residual_mapping.detail_parent_surface_key.as_deref(),
Some("unmapped")
);
assert_eq!(model.normalization_summary.surface_row_count, 5);
assert_eq!(model.normalization_summary.surface_row_count, 6);
assert_eq!(model.normalization_summary.detail_row_count, 3);
assert_eq!(model.normalization_summary.unmapped_row_count, 1);
}
@@ -2051,18 +2303,60 @@ mod tests {
#[test]
fn classifies_pack_specific_concepts_without_presentation_roles() {
assert_eq!(
concept_statement_fallback("FinancingReceivableExcludingAccruedInterestAfterAllowanceForCreditLoss")
.as_deref(),
concept_statement_fallback(
"FinancingReceivableExcludingAccruedInterestAfterAllowanceForCreditLoss"
)
.as_deref(),
Some("balance")
);
assert_eq!(
concept_statement_fallback("Deposits").as_deref(),
Some("balance")
);
assert_eq!(concept_statement_fallback("Deposits").as_deref(), Some("balance"));
assert_eq!(
concept_statement_fallback("RealEstateInvestmentPropertyNet").as_deref(),
Some("balance")
);
assert_eq!(concept_statement_fallback("LeaseIncome").as_deref(), Some("income"));
assert_eq!(
concept_statement_fallback("DirectCostsOfLeasedAndRentedPropertyOrEquipment").as_deref(),
concept_statement_fallback("DeferredPolicyAcquisitionCosts").as_deref(),
Some("balance")
);
assert_eq!(
concept_statement_fallback("DeferredPolicyAcquisitionCostsAndValueOfBusinessAcquired")
.as_deref(),
Some("balance")
);
assert_eq!(
concept_statement_fallback("IncreaseDecreaseInAccountsReceivable").as_deref(),
Some("cash_flow")
);
assert_eq!(
concept_statement_fallback("PaymentsOfDividends").as_deref(),
Some("cash_flow")
);
assert_eq!(
concept_statement_fallback("RepaymentsOfDebt").as_deref(),
Some("cash_flow")
);
assert_eq!(
concept_statement_fallback("ShareBasedCompensation").as_deref(),
Some("cash_flow")
);
assert_eq!(
concept_statement_fallback("PaymentsForCapitalImprovements").as_deref(),
Some("cash_flow")
);
assert_eq!(
concept_statement_fallback("PaymentsForDepositsOnRealEstateAcquisitions").as_deref(),
Some("cash_flow")
);
assert_eq!(
concept_statement_fallback("LeaseIncome").as_deref(),
Some("income")
);
assert_eq!(
concept_statement_fallback("DirectCostsOfLeasedAndRentedPropertyOrEquipment")
.as_deref(),
Some("income")
);
}