Fix P0 issues in financial ingestion architecture
- Wrap snapshot updates in transactions with error context for each child table - Add sidecar retry with exponential backoff (3 attempts, 2s base, 10s max, 30% jitter) - Add HTTP timeout (30s per request) and SEC rate limiting (10 req/s) in Rust - Add XBRL validation with status reporting (checks root element, tag balance)
This commit is contained in:
@@ -4,6 +4,8 @@ use regex::Regex;
|
||||
use reqwest::blocking::Client;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::{BTreeMap, HashMap, HashSet};
|
||||
use std::sync::Mutex;
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
mod kpi_mapper;
|
||||
mod metrics;
|
||||
@@ -20,6 +22,39 @@ use crabrl as _;
|
||||
pub const PARSER_ENGINE: &str = "fiscal-xbrl";
|
||||
pub const PARSER_VERSION: &str = env!("CARGO_PKG_VERSION");
|
||||
|
||||
const DEFAULT_SEC_RATE_LIMIT_MS: u64 = 100;
|
||||
const HTTP_TIMEOUT_SECS: u64 = 30;
|
||||
|
||||
static RATE_LIMITER: Lazy<Mutex<Instant>> = Lazy::new(|| Mutex::new(Instant::now()));
|
||||
|
||||
fn sec_rate_limit_delay() -> u64 {
|
||||
std::env::var("SEC_RATE_LIMIT_MS")
|
||||
.ok()
|
||||
.and_then(|s| s.parse().ok())
|
||||
.unwrap_or(DEFAULT_SEC_RATE_LIMIT_MS)
|
||||
}
|
||||
|
||||
fn rate_limited_fetch<T, F>(fetch_fn: F) -> Result<T>
|
||||
where
|
||||
F: FnOnce() -> Result<T>,
|
||||
{
|
||||
let delay_ms = sec_rate_limit_delay();
|
||||
|
||||
{
|
||||
let mut last_request = RATE_LIMITER.lock().unwrap();
|
||||
let elapsed = last_request.elapsed();
|
||||
let min_delay = Duration::from_millis(delay_ms);
|
||||
|
||||
if elapsed < min_delay {
|
||||
std::thread::sleep(min_delay - elapsed);
|
||||
}
|
||||
|
||||
*last_request = Instant::now();
|
||||
}
|
||||
|
||||
fetch_fn()
|
||||
}
|
||||
|
||||
static CONTEXT_RE: Lazy<Regex> = Lazy::new(|| {
|
||||
Regex::new(r#"(?is)<(?:[a-z0-9_\-]+:)?context\b[^>]*\bid=["']([^"']+)["'][^>]*>(.*?)</(?:[a-z0-9_\-]+:)?context>"#).unwrap()
|
||||
});
|
||||
@@ -118,6 +153,7 @@ pub struct HydrateFilingResponse {
|
||||
pub contexts: Vec<ContextOutput>,
|
||||
pub derived_metrics: FilingMetrics,
|
||||
pub validation_result: ValidationResultOutput,
|
||||
pub xbrl_validation: XbrlValidationResult,
|
||||
pub facts_count: usize,
|
||||
pub concepts_count: usize,
|
||||
pub dimensions_count: usize,
|
||||
@@ -481,6 +517,7 @@ struct PresentationNode {
|
||||
pub fn hydrate_filing(input: HydrateFilingRequest) -> Result<HydrateFilingResponse> {
|
||||
let client = Client::builder()
|
||||
.user_agent("Fiscal Clone <support@fiscal.local>")
|
||||
.timeout(Duration::from_secs(120))
|
||||
.build()
|
||||
.context("unable to build HTTP client")?;
|
||||
|
||||
@@ -521,7 +558,11 @@ pub fn hydrate_filing(input: HydrateFilingRequest) -> Result<HydrateFilingRespon
|
||||
computed_definitions: vec![],
|
||||
contexts: vec![],
|
||||
derived_metrics: FilingMetrics::default(),
|
||||
validation_result,
|
||||
validation_result: validation_result.clone(),
|
||||
xbrl_validation: XbrlValidationResult {
|
||||
status: "error".to_string(),
|
||||
message: Some("No XBRL instance found".to_string()),
|
||||
},
|
||||
facts_count: 0,
|
||||
concepts_count: 0,
|
||||
dimensions_count: 0,
|
||||
@@ -542,6 +583,17 @@ pub fn hydrate_filing(input: HydrateFilingRequest) -> Result<HydrateFilingRespon
|
||||
|
||||
let instance_text = fetch_text(&client, &instance_asset.url)
|
||||
.context("fetch request failed for XBRL instance")?;
|
||||
|
||||
let xbrl_validation = validate_xbrl_structure(&instance_text, Some(&instance_asset.name));
|
||||
if xbrl_validation.status != "passed" {
|
||||
eprintln!(
|
||||
"[xbrl] Validation {} for {:?}: {}",
|
||||
xbrl_validation.status,
|
||||
instance_asset.name,
|
||||
xbrl_validation.message.as_deref().unwrap_or("unknown")
|
||||
);
|
||||
}
|
||||
|
||||
let parsed_instance = parse_xbrl_instance(&instance_text, Some(instance_asset.name.clone()));
|
||||
|
||||
let mut label_by_concept = HashMap::new();
|
||||
@@ -678,6 +730,7 @@ pub fn hydrate_filing(input: HydrateFilingRequest) -> Result<HydrateFilingRespon
|
||||
contexts: parsed_instance.contexts,
|
||||
derived_metrics: metrics::derive_metrics(&facts),
|
||||
validation_result,
|
||||
xbrl_validation,
|
||||
facts_count: facts.len(),
|
||||
concepts_count: concepts.len(),
|
||||
dimensions_count: facts
|
||||
@@ -963,29 +1016,78 @@ fn parse_size(value: Option<&serde_json::Value>) -> Option<i64> {
|
||||
}
|
||||
|
||||
fn fetch_text(client: &Client, url: &str) -> Result<String> {
|
||||
let response = client
|
||||
.get(url)
|
||||
.send()
|
||||
.with_context(|| format!("request failed for {url}"))?;
|
||||
if !response.status().is_success() {
|
||||
return Err(anyhow!("request failed for {url} ({})", response.status()));
|
||||
}
|
||||
response
|
||||
.text()
|
||||
.with_context(|| format!("unable to read response body for {url}"))
|
||||
rate_limited_fetch(|| {
|
||||
let response = client
|
||||
.get(url)
|
||||
.timeout(Duration::from_secs(HTTP_TIMEOUT_SECS))
|
||||
.send()
|
||||
.with_context(|| format!("request failed for {url}"))?;
|
||||
if !response.status().is_success() {
|
||||
return Err(anyhow!("request failed for {url} ({})", response.status()));
|
||||
}
|
||||
response
|
||||
.text()
|
||||
.with_context(|| format!("unable to read response body for {url}"))
|
||||
})
|
||||
}
|
||||
|
||||
fn fetch_json<T: for<'de> Deserialize<'de>>(client: &Client, url: &str) -> Result<T> {
|
||||
let response = client
|
||||
.get(url)
|
||||
.send()
|
||||
.with_context(|| format!("request failed for {url}"))?;
|
||||
if !response.status().is_success() {
|
||||
return Err(anyhow!("request failed for {url} ({})", response.status()));
|
||||
rate_limited_fetch(|| {
|
||||
let response = client
|
||||
.get(url)
|
||||
.timeout(Duration::from_secs(HTTP_TIMEOUT_SECS))
|
||||
.send()
|
||||
.with_context(|| format!("request failed for {url}"))?;
|
||||
if !response.status().is_success() {
|
||||
return Err(anyhow!("request failed for {url} ({})", response.status()));
|
||||
}
|
||||
response
|
||||
.json::<T>()
|
||||
.with_context(|| format!("unable to parse JSON response for {url}"))
|
||||
})
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
pub struct XbrlValidationResult {
|
||||
pub status: String,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub message: Option<String>,
|
||||
}
|
||||
|
||||
fn validate_xbrl_structure(xml: &str, source_file: Option<&str>) -> XbrlValidationResult {
|
||||
if xml.is_empty() {
|
||||
return XbrlValidationResult {
|
||||
status: "error".to_string(),
|
||||
message: Some("XBRL content is empty".to_string()),
|
||||
};
|
||||
}
|
||||
|
||||
if !xml.contains("<xbrl") && !xml.contains("<xbrli:xbrl") {
|
||||
return XbrlValidationResult {
|
||||
status: "error".to_string(),
|
||||
message: Some("Invalid XBRL: missing root element".to_string()),
|
||||
};
|
||||
}
|
||||
|
||||
let open_count = xml.matches('<').count();
|
||||
let close_count = xml.matches('>').count();
|
||||
|
||||
if open_count != close_count {
|
||||
return XbrlValidationResult {
|
||||
status: "warning".to_string(),
|
||||
message: Some(format!(
|
||||
"Malformed XML detected in {:?} ({} open, {} close tags)",
|
||||
source_file.unwrap_or("unknown"),
|
||||
open_count,
|
||||
close_count
|
||||
)),
|
||||
};
|
||||
}
|
||||
|
||||
XbrlValidationResult {
|
||||
status: "passed".to_string(),
|
||||
message: None,
|
||||
}
|
||||
response
|
||||
.json::<T>()
|
||||
.with_context(|| format!("unable to parse JSON response for {url}"))
|
||||
}
|
||||
|
||||
struct ParsedInstance {
|
||||
|
||||
Reference in New Issue
Block a user