feat(taxonomy): add rust sidecar compact surface pipeline

This commit is contained in:
2026-03-12 15:23:10 -04:00
parent f2c25fb9c6
commit 58061af006
84 changed files with 19350 additions and 265 deletions

View File

@@ -0,0 +1,151 @@
#!/usr/bin/env python3
"""
Download real SEC XBRL filings from various companies to use as test fixtures.
These will be used for benchmarking and testing the parser.
"""
import os
import time
import urllib.request
from pathlib import Path
# Create fixtures directory
fixtures_dir = Path("fixtures")
fixtures_dir.mkdir(exist_ok=True)
# List of real SEC XBRL filings from various companies
# Format: (company_name, ticker, description, url)
filings = [
# Apple filings
("apple", "AAPL", "10-K 2023 Instance",
"https://www.sec.gov/Archives/edgar/data/320193/000032019323000106/aapl-20230930_htm.xml"),
("apple", "AAPL", "10-K 2023 Labels",
"https://www.sec.gov/Archives/edgar/data/320193/000032019323000106/aapl-20230930_lab.xml"),
("apple", "AAPL", "10-K 2023 Calculation",
"https://www.sec.gov/Archives/edgar/data/320193/000032019323000106/aapl-20230930_cal.xml"),
# Microsoft filings
("microsoft", "MSFT", "10-Q 2023 Instance",
"https://www.sec.gov/Archives/edgar/data/789019/000095017023064280/msft-20230930_htm.xml"),
("microsoft", "MSFT", "10-Q 2023 Labels",
"https://www.sec.gov/Archives/edgar/data/789019/000095017023064280/msft-20230930_lab.xml"),
("microsoft", "MSFT", "10-Q 2023 Presentation",
"https://www.sec.gov/Archives/edgar/data/789019/000095017023064280/msft-20230930_pre.xml"),
# Tesla filings
("tesla", "TSLA", "10-K 2023 Instance",
"https://www.sec.gov/Archives/edgar/data/1318605/000162828024002390/tsla-20231231_htm.xml"),
("tesla", "TSLA", "10-K 2023 Definition",
"https://www.sec.gov/Archives/edgar/data/1318605/000162828024002390/tsla-20231231_def.xml"),
# Amazon filings
("amazon", "AMZN", "10-K 2023 Instance",
"https://www.sec.gov/Archives/edgar/data/1018724/000101872424000006/amzn-20231231_htm.xml"),
("amazon", "AMZN", "10-K 2023 Labels",
"https://www.sec.gov/Archives/edgar/data/1018724/000101872424000006/amzn-20231231_lab.xml"),
# Google/Alphabet filings
("alphabet", "GOOGL", "10-K 2023 Instance",
"https://www.sec.gov/Archives/edgar/data/1652044/000165204424000022/goog-20231231_htm.xml"),
("alphabet", "GOOGL", "10-K 2023 Calculation",
"https://www.sec.gov/Archives/edgar/data/1652044/000165204424000022/goog-20231231_cal.xml"),
# JPMorgan Chase filings
("jpmorgan", "JPM", "10-K 2023 Instance",
"https://www.sec.gov/Archives/edgar/data/19617/000001961724000198/jpm-20231231_htm.xml"),
("jpmorgan", "JPM", "10-K 2023 Labels",
"https://www.sec.gov/Archives/edgar/data/19617/000001961724000198/jpm-20231231_lab.xml"),
# Walmart filings
("walmart", "WMT", "10-K 2024 Instance",
"https://www.sec.gov/Archives/edgar/data/104169/000010416924000012/wmt-20240131_htm.xml"),
("walmart", "WMT", "10-K 2024 Presentation",
"https://www.sec.gov/Archives/edgar/data/104169/000010416924000012/wmt-20240131_pre.xml"),
# Johnson & Johnson filings
("jnj", "JNJ", "10-K 2023 Instance",
"https://www.sec.gov/Archives/edgar/data/200406/000020040624000016/jnj-20231231_htm.xml"),
# ExxonMobil filings
("exxon", "XOM", "10-K 2023 Instance",
"https://www.sec.gov/Archives/edgar/data/34088/000003408824000013/xom-20231231_htm.xml"),
# Berkshire Hathaway filings
("berkshire", "BRK", "10-K 2023 Instance",
"https://www.sec.gov/Archives/edgar/data/1067983/000095017024021825/brka-20231231_htm.xml"),
]
def download_file(url, filepath):
"""Download a file from URL to filepath."""
try:
# Add headers to avoid being blocked
request = urllib.request.Request(
url,
headers={
'User-Agent': 'crabrl-test-fixtures/1.0 (testing@example.com)'
}
)
with urllib.request.urlopen(request) as response:
content = response.read()
with open(filepath, 'wb') as f:
f.write(content)
return True
except Exception as e:
print(f" Error: {e}")
return False
def main():
print("Downloading SEC XBRL fixtures from various companies...")
print("=" * 60)
downloaded = 0
failed = 0
for company, ticker, description, url in filings:
# Create company directory
company_dir = fixtures_dir / company
company_dir.mkdir(exist_ok=True)
# Generate filename from URL
filename = url.split('/')[-1]
filepath = company_dir / filename
print(f"\n[{ticker}] {description}")
print(f" URL: {url}")
print(f" Saving to: {filepath}")
if filepath.exists():
print(" ✓ Already exists, skipping")
continue
if download_file(url, filepath):
file_size = os.path.getsize(filepath)
print(f" ✓ Downloaded ({file_size:,} bytes)")
downloaded += 1
else:
print(f" ✗ Failed to download")
failed += 1
# Be polite to SEC servers
time.sleep(0.5)
print("\n" + "=" * 60)
print(f"Download complete: {downloaded} downloaded, {failed} failed")
print(f"Fixtures saved to: {fixtures_dir.absolute()}")
# Show directory structure
print("\nFixture structure:")
for company_dir in sorted(fixtures_dir.iterdir()):
if company_dir.is_dir():
files = list(company_dir.glob("*.xml"))
if files:
print(f" {company_dir.name}/")
for f in sorted(files)[:3]: # Show first 3 files
size = os.path.getsize(f)
print(f" - {f.name} ({size:,} bytes)")
if len(files) > 3:
print(f" ... and {len(files)-3} more files")
if __name__ == "__main__":
main()