Files
crabrl/scripts/download_fixtures.py
Stefano Amorelli 46ecbd2635 feat: implement CLI for XBRL parsing and validation
- Parse command with optional stats flag
- Validate command with SEC EDGAR profile support
- Benchmark command for performance testing
- Colored output for better UX
2025-08-16 17:25:06 +03:00

151 lines
5.7 KiB
Python

#!/usr/bin/env python3
"""
Download real SEC XBRL filings from various companies to use as test fixtures.
These will be used for benchmarking and testing the parser.
"""
import os
import time
import urllib.request
from pathlib import Path
# Create fixtures directory
fixtures_dir = Path("fixtures")
fixtures_dir.mkdir(exist_ok=True)
# List of real SEC XBRL filings from various companies
# Format: (company_name, ticker, description, url)
filings = [
# Apple filings
("apple", "AAPL", "10-K 2023 Instance",
"https://www.sec.gov/Archives/edgar/data/320193/000032019323000106/aapl-20230930_htm.xml"),
("apple", "AAPL", "10-K 2023 Labels",
"https://www.sec.gov/Archives/edgar/data/320193/000032019323000106/aapl-20230930_lab.xml"),
("apple", "AAPL", "10-K 2023 Calculation",
"https://www.sec.gov/Archives/edgar/data/320193/000032019323000106/aapl-20230930_cal.xml"),
# Microsoft filings
("microsoft", "MSFT", "10-Q 2023 Instance",
"https://www.sec.gov/Archives/edgar/data/789019/000095017023064280/msft-20230930_htm.xml"),
("microsoft", "MSFT", "10-Q 2023 Labels",
"https://www.sec.gov/Archives/edgar/data/789019/000095017023064280/msft-20230930_lab.xml"),
("microsoft", "MSFT", "10-Q 2023 Presentation",
"https://www.sec.gov/Archives/edgar/data/789019/000095017023064280/msft-20230930_pre.xml"),
# Tesla filings
("tesla", "TSLA", "10-K 2023 Instance",
"https://www.sec.gov/Archives/edgar/data/1318605/000162828024002390/tsla-20231231_htm.xml"),
("tesla", "TSLA", "10-K 2023 Definition",
"https://www.sec.gov/Archives/edgar/data/1318605/000162828024002390/tsla-20231231_def.xml"),
# Amazon filings
("amazon", "AMZN", "10-K 2023 Instance",
"https://www.sec.gov/Archives/edgar/data/1018724/000101872424000006/amzn-20231231_htm.xml"),
("amazon", "AMZN", "10-K 2023 Labels",
"https://www.sec.gov/Archives/edgar/data/1018724/000101872424000006/amzn-20231231_lab.xml"),
# Google/Alphabet filings
("alphabet", "GOOGL", "10-K 2023 Instance",
"https://www.sec.gov/Archives/edgar/data/1652044/000165204424000022/goog-20231231_htm.xml"),
("alphabet", "GOOGL", "10-K 2023 Calculation",
"https://www.sec.gov/Archives/edgar/data/1652044/000165204424000022/goog-20231231_cal.xml"),
# JPMorgan Chase filings
("jpmorgan", "JPM", "10-K 2023 Instance",
"https://www.sec.gov/Archives/edgar/data/19617/000001961724000198/jpm-20231231_htm.xml"),
("jpmorgan", "JPM", "10-K 2023 Labels",
"https://www.sec.gov/Archives/edgar/data/19617/000001961724000198/jpm-20231231_lab.xml"),
# Walmart filings
("walmart", "WMT", "10-K 2024 Instance",
"https://www.sec.gov/Archives/edgar/data/104169/000010416924000012/wmt-20240131_htm.xml"),
("walmart", "WMT", "10-K 2024 Presentation",
"https://www.sec.gov/Archives/edgar/data/104169/000010416924000012/wmt-20240131_pre.xml"),
# Johnson & Johnson filings
("jnj", "JNJ", "10-K 2023 Instance",
"https://www.sec.gov/Archives/edgar/data/200406/000020040624000016/jnj-20231231_htm.xml"),
# ExxonMobil filings
("exxon", "XOM", "10-K 2023 Instance",
"https://www.sec.gov/Archives/edgar/data/34088/000003408824000013/xom-20231231_htm.xml"),
# Berkshire Hathaway filings
("berkshire", "BRK", "10-K 2023 Instance",
"https://www.sec.gov/Archives/edgar/data/1067983/000095017024021825/brka-20231231_htm.xml"),
]
def download_file(url, filepath):
"""Download a file from URL to filepath."""
try:
# Add headers to avoid being blocked
request = urllib.request.Request(
url,
headers={
'User-Agent': 'crabrl-test-fixtures/1.0 (testing@example.com)'
}
)
with urllib.request.urlopen(request) as response:
content = response.read()
with open(filepath, 'wb') as f:
f.write(content)
return True
except Exception as e:
print(f" Error: {e}")
return False
def main():
print("Downloading SEC XBRL fixtures from various companies...")
print("=" * 60)
downloaded = 0
failed = 0
for company, ticker, description, url in filings:
# Create company directory
company_dir = fixtures_dir / company
company_dir.mkdir(exist_ok=True)
# Generate filename from URL
filename = url.split('/')[-1]
filepath = company_dir / filename
print(f"\n[{ticker}] {description}")
print(f" URL: {url}")
print(f" Saving to: {filepath}")
if filepath.exists():
print(" ✓ Already exists, skipping")
continue
if download_file(url, filepath):
file_size = os.path.getsize(filepath)
print(f" ✓ Downloaded ({file_size:,} bytes)")
downloaded += 1
else:
print(f" ✗ Failed to download")
failed += 1
# Be polite to SEC servers
time.sleep(0.5)
print("\n" + "=" * 60)
print(f"Download complete: {downloaded} downloaded, {failed} failed")
print(f"Fixtures saved to: {fixtures_dir.absolute()}")
# Show directory structure
print("\nFixture structure:")
for company_dir in sorted(fixtures_dir.iterdir()):
if company_dir.is_dir():
files = list(company_dir.glob("*.xml"))
if files:
print(f" {company_dir.name}/")
for f in sorted(files)[:3]: # Show first 3 files
size = os.path.getsize(f)
print(f" - {f.name} ({size:,} bytes)")
if len(files) > 3:
print(f" ... and {len(files)-3} more files")
if __name__ == "__main__":
main()