mirror of
https://github.com/stefanoamorelli/crabrl.git
synced 2026-04-18 07:10:42 +00:00
feat: implement CLI for XBRL parsing and validation
- Parse command with optional stats flag - Validate command with SEC EDGAR profile support - Benchmark command for performance testing - Colored output for better UX
This commit is contained in:
151
scripts/download_fixtures.py
Normal file
151
scripts/download_fixtures.py
Normal file
@@ -0,0 +1,151 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Download real SEC XBRL filings from various companies to use as test fixtures.
|
||||
These will be used for benchmarking and testing the parser.
|
||||
"""
|
||||
|
||||
import os
|
||||
import time
|
||||
import urllib.request
|
||||
from pathlib import Path
|
||||
|
||||
# Create fixtures directory
|
||||
fixtures_dir = Path("fixtures")
|
||||
fixtures_dir.mkdir(exist_ok=True)
|
||||
|
||||
# List of real SEC XBRL filings from various companies
|
||||
# Format: (company_name, ticker, description, url)
|
||||
filings = [
|
||||
# Apple filings
|
||||
("apple", "AAPL", "10-K 2023 Instance",
|
||||
"https://www.sec.gov/Archives/edgar/data/320193/000032019323000106/aapl-20230930_htm.xml"),
|
||||
("apple", "AAPL", "10-K 2023 Labels",
|
||||
"https://www.sec.gov/Archives/edgar/data/320193/000032019323000106/aapl-20230930_lab.xml"),
|
||||
("apple", "AAPL", "10-K 2023 Calculation",
|
||||
"https://www.sec.gov/Archives/edgar/data/320193/000032019323000106/aapl-20230930_cal.xml"),
|
||||
|
||||
# Microsoft filings
|
||||
("microsoft", "MSFT", "10-Q 2023 Instance",
|
||||
"https://www.sec.gov/Archives/edgar/data/789019/000095017023064280/msft-20230930_htm.xml"),
|
||||
("microsoft", "MSFT", "10-Q 2023 Labels",
|
||||
"https://www.sec.gov/Archives/edgar/data/789019/000095017023064280/msft-20230930_lab.xml"),
|
||||
("microsoft", "MSFT", "10-Q 2023 Presentation",
|
||||
"https://www.sec.gov/Archives/edgar/data/789019/000095017023064280/msft-20230930_pre.xml"),
|
||||
|
||||
# Tesla filings
|
||||
("tesla", "TSLA", "10-K 2023 Instance",
|
||||
"https://www.sec.gov/Archives/edgar/data/1318605/000162828024002390/tsla-20231231_htm.xml"),
|
||||
("tesla", "TSLA", "10-K 2023 Definition",
|
||||
"https://www.sec.gov/Archives/edgar/data/1318605/000162828024002390/tsla-20231231_def.xml"),
|
||||
|
||||
# Amazon filings
|
||||
("amazon", "AMZN", "10-K 2023 Instance",
|
||||
"https://www.sec.gov/Archives/edgar/data/1018724/000101872424000006/amzn-20231231_htm.xml"),
|
||||
("amazon", "AMZN", "10-K 2023 Labels",
|
||||
"https://www.sec.gov/Archives/edgar/data/1018724/000101872424000006/amzn-20231231_lab.xml"),
|
||||
|
||||
# Google/Alphabet filings
|
||||
("alphabet", "GOOGL", "10-K 2023 Instance",
|
||||
"https://www.sec.gov/Archives/edgar/data/1652044/000165204424000022/goog-20231231_htm.xml"),
|
||||
("alphabet", "GOOGL", "10-K 2023 Calculation",
|
||||
"https://www.sec.gov/Archives/edgar/data/1652044/000165204424000022/goog-20231231_cal.xml"),
|
||||
|
||||
# JPMorgan Chase filings
|
||||
("jpmorgan", "JPM", "10-K 2023 Instance",
|
||||
"https://www.sec.gov/Archives/edgar/data/19617/000001961724000198/jpm-20231231_htm.xml"),
|
||||
("jpmorgan", "JPM", "10-K 2023 Labels",
|
||||
"https://www.sec.gov/Archives/edgar/data/19617/000001961724000198/jpm-20231231_lab.xml"),
|
||||
|
||||
# Walmart filings
|
||||
("walmart", "WMT", "10-K 2024 Instance",
|
||||
"https://www.sec.gov/Archives/edgar/data/104169/000010416924000012/wmt-20240131_htm.xml"),
|
||||
("walmart", "WMT", "10-K 2024 Presentation",
|
||||
"https://www.sec.gov/Archives/edgar/data/104169/000010416924000012/wmt-20240131_pre.xml"),
|
||||
|
||||
# Johnson & Johnson filings
|
||||
("jnj", "JNJ", "10-K 2023 Instance",
|
||||
"https://www.sec.gov/Archives/edgar/data/200406/000020040624000016/jnj-20231231_htm.xml"),
|
||||
|
||||
# ExxonMobil filings
|
||||
("exxon", "XOM", "10-K 2023 Instance",
|
||||
"https://www.sec.gov/Archives/edgar/data/34088/000003408824000013/xom-20231231_htm.xml"),
|
||||
|
||||
# Berkshire Hathaway filings
|
||||
("berkshire", "BRK", "10-K 2023 Instance",
|
||||
"https://www.sec.gov/Archives/edgar/data/1067983/000095017024021825/brka-20231231_htm.xml"),
|
||||
]
|
||||
|
||||
def download_file(url, filepath):
|
||||
"""Download a file from URL to filepath."""
|
||||
try:
|
||||
# Add headers to avoid being blocked
|
||||
request = urllib.request.Request(
|
||||
url,
|
||||
headers={
|
||||
'User-Agent': 'crabrl-test-fixtures/1.0 (testing@example.com)'
|
||||
}
|
||||
)
|
||||
|
||||
with urllib.request.urlopen(request) as response:
|
||||
content = response.read()
|
||||
with open(filepath, 'wb') as f:
|
||||
f.write(content)
|
||||
return True
|
||||
except Exception as e:
|
||||
print(f" Error: {e}")
|
||||
return False
|
||||
|
||||
def main():
|
||||
print("Downloading SEC XBRL fixtures from various companies...")
|
||||
print("=" * 60)
|
||||
|
||||
downloaded = 0
|
||||
failed = 0
|
||||
|
||||
for company, ticker, description, url in filings:
|
||||
# Create company directory
|
||||
company_dir = fixtures_dir / company
|
||||
company_dir.mkdir(exist_ok=True)
|
||||
|
||||
# Generate filename from URL
|
||||
filename = url.split('/')[-1]
|
||||
filepath = company_dir / filename
|
||||
|
||||
print(f"\n[{ticker}] {description}")
|
||||
print(f" URL: {url}")
|
||||
print(f" Saving to: {filepath}")
|
||||
|
||||
if filepath.exists():
|
||||
print(" ✓ Already exists, skipping")
|
||||
continue
|
||||
|
||||
if download_file(url, filepath):
|
||||
file_size = os.path.getsize(filepath)
|
||||
print(f" ✓ Downloaded ({file_size:,} bytes)")
|
||||
downloaded += 1
|
||||
else:
|
||||
print(f" ✗ Failed to download")
|
||||
failed += 1
|
||||
|
||||
# Be polite to SEC servers
|
||||
time.sleep(0.5)
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print(f"Download complete: {downloaded} downloaded, {failed} failed")
|
||||
print(f"Fixtures saved to: {fixtures_dir.absolute()}")
|
||||
|
||||
# Show directory structure
|
||||
print("\nFixture structure:")
|
||||
for company_dir in sorted(fixtures_dir.iterdir()):
|
||||
if company_dir.is_dir():
|
||||
files = list(company_dir.glob("*.xml"))
|
||||
if files:
|
||||
print(f" {company_dir.name}/")
|
||||
for f in sorted(files)[:3]: # Show first 3 files
|
||||
size = os.path.getsize(f)
|
||||
print(f" - {f.name} ({size:,} bytes)")
|
||||
if len(files) > 3:
|
||||
print(f" ... and {len(files)-3} more files")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user