mirror of
https://github.com/stefanoamorelli/crabrl.git
synced 2026-04-18 07:10:42 +00:00
- Parse command with optional stats flag - Validate command with SEC EDGAR profile support - Benchmark command for performance testing - Colored output for better UX
151 lines
5.7 KiB
Python
151 lines
5.7 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Download real SEC XBRL filings from various companies to use as test fixtures.
|
|
These will be used for benchmarking and testing the parser.
|
|
"""
|
|
|
|
import os
|
|
import time
|
|
import urllib.request
|
|
from pathlib import Path
|
|
|
|
# Create fixtures directory
|
|
fixtures_dir = Path("fixtures")
|
|
fixtures_dir.mkdir(exist_ok=True)
|
|
|
|
# List of real SEC XBRL filings from various companies
|
|
# Format: (company_name, ticker, description, url)
|
|
filings = [
|
|
# Apple filings
|
|
("apple", "AAPL", "10-K 2023 Instance",
|
|
"https://www.sec.gov/Archives/edgar/data/320193/000032019323000106/aapl-20230930_htm.xml"),
|
|
("apple", "AAPL", "10-K 2023 Labels",
|
|
"https://www.sec.gov/Archives/edgar/data/320193/000032019323000106/aapl-20230930_lab.xml"),
|
|
("apple", "AAPL", "10-K 2023 Calculation",
|
|
"https://www.sec.gov/Archives/edgar/data/320193/000032019323000106/aapl-20230930_cal.xml"),
|
|
|
|
# Microsoft filings
|
|
("microsoft", "MSFT", "10-Q 2023 Instance",
|
|
"https://www.sec.gov/Archives/edgar/data/789019/000095017023064280/msft-20230930_htm.xml"),
|
|
("microsoft", "MSFT", "10-Q 2023 Labels",
|
|
"https://www.sec.gov/Archives/edgar/data/789019/000095017023064280/msft-20230930_lab.xml"),
|
|
("microsoft", "MSFT", "10-Q 2023 Presentation",
|
|
"https://www.sec.gov/Archives/edgar/data/789019/000095017023064280/msft-20230930_pre.xml"),
|
|
|
|
# Tesla filings
|
|
("tesla", "TSLA", "10-K 2023 Instance",
|
|
"https://www.sec.gov/Archives/edgar/data/1318605/000162828024002390/tsla-20231231_htm.xml"),
|
|
("tesla", "TSLA", "10-K 2023 Definition",
|
|
"https://www.sec.gov/Archives/edgar/data/1318605/000162828024002390/tsla-20231231_def.xml"),
|
|
|
|
# Amazon filings
|
|
("amazon", "AMZN", "10-K 2023 Instance",
|
|
"https://www.sec.gov/Archives/edgar/data/1018724/000101872424000006/amzn-20231231_htm.xml"),
|
|
("amazon", "AMZN", "10-K 2023 Labels",
|
|
"https://www.sec.gov/Archives/edgar/data/1018724/000101872424000006/amzn-20231231_lab.xml"),
|
|
|
|
# Google/Alphabet filings
|
|
("alphabet", "GOOGL", "10-K 2023 Instance",
|
|
"https://www.sec.gov/Archives/edgar/data/1652044/000165204424000022/goog-20231231_htm.xml"),
|
|
("alphabet", "GOOGL", "10-K 2023 Calculation",
|
|
"https://www.sec.gov/Archives/edgar/data/1652044/000165204424000022/goog-20231231_cal.xml"),
|
|
|
|
# JPMorgan Chase filings
|
|
("jpmorgan", "JPM", "10-K 2023 Instance",
|
|
"https://www.sec.gov/Archives/edgar/data/19617/000001961724000198/jpm-20231231_htm.xml"),
|
|
("jpmorgan", "JPM", "10-K 2023 Labels",
|
|
"https://www.sec.gov/Archives/edgar/data/19617/000001961724000198/jpm-20231231_lab.xml"),
|
|
|
|
# Walmart filings
|
|
("walmart", "WMT", "10-K 2024 Instance",
|
|
"https://www.sec.gov/Archives/edgar/data/104169/000010416924000012/wmt-20240131_htm.xml"),
|
|
("walmart", "WMT", "10-K 2024 Presentation",
|
|
"https://www.sec.gov/Archives/edgar/data/104169/000010416924000012/wmt-20240131_pre.xml"),
|
|
|
|
# Johnson & Johnson filings
|
|
("jnj", "JNJ", "10-K 2023 Instance",
|
|
"https://www.sec.gov/Archives/edgar/data/200406/000020040624000016/jnj-20231231_htm.xml"),
|
|
|
|
# ExxonMobil filings
|
|
("exxon", "XOM", "10-K 2023 Instance",
|
|
"https://www.sec.gov/Archives/edgar/data/34088/000003408824000013/xom-20231231_htm.xml"),
|
|
|
|
# Berkshire Hathaway filings
|
|
("berkshire", "BRK", "10-K 2023 Instance",
|
|
"https://www.sec.gov/Archives/edgar/data/1067983/000095017024021825/brka-20231231_htm.xml"),
|
|
]
|
|
|
|
def download_file(url, filepath):
|
|
"""Download a file from URL to filepath."""
|
|
try:
|
|
# Add headers to avoid being blocked
|
|
request = urllib.request.Request(
|
|
url,
|
|
headers={
|
|
'User-Agent': 'crabrl-test-fixtures/1.0 (testing@example.com)'
|
|
}
|
|
)
|
|
|
|
with urllib.request.urlopen(request) as response:
|
|
content = response.read()
|
|
with open(filepath, 'wb') as f:
|
|
f.write(content)
|
|
return True
|
|
except Exception as e:
|
|
print(f" Error: {e}")
|
|
return False
|
|
|
|
def main():
|
|
print("Downloading SEC XBRL fixtures from various companies...")
|
|
print("=" * 60)
|
|
|
|
downloaded = 0
|
|
failed = 0
|
|
|
|
for company, ticker, description, url in filings:
|
|
# Create company directory
|
|
company_dir = fixtures_dir / company
|
|
company_dir.mkdir(exist_ok=True)
|
|
|
|
# Generate filename from URL
|
|
filename = url.split('/')[-1]
|
|
filepath = company_dir / filename
|
|
|
|
print(f"\n[{ticker}] {description}")
|
|
print(f" URL: {url}")
|
|
print(f" Saving to: {filepath}")
|
|
|
|
if filepath.exists():
|
|
print(" ✓ Already exists, skipping")
|
|
continue
|
|
|
|
if download_file(url, filepath):
|
|
file_size = os.path.getsize(filepath)
|
|
print(f" ✓ Downloaded ({file_size:,} bytes)")
|
|
downloaded += 1
|
|
else:
|
|
print(f" ✗ Failed to download")
|
|
failed += 1
|
|
|
|
# Be polite to SEC servers
|
|
time.sleep(0.5)
|
|
|
|
print("\n" + "=" * 60)
|
|
print(f"Download complete: {downloaded} downloaded, {failed} failed")
|
|
print(f"Fixtures saved to: {fixtures_dir.absolute()}")
|
|
|
|
# Show directory structure
|
|
print("\nFixture structure:")
|
|
for company_dir in sorted(fixtures_dir.iterdir()):
|
|
if company_dir.is_dir():
|
|
files = list(company_dir.glob("*.xml"))
|
|
if files:
|
|
print(f" {company_dir.name}/")
|
|
for f in sorted(files)[:3]: # Show first 3 files
|
|
size = os.path.getsize(f)
|
|
print(f" - {f.name} ({size:,} bytes)")
|
|
if len(files) > 3:
|
|
print(f" ... and {len(files)-3} more files")
|
|
|
|
if __name__ == "__main__":
|
|
main() |