mirror of
https://github.com/stefanoamorelli/crabrl.git
synced 2026-04-18 07:10:42 +00:00
- Parse command with optional stats flag - Validate command with SEC EDGAR profile support - Benchmark command for performance testing - Colored output for better UX
214 lines
7.0 KiB
Python
214 lines
7.0 KiB
Python
#!/usr/bin/env python3
|
|
"""Compare performance between crabrl and Arelle."""
|
|
|
|
import os
|
|
import sys
|
|
import time
|
|
import subprocess
|
|
import json
|
|
import statistics
|
|
from pathlib import Path
|
|
from tabulate import tabulate
|
|
import matplotlib.pyplot as plt
|
|
|
|
# Add parent directory to path for imports
|
|
sys.path.insert(0, str(Path(__file__).parent.parent))
|
|
|
|
def benchmark_arelle(file_path, runs=3):
|
|
"""Benchmark Arelle parsing performance."""
|
|
times = []
|
|
|
|
for _ in range(runs):
|
|
start = time.perf_counter()
|
|
|
|
# Run Arelle in subprocess to isolate memory
|
|
result = subprocess.run([
|
|
sys.executable, "-c",
|
|
f"""
|
|
import sys
|
|
sys.path.insert(0, 'venv/lib/python{sys.version_info.major}.{sys.version_info.minor}/site-packages')
|
|
from arelle import Cntlr
|
|
from arelle import ModelManager
|
|
|
|
# Suppress Arelle output
|
|
import logging
|
|
logging.getLogger("arelle").setLevel(logging.ERROR)
|
|
|
|
controller = Cntlr.Cntlr(logFileName=None)
|
|
controller.webCache.workOffline = True
|
|
modelManager = ModelManager.initialize(controller)
|
|
|
|
# Load and parse the XBRL file
|
|
modelXbrl = modelManager.load('{file_path}')
|
|
if modelXbrl:
|
|
facts = len(modelXbrl.facts)
|
|
contexts = len(modelXbrl.contexts)
|
|
units = len(modelXbrl.units)
|
|
print(f"{{facts}},{{contexts}},{{units}}")
|
|
modelXbrl.close()
|
|
"""
|
|
], capture_output=True, text=True, cwd=Path(__file__).parent)
|
|
|
|
end = time.perf_counter()
|
|
|
|
if result.returncode == 0 and result.stdout:
|
|
times.append(end - start)
|
|
if len(times) == 1: # Print counts on first run
|
|
parts = result.stdout.strip().split(',')
|
|
if len(parts) == 3:
|
|
print(f" Arelle found: {parts[0]} facts, {parts[1]} contexts, {parts[2]} units")
|
|
else:
|
|
print(f" Arelle error: {result.stderr}")
|
|
|
|
if times:
|
|
return {
|
|
'mean': statistics.mean(times),
|
|
'median': statistics.median(times),
|
|
'stdev': statistics.stdev(times) if len(times) > 1 else 0,
|
|
'min': min(times),
|
|
'max': max(times),
|
|
'runs': len(times)
|
|
}
|
|
return None
|
|
|
|
def benchmark_crabrl(file_path, runs=3):
|
|
"""Benchmark crabrl parsing performance."""
|
|
times = []
|
|
|
|
# Build the benchmark binary if needed
|
|
subprocess.run(["cargo", "build", "--release", "--example", "benchmark_single"],
|
|
capture_output=True, cwd=Path(__file__).parent.parent)
|
|
|
|
for _ in range(runs):
|
|
start = time.perf_counter()
|
|
|
|
result = subprocess.run([
|
|
"../target/release/examples/benchmark_single",
|
|
file_path
|
|
], capture_output=True, text=True, cwd=Path(__file__).parent)
|
|
|
|
end = time.perf_counter()
|
|
|
|
if result.returncode == 0:
|
|
times.append(end - start)
|
|
if len(times) == 1 and result.stdout: # Print counts on first run
|
|
print(f" crabrl output: {result.stdout.strip()}")
|
|
else:
|
|
print(f" crabrl error: {result.stderr}")
|
|
|
|
if times:
|
|
return {
|
|
'mean': statistics.mean(times),
|
|
'median': statistics.median(times),
|
|
'stdev': statistics.stdev(times) if len(times) > 1 else 0,
|
|
'min': min(times),
|
|
'max': max(times),
|
|
'runs': len(times)
|
|
}
|
|
return None
|
|
|
|
def main():
|
|
"""Run comparative benchmarks."""
|
|
print("=" * 80)
|
|
print("XBRL Parser Performance Comparison: crabrl vs Arelle")
|
|
print("=" * 80)
|
|
|
|
test_files = [
|
|
("Tiny (10 facts)", "../test_data/test_tiny.xbrl"),
|
|
("Small (100 facts)", "../test_data/test_small.xbrl"),
|
|
("Medium (1K facts)", "../test_data/test_medium.xbrl"),
|
|
("Large (10K facts)", "../test_data/test_large.xbrl"),
|
|
("Huge (100K facts)", "../test_data/test_huge.xbrl"),
|
|
]
|
|
|
|
results = []
|
|
|
|
for name, file_path in test_files:
|
|
if not Path(file_path).exists():
|
|
print(f"Skipping {name}: file not found")
|
|
continue
|
|
|
|
file_size_mb = Path(file_path).stat().st_size / (1024 * 1024)
|
|
print(f"\nBenchmarking {name} ({file_size_mb:.2f} MB)...")
|
|
|
|
# Benchmark Arelle
|
|
print(" Running Arelle...")
|
|
arelle_stats = benchmark_arelle(file_path, runs=5)
|
|
|
|
# Benchmark crabrl
|
|
print(" Running crabrl...")
|
|
crabrl_stats = benchmark_crabrl(file_path, runs=5)
|
|
|
|
if arelle_stats and crabrl_stats:
|
|
speedup = arelle_stats['median'] / crabrl_stats['median']
|
|
results.append({
|
|
'File': name,
|
|
'Size (MB)': f"{file_size_mb:.2f}",
|
|
'Arelle (ms)': f"{arelle_stats['median']*1000:.1f}",
|
|
'crabrl (ms)': f"{crabrl_stats['median']*1000:.1f}",
|
|
'Speedup': f"{speedup:.1f}x",
|
|
'arelle_raw': arelle_stats['median'],
|
|
'crabrl_raw': crabrl_stats['median'],
|
|
})
|
|
|
|
# Print results table
|
|
print("\n" + "=" * 80)
|
|
print("RESULTS SUMMARY")
|
|
print("=" * 80)
|
|
|
|
if results:
|
|
table_data = [{k: v for k, v in r.items() if not k.endswith('_raw')} for r in results]
|
|
print(tabulate(table_data, headers="keys", tablefmt="grid"))
|
|
|
|
# Calculate average speedup
|
|
speedups = [r['arelle_raw'] / r['crabrl_raw'] for r in results]
|
|
avg_speedup = statistics.mean(speedups)
|
|
print(f"\nAverage speedup: {avg_speedup:.1f}x faster than Arelle")
|
|
|
|
# Create performance chart
|
|
create_performance_chart(results)
|
|
else:
|
|
print("No results to display")
|
|
|
|
def create_performance_chart(results):
|
|
"""Create a performance comparison chart."""
|
|
labels = [r['File'].split('(')[0].strip() for r in results]
|
|
arelle_times = [r['arelle_raw'] * 1000 for r in results]
|
|
crabrl_times = [r['crabrl_raw'] * 1000 for r in results]
|
|
|
|
x = range(len(labels))
|
|
width = 0.35
|
|
|
|
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))
|
|
|
|
# Bar chart
|
|
ax1.bar([i - width/2 for i in x], arelle_times, width, label='Arelle', color='#FF6B6B')
|
|
ax1.bar([i + width/2 for i in x], crabrl_times, width, label='crabrl', color='#4ECDC4')
|
|
ax1.set_xlabel('File Size')
|
|
ax1.set_ylabel('Time (ms)')
|
|
ax1.set_title('Parsing Time Comparison')
|
|
ax1.set_xticks(x)
|
|
ax1.set_xticklabels(labels, rotation=45)
|
|
ax1.legend()
|
|
ax1.grid(True, alpha=0.3)
|
|
|
|
# Speedup chart
|
|
speedups = [a/c for a, c in zip(arelle_times, crabrl_times)]
|
|
ax2.bar(x, speedups, color='#95E77E')
|
|
ax2.set_xlabel('File Size')
|
|
ax2.set_ylabel('Speedup Factor')
|
|
ax2.set_title('crabrl Speedup over Arelle')
|
|
ax2.set_xticks(x)
|
|
ax2.set_xticklabels(labels, rotation=45)
|
|
ax2.grid(True, alpha=0.3)
|
|
|
|
# Add value labels on bars
|
|
for i, v in enumerate(speedups):
|
|
ax2.text(i, v + 0.5, f'{v:.1f}x', ha='center', va='bottom')
|
|
|
|
plt.tight_layout()
|
|
plt.savefig('benchmark_results.png', dpi=150)
|
|
print(f"\nPerformance chart saved to: benchmarks/benchmark_results.png")
|
|
|
|
if __name__ == "__main__":
|
|
main() |