feat(taxonomy): add rust sidecar compact surface pipeline

2026-03-12 15:23:10 -04:00
parent f2c25fb9c6
commit 58061af006
84 changed files with 19350 additions and 265 deletions
--- a/rust/vendor/crabrl/benchmarks/compare.py
+++ b/rust/vendor/crabrl/benchmarks/compare.py
@@ -0,0 +1,71 @@
+#!/usr/bin/env python3
+"""
+Compare crabrl performance with Arelle
+"""
+
+import subprocess
+import time
+import sys
+from pathlib import Path
+
+def run_crabrl(filepath):
+    """Run crabrl and measure time"""
+    cmd = ["../target/release/crabrl", "parse", filepath]
+    start = time.perf_counter()
+    result = subprocess.run(cmd, capture_output=True, text=True)
+    elapsed = (time.perf_counter() - start) * 1000
+    
+    if result.returncode == 0:
+        # Parse output for fact count
+        facts = 0
+        for line in result.stdout.split('\n'):
+            if 'Facts:' in line:
+                facts = int(line.split(':')[1].strip())
+                break
+        return elapsed, facts
+    return None, 0
+
+def run_arelle(filepath):
+    """Run Arelle and measure time"""
+    try:
+        cmd = ["python3", "-m", "arelle.CntlrCmdLine", 
+               "--file", filepath, "--skipDTS", "--logLevel", "ERROR"]
+        start = time.perf_counter()
+        result = subprocess.run(cmd, capture_output=True, text=True, timeout=30)
+        elapsed = (time.perf_counter() - start) * 1000
+        
+        if result.returncode == 0:
+            return elapsed
+        return None
+    except:
+        return None
+
+def main():
+    if len(sys.argv) < 2:
+        print("Usage: compare.py <xbrl-file>")
+        sys.exit(1)
+    
+    filepath = sys.argv[1]
+    print(f"Comparing performance on: {filepath}\n")
+    
+    # Run crabrl
+    crabrl_time, facts = run_crabrl(filepath)
+    if crabrl_time:
+        print(f"crabrl: {crabrl_time:.1f}ms ({facts} facts)")
+    else:
+        print("crabrl: Failed")
+    
+    # Run Arelle
+    arelle_time = run_arelle(filepath)
+    if arelle_time:
+        print(f"Arelle: {arelle_time:.1f}ms")
+    else:
+        print("Arelle: Failed or not installed")
+    
+    # Calculate speedup
+    if crabrl_time and arelle_time:
+        speedup = arelle_time / crabrl_time
+        print(f"\nSpeedup: {speedup:.1f}x faster")
+
+if __name__ == "__main__":
+    main()
--- a/rust/vendor/crabrl/benchmarks/compare_performance.py
+++ b/rust/vendor/crabrl/benchmarks/compare_performance.py
@@ -0,0 +1,214 @@
+#!/usr/bin/env python3
+"""Compare performance between crabrl and Arelle."""
+
+import os
+import sys
+import time
+import subprocess
+import json
+import statistics
+from pathlib import Path
+from tabulate import tabulate
+import matplotlib.pyplot as plt
+
+# Add parent directory to path for imports
+sys.path.insert(0, str(Path(__file__).parent.parent))
+
+def benchmark_arelle(file_path, runs=3):
+    """Benchmark Arelle parsing performance."""
+    times = []
+    
+    for _ in range(runs):
+        start = time.perf_counter()
+        
+        # Run Arelle in subprocess to isolate memory
+        result = subprocess.run([
+            sys.executable, "-c",
+            f"""
+import sys
+sys.path.insert(0, 'venv/lib/python{sys.version_info.major}.{sys.version_info.minor}/site-packages')
+from arelle import Cntlr
+from arelle import ModelManager
+
+# Suppress Arelle output
+import logging
+logging.getLogger("arelle").setLevel(logging.ERROR)
+
+controller = Cntlr.Cntlr(logFileName=None)
+controller.webCache.workOffline = True
+modelManager = ModelManager.initialize(controller)
+
+# Load and parse the XBRL file
+modelXbrl = modelManager.load('{file_path}')
+if modelXbrl:
+    facts = len(modelXbrl.facts)
+    contexts = len(modelXbrl.contexts)
+    units = len(modelXbrl.units)
+    print(f"{{facts}},{{contexts}},{{units}}")
+    modelXbrl.close()
+"""
+        ], capture_output=True, text=True, cwd=Path(__file__).parent)
+        
+        end = time.perf_counter()
+        
+        if result.returncode == 0 and result.stdout:
+            times.append(end - start)
+            if len(times) == 1:  # Print counts on first run
+                parts = result.stdout.strip().split(',')
+                if len(parts) == 3:
+                    print(f"    Arelle found: {parts[0]} facts, {parts[1]} contexts, {parts[2]} units")
+        else:
+            print(f"    Arelle error: {result.stderr}")
+    
+    if times:
+        return {
+            'mean': statistics.mean(times),
+            'median': statistics.median(times),
+            'stdev': statistics.stdev(times) if len(times) > 1 else 0,
+            'min': min(times),
+            'max': max(times),
+            'runs': len(times)
+        }
+    return None
+
+def benchmark_crabrl(file_path, runs=3):
+    """Benchmark crabrl parsing performance."""
+    times = []
+    
+    # Build the benchmark binary if needed
+    subprocess.run(["cargo", "build", "--release", "--example", "benchmark_single"], 
+                  capture_output=True, cwd=Path(__file__).parent.parent)
+    
+    for _ in range(runs):
+        start = time.perf_counter()
+        
+        result = subprocess.run([
+            "../target/release/examples/benchmark_single",
+            file_path
+        ], capture_output=True, text=True, cwd=Path(__file__).parent)
+        
+        end = time.perf_counter()
+        
+        if result.returncode == 0:
+            times.append(end - start)
+            if len(times) == 1 and result.stdout:  # Print counts on first run
+                print(f"    crabrl output: {result.stdout.strip()}")
+        else:
+            print(f"    crabrl error: {result.stderr}")
+    
+    if times:
+        return {
+            'mean': statistics.mean(times),
+            'median': statistics.median(times),
+            'stdev': statistics.stdev(times) if len(times) > 1 else 0,
+            'min': min(times),
+            'max': max(times),
+            'runs': len(times)
+        }
+    return None
+
+def main():
+    """Run comparative benchmarks."""
+    print("=" * 80)
+    print("XBRL Parser Performance Comparison: crabrl vs Arelle")
+    print("=" * 80)
+    
+    test_files = [
+        ("Tiny (10 facts)", "../test_data/test_tiny.xbrl"),
+        ("Small (100 facts)", "../test_data/test_small.xbrl"),
+        ("Medium (1K facts)", "../test_data/test_medium.xbrl"),
+        ("Large (10K facts)", "../test_data/test_large.xbrl"),
+        ("Huge (100K facts)", "../test_data/test_huge.xbrl"),
+    ]
+    
+    results = []
+    
+    for name, file_path in test_files:
+        if not Path(file_path).exists():
+            print(f"Skipping {name}: file not found")
+            continue
+        
+        file_size_mb = Path(file_path).stat().st_size / (1024 * 1024)
+        print(f"\nBenchmarking {name} ({file_size_mb:.2f} MB)...")
+        
+        # Benchmark Arelle
+        print("  Running Arelle...")
+        arelle_stats = benchmark_arelle(file_path, runs=5)
+        
+        # Benchmark crabrl
+        print("  Running crabrl...")
+        crabrl_stats = benchmark_crabrl(file_path, runs=5)
+        
+        if arelle_stats and crabrl_stats:
+            speedup = arelle_stats['median'] / crabrl_stats['median']
+            results.append({
+                'File': name,
+                'Size (MB)': f"{file_size_mb:.2f}",
+                'Arelle (ms)': f"{arelle_stats['median']*1000:.1f}",
+                'crabrl (ms)': f"{crabrl_stats['median']*1000:.1f}",
+                'Speedup': f"{speedup:.1f}x",
+                'arelle_raw': arelle_stats['median'],
+                'crabrl_raw': crabrl_stats['median'],
+            })
+    
+    # Print results table
+    print("\n" + "=" * 80)
+    print("RESULTS SUMMARY")
+    print("=" * 80)
+    
+    if results:
+        table_data = [{k: v for k, v in r.items() if not k.endswith('_raw')} for r in results]
+        print(tabulate(table_data, headers="keys", tablefmt="grid"))
+        
+        # Calculate average speedup
+        speedups = [r['arelle_raw'] / r['crabrl_raw'] for r in results]
+        avg_speedup = statistics.mean(speedups)
+        print(f"\nAverage speedup: {avg_speedup:.1f}x faster than Arelle")
+        
+        # Create performance chart
+        create_performance_chart(results)
+    else:
+        print("No results to display")
+
+def create_performance_chart(results):
+    """Create a performance comparison chart."""
+    labels = [r['File'].split('(')[0].strip() for r in results]
+    arelle_times = [r['arelle_raw'] * 1000 for r in results]
+    crabrl_times = [r['crabrl_raw'] * 1000 for r in results]
+    
+    x = range(len(labels))
+    width = 0.35
+    
+    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))
+    
+    # Bar chart
+    ax1.bar([i - width/2 for i in x], arelle_times, width, label='Arelle', color='#FF6B6B')
+    ax1.bar([i + width/2 for i in x], crabrl_times, width, label='crabrl', color='#4ECDC4')
+    ax1.set_xlabel('File Size')
+    ax1.set_ylabel('Time (ms)')
+    ax1.set_title('Parsing Time Comparison')
+    ax1.set_xticks(x)
+    ax1.set_xticklabels(labels, rotation=45)
+    ax1.legend()
+    ax1.grid(True, alpha=0.3)
+    
+    # Speedup chart
+    speedups = [a/c for a, c in zip(arelle_times, crabrl_times)]
+    ax2.bar(x, speedups, color='#95E77E')
+    ax2.set_xlabel('File Size')
+    ax2.set_ylabel('Speedup Factor')
+    ax2.set_title('crabrl Speedup over Arelle')
+    ax2.set_xticks(x)
+    ax2.set_xticklabels(labels, rotation=45)
+    ax2.grid(True, alpha=0.3)
+    
+    # Add value labels on bars
+    for i, v in enumerate(speedups):
+        ax2.text(i, v + 0.5, f'{v:.1f}x', ha='center', va='bottom')
+    
+    plt.tight_layout()
+    plt.savefig('benchmark_results.png', dpi=150)
+    print(f"\nPerformance chart saved to: benchmarks/benchmark_results.png")
+
+if __name__ == "__main__":
+    main()