From ea130d32996ceb11226279d38802ce4c441f6856 Mon Sep 17 00:00:00 2001 From: francy51 Date: Tue, 17 Mar 2026 19:55:38 -0400 Subject: [PATCH] Vendor crabrl-fork source and remove submodule linkage - Replace `rust/crabrl-fork` gitlink with tracked source files - Add workspace notes documenting why the fork is vendored - Update ignore rules for vendored fork build artifacts --- .gitignore | 1 + doc/rust-workspace.md | 7 + rust/crabrl-fork | 1 - rust/crabrl-fork/.gitattributes | 55 + rust/crabrl-fork/.github/workflows/ci.yml | 106 ++ .../crabrl-fork/.github/workflows/release.yml | 195 ++++ rust/crabrl-fork/.gitignore | 125 +++ rust/crabrl-fork/.rustfmt.toml | 2 + rust/crabrl-fork/CITATION.cff | 20 + rust/crabrl-fork/Cargo.toml | 72 ++ rust/crabrl-fork/LICENSE | 661 ++++++++++++ rust/crabrl-fork/README.md | 228 ++++ rust/crabrl-fork/benches/parser.rs | 37 + rust/crabrl-fork/benchmarks/compare.py | 71 ++ .../benchmarks/compare_performance.py | 214 ++++ rust/crabrl-fork/examples/benchmark_single.rs | 36 + rust/crabrl-fork/examples/parse.rs | 22 + rust/crabrl-fork/examples/validate.rs | 29 + rust/crabrl-fork/scripts/download_fixtures.py | 151 +++ .../scripts/generate_benchmark_charts.py | 260 +++++ .../scripts/generate_clean_benchmarks.py | 253 +++++ rust/crabrl-fork/src/allocator.rs | 242 +++++ rust/crabrl-fork/src/cache.rs | 47 + rust/crabrl-fork/src/instance.rs | 21 + rust/crabrl-fork/src/lib.rs | 113 ++ rust/crabrl-fork/src/linkbase.rs | 470 +++++++++ rust/crabrl-fork/src/main.rs | 181 ++++ rust/crabrl-fork/src/model.rs | 431 ++++++++ rust/crabrl-fork/src/parser.rs | 990 ++++++++++++++++++ rust/crabrl-fork/src/schema.rs | 308 ++++++ rust/crabrl-fork/src/sec.rs | 51 + rust/crabrl-fork/src/simd.rs | 303 ++++++ rust/crabrl-fork/src/simple_parser.rs | 99 ++ rust/crabrl-fork/src/taxonomy.rs | 49 + rust/crabrl-fork/src/validator.rs | 601 +++++++++++ 35 files changed, 6451 insertions(+), 1 deletion(-) create mode 100644 doc/rust-workspace.md delete mode 160000 rust/crabrl-fork create mode 100644 rust/crabrl-fork/.gitattributes create mode 100644 rust/crabrl-fork/.github/workflows/ci.yml create mode 100644 rust/crabrl-fork/.github/workflows/release.yml create mode 100644 rust/crabrl-fork/.gitignore create mode 100644 rust/crabrl-fork/.rustfmt.toml create mode 100644 rust/crabrl-fork/CITATION.cff create mode 100644 rust/crabrl-fork/Cargo.toml create mode 100644 rust/crabrl-fork/LICENSE create mode 100644 rust/crabrl-fork/README.md create mode 100644 rust/crabrl-fork/benches/parser.rs create mode 100644 rust/crabrl-fork/benchmarks/compare.py create mode 100644 rust/crabrl-fork/benchmarks/compare_performance.py create mode 100644 rust/crabrl-fork/examples/benchmark_single.rs create mode 100644 rust/crabrl-fork/examples/parse.rs create mode 100644 rust/crabrl-fork/examples/validate.rs create mode 100644 rust/crabrl-fork/scripts/download_fixtures.py create mode 100644 rust/crabrl-fork/scripts/generate_benchmark_charts.py create mode 100644 rust/crabrl-fork/scripts/generate_clean_benchmarks.py create mode 100644 rust/crabrl-fork/src/allocator.rs create mode 100644 rust/crabrl-fork/src/cache.rs create mode 100644 rust/crabrl-fork/src/instance.rs create mode 100644 rust/crabrl-fork/src/lib.rs create mode 100644 rust/crabrl-fork/src/linkbase.rs create mode 100644 rust/crabrl-fork/src/main.rs create mode 100644 rust/crabrl-fork/src/model.rs create mode 100644 rust/crabrl-fork/src/parser.rs create mode 100644 rust/crabrl-fork/src/schema.rs create mode 100644 rust/crabrl-fork/src/sec.rs create mode 100644 rust/crabrl-fork/src/simd.rs create mode 100644 rust/crabrl-fork/src/simple_parser.rs create mode 100644 rust/crabrl-fork/src/taxonomy.rs create mode 100644 rust/crabrl-fork/src/validator.rs diff --git a/.gitignore b/.gitignore index 5241b53..a06449c 100644 --- a/.gitignore +++ b/.gitignore @@ -49,6 +49,7 @@ data/*.sqlite-wal .workflow-data/ output/ rust/target/ +rust/crabrl-fork/target/ rust/vendor/crabrl/.git-vendor/ bin/fiscal-xbrl diff --git a/doc/rust-workspace.md b/doc/rust-workspace.md new file mode 100644 index 0000000..82a57e8 --- /dev/null +++ b/doc/rust-workspace.md @@ -0,0 +1,7 @@ +# Rust Workspace Notes + +`rust/crabrl-fork` is intentionally vendored into this repository as normal tracked source files. + +This is required for clean-clone deployment environments such as Coolify. Deploy builds clone only the main repository, so `crabrl-fork` must exist directly in the checkout and must not rely on nested Git metadata, a submodule checkout, or an external recursive clone step. + +When updating the fork, sync its source intentionally from the upstream fork repository and commit the resulting files into this repository. Do not reintroduce `rust/crabrl-fork` as a submodule, gitlink, or nested repository. diff --git a/rust/crabrl-fork b/rust/crabrl-fork deleted file mode 160000 index bba0aa7..0000000 --- a/rust/crabrl-fork +++ /dev/null @@ -1 +0,0 @@ -Subproject commit bba0aa7fd7761901c4765952ee7a1d760ff06ec6 diff --git a/rust/crabrl-fork/.gitattributes b/rust/crabrl-fork/.gitattributes new file mode 100644 index 0000000..99d8419 --- /dev/null +++ b/rust/crabrl-fork/.gitattributes @@ -0,0 +1,55 @@ +# Auto detect text files and perform LF normalization +* text=auto + +# Rust files +*.rs text eol=lf +*.toml text eol=lf +Cargo.lock text eol=lf + +# Python files +*.py text eol=lf +*.pyx text eol=lf +*.pxd text eol=lf + +# Documentation +*.md text eol=lf +*.txt text eol=lf +LICENSE text eol=lf + +# Config files +*.json text eol=lf +*.yaml text eol=lf +*.yml text eol=lf +*.xml text eol=lf +*.xsd text eol=lf +*.xbrl text eol=lf + +# Shell scripts +*.sh text eol=lf +*.bash text eol=lf + +# Git files +.gitignore text eol=lf +.gitattributes text eol=lf + +# Binary files +*.png binary +*.jpg binary +*.jpeg binary +*.gif binary +*.ico binary +*.pdf binary +*.zip binary +*.gz binary +*.tar binary +*.7z binary +*.exe binary +*.dll binary +*.so binary +*.dylib binary + +# Linguist overrides - ensure Rust is recognized as primary language +*.rs linguist-language=Rust +benchmarks/*.py linguist-documentation +scripts/*.py linguist-documentation +examples/* linguist-documentation \ No newline at end of file diff --git a/rust/crabrl-fork/.github/workflows/ci.yml b/rust/crabrl-fork/.github/workflows/ci.yml new file mode 100644 index 0000000..518fca9 --- /dev/null +++ b/rust/crabrl-fork/.github/workflows/ci.yml @@ -0,0 +1,106 @@ +name: CI + +on: + push: + branches: [ main, master ] + pull_request: + branches: [ main, master ] + +env: + CARGO_TERM_COLOR: always + RUST_BACKTRACE: 1 + +jobs: + test: + name: Test - ${{ matrix.os }} + runs-on: ${{ matrix.os }} + strategy: + matrix: + os: [ubuntu-latest, windows-latest, macos-latest] + rust: [stable, beta] + exclude: + - os: windows-latest + rust: beta + - os: macos-latest + rust: beta + + steps: + - uses: actions/checkout@v4 + + - name: Install Rust + uses: dtolnay/rust-toolchain@master + with: + toolchain: ${{ matrix.rust }} + components: rustfmt, clippy + + - name: Cache cargo registry + uses: actions/cache@v4 + with: + path: ~/.cargo/registry + key: ${{ runner.os }}-cargo-registry-${{ hashFiles('**/Cargo.lock') }} + + - name: Cache cargo index + uses: actions/cache@v4 + with: + path: ~/.cargo/git + key: ${{ runner.os }}-cargo-index-${{ hashFiles('**/Cargo.lock') }} + + - name: Cache cargo build + uses: actions/cache@v4 + with: + path: target + key: ${{ runner.os }}-cargo-build-target-${{ hashFiles('**/Cargo.lock') }} + + - name: Check formatting + run: cargo fmt -- --check + + - name: Run clippy + run: cargo clippy --all-features -- -D warnings + + - name: Build + run: cargo build --verbose --all-features + + - name: Run tests + run: cargo test --verbose --all-features + + - name: Build release + run: cargo build --release --all-features + + - name: Run benchmarks (smoke test) + run: cargo bench --no-run + + coverage: + name: Code Coverage + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + + - name: Install Rust + uses: dtolnay/rust-toolchain@stable + with: + components: llvm-tools-preview + + - name: Install cargo-llvm-cov + uses: taiki-e/install-action@cargo-llvm-cov + + - name: Generate code coverage + run: cargo llvm-cov --all-features --workspace --lcov --output-path lcov.info + + - name: Upload coverage to Codecov + uses: codecov/codecov-action@v4 + with: + files: lcov.info + fail_ci_if_error: false + + security-audit: + name: Security Audit + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + + - name: Run cargo audit + uses: actions-rs/audit-check@v1 + with: + token: ${{ secrets.GITHUB_TOKEN }} \ No newline at end of file diff --git a/rust/crabrl-fork/.github/workflows/release.yml b/rust/crabrl-fork/.github/workflows/release.yml new file mode 100644 index 0000000..ff6d5ef --- /dev/null +++ b/rust/crabrl-fork/.github/workflows/release.yml @@ -0,0 +1,195 @@ +name: Release + +on: + push: + tags: + - 'v*' + workflow_dispatch: + inputs: + version: + description: 'Version to publish (e.g., 0.1.0)' + required: true + type: string + +env: + CARGO_TERM_COLOR: always + +jobs: + test: + name: Final Tests + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + + - name: Install Rust stable + uses: dtolnay/rust-toolchain@stable + with: + components: rustfmt, clippy + + - name: Run tests + run: cargo test --all-features --release + + # Temporarily skip formatting check to get initial release out + # - name: Check formatting + # run: cargo fmt -- --check + + # - name: Run clippy + # run: cargo clippy --all-features -- -D warnings + + publish-crates-io: + name: Publish to crates.io + needs: test + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + + - name: Install Rust stable + uses: dtolnay/rust-toolchain@stable + + - name: Verify version + run: | + # Extract version from Cargo.toml + CARGO_VERSION=$(grep -E "^version" Cargo.toml | head -1 | cut -d'"' -f2) + echo "Cargo.toml version: $CARGO_VERSION" + + # For manual workflow dispatch + if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then + INPUT_VERSION="${{ github.event.inputs.version }}" + if [ "$CARGO_VERSION" != "$INPUT_VERSION" ]; then + echo "Error: Cargo.toml version ($CARGO_VERSION) doesn't match input version ($INPUT_VERSION)" + exit 1 + fi + fi + + # For tag push + if [ "${{ github.event_name }}" = "push" ]; then + TAG_VERSION="${GITHUB_REF#refs/tags/v}" + if [ "$CARGO_VERSION" != "$TAG_VERSION" ]; then + echo "Error: Cargo.toml version ($CARGO_VERSION) doesn't match tag version ($TAG_VERSION)" + exit 1 + fi + fi + + - name: Check if version exists on crates.io + run: | + CRATE_NAME=$(grep -E "^name" Cargo.toml | head -1 | cut -d'"' -f2) + VERSION=$(grep -E "^version" Cargo.toml | head -1 | cut -d'"' -f2) + + if cargo search "$CRATE_NAME" | grep -q "^$CRATE_NAME = \"$VERSION\""; then + echo "Version $VERSION already exists on crates.io" + exit 1 + fi + + - name: Build release + run: cargo build --release --all-features + + - name: Package for crates.io + run: cargo package --all-features + + - name: Publish to crates.io + run: cargo publish --all-features --token ${{ secrets.CARGO_REGISTRY_TOKEN }} + env: + CARGO_REGISTRY_TOKEN: ${{ secrets.CARGO_REGISTRY_TOKEN }} + + create-github-release: + name: Create GitHub Release + needs: publish-crates-io + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + + - name: Install Rust stable + uses: dtolnay/rust-toolchain@stable + + - name: Build release binaries + run: | + cargo build --release --all-features + mkdir -p release + cp target/release/crabrl release/crabrl-linux-x64 + chmod +x release/crabrl-linux-x64 + + - name: Create Release + uses: softprops/action-gh-release@v2 + with: + files: release/* + generate_release_notes: true + body: | + ## Installation + + ### From crates.io + ```bash + cargo install crabrl + ``` + + ### Download Binary + Download the pre-built binary for your platform from the assets below. + + ## What's Changed + See the full changelog below. + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + + build-cross-platform: + name: Build ${{ matrix.target }} + needs: test + runs-on: ${{ matrix.os }} + strategy: + matrix: + include: + - os: ubuntu-latest + target: x86_64-unknown-linux-gnu + artifact: crabrl-linux-x64 + - os: ubuntu-latest + target: aarch64-unknown-linux-gnu + artifact: crabrl-linux-arm64 + use-cross: true + - os: windows-latest + target: x86_64-pc-windows-msvc + artifact: crabrl-windows-x64.exe + - os: macos-latest + target: x86_64-apple-darwin + artifact: crabrl-macos-x64 + - os: macos-latest + target: aarch64-apple-darwin + artifact: crabrl-macos-arm64 + + steps: + - uses: actions/checkout@v4 + + - name: Install Rust + uses: dtolnay/rust-toolchain@stable + with: + targets: ${{ matrix.target }} + + - name: Install cross + if: matrix.use-cross + run: cargo install cross + + - name: Build + run: | + if [ "${{ matrix.use-cross }}" = "true" ]; then + cross build --release --target ${{ matrix.target }} --all-features + else + cargo build --release --target ${{ matrix.target }} --all-features + fi + shell: bash + + - name: Package + run: | + mkdir -p release + if [ "${{ matrix.os }}" = "windows-latest" ]; then + cp target/${{ matrix.target }}/release/crabrl.exe release/${{ matrix.artifact }} + else + cp target/${{ matrix.target }}/release/crabrl release/${{ matrix.artifact }} + chmod +x release/${{ matrix.artifact }} + fi + shell: bash + + - name: Upload artifact + uses: actions/upload-artifact@v4 + with: + name: ${{ matrix.artifact }} + path: release/${{ matrix.artifact }} \ No newline at end of file diff --git a/rust/crabrl-fork/.gitignore b/rust/crabrl-fork/.gitignore new file mode 100644 index 0000000..a1ceb39 --- /dev/null +++ b/rust/crabrl-fork/.gitignore @@ -0,0 +1,125 @@ +# Rust +/target/ +**/*.rs.bk +*.pdb +Cargo.lock + +# Python +__pycache__/ +*.py[cod] +*$py.class +*.so +.Python +venv/ +ENV/ +env/ +.venv +.env + +# Virtual environments +benchmarks/venv/ +**/venv/ +**/virtualenv/ +**/.venv/ + +# Test data and fixtures +test_data/ +benchmarks/fixtures/ +fixtures/ + +# Benchmark outputs +*.png +*.json +benchmark_results/ +benchmarks/*.png +benchmarks/*.json +benchmarks/*_results.json + +# IDE +.idea/ +.vscode/ +*.swp +*.swo +*~ +.DS_Store + +# Build artifacts +*.o +*.a +*.so +*.dll +*.exe +*.out + +# Documentation +/target/doc/ +/target/debug/ +/target/release/ + +# Logs +*.log +logs/ + +# Coverage +*.profraw +*.profdata +/target/coverage/ +tarpaulin-report.html +cobertura.xml + +# OS files +.DS_Store +Thumbs.db +desktop.ini + +# Temporary files +*.tmp +*.temp +*.bak +.cache/ +tmp/ + +# Large test files +*.xbrl +*.xml +!examples/*.xml +!tests/fixtures/*.xml + +# Downloaded SEC filings +benchmarks/fixtures/ +scripts/fixtures/ + +# Benchmark comparison artifacts +benchmarks/benchmark_results.png +benchmarks/synthetic_benchmark_chart.png +benchmarks/real_benchmark_chart.png +benchmarks/sec_comparison_results.json +benchmarks/synthetic_benchmark_results.json +benchmarks/real_benchmark_results.json +benchmarks/real_filing_results.json + +# Python artifacts from benchmarking +*.pyc +.pytest_cache/ +.coverage +htmlcov/ +.tox/ +.hypothesis/ + +# Backup files +*.backup +*.old +*.orig + +# Archives +*.zip +*.tar.gz +*.tar.bz2 +*.7z +*.rar + +# Keep important config examples +!.gitignore +!.github/ +!examples/.gitkeep +!tests/fixtures/.gitkeep \ No newline at end of file diff --git a/rust/crabrl-fork/.rustfmt.toml b/rust/crabrl-fork/.rustfmt.toml new file mode 100644 index 0000000..c0778a0 --- /dev/null +++ b/rust/crabrl-fork/.rustfmt.toml @@ -0,0 +1,2 @@ +# Rust formatting configuration +edition = "2021" \ No newline at end of file diff --git a/rust/crabrl-fork/CITATION.cff b/rust/crabrl-fork/CITATION.cff new file mode 100644 index 0000000..c781ad3 --- /dev/null +++ b/rust/crabrl-fork/CITATION.cff @@ -0,0 +1,20 @@ +cff-version: 1.2.0 +message: "If you use this software, please cite it as below." +authors: + - family-names: "Amorelli" + given-names: "Stefano" + email: "stefano@amorelli.tech" + orcid: "https://orcid.org/0009-0004-4917-0999" +title: "crabrl: High-performance XBRL parser for SEC EDGAR filings" +version: 0.1.0 +date-released: 2025-01-16 +url: "https://github.com/stefanoamorelli/crabrl" +repository-code: "https://github.com/stefanoamorelli/crabrl" +license: AGPL-3.0 +keywords: + - xbrl + - parser + - sec-edgar + - finance + - rust +abstract: "A high-performance XBRL parser and validator written in Rust, optimized for SEC EDGAR filings. Achieves 50-150x performance gains over traditional parsers through zero-copy parsing, memory-mapped I/O, and Rust's ownership model." \ No newline at end of file diff --git a/rust/crabrl-fork/Cargo.toml b/rust/crabrl-fork/Cargo.toml new file mode 100644 index 0000000..9455434 --- /dev/null +++ b/rust/crabrl-fork/Cargo.toml @@ -0,0 +1,72 @@ +[package] +name = "crabrl" +version = "0.1.0" +edition = "2021" +authors = ["Stefano Amorelli "] +description = "High-performance XBRL parser and validator" +license = "AGPL-3.0" +repository = "https://github.com/stefanoamorelli/crabrl" +keywords = ["xbrl", "parser", "finance", "sec", "edgar"] +categories = ["parser-implementations", "finance", "command-line-utilities"] + +[dependencies] +# Core +quick-xml = "0.36" +chrono = { version = "0.4", features = ["serde"] } +compact_str = { version = "0.8", features = ["serde"] } + +# Performance +ahash = "0.8" +parking_lot = "0.12" +memchr = "2.7" +bumpalo = "3.16" +string-interner = "0.18" +rayon = { version = "1.10", optional = true } +memmap2 = { version = "0.9", optional = true } +mimalloc = { version = "0.1", default-features = false } +bitflags = "2.6" + +# Async support +tokio = { version = "1.40", features = ["fs", "io-util"], optional = true } +async-stream = { version = "0.3", optional = true } + +# CLI +clap = { version = "4.5", features = ["derive"], optional = true } +colored = { version = "2.1", optional = true } + +# Serialization +serde = { version = "1.0", features = ["derive"] } +serde_json = "1.0" + +# Error handling +thiserror = "2.0" +anyhow = "1.0" + +[dev-dependencies] +criterion = "0.5" +pretty_assertions = "1.4" +tempfile = "3.15" + +[[bin]] +name = "crabrl" +required-features = ["cli"] + +[[bench]] +name = "parser" +harness = false + +[features] +default = ["cli", "parallel"] +cli = ["clap", "colored"] +parallel = ["rayon"] +mmap = ["memmap2"] +async = ["tokio", "async-stream"] + +[profile.release] +lto = "fat" +codegen-units = 1 +opt-level = 3 +strip = true + +[profile.bench] +inherits = "release" \ No newline at end of file diff --git a/rust/crabrl-fork/LICENSE b/rust/crabrl-fork/LICENSE new file mode 100644 index 0000000..e797e25 --- /dev/null +++ b/rust/crabrl-fork/LICENSE @@ -0,0 +1,661 @@ + GNU AFFERO GENERAL PUBLIC LICENSE + Version 3, 19 November 2007 + + Copyright (C) 2007 Free Software Foundation, Inc. + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The GNU Affero General Public License is a free, copyleft license for +software and other kinds of works, specifically designed to ensure +cooperation with the community in the case of network server software. + + The licenses for most software and other practical works are designed +to take away your freedom to share and change the works. By contrast, +our General Public Licenses are intended to guarantee your freedom to +share and change all versions of a program--to make sure it remains free +software for all its users. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +them if you wish), that you receive source code or can get it if you +want it, that you can change the software or use pieces of it in new +free programs, and that you know you can do these things. + + Developers that use our General Public Licenses protect your rights +with two steps: (1) assert copyright on the software, and (2) offer +you this License which gives you legal permission to copy, distribute +and/or modify the software. + + A secondary benefit of defending all users' freedom is that +improvements made in alternate versions of the program, if they +receive widespread use, become available for other developers to +incorporate. Many developers of free software are heartened and +encouraged by the resulting cooperation. However, in the case of +software used on network servers, this result may fail to come about. +The GNU General Public License permits making a modified version and +letting the public access it on a server without ever releasing its +source code to the public. + + The GNU Affero General Public License is designed specifically to +ensure that, in such cases, the modified source code becomes available +to the community. It requires the operator of a network server to +provide the source code of the modified version running there to the +users of that server. Therefore, public use of a modified version, on +a publicly accessible server, gives the public access to the source +code of the modified version. + + An older license, called the Affero General Public License and +published by Affero, was designed to accomplish similar goals. This is +a different license, not a version of the Affero GPL, but Affero has +released a new version of the Affero GPL which permits relicensing under +this license. + + The precise terms and conditions for copying, distribution and +modification follow. + + TERMS AND CONDITIONS + + 0. Definitions. + + "This License" refers to version 3 of the GNU Affero General Public License. + + "Copyright" also means copyright-like laws that apply to other kinds of +works, such as semiconductor masks. + + "The Program" refers to any copyrightable work licensed under this +License. Each licensee is addressed as "you". "Licensees" and +"recipients" may be individuals or organizations. + + To "modify" a work means to copy from or adapt all or part of the work +in a fashion requiring copyright permission, other than the making of an +exact copy. The resulting work is called a "modified version" of the +earlier work or a work "based on" the earlier work. + + A "covered work" means either the unmodified Program or a work based +on the Program. + + To "propagate" a work means to do anything with it that, without +permission, would make you directly or secondarily liable for +infringement under applicable copyright law, except executing it on a +computer or modifying a private copy. Propagation includes copying, +distribution (with or without modification), making available to the +public, and in some countries other activities as well. + + To "convey" a work means any kind of propagation that enables other +parties to make or receive copies. Mere interaction with a user through +a computer network, with no transfer of a copy, is not conveying. + + An interactive user interface displays "Appropriate Legal Notices" +to the extent that it includes a convenient and prominently visible +feature that (1) displays an appropriate copyright notice, and (2) +tells the user that there is no warranty for the work (except to the +extent that warranties are provided), that licensees may convey the +work under this License, and how to view a copy of this License. If +the interface presents a list of user commands or options, such as a +menu, a prominent item in the list meets this criterion. + + 1. Source Code. + + The "source code" for a work means the preferred form of the work +for making modifications to it. "Object code" means any non-source +form of a work. + + A "Standard Interface" means an interface that either is an official +standard defined by a recognized standards body, or, in the case of +interfaces specified for a particular programming language, one that +is widely used among developers working in that language. + + The "System Libraries" of an executable work include anything, other +than the work as a whole, that (a) is included in the normal form of +packaging a Major Component, but which is not part of that Major +Component, and (b) serves only to enable use of the work with that +Major Component, or to implement a Standard Interface for which an +implementation is available to the public in source code form. A +"Major Component", in this context, means a major essential component +(kernel, window system, and so on) of the specific operating system +(if any) on which the executable work runs, or a compiler used to +produce the work, or an object code interpreter used to run it. + + The "Corresponding Source" for a work in object code form means all +the source code needed to generate, install, and (for an executable +work) run the object code and to modify the work, including scripts to +control those activities. However, it does not include the work's +System Libraries, or general-purpose tools or generally available free +programs which are used unmodified in performing those activities but +which are not part of the work. For example, Corresponding Source +includes interface definition files associated with source files for +the work, and the source code for shared libraries and dynamically +linked subprograms that the work is specifically designed to require, +such as by intimate data communication or control flow between those +subprograms and other parts of the work. + + The Corresponding Source need not include anything that users +can regenerate automatically from other parts of the Corresponding +Source. + + The Corresponding Source for a work in source code form is that +same work. + + 2. Basic Permissions. + + All rights granted under this License are granted for the term of +copyright on the Program, and are irrevocable provided the stated +conditions are met. This License explicitly affirms your unlimited +permission to run the unmodified Program. The output from running a +covered work is covered by this License only if the output, given its +content, constitutes a covered work. This License acknowledges your +rights of fair use or other equivalent, as provided by copyright law. + + You may make, run and propagate covered works that you do not +convey, without conditions so long as your license otherwise remains +in force. You may convey covered works to others for the sole purpose +of having them make modifications exclusively for you, or provide you +with facilities for running those works, provided that you comply with +the terms of this License in conveying all material for which you do +not control copyright. Those thus making or running the covered works +for you must do so exclusively on your behalf, under your direction +and control, on terms that prohibit them from making any copies of +your copyrighted material outside their relationship with you. + + Conveying under any other circumstances is permitted solely under +the conditions stated below. Sublicensing is not allowed; section 10 +makes it unnecessary. + + 3. Protecting Users' Legal Rights From Anti-Circumvention Law. + + No covered work shall be deemed part of an effective technological +measure under any applicable law fulfilling obligations under article +11 of the WIPO copyright treaty adopted on 20 December 1996, or +similar laws prohibiting or restricting circumvention of such +measures. + + When you convey a covered work, you waive any legal power to forbid +circumvention of technological measures to the extent such circumvention +is effected by exercising rights under this License with respect to +the covered work, and you disclaim any intention to limit operation or +modification of the work as a means of enforcing, against the work's +users, your or third parties' legal rights to forbid circumvention of +technological measures. + + 4. Conveying Verbatim Copies. + + You may convey verbatim copies of the Program's source code as you +receive it, in any medium, provided that you conspicuously and +appropriately publish on each copy an appropriate copyright notice; +keep intact all notices stating that this License and any +non-permissive terms added in accord with section 7 apply to the code; +keep intact all notices of the absence of any warranty; and give all +recipients a copy of this License along with the Program. + + You may charge any price or no price for each copy that you convey, +and you may offer support or warranty protection for a fee. + + 5. Conveying Modified Source Versions. + + You may convey a work based on the Program, or the modifications to +produce it from the Program, in the form of source code under the +terms of section 4, provided that you also meet all of these conditions: + + a) The work must carry prominent notices stating that you modified + it, and giving a relevant date. + + b) The work must carry prominent notices stating that it is + released under this License and any conditions added under section + 7. This requirement modifies the requirement in section 4 to + "keep intact all notices". + + c) You must license the entire work, as a whole, under this + License to anyone who comes into possession of a copy. This + License will therefore apply, along with any applicable section 7 + additional terms, to the whole of the work, and all its parts, + regardless of how they are packaged. This License gives no + permission to license the work in any other way, but it does not + invalidate such permission if you have separately received it. + + d) If the work has interactive user interfaces, each must display + Appropriate Legal Notices; however, if the Program has interactive + interfaces that do not display Appropriate Legal Notices, your + work need not make them do so. + + A compilation of a covered work with other separate and independent +works, which are not by their nature extensions of the covered work, +and which are not combined with it such as to form a larger program, +in or on a volume of a storage or distribution medium, is called an +"aggregate" if the compilation and its resulting copyright are not +used to limit the access or legal rights of the compilation's users +beyond what the individual works permit. Inclusion of a covered work +in an aggregate does not cause this License to apply to the other +parts of the aggregate. + + 6. Conveying Non-Source Forms. + + You may convey a covered work in object code form under the terms +of sections 4 and 5, provided that you also convey the +machine-readable Corresponding Source under the terms of this License, +in one of these ways: + + a) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by the + Corresponding Source fixed on a durable physical medium + customarily used for software interchange. + + b) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by a + written offer, valid for at least three years and valid for as + long as you offer spare parts or customer support for that product + model, to give anyone who possesses the object code either (1) a + copy of the Corresponding Source for all the software in the + product that is covered by this License, on a durable physical + medium customarily used for software interchange, for a price no + more than your reasonable cost of physically performing this + conveying of source, or (2) access to copy the + Corresponding Source from a network server at no charge. + + c) Convey individual copies of the object code with a copy of the + written offer to provide the Corresponding Source. This + alternative is allowed only occasionally and noncommercially, and + only if you received the object code with such an offer, in accord + with subsection 6b. + + d) Convey the object code by offering access from a designated + place (gratis or for a charge), and offer equivalent access to the + Corresponding Source in the same way through the same place at no + further charge. You need not require recipients to copy the + Corresponding Source along with the object code. If the place to + copy the object code is a network server, the Corresponding Source + may be on a different server (operated by you or a third party) + that supports equivalent copying facilities, provided you maintain + clear directions next to the object code saying where to find the + Corresponding Source. Regardless of what server hosts the + Corresponding Source, you remain obligated to ensure that it is + available for as long as needed to satisfy these requirements. + + e) Convey the object code using peer-to-peer transmission, provided + you inform other peers where the object code and Corresponding + Source of the work are being offered to the general public at no + charge under subsection 6d. + + A separable portion of the object code, whose source code is excluded +from the Corresponding Source as a System Library, need not be +included in conveying the object code work. + + A "User Product" is either (1) a "consumer product", which means any +tangible personal property which is normally used for personal, family, +or household purposes, or (2) anything designed or sold for incorporation +into a dwelling. In determining whether a product is a consumer product, +doubtful cases shall be resolved in favor of coverage. For a particular +product received by a particular user, "normally used" refers to a +typical or common use of that class of product, regardless of the status +of the particular user or of the way in which the particular user +actually uses, or expects or is expected to use, the product. A product +is a consumer product regardless of whether the product has substantial +commercial, industrial or non-consumer uses, unless such uses represent +the only significant mode of use of the product. + + "Installation Information" for a User Product means any methods, +procedures, authorization keys, or other information required to install +and execute modified versions of a covered work in that User Product from +a modified version of its Corresponding Source. The information must +suffice to ensure that the continued functioning of the modified object +code is in no case prevented or interfered with solely because +modification has been made. + + If you convey an object code work under this section in, or with, or +specifically for use in, a User Product, and the conveying occurs as +part of a transaction in which the right of possession and use of the +User Product is transferred to the recipient in perpetuity or for a +fixed term (regardless of how the transaction is characterized), the +Corresponding Source conveyed under this section must be accompanied +by the Installation Information. But this requirement does not apply +if neither you nor any third party retains the ability to install +modified object code on the User Product (for example, the work has +been installed in ROM). + + The requirement to provide Installation Information does not include a +requirement to continue to provide support service, warranty, or updates +for a work that has been modified or installed by the recipient, or for +the User Product in which it has been modified or installed. Access to a +network may be denied when the modification itself materially and +adversely affects the operation of the network or violates the rules and +protocols for communication across the network. + + Corresponding Source conveyed, and Installation Information provided, +in accord with this section must be in a format that is publicly +documented (and with an implementation available to the public in +source code form), and must require no special password or key for +unpacking, reading or copying. + + 7. Additional Terms. + + "Additional permissions" are terms that supplement the terms of this +License by making exceptions from one or more of its conditions. +Additional permissions that are applicable to the entire Program shall +be treated as though they were included in this License, to the extent +that they are valid under applicable law. If additional permissions +apply only to part of the Program, that part may be used separately +under those permissions, but the entire Program remains governed by +this License without regard to the additional permissions. + + When you convey a copy of a covered work, you may at your option +remove any additional permissions from that copy, or from any part of +it. (Additional permissions may be written to require their own +removal in certain cases when you modify the work.) You may place +additional permissions on material, added by you to a covered work, +for which you have or can give appropriate copyright permission. + + Notwithstanding any other provision of this License, for material you +add to a covered work, you may (if authorized by the copyright holders of +that material) supplement the terms of this License with terms: + + a) Disclaiming warranty or limiting liability differently from the + terms of sections 15 and 16 of this License; or + + b) Requiring preservation of specified reasonable legal notices or + author attributions in that material or in the Appropriate Legal + Notices displayed by works containing it; or + + c) Prohibiting misrepresentation of the origin of that material, or + requiring that modified versions of such material be marked in + reasonable ways as different from the original version; or + + d) Limiting the use for publicity purposes of names of licensors or + authors of the material; or + + e) Declining to grant rights under trademark law for use of some + trade names, trademarks, or service marks; or + + f) Requiring indemnification of licensors and authors of that + material by anyone who conveys the material (or modified versions of + it) with contractual assumptions of liability to the recipient, for + any liability that these contractual assumptions directly impose on + those licensors and authors. + + All other non-permissive additional terms are considered "further +restrictions" within the meaning of section 10. If the Program as you +received it, or any part of it, contains a notice stating that it is +governed by this License along with a term that is a further +restriction, you may remove that term. If a license document contains +a further restriction but permits relicensing or conveying under this +License, you may add to a covered work material governed by the terms +of that license document, provided that the further restriction does +not survive such relicensing or conveying. + + If you add terms to a covered work in accord with this section, you +must place, in the relevant source files, a statement of the +additional terms that apply to those files, or a notice indicating +where to find the applicable terms. + + Additional terms, permissive or non-permissive, may be stated in the +form of a separately written license, or stated as exceptions; +the above requirements apply either way. + + 8. Termination. + + You may not propagate or modify a covered work except as expressly +provided under this License. Any attempt otherwise to propagate or +modify it is void, and will automatically terminate your rights under +this License (including any patent licenses granted under the third +paragraph of section 11). + + However, if you cease all violation of this License, then your +license from a particular copyright holder is reinstated (a) +provisionally, unless and until the copyright holder explicitly and +finally terminates your license, and (b) permanently, if the copyright +holder fails to notify you of the violation by some reasonable means +prior to 60 days after the cessation. + + Moreover, your license from a particular copyright holder is +reinstated permanently if the copyright holder notifies you of the +violation by some reasonable means, this is the first time you have +received notice of violation of this License (for any work) from that +copyright holder, and you cure the violation prior to 30 days after +your receipt of the notice. + + Termination of your rights under this section does not terminate the +licenses of parties who have received copies or rights from you under +this License. If your rights have been terminated and not permanently +reinstated, you do not qualify to receive new licenses for the same +material under section 10. + + 9. Acceptance Not Required for Having Copies. + + You are not required to accept this License in order to receive or +run a copy of the Program. Ancillary propagation of a covered work +occurring solely as a consequence of using peer-to-peer transmission +to receive a copy likewise does not require acceptance. However, +nothing other than this License grants you permission to propagate or +modify any covered work. These actions infringe copyright if you do +not accept this License. Therefore, by modifying or propagating a +covered work, you indicate your acceptance of this License to do so. + + 10. Automatic Licensing of Downstream Recipients. + + Each time you convey a covered work, the recipient automatically +receives a license from the original licensors, to run, modify and +propagate that work, subject to this License. You are not responsible +for enforcing compliance by third parties with this License. + + An "entity transaction" is a transaction transferring control of an +organization, or substantially all assets of one, or subdividing an +organization, or merging organizations. If propagation of a covered +work results from an entity transaction, each party to that +transaction who receives a copy of the work also receives whatever +licenses to the work the party's predecessor in interest had or could +give under the previous paragraph, plus a right to possession of the +Corresponding Source of the work from the predecessor in interest, if +the predecessor has it or can get it with reasonable efforts. + + You may not impose any further restrictions on the exercise of the +rights granted or affirmed under this License. For example, you may +not impose a license fee, royalty, or other charge for exercise of +rights granted under this License, and you may not initiate litigation +(including a cross-claim or counterclaim in a lawsuit) alleging that +any patent claim is infringed by making, using, selling, offering for +sale, or importing the Program or any portion of it. + + 11. Patents. + + A "contributor" is a copyright holder who authorizes use under this +License of the Program or a work on which the Program is based. The +work thus licensed is called the contributor's "contributor version". + + A contributor's "essential patent claims" are all patent claims +owned or controlled by the contributor, whether already acquired or +hereafter acquired, that would be infringed by some manner, permitted +by this License, of making, using, or selling its contributor version, +but do not include claims that would be infringed only as a +consequence of further modification of the contributor version. For +purposes of this definition, "control" includes the right to grant +patent sublicenses in a manner consistent with the requirements of +this License. + + Each contributor grants you a non-exclusive, worldwide, royalty-free +patent license under the contributor's essential patent claims, to +make, use, sell, offer for sale, import and otherwise run, modify and +propagate the contents of its contributor version. + + In the following three paragraphs, a "patent license" is any express +agreement or commitment, however denominated, not to enforce a patent +(such as an express permission to practice a patent or covenant not to +sue for patent infringement). To "grant" such a patent license to a +party means to make such an agreement or commitment not to enforce a +patent against the party. + + If you convey a covered work, knowingly relying on a patent license, +and the Corresponding Source of the work is not available for anyone +to copy, free of charge and under the terms of this License, through a +publicly available network server or other readily accessible means, +then you must either (1) cause the Corresponding Source to be so +available, or (2) arrange to deprive yourself of the benefit of the +patent license for this particular work, or (3) arrange, in a manner +consistent with the requirements of this License, to extend the patent +license to downstream recipients. "Knowingly relying" means you have +actual knowledge that, but for the patent license, your conveying the +covered work in a country, or your recipient's use of the covered work +in a country, would infringe one or more identifiable patents in that +country that you have reason to believe are valid. + + If, pursuant to or in connection with a single transaction or +arrangement, you convey, or propagate by procuring conveyance of, a +covered work, and grant a patent license to some of the parties +receiving the covered work authorizing them to use, propagate, modify +or convey a specific copy of the covered work, then the patent license +you grant is automatically extended to all recipients of the covered +work and works based on it. + + A patent license is "discriminatory" if it does not include within +the scope of its coverage, prohibits the exercise of, or is +conditioned on the non-exercise of one or more of the rights that are +specifically granted under this License. You may not convey a covered +work if you are a party to an arrangement with a third party that is +in the business of distributing software, under which you make payment +to the third party based on the extent of your activity of conveying +the work, and under which the third party grants, to any of the +parties who would receive the covered work from you, a discriminatory +patent license (a) in connection with copies of the covered work +conveyed by you (or copies made from those copies), or (b) primarily +for and in connection with specific products or compilations that +contain the covered work, unless you entered into that arrangement, +or that patent license was granted, prior to 28 March 2007. + + Nothing in this License shall be construed as excluding or limiting +any implied license or other defenses to infringement that may +otherwise be available to you under applicable patent law. + + 12. No Surrender of Others' Freedom. + + If conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot convey a +covered work so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you may +not convey it at all. For example, if you agree to terms that obligate you +to collect a royalty for further conveying from those to whom you convey +the Program, the only way you could satisfy both those terms and this +License would be to refrain entirely from conveying the Program. + + 13. Remote Network Interaction; Use with the GNU General Public License. + + Notwithstanding any other provision of this License, if you modify the +Program, your modified version must prominently offer all users +interacting with it remotely through a computer network (if your version +supports such interaction) an opportunity to receive the Corresponding +Source of your version by providing access to the Corresponding Source +from a network server at no charge, through some standard or customary +means of facilitating copying of software. This Corresponding Source +shall include the Corresponding Source for any work covered by version 3 +of the GNU General Public License that is incorporated pursuant to the +following paragraph. + + Notwithstanding any other provision of this License, you have +permission to link or combine any covered work with a work licensed +under version 3 of the GNU General Public License into a single +combined work, and to convey the resulting work. The terms of this +License will continue to apply to the part which is the covered work, +but the work with which it is combined will remain governed by version +3 of the GNU General Public License. + + 14. Revised Versions of this License. + + The Free Software Foundation may publish revised and/or new versions of +the GNU Affero General Public License from time to time. Such new versions +will be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + + Each version is given a distinguishing version number. If the +Program specifies that a certain numbered version of the GNU Affero General +Public License "or any later version" applies to it, you have the +option of following the terms and conditions either of that numbered +version or of any later version published by the Free Software +Foundation. If the Program does not specify a version number of the +GNU Affero General Public License, you may choose any version ever published +by the Free Software Foundation. + + If the Program specifies that a proxy can decide which future +versions of the GNU Affero General Public License can be used, that proxy's +public statement of acceptance of a version permanently authorizes you +to choose that version for the Program. + + Later license versions may give you additional or different +permissions. However, no additional obligations are imposed on any +author or copyright holder as a result of your choosing to follow a +later version. + + 15. Disclaimer of Warranty. + + THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY +APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT +HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY +OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, +THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM +IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF +ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + + 16. Limitation of Liability. + + IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS +THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY +GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE +USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF +DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD +PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), +EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF +SUCH DAMAGES. + + 17. Interpretation of Sections 15 and 16. + + If the disclaimer of warranty and limitation of liability provided +above cannot be given local legal effect according to their terms, +reviewing courts shall apply local law that most closely approximates +an absolute waiver of all civil liability in connection with the +Program, unless a warranty or assumption of liability accompanies a +copy of the Program in return for a fee. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +state the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + crabrl - fast XBRL parsers and validator in Rust + Copyright (C) 2025 Stefano Amorelli + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published + by the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . + +Also add information on how to contact you by electronic and paper mail. + + If your software can interact with users remotely through a computer +network, you should also make sure that it provides a way for users to +get its source. For example, if your program is a web application, its +interface could display a "Source" link that leads users to an archive +of the code. There are many ways you could offer source, and different +solutions will be better for different programs; see section 13 for the +specific requirements. + + You should also get your employer (if you work as a programmer) or school, +if any, to sign a "copyright disclaimer" for the program, if necessary. +For more information on this, and how to apply and follow the GNU AGPL, see +. diff --git a/rust/crabrl-fork/README.md b/rust/crabrl-fork/README.md new file mode 100644 index 0000000..6f2281a --- /dev/null +++ b/rust/crabrl-fork/README.md @@ -0,0 +1,228 @@ +# crabrl 🦀 + +[![Crates.io](https://img.shields.io/crates/v/crabrl.svg)](https://crates.io/crates/crabrl) +[![CI Status](https://github.com/stefanoamorelli/crabrl/workflows/CI/badge.svg)](https://github.com/stefanoamorelli/crabrl/actions) +[![License: AGPL v3](https://img.shields.io/badge/License-AGPL%20v3-blue.svg)](https://www.gnu.org/licenses/agpl-3.0) +[![Rust Version](https://img.shields.io/badge/rust-1.75%2B-orange.svg)](https://www.rust-lang.org) +[![Downloads](https://img.shields.io/crates/d/crabrl.svg)](https://crates.io/crates/crabrl) +[![docs.rs](https://docs.rs/crabrl/badge.svg)](https://docs.rs/crabrl) + +![crabrl Performance](benchmarks/header.png) + +Lightning-fast XBRL parser that's **50-150x faster** than traditional parsers, built for speed and accuracy when processing [SEC EDGAR](https://www.sec.gov/edgar) filings. + +## Performance + +![Performance Benchmarks](benchmarks/performance_charts.png) + +### Speed Comparison + +![Speed Comparison](benchmarks/speed_comparison_clean.png) + +**Key Performance Metrics:** +- **50-150x faster** than traditional XBRL parsers +- **140,000+ facts/second** throughput +- **< 50MB memory** for 100K facts +- **Linear scaling** with file size + +## Technical Architecture + +crabrl is built on Rust's zero-cost abstractions and modern parsing techniques. While established parsers like [Arelle](https://arelle.org/) provide comprehensive XBRL specification support and extensive validation capabilities, crabrl focuses on high-performance parsing for scenarios where speed is critical. + +### Implementation Details + +| Optimization | Impact | Technology | +|-------------|---------|------------| +| **Zero-copy parsing** | -90% memory allocs | [`quick-xml`](https://github.com/tafia/quick-xml) with string slicing | +| **No garbage collection** | Predictable latency | Rust's ownership model | +| **Faster hashmaps** | 2x lookup speed | [`ahash`](https://github.com/tkaitchuck/aHash) instead of default hasher | +| **Compact strings** | -50% memory for small strings | [`compact_str`](https://github.com/ParkMyCar/compact_str) | +| **Parallelization** | 4-8x on multicore | [`rayon`](https://github.com/rayon-rs/rayon) work-stealing | +| **Memory mapping** | Zero-copy file I/O | [`memmap2`](https://github.com/RazrFalcon/memmap2-rs) | +| **Better allocator** | -25% allocation time | [`mimalloc`](https://github.com/microsoft/mimalloc) | + +**Benchmark results:** 100,000 XBRL facts parsed in 56ms (crabrl) vs 2,672ms (Arelle) on identical hardware. + +## XBRL Support Status + +| Feature | Description | Status | +|---------|-------------|---------| +| **XBRL 2.1 Instance** | Parse facts, contexts, units from `.xml` files | ✅ Stable | +| **SEC Validation** | EDGAR-specific rules and checks | ✅ Stable | +| **Calculation Linkbase** | Validate arithmetic relationships | ✅ Stable | +| **Presentation Linkbase** | Extract display hierarchy | 🚧 Beta | +| **Label Linkbase** | Human-readable concept names | 🚧 Beta | +| **Definition Linkbase** | Dimensional relationships | 📋 Planned | +| **Formula Linkbase** | Business rules validation | 📋 Planned | +| **Inline XBRL (iXBRL)** | HTML-embedded XBRL | 📋 Planned | + +## Installation + +### From crates.io +```bash +cargo install crabrl +``` + +### From Source +```bash +git clone https://github.com/stefanoamorelli/crabrl +cd crabrl +cargo build --release --features cli +``` + +### As Library Dependency +```toml +[dependencies] +crabrl = "0.1.0" +``` + +## Usage + +### CLI + +```bash +# Parse and display summary +crabrl parse filing.xml + +# Parse with statistics (timing and throughput) +crabrl parse filing.xml --stats + +# Validate with generic rules +crabrl validate filing.xml + +# Validate with SEC EDGAR rules +crabrl validate filing.xml --profile sec-edgar + +# Validate with strict mode (warnings as errors) +crabrl validate filing.xml --strict + +# Benchmark performance +crabrl bench filing.xml --iterations 100 +``` + +### Library + +#### Basic Usage + +```rust +use crabrl::Parser; + +// Parse XBRL document +let parser = Parser::new(); +let doc = parser.parse_file("filing.xml")?; + +// Access parsed data +println!("Facts: {}", doc.facts.len()); +println!("Contexts: {}", doc.contexts.len()); +println!("Units: {}", doc.units.len()); +``` + +#### Parse from Different Sources + +```rust +// From file path +let doc = parser.parse_file("filing.xml")?; + +// From bytes +let xml_bytes = std::fs::read("filing.xml")?; +let doc = parser.parse_bytes(&xml_bytes)?; +``` + +#### Validation + +```rust +use crabrl::{Parser, Validator}; + +let parser = Parser::new(); +let doc = parser.parse_file("filing.xml")?; + +// Generic validation +let validator = Validator::new(); +let result = validator.validate(&doc)?; + +if result.is_valid { + println!("Document is valid!"); +} else { + for error in &result.errors { + eprintln!("Error: {}", error); + } +} + +// SEC EDGAR validation (stricter rules) +let sec_validator = Validator::sec_edgar(); +let sec_result = sec_validator.validate(&doc)?; +``` + +## Performance Measurements + +Performance comparison with [Arelle](https://arelle.org/) v2.17.4 (Python-based XBRL processor with full specification support): + +### Synthetic Dataset Benchmarks + +| File Size | Facts | crabrl | Arelle | Ratio | +|-----------|------:|-------:|-------:|------:| +| Tiny | 10 | 1.1 ms | 164 ms | 150x | +| Small | 100 | 1.4 ms | 168 ms | 119x | +| Medium | 1K | 1.7 ms | 184 ms | 108x | +| Large | 10K | 6.1 ms | 351 ms | 58x | +| Huge | 100K | 57 ms | 2,672 ms | 47x | + +### SEC Filing Parse Times + +| Company | Filing Type | File Size | Facts | Parse Time | Throughput | +|---------|-------------|-----------|-------|------------|------------| +| Apple | [10-K 2023](https://www.sec.gov/Archives/edgar/data/320193/000032019323000106/aapl-20230930_htm.xml) | 1.4 MB | 1,075 | 2.1 ms | 516K facts/sec | +| Microsoft | [10-Q 2023](https://www.sec.gov/Archives/edgar/data/789019/000095017023064280/msft-20230930_htm.xml) | 2.8 MB | 2,341 | 4.3 ms | 544K facts/sec | +| Tesla | [10-K 2023](https://www.sec.gov/Archives/edgar/data/1318605/000162828024002390/tsla-20231231_htm.xml) | 3.1 MB | 3,122 | 5.8 ms | 538K facts/sec | + +### Run Your Own Benchmarks + +```bash +# Quick benchmark with Criterion +cargo bench + +# Compare against Arelle +cd benchmarks && python compare_performance.py + +# Test on real SEC filings +python scripts/download_fixtures.py # Download Apple, MSFT, Tesla, etc. +cargo run --release --bin crabrl -- bench fixtures/apple/aapl-20230930_htm.xml +``` + +## Resources & Links + +### XBRL Standards +- [XBRL International](https://www.xbrl.org/) - Official XBRL specifications +- [XBRL 2.1 Specification](https://www.xbrl.org/Specification/XBRL-2.1/REC-2003-12-31/XBRL-2.1-REC-2003-12-31+corrected-errata-2013-02-20.html) - Core standard we implement +- [SEC EDGAR](https://www.sec.gov/edgar/searchedgar/companysearch) - Search real company filings +- [EDGAR Filer Manual](https://www.sec.gov/info/edgar/forms/edgform.pdf) - SEC filing requirements + +### Dependencies We Use + +| Crate | Purpose | Why We Chose It | +|-------|---------|-----------------| +| [`quick-xml`](https://github.com/tafia/quick-xml) | XML parsing | Zero-copy, fastest XML parser in Rust | +| [`ahash`](https://github.com/tkaitchuck/aHash) | HashMap hashing | 2x faster than default hasher | +| [`compact_str`](https://github.com/ParkMyCar/compact_str) | String storage | Small string optimization | +| [`rayon`](https://github.com/rayon-rs/rayon) | Parallelization | Work-stealing for automatic load balancing | +| [`mimalloc`](https://github.com/microsoft/mimalloc) | Memory allocator | Microsoft's high-performance allocator | +| [`criterion`](https://github.com/bheisler/criterion.rs) | Benchmarking | Statistical benchmarking with graphs | + +### Alternative XBRL Parsers +- [Arelle](https://arelle.org/) - Complete XBRL processor with validation, formulas, and rendering (Python) +- [python-xbrl](https://github.com/manusimidt/py-xbrl) - Lightweight Python parser +- [xbrl-parser](https://www.npmjs.com/package/xbrl-parser) - JavaScript/Node.js +- [XBRL4j](https://github.com/br-data/xbrl-parser) - Java implementation + +## License ⚖️ + +This open-source project is licensed under the GNU Affero General Public License v3.0 (AGPL-3.0). This means: + +- You can use, modify, and distribute this software +- If you modify and distribute it, you must release your changes under AGPL-3.0 +- If you run a modified version on a server, you must provide the source code to users +- See the [LICENSE](LICENSE) file for full details + +For commercial licensing options or other licensing inquiries, please contact stefano@amorelli.tech. + +© 2025 Stefano Amorelli – Released under the GNU Affero General Public License v3.0. Enjoy! 🎉 \ No newline at end of file diff --git a/rust/crabrl-fork/benches/parser.rs b/rust/crabrl-fork/benches/parser.rs new file mode 100644 index 0000000..9ed11b1 --- /dev/null +++ b/rust/crabrl-fork/benches/parser.rs @@ -0,0 +1,37 @@ +use crabrl::Parser; +use criterion::{black_box, criterion_group, criterion_main, Criterion}; +use std::path::Path; + +fn parse_sample_sec_file(c: &mut Criterion) { + let parser = Parser::new(); + let sample_file = Path::new("fixtures/sample-sec.xml"); + + if sample_file.exists() { + c.bench_function("parse_sample_sec", |b| { + b.iter(|| parser.parse_file(black_box(&sample_file))); + }); + } else { + // If no fixtures exist, use a minimal inline XBRL for benchmarking + let minimal_xbrl = r#" + + + + 0000000000 + + + 2023-12-31 + + + + iso4217:USD + +"#; + + c.bench_function("parse_minimal", |b| { + b.iter(|| parser.parse_str(black_box(minimal_xbrl))); + }); + } +} + +criterion_group!(benches, parse_sample_sec_file); +criterion_main!(benches); diff --git a/rust/crabrl-fork/benchmarks/compare.py b/rust/crabrl-fork/benchmarks/compare.py new file mode 100644 index 0000000..16355cf --- /dev/null +++ b/rust/crabrl-fork/benchmarks/compare.py @@ -0,0 +1,71 @@ +#!/usr/bin/env python3 +""" +Compare crabrl performance with Arelle +""" + +import subprocess +import time +import sys +from pathlib import Path + +def run_crabrl(filepath): + """Run crabrl and measure time""" + cmd = ["../target/release/crabrl", "parse", filepath] + start = time.perf_counter() + result = subprocess.run(cmd, capture_output=True, text=True) + elapsed = (time.perf_counter() - start) * 1000 + + if result.returncode == 0: + # Parse output for fact count + facts = 0 + for line in result.stdout.split('\n'): + if 'Facts:' in line: + facts = int(line.split(':')[1].strip()) + break + return elapsed, facts + return None, 0 + +def run_arelle(filepath): + """Run Arelle and measure time""" + try: + cmd = ["python3", "-m", "arelle.CntlrCmdLine", + "--file", filepath, "--skipDTS", "--logLevel", "ERROR"] + start = time.perf_counter() + result = subprocess.run(cmd, capture_output=True, text=True, timeout=30) + elapsed = (time.perf_counter() - start) * 1000 + + if result.returncode == 0: + return elapsed + return None + except: + return None + +def main(): + if len(sys.argv) < 2: + print("Usage: compare.py ") + sys.exit(1) + + filepath = sys.argv[1] + print(f"Comparing performance on: {filepath}\n") + + # Run crabrl + crabrl_time, facts = run_crabrl(filepath) + if crabrl_time: + print(f"crabrl: {crabrl_time:.1f}ms ({facts} facts)") + else: + print("crabrl: Failed") + + # Run Arelle + arelle_time = run_arelle(filepath) + if arelle_time: + print(f"Arelle: {arelle_time:.1f}ms") + else: + print("Arelle: Failed or not installed") + + # Calculate speedup + if crabrl_time and arelle_time: + speedup = arelle_time / crabrl_time + print(f"\nSpeedup: {speedup:.1f}x faster") + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/rust/crabrl-fork/benchmarks/compare_performance.py b/rust/crabrl-fork/benchmarks/compare_performance.py new file mode 100644 index 0000000..05ab3e2 --- /dev/null +++ b/rust/crabrl-fork/benchmarks/compare_performance.py @@ -0,0 +1,214 @@ +#!/usr/bin/env python3 +"""Compare performance between crabrl and Arelle.""" + +import os +import sys +import time +import subprocess +import json +import statistics +from pathlib import Path +from tabulate import tabulate +import matplotlib.pyplot as plt + +# Add parent directory to path for imports +sys.path.insert(0, str(Path(__file__).parent.parent)) + +def benchmark_arelle(file_path, runs=3): + """Benchmark Arelle parsing performance.""" + times = [] + + for _ in range(runs): + start = time.perf_counter() + + # Run Arelle in subprocess to isolate memory + result = subprocess.run([ + sys.executable, "-c", + f""" +import sys +sys.path.insert(0, 'venv/lib/python{sys.version_info.major}.{sys.version_info.minor}/site-packages') +from arelle import Cntlr +from arelle import ModelManager + +# Suppress Arelle output +import logging +logging.getLogger("arelle").setLevel(logging.ERROR) + +controller = Cntlr.Cntlr(logFileName=None) +controller.webCache.workOffline = True +modelManager = ModelManager.initialize(controller) + +# Load and parse the XBRL file +modelXbrl = modelManager.load('{file_path}') +if modelXbrl: + facts = len(modelXbrl.facts) + contexts = len(modelXbrl.contexts) + units = len(modelXbrl.units) + print(f"{{facts}},{{contexts}},{{units}}") + modelXbrl.close() +""" + ], capture_output=True, text=True, cwd=Path(__file__).parent) + + end = time.perf_counter() + + if result.returncode == 0 and result.stdout: + times.append(end - start) + if len(times) == 1: # Print counts on first run + parts = result.stdout.strip().split(',') + if len(parts) == 3: + print(f" Arelle found: {parts[0]} facts, {parts[1]} contexts, {parts[2]} units") + else: + print(f" Arelle error: {result.stderr}") + + if times: + return { + 'mean': statistics.mean(times), + 'median': statistics.median(times), + 'stdev': statistics.stdev(times) if len(times) > 1 else 0, + 'min': min(times), + 'max': max(times), + 'runs': len(times) + } + return None + +def benchmark_crabrl(file_path, runs=3): + """Benchmark crabrl parsing performance.""" + times = [] + + # Build the benchmark binary if needed + subprocess.run(["cargo", "build", "--release", "--example", "benchmark_single"], + capture_output=True, cwd=Path(__file__).parent.parent) + + for _ in range(runs): + start = time.perf_counter() + + result = subprocess.run([ + "../target/release/examples/benchmark_single", + file_path + ], capture_output=True, text=True, cwd=Path(__file__).parent) + + end = time.perf_counter() + + if result.returncode == 0: + times.append(end - start) + if len(times) == 1 and result.stdout: # Print counts on first run + print(f" crabrl output: {result.stdout.strip()}") + else: + print(f" crabrl error: {result.stderr}") + + if times: + return { + 'mean': statistics.mean(times), + 'median': statistics.median(times), + 'stdev': statistics.stdev(times) if len(times) > 1 else 0, + 'min': min(times), + 'max': max(times), + 'runs': len(times) + } + return None + +def main(): + """Run comparative benchmarks.""" + print("=" * 80) + print("XBRL Parser Performance Comparison: crabrl vs Arelle") + print("=" * 80) + + test_files = [ + ("Tiny (10 facts)", "../test_data/test_tiny.xbrl"), + ("Small (100 facts)", "../test_data/test_small.xbrl"), + ("Medium (1K facts)", "../test_data/test_medium.xbrl"), + ("Large (10K facts)", "../test_data/test_large.xbrl"), + ("Huge (100K facts)", "../test_data/test_huge.xbrl"), + ] + + results = [] + + for name, file_path in test_files: + if not Path(file_path).exists(): + print(f"Skipping {name}: file not found") + continue + + file_size_mb = Path(file_path).stat().st_size / (1024 * 1024) + print(f"\nBenchmarking {name} ({file_size_mb:.2f} MB)...") + + # Benchmark Arelle + print(" Running Arelle...") + arelle_stats = benchmark_arelle(file_path, runs=5) + + # Benchmark crabrl + print(" Running crabrl...") + crabrl_stats = benchmark_crabrl(file_path, runs=5) + + if arelle_stats and crabrl_stats: + speedup = arelle_stats['median'] / crabrl_stats['median'] + results.append({ + 'File': name, + 'Size (MB)': f"{file_size_mb:.2f}", + 'Arelle (ms)': f"{arelle_stats['median']*1000:.1f}", + 'crabrl (ms)': f"{crabrl_stats['median']*1000:.1f}", + 'Speedup': f"{speedup:.1f}x", + 'arelle_raw': arelle_stats['median'], + 'crabrl_raw': crabrl_stats['median'], + }) + + # Print results table + print("\n" + "=" * 80) + print("RESULTS SUMMARY") + print("=" * 80) + + if results: + table_data = [{k: v for k, v in r.items() if not k.endswith('_raw')} for r in results] + print(tabulate(table_data, headers="keys", tablefmt="grid")) + + # Calculate average speedup + speedups = [r['arelle_raw'] / r['crabrl_raw'] for r in results] + avg_speedup = statistics.mean(speedups) + print(f"\nAverage speedup: {avg_speedup:.1f}x faster than Arelle") + + # Create performance chart + create_performance_chart(results) + else: + print("No results to display") + +def create_performance_chart(results): + """Create a performance comparison chart.""" + labels = [r['File'].split('(')[0].strip() for r in results] + arelle_times = [r['arelle_raw'] * 1000 for r in results] + crabrl_times = [r['crabrl_raw'] * 1000 for r in results] + + x = range(len(labels)) + width = 0.35 + + fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6)) + + # Bar chart + ax1.bar([i - width/2 for i in x], arelle_times, width, label='Arelle', color='#FF6B6B') + ax1.bar([i + width/2 for i in x], crabrl_times, width, label='crabrl', color='#4ECDC4') + ax1.set_xlabel('File Size') + ax1.set_ylabel('Time (ms)') + ax1.set_title('Parsing Time Comparison') + ax1.set_xticks(x) + ax1.set_xticklabels(labels, rotation=45) + ax1.legend() + ax1.grid(True, alpha=0.3) + + # Speedup chart + speedups = [a/c for a, c in zip(arelle_times, crabrl_times)] + ax2.bar(x, speedups, color='#95E77E') + ax2.set_xlabel('File Size') + ax2.set_ylabel('Speedup Factor') + ax2.set_title('crabrl Speedup over Arelle') + ax2.set_xticks(x) + ax2.set_xticklabels(labels, rotation=45) + ax2.grid(True, alpha=0.3) + + # Add value labels on bars + for i, v in enumerate(speedups): + ax2.text(i, v + 0.5, f'{v:.1f}x', ha='center', va='bottom') + + plt.tight_layout() + plt.savefig('benchmark_results.png', dpi=150) + print(f"\nPerformance chart saved to: benchmarks/benchmark_results.png") + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/rust/crabrl-fork/examples/benchmark_single.rs b/rust/crabrl-fork/examples/benchmark_single.rs new file mode 100644 index 0000000..e60a15c --- /dev/null +++ b/rust/crabrl-fork/examples/benchmark_single.rs @@ -0,0 +1,36 @@ +//! Single file benchmark + +use crabrl::Parser; +use std::env; +use std::fs; +use std::time::Instant; + +fn main() { + let args: Vec = env::args().collect(); + if args.len() != 2 { + eprintln!("Usage: {} ", args[0]); + std::process::exit(1); + } + + let content = fs::read(&args[1]).expect("Failed to read file"); + + let parser = Parser::new(); + let start = Instant::now(); + + match parser.parse_bytes(&content) { + Ok(document) => { + let elapsed = start.elapsed(); + println!( + "Parsed in {:.3}ms: {} facts, {} contexts, {} units", + elapsed.as_secs_f64() * 1000.0, + document.facts.len(), + document.contexts.len(), + document.units.len() + ); + } + Err(e) => { + eprintln!("Parse error: {}", e); + std::process::exit(1); + } + } +} diff --git a/rust/crabrl-fork/examples/parse.rs b/rust/crabrl-fork/examples/parse.rs new file mode 100644 index 0000000..5b76f19 --- /dev/null +++ b/rust/crabrl-fork/examples/parse.rs @@ -0,0 +1,22 @@ +//! Parse and display XBRL file info + +use crabrl::Parser; +use std::env; + +fn main() -> Result<(), Box> { + let args: Vec = env::args().collect(); + if args.len() != 2 { + eprintln!("Usage: {} ", args[0]); + std::process::exit(1); + } + + let parser = Parser::new(); + let doc = parser.parse_file(&args[1])?; + + println!("Parsed {}:", args[1]); + println!(" Facts: {}", doc.facts.len()); + println!(" Contexts: {}", doc.contexts.len()); + println!(" Units: {}", doc.units.len()); + + Ok(()) +} diff --git a/rust/crabrl-fork/examples/validate.rs b/rust/crabrl-fork/examples/validate.rs new file mode 100644 index 0000000..7e58913 --- /dev/null +++ b/rust/crabrl-fork/examples/validate.rs @@ -0,0 +1,29 @@ +//! Validation example + +use crabrl::{Parser, Validator}; +use std::env; + +fn main() -> Result<(), Box> { + let args: Vec = env::args().collect(); + if args.len() != 2 { + eprintln!("Usage: {} ", args[0]); + std::process::exit(1); + } + + // Parse + let parser = Parser::new(); + let doc = parser.parse_file(&args[1])?; + + // Validate + let validator = Validator::new(); + match validator.validate(&doc) { + Ok(_) => { + println!("✓ Document is valid"); + } + Err(e) => { + println!("✗ Validation failed: {}", e); + } + } + + Ok(()) +} diff --git a/rust/crabrl-fork/scripts/download_fixtures.py b/rust/crabrl-fork/scripts/download_fixtures.py new file mode 100644 index 0000000..8f90413 --- /dev/null +++ b/rust/crabrl-fork/scripts/download_fixtures.py @@ -0,0 +1,151 @@ +#!/usr/bin/env python3 +""" +Download real SEC XBRL filings from various companies to use as test fixtures. +These will be used for benchmarking and testing the parser. +""" + +import os +import time +import urllib.request +from pathlib import Path + +# Create fixtures directory +fixtures_dir = Path("fixtures") +fixtures_dir.mkdir(exist_ok=True) + +# List of real SEC XBRL filings from various companies +# Format: (company_name, ticker, description, url) +filings = [ + # Apple filings + ("apple", "AAPL", "10-K 2023 Instance", + "https://www.sec.gov/Archives/edgar/data/320193/000032019323000106/aapl-20230930_htm.xml"), + ("apple", "AAPL", "10-K 2023 Labels", + "https://www.sec.gov/Archives/edgar/data/320193/000032019323000106/aapl-20230930_lab.xml"), + ("apple", "AAPL", "10-K 2023 Calculation", + "https://www.sec.gov/Archives/edgar/data/320193/000032019323000106/aapl-20230930_cal.xml"), + + # Microsoft filings + ("microsoft", "MSFT", "10-Q 2023 Instance", + "https://www.sec.gov/Archives/edgar/data/789019/000095017023064280/msft-20230930_htm.xml"), + ("microsoft", "MSFT", "10-Q 2023 Labels", + "https://www.sec.gov/Archives/edgar/data/789019/000095017023064280/msft-20230930_lab.xml"), + ("microsoft", "MSFT", "10-Q 2023 Presentation", + "https://www.sec.gov/Archives/edgar/data/789019/000095017023064280/msft-20230930_pre.xml"), + + # Tesla filings + ("tesla", "TSLA", "10-K 2023 Instance", + "https://www.sec.gov/Archives/edgar/data/1318605/000162828024002390/tsla-20231231_htm.xml"), + ("tesla", "TSLA", "10-K 2023 Definition", + "https://www.sec.gov/Archives/edgar/data/1318605/000162828024002390/tsla-20231231_def.xml"), + + # Amazon filings + ("amazon", "AMZN", "10-K 2023 Instance", + "https://www.sec.gov/Archives/edgar/data/1018724/000101872424000006/amzn-20231231_htm.xml"), + ("amazon", "AMZN", "10-K 2023 Labels", + "https://www.sec.gov/Archives/edgar/data/1018724/000101872424000006/amzn-20231231_lab.xml"), + + # Google/Alphabet filings + ("alphabet", "GOOGL", "10-K 2023 Instance", + "https://www.sec.gov/Archives/edgar/data/1652044/000165204424000022/goog-20231231_htm.xml"), + ("alphabet", "GOOGL", "10-K 2023 Calculation", + "https://www.sec.gov/Archives/edgar/data/1652044/000165204424000022/goog-20231231_cal.xml"), + + # JPMorgan Chase filings + ("jpmorgan", "JPM", "10-K 2023 Instance", + "https://www.sec.gov/Archives/edgar/data/19617/000001961724000198/jpm-20231231_htm.xml"), + ("jpmorgan", "JPM", "10-K 2023 Labels", + "https://www.sec.gov/Archives/edgar/data/19617/000001961724000198/jpm-20231231_lab.xml"), + + # Walmart filings + ("walmart", "WMT", "10-K 2024 Instance", + "https://www.sec.gov/Archives/edgar/data/104169/000010416924000012/wmt-20240131_htm.xml"), + ("walmart", "WMT", "10-K 2024 Presentation", + "https://www.sec.gov/Archives/edgar/data/104169/000010416924000012/wmt-20240131_pre.xml"), + + # Johnson & Johnson filings + ("jnj", "JNJ", "10-K 2023 Instance", + "https://www.sec.gov/Archives/edgar/data/200406/000020040624000016/jnj-20231231_htm.xml"), + + # ExxonMobil filings + ("exxon", "XOM", "10-K 2023 Instance", + "https://www.sec.gov/Archives/edgar/data/34088/000003408824000013/xom-20231231_htm.xml"), + + # Berkshire Hathaway filings + ("berkshire", "BRK", "10-K 2023 Instance", + "https://www.sec.gov/Archives/edgar/data/1067983/000095017024021825/brka-20231231_htm.xml"), +] + +def download_file(url, filepath): + """Download a file from URL to filepath.""" + try: + # Add headers to avoid being blocked + request = urllib.request.Request( + url, + headers={ + 'User-Agent': 'crabrl-test-fixtures/1.0 (testing@example.com)' + } + ) + + with urllib.request.urlopen(request) as response: + content = response.read() + with open(filepath, 'wb') as f: + f.write(content) + return True + except Exception as e: + print(f" Error: {e}") + return False + +def main(): + print("Downloading SEC XBRL fixtures from various companies...") + print("=" * 60) + + downloaded = 0 + failed = 0 + + for company, ticker, description, url in filings: + # Create company directory + company_dir = fixtures_dir / company + company_dir.mkdir(exist_ok=True) + + # Generate filename from URL + filename = url.split('/')[-1] + filepath = company_dir / filename + + print(f"\n[{ticker}] {description}") + print(f" URL: {url}") + print(f" Saving to: {filepath}") + + if filepath.exists(): + print(" ✓ Already exists, skipping") + continue + + if download_file(url, filepath): + file_size = os.path.getsize(filepath) + print(f" ✓ Downloaded ({file_size:,} bytes)") + downloaded += 1 + else: + print(f" ✗ Failed to download") + failed += 1 + + # Be polite to SEC servers + time.sleep(0.5) + + print("\n" + "=" * 60) + print(f"Download complete: {downloaded} downloaded, {failed} failed") + print(f"Fixtures saved to: {fixtures_dir.absolute()}") + + # Show directory structure + print("\nFixture structure:") + for company_dir in sorted(fixtures_dir.iterdir()): + if company_dir.is_dir(): + files = list(company_dir.glob("*.xml")) + if files: + print(f" {company_dir.name}/") + for f in sorted(files)[:3]: # Show first 3 files + size = os.path.getsize(f) + print(f" - {f.name} ({size:,} bytes)") + if len(files) > 3: + print(f" ... and {len(files)-3} more files") + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/rust/crabrl-fork/scripts/generate_benchmark_charts.py b/rust/crabrl-fork/scripts/generate_benchmark_charts.py new file mode 100644 index 0000000..bb4221d --- /dev/null +++ b/rust/crabrl-fork/scripts/generate_benchmark_charts.py @@ -0,0 +1,260 @@ +#!/usr/bin/env python3 +"""Generate benchmark charts for crabrl README""" + +import matplotlib.pyplot as plt +import matplotlib.patches as mpatches +import numpy as np +from matplotlib.patches import FancyBboxPatch +import seaborn as sns + +# Set style +plt.style.use('seaborn-v0_8-darkgrid') +sns.set_palette("husl") + +# Performance data (based on claims and benchmarks) +parsers = ['crabrl', 'Traditional\nXBRL Parser', 'Arelle', 'Other\nParsers'] +parse_times = [7.2, 360, 1080, 720] # microseconds for sample file +throughput = [140000, 2800, 930, 1400] # facts per second + +# Speed improvement factors +speed_factors = [1, 50, 150, 100] + +# Create figure with subplots +fig = plt.figure(figsize=(16, 10)) +fig.suptitle('crabrl Performance Benchmarks', fontsize=24, fontweight='bold', y=0.98) + +# Color scheme +colors = ['#2ecc71', '#e74c3c', '#f39c12', '#95a5a6'] +highlight_color = '#27ae60' + +# 1. Parse Time Comparison (Bar Chart) +ax1 = plt.subplot(2, 3, 1) +bars1 = ax1.bar(parsers, parse_times, color=colors, edgecolor='black', linewidth=2) +bars1[0].set_color(highlight_color) +bars1[0].set_edgecolor('#229954') +bars1[0].set_linewidth(3) + +ax1.set_ylabel('Parse Time (μs)', fontsize=12, fontweight='bold') +ax1.set_title('Parse Time Comparison\n(Lower is Better)', fontsize=14, fontweight='bold') +ax1.set_ylim(0, max(parse_times) * 1.2) + +# Add value labels on bars +for bar, value in zip(bars1, parse_times): + height = bar.get_height() + ax1.text(bar.get_x() + bar.get_width()/2., height + max(parse_times) * 0.02, + f'{value:.1f}μs', ha='center', va='bottom', fontweight='bold', fontsize=10) + +# 2. Throughput Comparison (Bar Chart) +ax2 = plt.subplot(2, 3, 2) +bars2 = ax2.bar(parsers, np.array(throughput)/1000, color=colors, edgecolor='black', linewidth=2) +bars2[0].set_color(highlight_color) +bars2[0].set_edgecolor('#229954') +bars2[0].set_linewidth(3) + +ax2.set_ylabel('Throughput (K facts/sec)', fontsize=12, fontweight='bold') +ax2.set_title('Throughput Comparison\n(Higher is Better)', fontsize=14, fontweight='bold') +ax2.set_ylim(0, max(throughput)/1000 * 1.2) + +# Add value labels +for bar, value in zip(bars2, np.array(throughput)/1000): + height = bar.get_height() + ax2.text(bar.get_x() + bar.get_width()/2., height + max(throughput)/1000 * 0.02, + f'{value:.1f}K', ha='center', va='bottom', fontweight='bold', fontsize=10) + +# 3. Speed Improvement Factor +ax3 = plt.subplot(2, 3, 3) +x_pos = np.arange(len(parsers)) +bars3 = ax3.barh(x_pos, speed_factors, color=colors, edgecolor='black', linewidth=2) +bars3[0].set_color(highlight_color) +bars3[0].set_edgecolor('#229954') +bars3[0].set_linewidth(3) + +ax3.set_yticks(x_pos) +ax3.set_yticklabels(parsers) +ax3.set_xlabel('Speed Factor (vs Traditional)', fontsize=12, fontweight='bold') +ax3.set_title('Relative Speed\n(crabrl as baseline)', fontsize=14, fontweight='bold') +ax3.set_xlim(0, max(speed_factors) * 1.2) + +# Add value labels +for i, (bar, value) in enumerate(zip(bars3, speed_factors)): + width = bar.get_width() + label = f'{value}x' if i == 0 else f'1/{value}x slower' + ax3.text(width + max(speed_factors) * 0.02, bar.get_y() + bar.get_height()/2., + label, ha='left', va='center', fontweight='bold', fontsize=10) + +# 4. Memory Usage Comparison (Simulated) +ax4 = plt.subplot(2, 3, 4) +memory_usage = [50, 850, 1200, 650] # MB for 100k facts +bars4 = ax4.bar(parsers, memory_usage, color=colors, edgecolor='black', linewidth=2) +bars4[0].set_color(highlight_color) +bars4[0].set_edgecolor('#229954') +bars4[0].set_linewidth(3) + +ax4.set_ylabel('Memory Usage (MB)', fontsize=12, fontweight='bold') +ax4.set_title('Memory Efficiency\n(100K facts, Lower is Better)', fontsize=14, fontweight='bold') +ax4.set_ylim(0, max(memory_usage) * 1.2) + +# Add value labels +for bar, value in zip(bars4, memory_usage): + height = bar.get_height() + ax4.text(bar.get_x() + bar.get_width()/2., height + max(memory_usage) * 0.02, + f'{value}MB', ha='center', va='bottom', fontweight='bold', fontsize=10) + +# 5. Scalability Chart (Line Plot) +ax5 = plt.subplot(2, 3, 5) +file_sizes = np.array([1, 10, 50, 100, 500, 1000]) # MB +crabrl_times = file_sizes * 0.1 # Linear scaling +traditional_times = file_sizes * 5 # Much slower +arelle_times = file_sizes * 15 # Even slower + +ax5.plot(file_sizes, crabrl_times, 'o-', color=highlight_color, linewidth=3, + markersize=8, label='crabrl', markeredgecolor='#229954', markeredgewidth=2) +ax5.plot(file_sizes, traditional_times, 's-', color=colors[1], linewidth=2, + markersize=6, label='Traditional', alpha=0.7) +ax5.plot(file_sizes, arelle_times, '^-', color=colors[2], linewidth=2, + markersize=6, label='Arelle', alpha=0.7) + +ax5.set_xlabel('File Size (MB)', fontsize=12, fontweight='bold') +ax5.set_ylabel('Parse Time (seconds)', fontsize=12, fontweight='bold') +ax5.set_title('Scalability Performance\n(Linear vs Exponential)', fontsize=14, fontweight='bold') +ax5.legend(loc='upper left', fontsize=10, framealpha=0.9) +ax5.grid(True, alpha=0.3) +ax5.set_xlim(0, 1100) + +# 6. Feature Comparison Matrix +ax6 = plt.subplot(2, 3, 6) +ax6.axis('off') + +features = ['Speed', 'Memory', 'SEC EDGAR', 'Parallel', 'Streaming'] +feature_scores = { + 'crabrl': [5, 5, 5, 5, 4], + 'Traditional': [1, 2, 3, 1, 2], + 'Arelle': [1, 1, 5, 2, 2], + 'Others': [2, 3, 3, 2, 3] +} + +# Create feature matrix visualization +y_pos = 0.9 +ax6.text(0.5, y_pos, 'Feature Comparison', fontsize=14, fontweight='bold', + ha='center', transform=ax6.transAxes) + +y_pos -= 0.1 +x_positions = [0.2, 0.35, 0.5, 0.65, 0.8] +for i, feature in enumerate(features): + ax6.text(x_positions[i], y_pos, feature, fontsize=10, fontweight='bold', + ha='center', transform=ax6.transAxes) + +parser_names = ['crabrl', 'Traditional', 'Arelle', 'Others'] +y_positions = [0.65, 0.5, 0.35, 0.2] + +for j, (parser, scores) in enumerate(zip(parser_names, + [feature_scores['crabrl'], + feature_scores['Traditional'], + feature_scores['Arelle'], + feature_scores['Others']])): + ax6.text(0.05, y_positions[j], parser, fontsize=10, fontweight='bold', + ha='left', transform=ax6.transAxes) + + for i, score in enumerate(scores): + # Draw filled circles for score + for k in range(5): + circle = plt.Circle((x_positions[i] + k*0.02 - 0.04, y_positions[j]), + 0.008, transform=ax6.transAxes, + color=highlight_color if k < score and j == 0 else + '#34495e' if k < score else '#ecf0f1', + edgecolor='black', linewidth=1) + ax6.add_patch(circle) + +# Add performance badges +badge_y = 0.05 +badges = ['🚀 50-150x Faster', '💾 Low Memory', '⚡ Zero-Copy', '🔒 Production Ready'] +badge_x_positions = [0.125, 0.375, 0.625, 0.875] + +for badge, x_pos in zip(badges, badge_x_positions): + bbox = FancyBboxPatch((x_pos - 0.1, badge_y - 0.03), 0.2, 0.06, + boxstyle="round,pad=0.01", + facecolor=highlight_color, edgecolor='#229954', + linewidth=2, transform=ax6.transAxes, alpha=0.9) + ax6.add_patch(bbox) + ax6.text(x_pos, badge_y, badge, fontsize=9, fontweight='bold', + ha='center', va='center', transform=ax6.transAxes, color='white') + +# Adjust layout +plt.tight_layout() +plt.subplots_adjust(top=0.93, hspace=0.3, wspace=0.3) + +# Save the figure +plt.savefig('benchmarks/benchmark_results.png', dpi=150, bbox_inches='tight', + facecolor='white', edgecolor='none') +print("Saved: benchmarks/benchmark_results.png") + +# Create a simplified hero image for README header +fig2, ax = plt.subplots(figsize=(12, 4), facecolor='white') +ax.axis('off') + +# Title +ax.text(0.5, 0.85, 'crabrl', fontsize=48, fontweight='bold', + ha='center', transform=ax.transAxes, color='#2c3e50') +ax.text(0.5, 0.65, 'Lightning-Fast XBRL Parser', fontsize=20, + ha='center', transform=ax.transAxes, color='#7f8c8d') + +# Performance stats +stats = [ + ('50-150x', 'Faster than\ntraditional parsers'), + ('140K', 'Facts per\nsecond'), + ('< 50MB', 'Memory for\n100K facts'), + ('Zero-Copy', 'Parsing\narchitecture') +] + +x_positions = [0.125, 0.375, 0.625, 0.875] +for (value, desc), x_pos in zip(stats, x_positions): + # Value + ax.text(x_pos, 0.35, value, fontsize=28, fontweight='bold', + ha='center', transform=ax.transAxes, color=highlight_color) + # Description + ax.text(x_pos, 0.15, desc, fontsize=12, + ha='center', transform=ax.transAxes, color='#7f8c8d', + multialignment='center') + +plt.savefig('benchmarks/hero_banner.png', dpi=150, bbox_inches='tight', + facecolor='white', edgecolor='none') +print("Saved: benchmarks/hero_banner.png") + +# Create a speed comparison bar +fig3, ax = plt.subplots(figsize=(10, 3), facecolor='white') + +# Speed comparison visualization +speeds = [150, 100, 50, 1] +labels = ['crabrl\n150x faster', 'crabrl\n100x faster', 'crabrl\n50x faster', 'Baseline'] +colors_speed = [highlight_color, '#3498db', '#9b59b6', '#95a5a6'] + +y_pos = np.arange(len(labels)) +bars = ax.barh(y_pos, speeds, color=colors_speed, edgecolor='black', linewidth=2) + +ax.set_yticks(y_pos) +ax.set_yticklabels(labels, fontsize=11, fontweight='bold') +ax.set_xlabel('Relative Performance', fontsize=12, fontweight='bold') +ax.set_title('crabrl Speed Advantage', fontsize=16, fontweight='bold', pad=20) + +# Add speed labels +for bar, speed in zip(bars, speeds): + width = bar.get_width() + label = f'{speed}x' if speed > 1 else 'Traditional\nParsers' + ax.text(width + 3, bar.get_y() + bar.get_height()/2., + label, ha='left', va='center', fontweight='bold', fontsize=11) + +ax.set_xlim(0, 180) +ax.spines['top'].set_visible(False) +ax.spines['right'].set_visible(False) +ax.grid(axis='x', alpha=0.3) + +plt.tight_layout() +plt.savefig('benchmarks/speed_comparison.png', dpi=150, bbox_inches='tight', + facecolor='white', edgecolor='none') +print("Saved: benchmarks/speed_comparison.png") + +print("\n✅ All benchmark images generated successfully!") +print("\nYou can now add these to your README:") +print(" - benchmarks/hero_banner.png (header image)") +print(" - benchmarks/benchmark_results.png (detailed performance)") +print(" - benchmarks/speed_comparison.png (speed comparison)") \ No newline at end of file diff --git a/rust/crabrl-fork/scripts/generate_clean_benchmarks.py b/rust/crabrl-fork/scripts/generate_clean_benchmarks.py new file mode 100644 index 0000000..72e4086 --- /dev/null +++ b/rust/crabrl-fork/scripts/generate_clean_benchmarks.py @@ -0,0 +1,253 @@ +#!/usr/bin/env python3 +"""Generate clean benchmark charts for crabrl README""" + +import matplotlib.pyplot as plt +import numpy as np +from matplotlib.patches import Rectangle, FancyBboxPatch +import matplotlib.patches as mpatches + +# Set a professional style +plt.rcParams['font.family'] = 'sans-serif' +plt.rcParams['font.sans-serif'] = ['DejaVu Sans', 'Arial', 'Helvetica'] +plt.rcParams['axes.linewidth'] = 1.5 +plt.rcParams['axes.edgecolor'] = '#333333' + +# Color palette (professional and accessible) +PRIMARY_COLOR = '#00A86B' # Jade green +SECONDARY_COLOR = '#FF6B6B' # Coral red +TERTIARY_COLOR = '#4ECDC4' # Teal +QUATERNARY_COLOR = '#95E1D3' # Mint +GRAY_COLOR = '#95A5A6' +DARK_COLOR = '#2C3E50' +LIGHT_GRAY = '#ECF0F1' + +# Performance data +performance_data = { + 'crabrl': { + 'parse_time': 7.2, # microseconds + 'throughput': 140000, # facts/sec + 'memory': 50, # MB for 100k facts + 'speed_factor': 100, # average speedup + 'color': PRIMARY_COLOR + }, + 'Traditional': { + 'parse_time': 720, + 'throughput': 1400, + 'memory': 850, + 'speed_factor': 1, + 'color': SECONDARY_COLOR + }, + 'Arelle': { + 'parse_time': 1080, + 'throughput': 930, + 'memory': 1200, + 'speed_factor': 0.67, + 'color': TERTIARY_COLOR + } +} + +# Create main comparison chart +fig = plt.figure(figsize=(14, 8), facecolor='white') +fig.suptitle('crabrl Performance Benchmarks', fontsize=22, fontweight='bold', color=DARK_COLOR) + +# 1. Parse Speed Comparison +ax1 = plt.subplot(2, 3, 1) +parsers = list(performance_data.keys()) +parse_times = [performance_data[p]['parse_time'] for p in parsers] +colors = [performance_data[p]['color'] for p in parsers] + +bars = ax1.bar(parsers, parse_times, color=colors, edgecolor=DARK_COLOR, linewidth=2) +ax1.set_ylabel('Parse Time (μs)', fontsize=11, fontweight='bold', color=DARK_COLOR) +ax1.set_title('Parse Time\n(Lower is Better)', fontsize=12, fontweight='bold', color=DARK_COLOR) +ax1.set_yscale('log') # Log scale for better visualization +ax1.grid(axis='y', alpha=0.3, linestyle='--') + +# Add value labels +for bar, value in zip(bars, parse_times): + height = bar.get_height() + ax1.text(bar.get_x() + bar.get_width()/2., height * 1.1, + f'{value:.1f}μs', ha='center', va='bottom', fontweight='bold', fontsize=10) + +# 2. Throughput Comparison +ax2 = plt.subplot(2, 3, 2) +throughputs = [performance_data[p]['throughput'] for p in parsers] +bars = ax2.bar(parsers, np.array(throughputs)/1000, color=colors, edgecolor=DARK_COLOR, linewidth=2) +ax2.set_ylabel('Throughput (K facts/sec)', fontsize=11, fontweight='bold', color=DARK_COLOR) +ax2.set_title('Processing Speed\n(Higher is Better)', fontsize=12, fontweight='bold', color=DARK_COLOR) +ax2.grid(axis='y', alpha=0.3, linestyle='--') + +for bar, value in zip(bars, np.array(throughputs)/1000): + height = bar.get_height() + ax2.text(bar.get_x() + bar.get_width()/2., height + 2, + f'{value:.0f}K', ha='center', va='bottom', fontweight='bold', fontsize=10) + +# 3. Memory Usage +ax3 = plt.subplot(2, 3, 3) +memory_usage = [performance_data[p]['memory'] for p in parsers] +bars = ax3.bar(parsers, memory_usage, color=colors, edgecolor=DARK_COLOR, linewidth=2) +ax3.set_ylabel('Memory (MB)', fontsize=11, fontweight='bold', color=DARK_COLOR) +ax3.set_title('Memory Usage\n(100K facts)', fontsize=12, fontweight='bold', color=DARK_COLOR) +ax3.grid(axis='y', alpha=0.3, linestyle='--') + +for bar, value in zip(bars, memory_usage): + height = bar.get_height() + ax3.text(bar.get_x() + bar.get_width()/2., height + 20, + f'{value}MB', ha='center', va='bottom', fontweight='bold', fontsize=10) + +# 4. Speed Multiplier Visual +ax4 = plt.subplot(2, 3, 4) +ax4.axis('off') +ax4.set_title('Speed Advantage', fontsize=12, fontweight='bold', color=DARK_COLOR, pad=20) + +# Create speed comparison visual +y_base = 0.5 +bar_height = 0.15 +max_width = 0.8 + +# crabrl bar (baseline) +crabrl_rect = Rectangle((0.1, y_base), max_width, bar_height, + facecolor=PRIMARY_COLOR, edgecolor=DARK_COLOR, linewidth=2) +ax4.add_patch(crabrl_rect) +ax4.text(0.1 + max_width + 0.02, y_base + bar_height/2, '100x baseline', + va='center', fontweight='bold', fontsize=11) +ax4.text(0.05, y_base + bar_height/2, 'crabrl', va='center', ha='right', fontweight='bold') + +# Traditional parser bar +trad_width = max_width / 100 # 1/100th the speed +trad_rect = Rectangle((0.1, y_base - bar_height*1.5), trad_width, bar_height, + facecolor=SECONDARY_COLOR, edgecolor=DARK_COLOR, linewidth=2) +ax4.add_patch(trad_rect) +ax4.text(0.1 + trad_width + 0.02, y_base - bar_height*1.5 + bar_height/2, '1x', + va='center', fontweight='bold', fontsize=11) +ax4.text(0.05, y_base - bar_height*1.5 + bar_height/2, 'Others', va='center', ha='right', fontweight='bold') + +ax4.set_xlim(0, 1) +ax4.set_ylim(0, 1) + +# 5. Scalability Chart +ax5 = plt.subplot(2, 3, 5) +file_sizes = np.array([1, 10, 50, 100, 500, 1000]) # MB +crabrl_times = file_sizes * 0.01 # Linear scaling +traditional_times = file_sizes * 1.0 # Much slower +arelle_times = file_sizes * 1.5 # Even slower + +ax5.plot(file_sizes, crabrl_times, 'o-', color=PRIMARY_COLOR, linewidth=3, + markersize=8, label='crabrl', markeredgecolor=DARK_COLOR, markeredgewidth=1.5) +ax5.plot(file_sizes, traditional_times, 's-', color=SECONDARY_COLOR, linewidth=2, + markersize=6, label='Traditional', alpha=0.8) +ax5.plot(file_sizes, arelle_times, '^-', color=TERTIARY_COLOR, linewidth=2, + markersize=6, label='Arelle', alpha=0.8) + +ax5.set_xlabel('File Size (MB)', fontsize=11, fontweight='bold', color=DARK_COLOR) +ax5.set_ylabel('Parse Time (seconds)', fontsize=11, fontweight='bold', color=DARK_COLOR) +ax5.set_title('Scalability\n(Linear vs Exponential)', fontsize=12, fontweight='bold', color=DARK_COLOR) +ax5.legend(loc='upper left', fontsize=10, framealpha=0.95) +ax5.grid(True, alpha=0.3, linestyle='--') +ax5.set_xlim(0, 1100) + +# 6. Key Features +ax6 = plt.subplot(2, 3, 6) +ax6.axis('off') +ax6.set_title('Key Advantages', fontsize=12, fontweight='bold', color=DARK_COLOR, y=0.95) + +features = [ + ('50-150x Faster', 'Than traditional parsers'), + ('Zero-Copy', 'Memory efficient design'), + ('Production Ready', 'SEC EDGAR optimized'), + ('Rust Powered', 'Safe and concurrent') +] + +y_start = 0.75 +for i, (title, desc) in enumerate(features): + y_pos = y_start - i * 0.2 + + # Feature box + bbox = FancyBboxPatch((0.05, y_pos - 0.05), 0.9, 0.12, + boxstyle="round,pad=0.02", + facecolor=PRIMARY_COLOR if i == 0 else LIGHT_GRAY, + edgecolor=DARK_COLOR, + linewidth=1.5, alpha=0.3 if i > 0 else 0.2) + ax6.add_patch(bbox) + + # Title + ax6.text(0.1, y_pos + 0.02, title, fontsize=11, fontweight='bold', + color=PRIMARY_COLOR if i == 0 else DARK_COLOR) + # Description + ax6.text(0.1, y_pos - 0.02, desc, fontsize=9, color=GRAY_COLOR) + +# Adjust layout +plt.tight_layout() +plt.subplots_adjust(top=0.92, hspace=0.4, wspace=0.3) + +# Save +plt.savefig('benchmarks/performance_charts.png', dpi=150, bbox_inches='tight', + facecolor='white', edgecolor='none') +print("Saved: benchmarks/performance_charts.png") + +# Create simple speed comparison bar +fig2, ax = plt.subplots(figsize=(10, 4), facecolor='white') + +# Data +parsers = ['crabrl', 'Parser B', 'Parser C', 'Arelle'] +speeds = [150, 3, 2, 1] # Relative to slowest +colors = [PRIMARY_COLOR, QUATERNARY_COLOR, TERTIARY_COLOR, SECONDARY_COLOR] + +# Create horizontal bars +y_pos = np.arange(len(parsers)) +bars = ax.barh(y_pos, speeds, color=colors, edgecolor=DARK_COLOR, linewidth=2, height=0.6) + +# Styling +ax.set_yticks(y_pos) +ax.set_yticklabels(parsers, fontsize=12, fontweight='bold') +ax.set_xlabel('Relative Speed (Higher is Better)', fontsize=12, fontweight='bold', color=DARK_COLOR) +ax.set_title('crabrl vs Traditional XBRL Parsers', fontsize=16, fontweight='bold', color=DARK_COLOR, pad=20) + +# Add value labels +for bar, speed in zip(bars, speeds): + width = bar.get_width() + label = f'{speed}x faster' if speed > 1 else 'Baseline' + ax.text(width + 2, bar.get_y() + bar.get_height()/2., + label, ha='left', va='center', fontweight='bold', fontsize=11) + +# Add impressive stats annotation +ax.text(0.98, 0.02, 'Up to 150x faster on SEC EDGAR filings', + transform=ax.transAxes, ha='right', fontsize=10, + style='italic', color=GRAY_COLOR) + +ax.set_xlim(0, 170) +ax.spines['top'].set_visible(False) +ax.spines['right'].set_visible(False) +ax.grid(axis='x', alpha=0.3, linestyle='--') + +plt.tight_layout() +plt.savefig('benchmarks/speed_comparison_clean.png', dpi=150, bbox_inches='tight', + facecolor='white', edgecolor='none') +print("Saved: benchmarks/speed_comparison_clean.png") + +# Create a minimal header image +fig3, ax = plt.subplots(figsize=(12, 3), facecolor='white') +ax.axis('off') + +# Background gradient effect using rectangles +for i in range(10): + alpha = 0.02 * (10 - i) + rect = Rectangle((i/10, 0), 0.1, 1, transform=ax.transAxes, + facecolor=PRIMARY_COLOR, alpha=alpha) + ax.add_patch(rect) + +# Title and tagline +ax.text(0.5, 0.65, 'crabrl', fontsize=42, fontweight='bold', + ha='center', transform=ax.transAxes, color=DARK_COLOR) +ax.text(0.5, 0.35, 'Lightning-Fast XBRL Parser for Rust', fontsize=16, + ha='center', transform=ax.transAxes, color=GRAY_COLOR) + +plt.savefig('benchmarks/header.png', dpi=150, bbox_inches='tight', + facecolor='white', edgecolor='none') +print("Saved: benchmarks/header.png") + +print("\n✅ Clean benchmark visualizations created successfully!") +print("\nGenerated files:") +print(" - benchmarks/header.png - Minimal header for README") +print(" - benchmarks/performance_charts.png - Comprehensive performance metrics") +print(" - benchmarks/speed_comparison_clean.png - Simple speed comparison") +print("\nYou can now add these images to your GitHub README!") \ No newline at end of file diff --git a/rust/crabrl-fork/src/allocator.rs b/rust/crabrl-fork/src/allocator.rs new file mode 100644 index 0000000..d44b3e3 --- /dev/null +++ b/rust/crabrl-fork/src/allocator.rs @@ -0,0 +1,242 @@ +use bumpalo::Bump; +use compact_str::CompactString; +use parking_lot::Mutex; +use std::cell::RefCell; +use std::collections::HashMap; +use std::mem::MaybeUninit; +use std::sync::Arc; + +const ARENA_SIZE: usize = 64 * 1024 * 1024; // 64MB arenas +const POOL_SIZE: usize = 1024; + +#[repr(align(64))] +pub struct ArenaAllocator { + current: RefCell, + arenas: RefCell>, + string_to_id: Arc>>, + id_to_string: Arc>>, +} + +impl ArenaAllocator { + pub fn new() -> Self { + Self { + current: RefCell::new(Bump::with_capacity(ARENA_SIZE)), + arenas: RefCell::new(Vec::with_capacity(16)), + string_to_id: Arc::new(Mutex::new(HashMap::new())), + id_to_string: Arc::new(Mutex::new(Vec::new())), + } + } + + #[inline(always)] + pub fn alloc(&self, val: T) -> &T { + unsafe { + let ptr = self.current.borrow().alloc(val) as *const T; + &*ptr + } + } + + #[inline(always)] + pub fn alloc_slice(&self, slice: &[T]) -> &[T] { + unsafe { + let ptr = self.current.borrow().alloc_slice_copy(slice) as *const [T]; + &*ptr + } + } + + #[inline(always)] + pub fn alloc_str(&self, s: &str) -> &str { + unsafe { + let ptr = self.current.borrow().alloc_str(s) as *const str; + &*ptr + } + } + + #[inline(always)] + pub fn intern_string(&self, s: &str) -> u32 { + let key = CompactString::from(s); + + // Check if already interned + if let Some(&id) = self.string_to_id.lock().get(&key) { + return id; + } + + // Add new interned string + let mut id_to_string = self.id_to_string.lock(); + let mut string_to_id = self.string_to_id.lock(); + + // Double-check after acquiring both locks + if let Some(&id) = string_to_id.get(&key) { + return id; + } + + let id = id_to_string.len() as u32; + id_to_string.push(key.clone()); + string_to_id.insert(key, id); + + id + } + + #[inline(always)] + pub fn get_interned(&self, id: u32) -> Option { + self.id_to_string.lock().get(id as usize).cloned() + } + + pub fn get_all_strings(&self) -> Vec { + self.id_to_string.lock().clone() + } + + pub fn string_count(&self) -> usize { + self.id_to_string.lock().len() + } + + pub fn reset(&self) { + let mut current = self.current.borrow_mut(); + current.reset(); + + let mut arenas = self.arenas.borrow_mut(); + for arena in arenas.iter_mut() { + arena.reset(); + } + + // Clear string interning + self.string_to_id.lock().clear(); + self.id_to_string.lock().clear(); + } + + pub fn new_arena(&self) { + let mut arenas = self.arenas.borrow_mut(); + let old = std::mem::replace( + &mut *self.current.borrow_mut(), + Bump::with_capacity(ARENA_SIZE), + ); + arenas.push(old); + } +} + +impl Default for ArenaAllocator { + fn default() -> Self { + Self::new() + } +} + +pub struct ObjectPool { + pool: Vec>, + factory: fn() -> T, +} + +impl ObjectPool { + pub fn new(capacity: usize, factory: fn() -> T) -> Self { + let mut pool = Vec::with_capacity(capacity); + for _ in 0..capacity { + pool.push(Box::new(factory())); + } + Self { pool, factory } + } + + #[inline(always)] + pub fn acquire(&mut self) -> Box { + self.pool + .pop() + .unwrap_or_else(|| Box::new((self.factory)())) + } + + #[inline(always)] + pub fn release(&mut self, obj: Box) { + if self.pool.len() < POOL_SIZE { + self.pool.push(obj); + } + } +} + +#[repr(C, align(64))] +pub struct StackBuffer { + data: [MaybeUninit; N], + len: usize, +} + +impl Default for StackBuffer { + fn default() -> Self { + Self::new() + } +} + +impl StackBuffer { + #[inline(always)] + pub const fn new() -> Self { + Self { + data: unsafe { MaybeUninit::uninit().assume_init() }, + len: 0, + } + } + + #[inline(always)] + pub fn push(&mut self, byte: u8) -> bool { + if self.len < N { + self.data[self.len] = MaybeUninit::new(byte); + self.len += 1; + true + } else { + false + } + } + + #[inline(always)] + pub fn as_slice(&self) -> &[u8] { + unsafe { std::slice::from_raw_parts(self.data.as_ptr() as *const u8, self.len) } + } + + #[inline(always)] + pub fn clear(&mut self) { + self.len = 0; + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_arena_allocator() { + let arena = ArenaAllocator::new(); + let s1 = arena.alloc_str("hello"); + let s2 = arena.alloc_str("world"); + assert_eq!(s1, "hello"); + assert_eq!(s2, "world"); + } + + #[test] + fn test_string_interning() { + let arena = ArenaAllocator::new(); + let id1 = arena.intern_string("test"); + let id2 = arena.intern_string("test"); + assert_eq!(id1, id2); + + let s = arena.get_interned(id1).unwrap(); + assert_eq!(s, "test"); + } + + #[test] + fn test_string_interning_different() { + let arena = ArenaAllocator::new(); + let id1 = arena.intern_string("foo"); + let id2 = arena.intern_string("bar"); + assert_ne!(id1, id2); + + assert_eq!(arena.get_interned(id1).unwrap(), "foo"); + assert_eq!(arena.get_interned(id2).unwrap(), "bar"); + } + + #[test] + fn test_get_all_strings() { + let arena = ArenaAllocator::new(); + arena.intern_string("a"); + arena.intern_string("b"); + arena.intern_string("c"); + + let all = arena.get_all_strings(); + assert_eq!(all.len(), 3); + assert!(all.contains(&CompactString::from("a"))); + assert!(all.contains(&CompactString::from("b"))); + assert!(all.contains(&CompactString::from("c"))); + } +} diff --git a/rust/crabrl-fork/src/cache.rs b/rust/crabrl-fork/src/cache.rs new file mode 100644 index 0000000..988214c --- /dev/null +++ b/rust/crabrl-fork/src/cache.rs @@ -0,0 +1,47 @@ +use dashmap::DashMap; +use std::sync::Arc; +use std::hash::Hash; + +pub struct LockFreeCache { + map: Arc>, + capacity: usize, +} + +impl LockFreeCache +where + K: Eq + Hash + Clone, + V: Clone, +{ + pub fn new(capacity: usize) -> Self { + Self { + map: Arc::new(DashMap::with_capacity(capacity)), + capacity, + } + } + + #[inline(always)] + pub fn get(&self, key: &K) -> Option { + self.map.get(key).map(|v| v.clone()) + } + + #[inline(always)] + pub fn insert(&self, key: K, value: V) { + if self.map.len() >= self.capacity { + if let Some(entry) = self.map.iter().next() { + let k = entry.key().clone(); + drop(entry); + self.map.remove(&k); + } + } + self.map.insert(key, value); + } + + #[inline(always)] + pub fn contains(&self, key: &K) -> bool { + self.map.contains_key(key) + } + + pub fn clear(&self) { + self.map.clear(); + } +} diff --git a/rust/crabrl-fork/src/instance.rs b/rust/crabrl-fork/src/instance.rs new file mode 100644 index 0000000..ffeab7d --- /dev/null +++ b/rust/crabrl-fork/src/instance.rs @@ -0,0 +1,21 @@ +use crate::model::Document; +use crate::Result; + +pub struct InstanceValidator { + strict: bool, +} + +impl InstanceValidator { + pub fn new() -> Self { + Self { strict: false } + } + + pub fn with_strict(mut self, strict: bool) -> Self { + self.strict = strict; + self + } + + pub fn validate(&self, _document: &Document) -> Result<()> { + Ok(()) + } +} diff --git a/rust/crabrl-fork/src/lib.rs b/rust/crabrl-fork/src/lib.rs new file mode 100644 index 0000000..de145a7 --- /dev/null +++ b/rust/crabrl-fork/src/lib.rs @@ -0,0 +1,113 @@ +//! crabrl - High-performance XBRL parser and validator +//! +//! Licensed under AGPL-3.0 + +pub mod allocator; +pub mod linkbase; +pub mod model; +pub mod parser; +pub mod schema; +pub mod simd; +pub mod validator; + +// Primary parser export +pub use parser::Parser; + +// Model types +pub use model::{ + CalculationLink, Context, DefinitionLink, DimensionMember, Document, Entity, Fact, FactFlags, + FactOrTuple, FactStorage, FactValue, Footnote, Link, Linkbase, Measure, Period, + PresentationLink, Reference, ReferenceLink, Scenario, Schema, SchemaElement, SchemaImport, + SchemaType, Segment, Tuple, TypedMember, Unit, UnitType, +}; + +// ValidationError from validator module +pub use validator::ValidationError; + +// Allocator +pub use allocator::ArenaAllocator; + +// Linkbase processor +pub use linkbase::LinkbaseProcessor; + +// Validator +pub use validator::XbrlValidator; + +#[derive(Debug, thiserror::Error)] +pub enum Error { + #[error("IO error: {0}")] + Io(#[from] std::io::Error), + #[error("Parse error: {0}")] + Parse(String), + #[error("Validation error: {0}")] + Validation(String), + #[error("Not found: {0}")] + NotFound(String), +} + +pub type Result = std::result::Result; + +// Convenience validator wrapper +#[derive(Default)] +pub struct Validator { + inner: XbrlValidator, +} + +impl Validator { + pub fn new() -> Self { + Self::default() + } + + pub fn sec_edgar() -> Self { + Self { + inner: XbrlValidator::new().strict(), + } + } + + pub fn with_config(_config: ValidationConfig) -> Self { + Self::new() + } + + pub fn validate(&self, doc: &Document) -> Result { + let start = std::time::Instant::now(); + let mut doc_copy = doc.clone(); + let is_valid = self.inner.validate(&mut doc_copy).is_ok(); + + Ok(ValidationResult { + is_valid, + errors: if is_valid { + Vec::new() + } else { + vec!["Validation failed".to_string()] + }, + warnings: Vec::new(), + stats: ValidationStats { + facts_validated: doc.facts.len(), + duration_ms: start.elapsed().as_millis() as u64, + }, + }) + } +} + +#[derive(Default)] +pub struct ValidationConfig { + pub strict: bool, +} + +impl ValidationConfig { + pub fn sec_edgar() -> Self { + Self { strict: true } + } +} + +pub struct ValidationResult { + pub is_valid: bool, + pub errors: Vec, + pub warnings: Vec, + pub stats: ValidationStats, +} + +pub struct ValidationStats { + pub facts_validated: usize, + pub duration_ms: u64, +} diff --git a/rust/crabrl-fork/src/linkbase.rs b/rust/crabrl-fork/src/linkbase.rs new file mode 100644 index 0000000..d2408b4 --- /dev/null +++ b/rust/crabrl-fork/src/linkbase.rs @@ -0,0 +1,470 @@ +// Linkbase processing for XBRL +use crate::model::*; +use crate::validator::ValidationError; +use crate::{Error, Result}; +use compact_str::CompactString; +use std::collections::HashMap; +use std::path::Path; + +pub struct LinkbaseProcessor { + presentation_links: HashMap>, + calculation_links: HashMap>, + definition_links: HashMap>, + label_links: HashMap>, + reference_links: HashMap>, +} + +impl Default for LinkbaseProcessor { + fn default() -> Self { + Self::new() + } +} + +impl LinkbaseProcessor { + pub fn new() -> Self { + Self { + presentation_links: HashMap::new(), + calculation_links: HashMap::new(), + definition_links: HashMap::new(), + label_links: HashMap::new(), + reference_links: HashMap::new(), + } + } + + pub fn load_linkbase>(&mut self, path: P) -> Result<()> { + let content = std::fs::read(path)?; + self.parse_linkbase(&content) + } + + pub fn parse_linkbase(&mut self, data: &[u8]) -> Result<()> { + // Skip BOM if present + let data = if data.starts_with(&[0xEF, 0xBB, 0xBF]) { + &data[3..] + } else { + data + }; + + let text = std::str::from_utf8(data) + .map_err(|_| Error::Parse("Invalid UTF-8 in linkbase".to_string()))?; + + // Detect linkbase type and parse accordingly + if text.contains("presentationLink") { + self.parse_presentation_linkbase(text)?; + } + if text.contains("calculationLink") { + self.parse_calculation_linkbase(text)?; + } + if text.contains("definitionLink") { + self.parse_definition_linkbase(text)?; + } + if text.contains("labelLink") { + self.parse_label_linkbase(text)?; + } + if text.contains("referenceLink") { + self.parse_reference_linkbase(text)?; + } + + Ok(()) + } + + fn parse_presentation_linkbase(&mut self, text: &str) -> Result<()> { + // Parse presentation arcs + let mut pos = 0; + while let Some(arc_start) = text[pos..].find("") { + let arc_text = &text[arc_start..arc_start + arc_end]; + + let mut link = PresentationLink { + from: CompactString::new(""), + to: CompactString::new(""), + order: 1.0, + priority: None, + use_attribute: None, + }; + + // Extract from + if let Some(from_start) = arc_text.find("xlink:from=\"") { + let from_start = from_start + 12; + if let Some(from_end) = arc_text[from_start..].find('"') { + link.from = + CompactString::from(&arc_text[from_start..from_start + from_end]); + } + } + + // Extract to + if let Some(to_start) = arc_text.find("xlink:to=\"") { + let to_start = to_start + 10; + if let Some(to_end) = arc_text[to_start..].find('"') { + link.to = CompactString::from(&arc_text[to_start..to_start + to_end]); + } + } + + // Extract order + if let Some(order_start) = arc_text.find("order=\"") { + let order_start = order_start + 7; + if let Some(order_end) = arc_text[order_start..].find('"') { + if let Ok(order) = arc_text[order_start..order_start + order_end].parse() { + link.order = order; + } + } + } + + // Extract priority + if let Some(priority_start) = arc_text.find("priority=\"") { + let priority_start = priority_start + 10; + if let Some(priority_end) = arc_text[priority_start..].find('"') { + if let Ok(priority) = + arc_text[priority_start..priority_start + priority_end].parse() + { + link.priority = Some(priority); + } + } + } + + // Extract use + if let Some(use_start) = arc_text.find("use=\"") { + let use_start = use_start + 5; + if let Some(use_end) = arc_text[use_start..].find('"') { + link.use_attribute = Some(CompactString::from( + &arc_text[use_start..use_start + use_end], + )); + } + } + + self.presentation_links + .entry(link.from.clone()) + .or_default() + .push(link); + } + } + + Ok(()) + } + + fn parse_calculation_linkbase(&mut self, text: &str) -> Result<()> { + // Parse calculation arcs + let mut pos = 0; + while let Some(arc_start) = text[pos..].find("") { + let arc_text = &text[arc_start..arc_start + arc_end]; + + let mut link = CalculationLink { + from: CompactString::new(""), + to: CompactString::new(""), + weight: 1.0, + order: 1.0, + }; + + // Extract from + if let Some(from_start) = arc_text.find("xlink:from=\"") { + let from_start = from_start + 12; + if let Some(from_end) = arc_text[from_start..].find('"') { + link.from = + CompactString::from(&arc_text[from_start..from_start + from_end]); + } + } + + // Extract to + if let Some(to_start) = arc_text.find("xlink:to=\"") { + let to_start = to_start + 10; + if let Some(to_end) = arc_text[to_start..].find('"') { + link.to = CompactString::from(&arc_text[to_start..to_start + to_end]); + } + } + + // Extract weight + if let Some(weight_start) = arc_text.find("weight=\"") { + let weight_start = weight_start + 8; + if let Some(weight_end) = arc_text[weight_start..].find('"') { + if let Ok(weight) = + arc_text[weight_start..weight_start + weight_end].parse() + { + link.weight = weight; + } + } + } + + // Extract order + if let Some(order_start) = arc_text.find("order=\"") { + let order_start = order_start + 7; + if let Some(order_end) = arc_text[order_start..].find('"') { + if let Ok(order) = arc_text[order_start..order_start + order_end].parse() { + link.order = order; + } + } + } + + self.calculation_links + .entry(link.from.clone()) + .or_default() + .push(link); + } + } + + Ok(()) + } + + fn parse_definition_linkbase(&mut self, text: &str) -> Result<()> { + // Parse definition arcs + let mut pos = 0; + while let Some(arc_start) = text[pos..].find("") { + let arc_text = &text[arc_start..arc_start + arc_end]; + + let mut link = DefinitionLink { + from: CompactString::new(""), + to: CompactString::new(""), + arcrole: CompactString::new(""), + order: 1.0, + }; + + // Extract from + if let Some(from_start) = arc_text.find("xlink:from=\"") { + let from_start = from_start + 12; + if let Some(from_end) = arc_text[from_start..].find('"') { + link.from = + CompactString::from(&arc_text[from_start..from_start + from_end]); + } + } + + // Extract to + if let Some(to_start) = arc_text.find("xlink:to=\"") { + let to_start = to_start + 10; + if let Some(to_end) = arc_text[to_start..].find('"') { + link.to = CompactString::from(&arc_text[to_start..to_start + to_end]); + } + } + + // Extract arcrole + if let Some(arcrole_start) = arc_text.find("xlink:arcrole=\"") { + let arcrole_start = arcrole_start + 15; + if let Some(arcrole_end) = arc_text[arcrole_start..].find('"') { + link.arcrole = CompactString::from( + &arc_text[arcrole_start..arcrole_start + arcrole_end], + ); + } + } + + // Extract order + if let Some(order_start) = arc_text.find("order=\"") { + let order_start = order_start + 7; + if let Some(order_end) = arc_text[order_start..].find('"') { + if let Ok(order) = arc_text[order_start..order_start + order_end].parse() { + link.order = order; + } + } + } + + self.definition_links + .entry(link.from.clone()) + .or_default() + .push(link); + } + } + + Ok(()) + } + + fn parse_label_linkbase(&mut self, text: &str) -> Result<()> { + // Parse labels + let mut pos = 0; + while let Some(label_start) = text[pos..].find("") { + let label_text = &text[label_start..label_start + label_end]; + + let mut link = LabelLink { + concept: CompactString::new(""), + label: CompactString::new(""), + role: CompactString::new(""), + lang: CompactString::new("en"), + }; + + // Extract label ID for concept mapping + if let Some(id_start) = label_text.find("xlink:label=\"") { + let id_start = id_start + 13; + if let Some(id_end) = label_text[id_start..].find('"') { + link.concept = + CompactString::from(&label_text[id_start..id_start + id_end]); + } + } + + // Extract role + if let Some(role_start) = label_text.find("xlink:role=\"") { + let role_start = role_start + 12; + if let Some(role_end) = label_text[role_start..].find('"') { + link.role = + CompactString::from(&label_text[role_start..role_start + role_end]); + } + } + + // Extract lang + if let Some(lang_start) = label_text.find("xml:lang=\"") { + let lang_start = lang_start + 10; + if let Some(lang_end) = label_text[lang_start..].find('"') { + link.lang = + CompactString::from(&label_text[lang_start..lang_start + lang_end]); + } + } + + // Extract label text content + if let Some(content_start) = label_text.find('>') { + let content = &label_text[content_start + 1..]; + link.label = CompactString::from(content.trim()); + } + + self.label_links + .entry(link.concept.clone()) + .or_default() + .push(link); + } + } + + Ok(()) + } + + fn parse_reference_linkbase(&mut self, text: &str) -> Result<()> { + // Parse references - simplified version + let mut pos = 0; + while let Some(ref_start) = text[pos..].find("") { + let ref_text = &text[ref_start..ref_start + ref_end]; + + let mut reference = Reference { + role: CompactString::new(""), + parts: HashMap::new(), + }; + + // Extract role + if let Some(role_start) = ref_text.find("xlink:role=\"") { + let role_start = role_start + 12; + if let Some(role_end) = ref_text[role_start..].find('"') { + reference.role = + CompactString::from(&ref_text[role_start..role_start + role_end]); + } + } + + // Parse reference parts (simplified) + let parts = [ + "Name", + "Number", + "Section", + "Subsection", + "Paragraph", + "Subparagraph", + "Clause", + ]; + for part in &parts { + let tag = format!("') { + let content_start = part_start + content_start + 1; + if let Some(content_end) = ref_text[content_start..].find('<') { + let content = &ref_text[content_start..content_start + content_end]; + reference.parts.insert( + CompactString::from(*part), + CompactString::from(content.trim()), + ); + } + } + } + } + + // Find concept this reference belongs to + if let Some(label_start) = ref_text.find("xlink:label=\"") { + let label_start = label_start + 13; + if let Some(label_end) = ref_text[label_start..].find('"') { + let concept = + CompactString::from(&ref_text[label_start..label_start + label_end]); + + let link = ReferenceLink { + concept: concept.clone(), + reference, + }; + + self.reference_links.entry(concept).or_default().push(link); + } + } + } + } + + Ok(()) + } + + pub fn get_presentation_tree(&self, root: &str) -> Vec<&PresentationLink> { + self.presentation_links + .get(root) + .map(|links| { + let mut sorted = links.iter().collect::>(); + sorted.sort_by(|a, b| a.order.partial_cmp(&b.order).unwrap()); + sorted + }) + .unwrap_or_default() + } + + pub fn calculate_total(&self, parent: &str, facts: &HashMap) -> f64 { + if let Some(links) = self.calculation_links.get(parent) { + links + .iter() + .map(|link| { + facts + .get(link.to.as_str()) + .map(|value| value * link.weight) + .unwrap_or(0.0) + }) + .sum() + } else { + facts.get(parent).copied().unwrap_or(0.0) + } + } + + pub fn get_label(&self, concept: &str, role: &str, lang: &str) -> Option<&str> { + self.label_links + .get(concept) + .and_then(|labels| { + labels + .iter() + .find(|l| l.role == role && l.lang == lang) + .or_else(|| labels.iter().find(|l| l.lang == lang)) + .or_else(|| labels.first()) + }) + .map(|l| l.label.as_str()) + } + + pub fn validate_calculations(&self, facts: &HashMap) -> Vec { + let mut errors = Vec::new(); + + for parent in self.calculation_links.keys() { + let calculated = self.calculate_total(parent, facts); + if let Some(&actual) = facts.get(parent.as_str()) { + let diff = (calculated - actual).abs(); + let tolerance = 0.01; // Allow small rounding differences + + if diff > tolerance { + errors.push(ValidationError::CalculationInconsistency { + concept: parent.to_string(), + expected: calculated, + actual, + }); + } + } + } + + errors + } +} diff --git a/rust/crabrl-fork/src/main.rs b/rust/crabrl-fork/src/main.rs new file mode 100644 index 0000000..fd20bac --- /dev/null +++ b/rust/crabrl-fork/src/main.rs @@ -0,0 +1,181 @@ +//! crabrl CLI - High-performance XBRL parser and validator + +use anyhow::{Context, Result}; +use clap::{Parser as ClapParser, Subcommand}; +use colored::*; +use std::path::PathBuf; +use std::time::Instant; + +use crabrl::{Parser, ValidationConfig, Validator}; + +/// High-performance XBRL parser and validator +#[derive(ClapParser)] +#[command(name = "crabrl")] +#[command(author, version, about, long_about = None)] +struct Cli { + #[command(subcommand)] + command: Commands, +} + +#[derive(Subcommand)] +enum Commands { + /// Parse an XBRL file + Parse { + /// Input file + input: PathBuf, + + /// Output as JSON + #[arg(short, long)] + json: bool, + + /// Show statistics + #[arg(short, long)] + stats: bool, + }, + + /// Validate an XBRL file + Validate { + /// Input file + input: PathBuf, + + /// Validation profile (generic, sec-edgar) + #[arg(short, long, default_value = "generic")] + profile: String, + + /// Treat warnings as errors + #[arg(long)] + strict: bool, + }, + + /// Benchmark parsing performance + Bench { + /// Input file + input: PathBuf, + + /// Number of iterations + #[arg(short, long, default_value = "100")] + iterations: usize, + }, +} + +fn main() -> Result<()> { + let cli = Cli::parse(); + + match cli.command { + Commands::Parse { + input, + json: _, + stats, + } => { + let start = Instant::now(); + let parser = Parser::new(); + let doc = parser + .parse_file(&input) + .with_context(|| format!("Failed to parse {}", input.display()))?; + let elapsed = start.elapsed(); + + println!("{} {}", "✓".green().bold(), input.display()); + println!(" Facts: {}", doc.facts.len()); + println!(" Contexts: {}", doc.contexts.len()); + println!(" Units: {}", doc.units.len()); + + if stats { + println!(" Time: {:.2}ms", elapsed.as_secs_f64() * 1000.0); + println!( + " Throughput: {:.0} facts/sec", + doc.facts.len() as f64 / elapsed.as_secs_f64() + ); + } + } + + Commands::Validate { + input, + profile, + strict, + } => { + let parser = Parser::new(); + let doc = parser + .parse_file(&input) + .with_context(|| format!("Failed to parse {}", input.display()))?; + + let config = match profile.as_str() { + "sec-edgar" => ValidationConfig::sec_edgar(), + _ => ValidationConfig::default(), + }; + + let validator = Validator::with_config(config); + let result = validator.validate(&doc)?; + + if result.is_valid { + println!( + "{} {} - Document is valid", + "✓".green().bold(), + input.display() + ); + } else { + println!( + "{} {} - Validation failed", + "✗".red().bold(), + input.display() + ); + println!(" Errors: {}", result.errors.len()); + println!(" Warnings: {}", result.warnings.len()); + + for error in result.errors.iter().take(5) { + println!(" {} {}", "ERROR:".red(), error); + } + + if result.errors.len() > 5 { + println!(" ... and {} more errors", result.errors.len() - 5); + } + + if strict && !result.warnings.is_empty() { + std::process::exit(1); + } + + if !result.is_valid { + std::process::exit(1); + } + } + } + + Commands::Bench { input, iterations } => { + let parser = Parser::new(); + + // Warmup + for _ in 0..3 { + let _ = parser.parse_file(&input)?; + } + + let mut times = Vec::with_capacity(iterations); + let mut doc_facts = 0; + + for _ in 0..iterations { + let start = Instant::now(); + let doc = parser.parse_file(&input)?; + times.push(start.elapsed()); + doc_facts = doc.facts.len(); + } + + times.sort(); + let min = times[0]; + let max = times[times.len() - 1]; + let median = times[times.len() / 2]; + let mean = times.iter().sum::() / times.len() as u32; + + println!("Benchmark Results for {}", input.display()); + println!(" Iterations: {}", iterations); + println!(" Facts: {}", doc_facts); + println!(" Min: {:.3}ms", min.as_secs_f64() * 1000.0); + println!(" Median: {:.3}ms", median.as_secs_f64() * 1000.0); + println!(" Mean: {:.3}ms", mean.as_secs_f64() * 1000.0); + println!(" Max: {:.3}ms", max.as_secs_f64() * 1000.0); + println!( + " Throughput: {:.0} facts/sec", + doc_facts as f64 / mean.as_secs_f64() + ); + } + } + + Ok(()) +} diff --git a/rust/crabrl-fork/src/model.rs b/rust/crabrl-fork/src/model.rs new file mode 100644 index 0000000..bc666f2 --- /dev/null +++ b/rust/crabrl-fork/src/model.rs @@ -0,0 +1,431 @@ +use bitflags::bitflags; +use compact_str::CompactString; +use serde::{Deserialize, Serialize}; +use std::collections::HashMap; + +// ============================================================================ +// Core XBRL Data Structures - Full Specification Support +// ============================================================================ + +bitflags! { + #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] + pub struct FactFlags: u8 { + const NIL = 0b01; + const HAS_PRECISION = 0b10; + const HAS_DECIMALS = 0b100; + const IN_TUPLE = 0b1000; + } +} + +#[repr(C, align(64))] +#[derive(Clone, Serialize, Deserialize)] +pub struct FactStorage { + pub concept_ids: Vec, + pub context_ids: Vec, + pub unit_ids: Vec, + pub values: Vec, + pub decimals: Vec>, + pub ids: Vec>, + pub footnote_refs: Vec>, +} + +impl FactStorage { + pub fn with_capacity(capacity: usize) -> Self { + Self { + concept_ids: Vec::with_capacity(capacity), + context_ids: Vec::with_capacity(capacity), + unit_ids: Vec::with_capacity(capacity), + values: Vec::with_capacity(capacity), + decimals: Vec::with_capacity(capacity), + ids: Vec::with_capacity(capacity), + footnote_refs: Vec::with_capacity(capacity), + } + } + + #[inline(always)] + pub fn len(&self) -> usize { + self.concept_ids.len() + } + + pub fn is_empty(&self) -> bool { + self.concept_ids.is_empty() + } + + pub fn push( + &mut self, + concept_id: u32, + context_id: u16, + unit_id: u16, + value: FactValue, + decimals: Option, + id: Option, + ) { + self.concept_ids.push(concept_id); + self.context_ids.push(context_id); + self.unit_ids.push(unit_id); + self.values.push(value); + self.decimals.push(decimals); + self.ids.push(id); + self.footnote_refs.push(Vec::new()); + } + + pub fn clear(&mut self) { + self.concept_ids.clear(); + self.context_ids.clear(); + self.unit_ids.clear(); + self.values.clear(); + self.decimals.clear(); + self.ids.clear(); + self.footnote_refs.clear(); + } +} + +#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize, Default)] +#[serde(untagged)] +pub enum FactValue { + Text(u32), + Decimal(f64), + Integer(i64), + Boolean(bool), + Date(u32), + DateTime(u32), + #[default] + Nil, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct Fact { + #[serde(skip_serializing_if = "Option::is_none")] + pub id: Option, + pub concept: CompactString, + pub context_ref: CompactString, + #[serde(skip_serializing_if = "Option::is_none")] + pub unit_ref: Option, + pub value: CompactString, + #[serde(skip_serializing_if = "Option::is_none")] + pub decimals: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub precision: Option, + pub nil: bool, + #[serde(skip_serializing_if = "Option::is_none")] + pub nil_reason: Option, + #[serde(default, skip_serializing_if = "Vec::is_empty")] + pub footnote_refs: Vec, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct Context { + pub id: CompactString, + pub entity: Entity, + pub period: Period, + #[serde(skip_serializing_if = "Option::is_none")] + pub scenario: Option, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct Entity { + pub identifier: CompactString, + pub scheme: CompactString, + #[serde(skip_serializing_if = "Option::is_none")] + pub segment: Option, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct Segment { + #[serde(default, skip_serializing_if = "Vec::is_empty")] + pub explicit_members: Vec, + #[serde(default, skip_serializing_if = "Vec::is_empty")] + pub typed_members: Vec, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct DimensionMember { + pub dimension: CompactString, + pub member: CompactString, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct TypedMember { + pub dimension: CompactString, + pub value: CompactString, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct Scenario { + #[serde(default, skip_serializing_if = "Vec::is_empty")] + pub explicit_members: Vec, + #[serde(default, skip_serializing_if = "Vec::is_empty")] + pub typed_members: Vec, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +#[serde(untagged)] +pub enum Period { + Instant { + date: CompactString, + }, + Duration { + start: CompactString, + end: CompactString, + }, + Forever, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct Unit { + pub id: CompactString, + pub unit_type: UnitType, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +#[serde(untagged)] +pub enum UnitType { + Simple(Vec), + Divide { + numerator: Vec, + denominator: Vec, + }, + Multiply(Vec), +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct Measure { + pub namespace: CompactString, + pub name: CompactString, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct Tuple { + #[serde(skip_serializing_if = "Option::is_none")] + pub id: Option, + pub name: CompactString, + pub facts: Vec, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +#[serde(untagged)] +pub enum FactOrTuple { + Fact(Fact), + Tuple(Box), +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct Footnote { + pub id: CompactString, + #[serde(skip_serializing_if = "Option::is_none")] + pub role: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub lang: Option, + pub content: CompactString, + #[serde(default, skip_serializing_if = "Vec::is_empty")] + pub fact_refs: Vec, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct FractionValue { + pub numerator: f64, + pub denominator: f64, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct Schema { + pub target_namespace: CompactString, + pub elements: HashMap, + pub types: HashMap, + pub imports: Vec, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct SchemaElement { + pub name: CompactString, + pub element_type: CompactString, + #[serde(skip_serializing_if = "Option::is_none")] + pub substitution_group: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub period_type: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub balance: Option, + #[serde(default)] + pub abstract_element: bool, + #[serde(default)] + pub nillable: bool, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct SchemaType { + pub name: CompactString, + #[serde(skip_serializing_if = "Option::is_none")] + pub base_type: Option, + #[serde(default, skip_serializing_if = "Vec::is_empty")] + pub restrictions: Vec, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +#[serde(untagged)] +pub enum TypeRestriction { + MinInclusive(CompactString), + MaxInclusive(CompactString), + MinExclusive(CompactString), + MaxExclusive(CompactString), + Pattern(CompactString), + Enumeration(Vec), + Length(usize), + MinLength(usize), + MaxLength(usize), +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct SchemaImport { + pub namespace: CompactString, + pub schema_location: CompactString, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct Linkbase { + pub role: CompactString, + pub links: Vec, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +#[serde(untagged)] +pub enum Link { + Presentation(PresentationLink), + Calculation(CalculationLink), + Definition(DefinitionLink), + Label(LabelLink), + Reference(ReferenceLink), +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct PresentationLink { + pub from: CompactString, + pub to: CompactString, + pub order: f32, + #[serde(skip_serializing_if = "Option::is_none")] + pub priority: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub use_attribute: Option, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct CalculationLink { + pub from: CompactString, + pub to: CompactString, + pub weight: f64, + pub order: f32, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct DefinitionLink { + pub from: CompactString, + pub to: CompactString, + pub arcrole: CompactString, + pub order: f32, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct LabelLink { + pub concept: CompactString, + pub label: CompactString, + pub role: CompactString, + pub lang: CompactString, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ReferenceLink { + pub concept: CompactString, + pub reference: Reference, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct Reference { + pub role: CompactString, + pub parts: HashMap, +} + +#[derive(Clone, Serialize, Deserialize)] +pub struct Document { + pub facts: FactStorage, + pub contexts: Vec, + pub units: Vec, + #[serde(default, skip_serializing_if = "Vec::is_empty")] + pub tuples: Vec, + #[serde(default, skip_serializing_if = "Vec::is_empty")] + pub footnotes: Vec, + #[serde(default, skip_serializing_if = "Vec::is_empty")] + pub presentation_links: Vec, + #[serde(default, skip_serializing_if = "Vec::is_empty")] + pub calculation_links: Vec, + #[serde(default, skip_serializing_if = "Vec::is_empty")] + pub definition_links: Vec, + #[serde(default, skip_serializing_if = "Vec::is_empty")] + pub label_links: Vec, + #[serde(default, skip_serializing_if = "Vec::is_empty")] + pub reference_links: Vec, + #[serde(default, skip_serializing_if = "Vec::is_empty")] + pub custom_links: Vec, + #[serde(default, skip_serializing_if = "Vec::is_empty")] + pub role_types: Vec, + #[serde(default, skip_serializing_if = "Vec::is_empty")] + pub arcrole_types: Vec, + #[serde(default, skip_serializing_if = "Vec::is_empty")] + pub schemas: Vec, + #[serde(default, skip_serializing_if = "Vec::is_empty")] + pub dimensions: Vec, + #[serde(default, skip_serializing_if = "Vec::is_empty")] + pub concept_names: Vec, + #[serde(default, skip_serializing_if = "HashMap::is_empty")] + pub namespaces: HashMap, +} + +impl Default for Document { + fn default() -> Self { + Self::new() + } +} + +impl Document { + pub fn new() -> Self { + Self { + facts: FactStorage::with_capacity(10000), + contexts: Vec::with_capacity(100), + units: Vec::with_capacity(50), + tuples: Vec::new(), + footnotes: Vec::new(), + presentation_links: Vec::new(), + calculation_links: Vec::new(), + definition_links: Vec::new(), + label_links: Vec::new(), + reference_links: Vec::new(), + custom_links: Vec::new(), + role_types: Vec::new(), + arcrole_types: Vec::new(), + schemas: Vec::new(), + dimensions: Vec::new(), + concept_names: Vec::new(), + namespaces: HashMap::new(), + } + } + + pub fn with_capacity(facts: usize, contexts: usize, units: usize) -> Self { + Self { + facts: FactStorage::with_capacity(facts), + contexts: Vec::with_capacity(contexts), + units: Vec::with_capacity(units), + tuples: Vec::new(), + footnotes: Vec::new(), + presentation_links: Vec::new(), + calculation_links: Vec::new(), + definition_links: Vec::new(), + label_links: Vec::new(), + reference_links: Vec::new(), + custom_links: Vec::new(), + role_types: Vec::new(), + arcrole_types: Vec::new(), + schemas: Vec::new(), + dimensions: Vec::new(), + concept_names: Vec::new(), + namespaces: HashMap::new(), + } + } +} diff --git a/rust/crabrl-fork/src/parser.rs b/rust/crabrl-fork/src/parser.rs new file mode 100644 index 0000000..248dab1 --- /dev/null +++ b/rust/crabrl-fork/src/parser.rs @@ -0,0 +1,990 @@ +use crate::allocator::ArenaAllocator; +use crate::model::*; +use crate::simd::SimdScanner; +use crate::{Error, Result}; +use compact_str::CompactString; +use std::path::Path; + +pub struct Parser { + load_schemas: bool, + load_linkbases: bool, + validate: bool, +} + +impl Parser { + pub fn new() -> Self { + Self { + load_schemas: false, + load_linkbases: false, + validate: false, + } + } + + pub fn with_validation(mut self, validate: bool) -> Self { + self.validate = validate; + self + } + + pub fn with_schema_loading(mut self, load: bool) -> Self { + self.load_schemas = load; + self + } + + pub fn with_linkbase_loading(mut self, load: bool) -> Self { + self.load_linkbases = load; + self + } + + pub fn parse_str(&self, content: &str) -> Result { + self.parse_bytes(content.as_bytes()) + } + + pub fn parse_file>(&self, path: P) -> Result { + let content = std::fs::read(path)?; + self.parse_bytes(&content) + } + + pub fn parse_bytes(&self, data: &[u8]) -> Result { + let data = if data.starts_with(&[0xEF, 0xBB, 0xBF]) { + &data[3..] + } else { + data + }; + + let allocator = ArenaAllocator::new(); + let mut parser = FullXbrlParser::new(data, &allocator); + parser.load_schemas = self.load_schemas; + parser.load_linkbases = self.load_linkbases; + parser.validate = self.validate; + parser.parse() + } +} + +impl Default for Parser { + fn default() -> Self { + Self::new() + } +} + +struct FullXbrlParser<'a> { + scanner: SimdScanner<'a>, + allocator: &'a ArenaAllocator, + doc: Document, + in_xbrl_root: bool, + load_schemas: bool, + load_linkbases: bool, + validate: bool, +} + +impl<'a> FullXbrlParser<'a> { + fn new(data: &'a [u8], allocator: &'a ArenaAllocator) -> Self { + Self { + scanner: SimdScanner::new(data), + allocator, + doc: Document::new(), + in_xbrl_root: false, + load_schemas: false, + load_linkbases: false, + validate: false, + } + } + + #[inline(always)] + fn read_tag_name(&mut self) -> Result<&'a str> { + let start = self.scanner.pos; + while let Some(ch) = self.scanner.peek() { + if ch == b' ' || ch == b'>' || ch == b'/' || ch == b'\t' || ch == b'\n' || ch == b'\r' { + break; + } + self.scanner.advance(1); + } + let end = self.scanner.pos; + + if start == end { + return Err(Error::Parse("Empty tag name".to_string())); + } + + std::str::from_utf8(&self.scanner.data[start..end]) + .map_err(|_| Error::Parse("Invalid UTF-8 in tag name".to_string())) + } + + #[inline(always)] + fn parse_attributes(&mut self) -> Result> { + let mut attrs = Vec::new(); + + loop { + self.scanner.skip_whitespace(); + + match self.scanner.peek() { + Some(b'>') => { + break; + } + Some(b'/') => { + self.scanner.advance(1); + if self.scanner.peek() == Some(b'>') { + break; + } + } + None => return Err(Error::Parse("Unexpected EOF in attributes".to_string())), + _ => {} + } + + let name_start = self.scanner.pos; + while let Some(ch) = self.scanner.peek() { + if ch == b'=' || ch == b' ' || ch == b'>' || ch == b'/' { + break; + } + self.scanner.advance(1); + } + + if self.scanner.pos == name_start { + break; + } + + let name = std::str::from_utf8(&self.scanner.data[name_start..self.scanner.pos]) + .map_err(|_| Error::Parse("Invalid UTF-8 in attribute name".to_string()))?; + + self.scanner.skip_whitespace(); + + if self.scanner.peek() != Some(b'=') { + continue; + } + self.scanner.advance(1); + + self.scanner.skip_whitespace(); + + let quote = self + .scanner + .peek() + .ok_or_else(|| Error::Parse("Expected quote".to_string()))?; + + if quote != b'"' && quote != b'\'' { + return Err(Error::Parse("Expected quote in attribute".to_string())); + } + + self.scanner.advance(1); + let value_start = self.scanner.pos; + + while let Some(ch) = self.scanner.peek() { + if ch == quote { + break; + } + self.scanner.advance(1); + } + + let value = std::str::from_utf8(&self.scanner.data[value_start..self.scanner.pos]) + .map_err(|_| Error::Parse("Invalid UTF-8 in attribute value".to_string()))?; + + self.scanner.advance(1); + + attrs.push((name, value)); + } + + Ok(attrs) + } + + #[inline(always)] + fn skip_to_tag_end(&mut self) -> Result<()> { + while let Some(ch) = self.scanner.peek() { + if ch == b'>' { + self.scanner.advance(1); + return Ok(()); + } + self.scanner.advance(1); + } + Err(Error::Parse("Expected '>'".to_string())) + } + + #[inline(always)] + fn read_text_content(&mut self) -> Result<&'a str> { + let start = self.scanner.pos; + while let Some(ch) = self.scanner.peek() { + if ch == b'<' { + break; + } + self.scanner.advance(1); + } + + let text = std::str::from_utf8(&self.scanner.data[start..self.scanner.pos]) + .map_err(|_| Error::Parse("Invalid UTF-8 in text content".to_string()))?; + + Ok(text.trim()) + } + + #[inline(always)] + fn read_text_content_with_cdata(&mut self) -> Result { + let mut result = CompactString::new(""); + let start = self.scanner.pos; + + loop { + if self.scanner.is_eof() { + let text = std::str::from_utf8(&self.scanner.data[start..self.scanner.pos]) + .map_err(|_| Error::Parse("Invalid UTF-8 in text content".to_string()))?; + if !result.is_empty() { + result.push_str(text.trim()); + } else { + result = CompactString::from(text.trim()); + } + break; + } + + if self.scanner.peek() == Some(b'<') { + let text = std::str::from_utf8(&self.scanner.data[start..self.scanner.pos]) + .map_err(|_| Error::Parse("Invalid UTF-8 in text content".to_string()))?; + if !text.trim().is_empty() { + if !result.is_empty() { + result.push_str(text.trim()); + } else { + result = CompactString::from(text.trim()); + } + } + + self.scanner.advance(1); + + if self.peek_ahead(7) == Some(b"![CDATA[") { + self.scanner.advance(8); + let cdata_start = self.scanner.pos; + + while !self.scanner.is_eof() { + if self.scanner.peek() == Some(b']') && self.peek_ahead(3) == Some(b"]]>") { + let cdata = std::str::from_utf8( + &self.scanner.data[cdata_start..self.scanner.pos], + ) + .map_err(|_| { + Error::Parse("Invalid UTF-8 in CDATA section".to_string()) + })?; + result.push_str(cdata); + self.scanner.advance(3); + break; + } + self.scanner.advance(1); + } + } else { + self.scanner.pos = self.scanner.pos.saturating_sub(1); + break; + } + } else { + self.scanner.advance(1); + } + } + + Ok(result) + } + + #[inline(always)] + fn skip_element_from_tag(&mut self) -> Result<()> { + self.skip_to_tag_end()?; + + if self.scanner.pos >= 2 && self.scanner.data[self.scanner.pos - 2] == b'/' { + return Ok(()); + } + + let mut depth = 1; + + while depth > 0 && !self.scanner.is_eof() { + while let Some(ch) = self.scanner.peek() { + if ch == b'<' { + break; + } + self.scanner.advance(1); + } + + if self.scanner.is_eof() { + break; + } + + self.scanner.advance(1); + + if self.scanner.peek() == Some(b'/') { + depth -= 1; + } else if self.scanner.peek() != Some(b'!') && self.scanner.peek() != Some(b'?') { + let mut is_self_closing = false; + + while let Some(ch) = self.scanner.peek() { + if ch == b'/' + && self.scanner.pos + 1 < self.scanner.data.len() + && self.scanner.data[self.scanner.pos + 1] == b'>' + { + is_self_closing = true; + } + if ch == b'>' { + self.scanner.advance(1); + break; + } + self.scanner.advance(1); + } + + if !is_self_closing { + depth += 1; + } + + continue; + } + + while let Some(ch) = self.scanner.peek() { + if ch == b'>' { + self.scanner.advance(1); + break; + } + self.scanner.advance(1); + } + } + + Ok(()) + } + + #[inline(always)] + fn skip_processing_instruction(&mut self) -> Result<()> { + while !self.scanner.is_eof() { + if self.scanner.peek() == Some(b'?') { + self.scanner.advance(1); + if self.scanner.peek() == Some(b'>') { + self.scanner.advance(1); + return Ok(()); + } + } else { + self.scanner.advance(1); + } + } + Err(Error::Parse("Unclosed processing instruction".to_string())) + } + + #[inline(always)] + fn skip_comment(&mut self) -> Result<()> { + while !self.scanner.is_eof() { + if self.scanner.peek() == Some(b'-') { + self.scanner.advance(1); + if self.scanner.peek() == Some(b'-') { + self.scanner.advance(1); + if self.scanner.peek() == Some(b'>') { + self.scanner.advance(1); + return Ok(()); + } + } + } else { + self.scanner.advance(1); + } + } + Err(Error::Parse("Unclosed comment".to_string())) + } + + #[inline(always)] + fn peek_ahead(&self, n: usize) -> Option<&'a [u8]> { + if self.scanner.pos + n <= self.scanner.data.len() { + Some(&self.scanner.data[self.scanner.pos..self.scanner.pos + n]) + } else { + None + } + } + + #[inline(always)] + fn skip_doctype(&mut self) -> Result<()> { + while !self.scanner.is_eof() { + if self.scanner.peek() == Some(b'>') { + self.scanner.advance(1); + return Ok(()); + } + self.scanner.advance(1); + } + Err(Error::Parse("Unclosed DOCTYPE".to_string())) + } + + #[inline(always)] + fn skip_closing_tag(&mut self, expected_tag: &str) -> Result<()> { + self.scanner.skip_whitespace(); + + if self.scanner.peek() != Some(b'<') { + return Ok(()); + } + + self.scanner.advance(1); + + if self.scanner.peek() != Some(b'/') { + self.scanner.pos = self.scanner.pos.saturating_sub(1); + return Ok(()); + } + + self.scanner.advance(1); + let tag = self.read_tag_name()?; + + if tag.ends_with(expected_tag) || tag == expected_tag { + self.skip_to_tag_end()?; + } + + Ok(()) + } + + #[inline(always)] + fn check_self_closing(&self) -> bool { + if self.scanner.pos >= 2 { + self.scanner.data[self.scanner.pos - 2] == b'/' + && self.scanner.data[self.scanner.pos - 1] == b'>' + } else { + false + } + } + + fn parse(&mut self) -> Result { + self.scanner.skip_whitespace(); + + while !self.scanner.is_eof() { + self.scanner.skip_whitespace(); + + if self.scanner.peek() != Some(b'<') { + while self.scanner.peek() != Some(b'<') && !self.scanner.is_eof() { + self.scanner.advance(1); + } + continue; + } + + self.scanner.advance(1); + + if self.scanner.peek() == Some(b'?') { + self.skip_processing_instruction()?; + } else if self.scanner.peek() == Some(b'!') { + if self.peek_ahead(3) == Some(b"!--") { + self.skip_comment()?; + } else { + self.skip_doctype()?; + } + } else if self.scanner.peek() == Some(b'/') { + self.scanner.advance(1); + let tag = self.read_tag_name()?; + self.skip_to_tag_end()?; + if tag == "xbrl" || tag.ends_with(":xbrl") { + self.in_xbrl_root = false; + break; + } + } else { + self.parse_element()?; + } + } + + self.doc.concept_names = self.allocator.get_all_strings(); + Ok(std::mem::take(&mut self.doc)) + } + + fn parse_element(&mut self) -> Result<()> { + let tag_name = self.read_tag_name()?; + + if tag_name == "xbrl" || tag_name.ends_with(":xbrl") { + self.parse_xbrl_root()?; + self.in_xbrl_root = true; + return Ok(()); + } + + if !self.in_xbrl_root { + self.skip_element_from_tag()?; + return Ok(()); + } + + if tag_name.ends_with(":context") || tag_name == "context" { + self.parse_context()?; + } else if tag_name.ends_with(":unit") || tag_name == "unit" { + self.parse_unit()?; + } else if tag_name.contains(':') { + self.parse_fact(tag_name)?; + } else { + self.skip_element_from_tag()?; + } + + Ok(()) + } + + fn parse_xbrl_root(&mut self) -> Result<()> { + let attrs = self.parse_attributes()?; + for (name, value) in attrs { + if name.starts_with("xmlns") { + let ns_name = if name.len() > 6 && name.chars().nth(5) == Some(':') { + CompactString::from(&name[6..]) + } else { + CompactString::new("") + }; + self.doc + .namespaces + .insert(ns_name, CompactString::from(value)); + } + } + self.skip_to_tag_end()?; + Ok(()) + } + + fn parse_context(&mut self) -> Result<()> { + let attrs = self.parse_attributes()?; + let id = attrs + .iter() + .find(|(n, _)| *n == "id") + .map(|(_, v)| CompactString::from(*v)) + .ok_or_else(|| Error::Parse("Context missing id".to_string()))?; + + self.skip_to_tag_end()?; + + let mut entity = None; + let mut period = None; + let mut scenario = None; + + loop { + self.scanner.skip_whitespace(); + while self.scanner.peek() != Some(b'<') && !self.scanner.is_eof() { + self.scanner.advance(1); + } + + if self.scanner.is_eof() { + break; + } + + let saved_pos = self.scanner.pos; + self.scanner.advance(1); + + if self.scanner.peek() == Some(b'/') { + self.scanner.advance(1); + let tag = self.read_tag_name()?; + if tag.ends_with("context") || tag == "context" { + self.skip_to_tag_end()?; + break; + } + self.scanner.pos = saved_pos; + break; + } + + let tag = self.read_tag_name()?; + if tag.ends_with("entity") { + entity = Some(self.parse_entity()?); + } else if tag.ends_with("period") { + period = Some(self.parse_period()?); + } else if tag.ends_with("scenario") { + scenario = Some(self.parse_scenario()?); + } else { + self.skip_element_from_tag()?; + } + } + + if let (Some(entity), Some(period)) = (entity, period) { + self.doc.contexts.push(Context { + id, + entity, + period, + scenario, + }); + } + + Ok(()) + } + + fn parse_entity(&mut self) -> Result { + let _attrs = self.parse_attributes()?; + self.skip_to_tag_end()?; + + let mut identifier = CompactString::new(""); + let mut scheme = CompactString::new(""); + let mut segment = None; + + loop { + self.scanner.skip_whitespace(); + while self.scanner.peek() != Some(b'<') && !self.scanner.is_eof() { + self.scanner.advance(1); + } + + if self.scanner.is_eof() { + break; + } + + let saved_pos = self.scanner.pos; + self.scanner.advance(1); + + if self.scanner.peek() == Some(b'/') { + self.scanner.advance(1); + let tag = self.read_tag_name()?; + if tag.ends_with("entity") || tag == "entity" { + self.skip_to_tag_end()?; + break; + } + self.scanner.pos = saved_pos; + break; + } + + let tag = self.read_tag_name()?; + if tag.ends_with("identifier") { + let attrs = self.parse_attributes()?; + scheme = attrs + .iter() + .find(|(n, _)| *n == "scheme") + .map(|(_, v)| CompactString::from(*v)) + .unwrap_or_default(); + self.skip_to_tag_end()?; + identifier = CompactString::from(self.read_text_content()?); + self.skip_closing_tag("identifier")?; + } else if tag.ends_with("segment") { + segment = Some(self.parse_segment()?); + } else { + self.skip_element_from_tag()?; + } + } + + Ok(Entity { + identifier, + scheme, + segment, + }) + } + + fn parse_period(&mut self) -> Result { + let _attrs = self.parse_attributes()?; + self.skip_to_tag_end()?; + + let mut instant = None; + let mut start_date = None; + let mut end_date = None; + let mut forever = false; + + loop { + self.scanner.skip_whitespace(); + if self.scanner.peek() != Some(b'<') { + break; + } + + let saved_pos = self.scanner.pos; + self.scanner.advance(1); + + if self.scanner.peek() == Some(b'/') { + self.scanner.advance(1); + let tag = self.read_tag_name()?; + if tag.ends_with("period") { + self.skip_to_tag_end()?; + break; + } + self.scanner.pos = saved_pos; + break; + } + + let tag = self.read_tag_name()?; + if tag.ends_with("instant") { + self.skip_to_tag_end()?; + instant = Some(CompactString::from(self.read_text_content()?)); + self.skip_closing_tag("instant")?; + } else if tag.ends_with("startDate") { + self.skip_to_tag_end()?; + start_date = Some(CompactString::from(self.read_text_content()?)); + self.skip_closing_tag("startDate")?; + } else if tag.ends_with("endDate") { + self.skip_to_tag_end()?; + end_date = Some(CompactString::from(self.read_text_content()?)); + self.skip_closing_tag("endDate")?; + } else if tag.ends_with("forever") { + forever = true; + self.skip_element_from_tag()?; + } else { + self.skip_element_from_tag()?; + } + } + + if forever { + Ok(Period::Forever) + } else if let Some(date) = instant { + Ok(Period::Instant { date }) + } else if let (Some(start), Some(end)) = (start_date, end_date) { + Ok(Period::Duration { start, end }) + } else { + Err(Error::Parse("Invalid period".to_string())) + } + } + + fn parse_segment(&mut self) -> Result { + let _attrs = self.parse_attributes()?; + self.skip_to_tag_end()?; + + let mut explicit_members = Vec::new(); + let mut typed_members = Vec::new(); + + loop { + self.scanner.skip_whitespace(); + while self.scanner.peek() != Some(b'<') && !self.scanner.is_eof() { + self.scanner.advance(1); + } + + if self.scanner.is_eof() { + break; + } + + let saved_pos = self.scanner.pos; + self.scanner.advance(1); + + if self.scanner.peek() == Some(b'/') { + self.scanner.advance(1); + let tag = self.read_tag_name()?; + if tag.ends_with("segment") || tag == "segment" { + self.skip_to_tag_end()?; + break; + } + self.scanner.pos = saved_pos; + break; + } + + let tag = self.read_tag_name()?; + if tag.ends_with("explicitMember") { + let attrs = self.parse_attributes()?; + let dimension = attrs + .iter() + .find(|(n, _)| *n == "dimension") + .map(|(_, v)| CompactString::from(*v)) + .unwrap_or_default(); + self.skip_to_tag_end()?; + let member = CompactString::from(self.read_text_content()?); + explicit_members.push(DimensionMember { dimension, member }); + self.skip_closing_tag("explicitMember")?; + } else if tag.ends_with("typedMember") { + let attrs = self.parse_attributes()?; + let dimension = attrs + .iter() + .find(|(n, _)| *n == "dimension") + .map(|(_, v)| CompactString::from(*v)) + .unwrap_or_default(); + self.skip_to_tag_end()?; + let value = CompactString::from(self.read_text_content()?); + typed_members.push(TypedMember { dimension, value }); + self.skip_closing_tag("typedMember")?; + } else { + self.skip_element_from_tag()?; + } + } + + Ok(Segment { + explicit_members, + typed_members, + }) + } + + fn parse_scenario(&mut self) -> Result { + let segment = self.parse_segment()?; + Ok(Scenario { + explicit_members: segment.explicit_members, + typed_members: segment.typed_members, + }) + } + + fn parse_unit(&mut self) -> Result<()> { + let attrs = self.parse_attributes()?; + let id = attrs + .iter() + .find(|(n, _)| *n == "id") + .map(|(_, v)| CompactString::from(*v)) + .ok_or_else(|| Error::Parse("Unit missing id".to_string()))?; + + self.skip_to_tag_end()?; + + let mut unit_type = None; + + loop { + self.scanner.skip_whitespace(); + if self.scanner.peek() != Some(b'<') { + break; + } + + let saved_pos = self.scanner.pos; + self.scanner.advance(1); + + if self.scanner.peek() == Some(b'/') { + self.scanner.advance(1); + let tag = self.read_tag_name()?; + if tag.ends_with("unit") { + self.skip_to_tag_end()?; + break; + } + self.scanner.pos = saved_pos; + break; + } + + let tag = self.read_tag_name()?; + if tag.ends_with("measure") { + self.skip_to_tag_end()?; + let measure_text = self.read_text_content()?; + let measure = self.parse_measure(measure_text); + + if unit_type.is_none() { + unit_type = Some(UnitType::Simple(vec![measure])); + } else if let Some(UnitType::Simple(ref mut measures)) = unit_type { + measures.push(measure); + } + + self.skip_closing_tag("measure")?; + } else if tag.ends_with("divide") { + unit_type = Some(self.parse_unit_divide()?); + } else { + self.skip_element_from_tag()?; + } + } + + if let Some(unit_type) = unit_type { + self.doc.units.push(Unit { id, unit_type }); + } + + Ok(()) + } + + fn parse_unit_divide(&mut self) -> Result { + let _attrs = self.parse_attributes()?; + self.skip_to_tag_end()?; + + let mut numerator = Vec::new(); + let mut denominator = Vec::new(); + + loop { + self.scanner.skip_whitespace(); + if self.scanner.peek() != Some(b'<') { + break; + } + + let saved_pos = self.scanner.pos; + self.scanner.advance(1); + + if self.scanner.peek() == Some(b'/') { + self.scanner.advance(1); + let tag = self.read_tag_name()?; + if tag.ends_with("divide") { + self.skip_to_tag_end()?; + break; + } + self.scanner.pos = saved_pos; + break; + } + + let tag = self.read_tag_name()?; + if tag.ends_with("unitNumerator") { + self.skip_to_tag_end()?; + numerator = self.parse_unit_measures()?; + self.skip_closing_tag("unitNumerator")?; + } else if tag.ends_with("unitDenominator") { + self.skip_to_tag_end()?; + denominator = self.parse_unit_measures()?; + self.skip_closing_tag("unitDenominator")?; + } else { + self.skip_element_from_tag()?; + } + } + + Ok(UnitType::Divide { + numerator, + denominator, + }) + } + + fn parse_unit_measures(&mut self) -> Result> { + let mut measures = Vec::new(); + + loop { + self.scanner.skip_whitespace(); + if self.scanner.peek() != Some(b'<') { + break; + } + + let saved_pos = self.scanner.pos; + self.scanner.advance(1); + + if self.scanner.peek() == Some(b'/') { + self.scanner.pos = saved_pos; + break; + } + + let tag = self.read_tag_name()?; + if tag.ends_with("measure") { + self.skip_to_tag_end()?; + let measure_text = self.read_text_content()?; + measures.push(self.parse_measure(measure_text)); + self.skip_closing_tag("measure")?; + } else { + self.scanner.pos = saved_pos; + break; + } + } + + Ok(measures) + } + + fn parse_measure(&self, text: &str) -> Measure { + if let Some(colon_pos) = text.find(':') { + Measure { + namespace: CompactString::from(&text[..colon_pos]), + name: CompactString::from(&text[colon_pos + 1..]), + } + } else { + Measure { + namespace: CompactString::new(""), + name: CompactString::from(text), + } + } + } + + fn parse_fact(&mut self, tag_name: &str) -> Result<()> { + let attrs = self.parse_attributes()?; + + let is_nil = attrs + .iter() + .any(|(n, v)| *n == "xsi:nil" && (*v == "true" || *v == "1")); + + let context_ref = attrs + .iter() + .find(|(n, _)| *n == "contextRef") + .map(|(_, v)| CompactString::from(*v)); + + let unit_ref = attrs + .iter() + .find(|(n, _)| *n == "unitRef") + .map(|(_, v)| CompactString::from(*v)); + + let id = attrs + .iter() + .find(|(n, _)| *n == "id") + .map(|(_, v)| CompactString::from(*v)); + + let decimals = attrs + .iter() + .find(|(n, _)| *n == "decimals") + .and_then(|(_, v)| v.parse::().ok()); + + let is_self_closing = self.check_self_closing(); + self.skip_to_tag_end()?; + + let value = if is_self_closing || is_nil { + CompactString::new("") + } else { + let value = self.read_text_content_with_cdata()?; + self.skip_closing_tag(tag_name)?; + value + }; + + if let Some(context_ref) = context_ref { + let concept_id = self.allocator.intern_string(tag_name); + let context_id = self + .doc + .contexts + .iter() + .position(|c| c.id == context_ref) + .map(|i| i as u16) + .unwrap_or(0); + + let unit_id = unit_ref + .as_ref() + .and_then(|u| self.doc.units.iter().position(|unit| unit.id == *u)) + .map(|i| (i + 1) as u16) + .unwrap_or(0); + + let fact_value = if is_nil { + FactValue::Nil + } else if value.is_empty() { + FactValue::Text(self.allocator.intern_string("")) + } else if let Ok(decimal) = value.parse::() { + FactValue::Decimal(decimal) + } else if let Ok(integer) = value.parse::() { + FactValue::Integer(integer) + } else { + FactValue::Text(self.allocator.intern_string(&value)) + }; + + self.doc + .facts + .push(concept_id, context_id, unit_id, fact_value, decimals, id); + } + + Ok(()) + } +} diff --git a/rust/crabrl-fork/src/schema.rs b/rust/crabrl-fork/src/schema.rs new file mode 100644 index 0000000..a154664 --- /dev/null +++ b/rust/crabrl-fork/src/schema.rs @@ -0,0 +1,308 @@ +// Schema loading and validation for XBRL +use crate::model::*; +use crate::validator::ValidationError; +use crate::{Error, Result}; +use compact_str::CompactString; +use std::collections::HashMap; +use std::path::Path; + +pub struct SchemaLoader { + cache: HashMap, +} + +impl Default for SchemaLoader { + fn default() -> Self { + Self::new() + } +} + +impl SchemaLoader { + pub fn new() -> Self { + Self { + cache: HashMap::new(), + } + } + + pub fn load_schema>(&mut self, path: P) -> Result<&Schema> { + let path_str = path.as_ref().to_string_lossy(); + let key = CompactString::from(path_str.as_ref()); + + if self.cache.contains_key(&key) { + return Ok(self.cache.get(&key).unwrap()); + } + + let schema = self.parse_schema_file(path)?; + self.cache.insert(key.clone(), schema); + Ok(self.cache.get(&key).unwrap()) + } + + fn parse_schema_file>(&self, path: P) -> Result { + let content = std::fs::read(path)?; + self.parse_schema_bytes(&content) + } + + fn parse_schema_bytes(&self, data: &[u8]) -> Result { + // Simple XML parsing for schema + let mut schema = Schema { + target_namespace: CompactString::new(""), + elements: HashMap::new(), + types: HashMap::new(), + imports: Vec::new(), + }; + + // Skip BOM if present + let data = if data.starts_with(&[0xEF, 0xBB, 0xBF]) { + &data[3..] + } else { + data + }; + + let text = std::str::from_utf8(data) + .map_err(|_| Error::Parse("Invalid UTF-8 in schema".to_string()))?; + + // Extract target namespace + if let Some(ns_start) = text.find("targetNamespace=\"") { + let ns_start = ns_start + 17; + if let Some(ns_end) = text[ns_start..].find('"') { + schema.target_namespace = CompactString::from(&text[ns_start..ns_start + ns_end]); + } + } + + // Parse elements + let mut pos = 0; + while let Some(elem_start) = text[pos..].find("") { + elem_start + end + 2 + } else if let Some(end) = text[elem_start..].find("") { + elem_start + end + 13 + } else { + continue; + }; + + let elem_text = &text[elem_start..elem_end]; + + // Extract element attributes + let mut element = SchemaElement { + name: CompactString::new(""), + element_type: CompactString::new(""), + substitution_group: None, + period_type: None, + balance: None, + abstract_element: elem_text.contains("abstract=\"true\""), + nillable: elem_text.contains("nillable=\"true\""), + }; + + // Extract name + if let Some(name_start) = elem_text.find("name=\"") { + let name_start = name_start + 6; + if let Some(name_end) = elem_text[name_start..].find('"') { + element.name = + CompactString::from(&elem_text[name_start..name_start + name_end]); + } + } + + // Extract type + if let Some(type_start) = elem_text.find("type=\"") { + let type_start = type_start + 6; + if let Some(type_end) = elem_text[type_start..].find('"') { + element.element_type = + CompactString::from(&elem_text[type_start..type_start + type_end]); + } + } + + // Extract substitutionGroup + if let Some(sg_start) = elem_text.find("substitutionGroup=\"") { + let sg_start = sg_start + 19; + if let Some(sg_end) = elem_text[sg_start..].find('"') { + element.substitution_group = + Some(CompactString::from(&elem_text[sg_start..sg_start + sg_end])); + } + } + + // Extract XBRL-specific attributes + if let Some(pt_start) = elem_text.find("xbrli:periodType=\"") { + let pt_start = pt_start + 18; + if let Some(pt_end) = elem_text[pt_start..].find('"') { + element.period_type = + Some(CompactString::from(&elem_text[pt_start..pt_start + pt_end])); + } + } + + if let Some(bal_start) = elem_text.find("xbrli:balance=\"") { + let bal_start = bal_start + 15; + if let Some(bal_end) = elem_text[bal_start..].find('"') { + element.balance = Some(CompactString::from( + &elem_text[bal_start..bal_start + bal_end], + )); + } + } + + if !element.name.is_empty() { + schema.elements.insert(element.name.clone(), element); + } + } + + // Parse imports + pos = 0; + while let Some(import_start) = text[pos..].find("") { + let import_text = &text[import_start..import_start + import_end]; + + let mut import = SchemaImport { + namespace: CompactString::new(""), + schema_location: CompactString::new(""), + }; + + if let Some(ns_start) = import_text.find("namespace=\"") { + let ns_start = ns_start + 11; + if let Some(ns_end) = import_text[ns_start..].find('"') { + import.namespace = + CompactString::from(&import_text[ns_start..ns_start + ns_end]); + } + } + + if let Some(loc_start) = import_text.find("schemaLocation=\"") { + let loc_start = loc_start + 16; + if let Some(loc_end) = import_text[loc_start..].find('"') { + import.schema_location = + CompactString::from(&import_text[loc_start..loc_start + loc_end]); + } + } + + schema.imports.push(import); + } + } + + Ok(schema) + } + + pub fn validate_element(&self, name: &str, value: &str, schema: &Schema) -> Result<()> { + if let Some(element) = schema.elements.get(name) { + // Check if element is abstract + if element.abstract_element { + return Err(Error::Validation(format!("Element {} is abstract", name))); + } + + // Validate type + if let Some(type_def) = schema.types.get(&element.element_type) { + self.validate_type(value, type_def)?; + } + + Ok(()) + } else { + // Element not found in schema - might be from imported schema + Ok(()) + } + } + + fn validate_type(&self, value: &str, type_def: &SchemaType) -> Result<()> { + for restriction in &type_def.restrictions { + match restriction { + TypeRestriction::MinInclusive(min) => { + if let (Ok(val), Ok(min_val)) = (value.parse::(), min.parse::()) { + if val < min_val { + return Err(Error::Validation(format!( + "Value {} is less than minimum {}", + val, min_val + ))); + } + } + } + TypeRestriction::MaxInclusive(max) => { + if let (Ok(val), Ok(max_val)) = (value.parse::(), max.parse::()) { + if val > max_val { + return Err(Error::Validation(format!( + "Value {} is greater than maximum {}", + val, max_val + ))); + } + } + } + TypeRestriction::Pattern(pattern) => { + if !value.contains(pattern.as_str()) { + return Err(Error::Validation(format!( + "Value {} doesn't match pattern {}", + value, pattern + ))); + } + } + TypeRestriction::MinLength(min) => { + if value.len() < *min { + return Err(Error::Validation(format!( + "Value length {} is less than minimum {}", + value.len(), + min + ))); + } + } + TypeRestriction::MaxLength(max) => { + if value.len() > *max { + return Err(Error::Validation(format!( + "Value length {} is greater than maximum {}", + value.len(), + max + ))); + } + } + _ => {} + } + } + Ok(()) + } +} + +// Schema validator for documents +pub struct SchemaValidator { + schemas: Vec, +} + +impl Default for SchemaValidator { + fn default() -> Self { + Self::new() + } +} + +impl SchemaValidator { + pub fn new() -> Self { + Self { + schemas: Vec::new(), + } + } + + pub fn add_schema(&mut self, schema: Schema) { + self.schemas.push(schema); + } + + pub fn validate_document(&self, doc: &Document) -> Vec { + let errors = Vec::new(); + + for i in 0..doc.facts.len() { + let _concept_id = doc.facts.concept_ids.get(i); + let _value = doc.facts.values.get(i); + } + + for schema in &self.schemas { + for element in schema.elements.values() { + if !element.nillable && !element.abstract_element { + // Check if this required element exists in document + // This would require reverse mapping from concept names to facts + let _found = false; + // if !found { + // errors.push(ValidationError::MissingRequiredElement { + // element: name.to_string(), + // }); + // } + } + } + } + + errors + } +} diff --git a/rust/crabrl-fork/src/sec.rs b/rust/crabrl-fork/src/sec.rs new file mode 100644 index 0000000..90e7c74 --- /dev/null +++ b/rust/crabrl-fork/src/sec.rs @@ -0,0 +1,51 @@ +// SEC EDGAR XBRL filing support (local files only) +use crate::{Parser, Document, Result}; +use std::path::Path; + +pub struct SecFilingParser { + parser: Parser, +} + +impl SecFilingParser { + pub fn new() -> Self { + Self { + parser: Parser::new().with_validation(true), + } + } + + pub fn parse_filing>(&self, path: P) -> Result { + self.parser.parse_file(path) + } + + pub fn with_validation(mut self, validate: bool) -> Self { + self.parser = self.parser.with_validation(validate); + self + } +} + +// Test utilities for SEC filings +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_parse_local_sec_filing() { + let parser = SecFilingParser::new(); + + // Test with local test files + if std::path::Path::new("test_data/test_tiny.xbrl").exists() { + match parser.parse_filing("test_data/test_tiny.xbrl") { + Ok(doc) => { + println!("Successfully parsed filing:"); + println!(" Facts: {}", doc.facts.len()); + println!(" Contexts: {}", doc.contexts.len()); + println!(" Units: {}", doc.units.len()); + assert!(doc.contexts.len() > 0, "Should have contexts"); + } + Err(e) => { + eprintln!("Failed to parse filing: {}", e); + } + } + } + } +} diff --git a/rust/crabrl-fork/src/simd.rs b/rust/crabrl-fork/src/simd.rs new file mode 100644 index 0000000..cbfc30e --- /dev/null +++ b/rust/crabrl-fork/src/simd.rs @@ -0,0 +1,303 @@ +use memchr::{memchr, memchr3}; + +#[cfg(target_arch = "x86_64")] +use std::arch::x86_64::*; + +const XML_TAG_START: u8 = b'<'; +const XML_TAG_END: u8 = b'>'; +const XML_QUOTE: u8 = b'"'; + +#[inline(always)] +pub fn find_tag_start(haystack: &[u8]) -> Option { + memchr(XML_TAG_START, haystack) +} + +#[inline(always)] +pub fn find_tag_end(haystack: &[u8]) -> Option { + memchr(XML_TAG_END, haystack) +} + +#[inline(always)] +pub fn find_quote(haystack: &[u8]) -> Option { + memchr(XML_QUOTE, haystack) +} + +#[inline(always)] +pub fn find_any_delimiter(haystack: &[u8]) -> Option { + memchr3(XML_TAG_START, XML_TAG_END, XML_QUOTE, haystack) +} + +#[cfg(target_arch = "x86_64")] +#[target_feature(enable = "avx2")] +#[inline] +pub unsafe fn find_pattern_avx2(haystack: &[u8], pattern: &[u8]) -> Option { + if pattern.is_empty() || haystack.len() < pattern.len() { + return None; + } + + let first_byte = _mm256_set1_epi8(pattern[0] as i8); + let mut i = 0; + + while i + 32 <= haystack.len() { + let chunk = _mm256_loadu_si256(haystack.as_ptr().add(i) as *const _); + let cmp = _mm256_cmpeq_epi8(chunk, first_byte); + let mask = _mm256_movemask_epi8(cmp); + + if mask != 0 { + for bit_pos in 0..32 { + if (mask & (1 << bit_pos)) != 0 { + let pos = i + bit_pos; + if pos + pattern.len() <= haystack.len() + && &haystack[pos..pos + pattern.len()] == pattern + { + return Some(pos); + } + } + } + } + i += 32; + } + + while i < haystack.len() - pattern.len() + 1 { + if &haystack[i..i + pattern.len()] == pattern { + return Some(i); + } + i += 1; + } + + None +} + +#[cfg(target_arch = "x86_64")] +#[target_feature(enable = "avx2")] +#[inline] +pub unsafe fn skip_whitespace_avx2(data: &[u8], mut pos: usize) -> usize { + let space = _mm256_set1_epi8(0x20); + let tab = _mm256_set1_epi8(0x09); + let newline = _mm256_set1_epi8(0x0A); + let carriage = _mm256_set1_epi8(0x0D); + + while pos + 32 <= data.len() { + let chunk = _mm256_loadu_si256(data.as_ptr().add(pos) as *const _); + + let is_space = _mm256_cmpeq_epi8(chunk, space); + let is_tab = _mm256_cmpeq_epi8(chunk, tab); + let is_newline = _mm256_cmpeq_epi8(chunk, newline); + let is_carriage = _mm256_cmpeq_epi8(chunk, carriage); + + let is_whitespace = _mm256_or_si256( + _mm256_or_si256(is_space, is_tab), + _mm256_or_si256(is_newline, is_carriage), + ); + + let mask = _mm256_movemask_epi8(is_whitespace); + + if mask != -1 { + for i in 0..32 { + if (mask & (1 << i)) == 0 { + return pos + i; + } + } + } + + pos += 32; + } + + while pos < data.len() { + match data[pos] { + b' ' | b'\t' | b'\n' | b'\r' => pos += 1, + _ => break, + } + } + + pos +} + +#[inline(always)] +pub fn skip_whitespace(data: &[u8], mut pos: usize) -> usize { + #[cfg(target_arch = "x86_64")] + { + if is_x86_feature_detected!("avx2") && data.len() - pos >= 32 { + return unsafe { skip_whitespace_avx2(data, pos) }; + } + } + + while pos < data.len() { + match data[pos] { + b' ' | b'\t' | b'\n' | b'\r' => pos += 1, + _ => break, + } + } + pos +} + +#[inline(always)] +pub fn find_pattern(haystack: &[u8], pattern: &[u8]) -> Option { + #[cfg(target_arch = "x86_64")] + { + if is_x86_feature_detected!("avx2") && haystack.len() >= 32 { + return unsafe { find_pattern_avx2(haystack, pattern) }; + } + } + + haystack + .windows(pattern.len()) + .position(|window| window == pattern) +} + +#[inline(always)] +pub fn is_whitespace(byte: u8) -> bool { + matches!(byte, b' ' | b'\t' | b'\n' | b'\r') +} + +#[inline(always)] +pub fn is_name_char(byte: u8) -> bool { + matches!(byte, + b'a'..=b'z' | b'A'..=b'Z' | b'0'..=b'9' | + b'-' | b'_' | b'.' | b':' + ) +} + +#[inline(always)] +pub fn is_name_start_char(byte: u8) -> bool { + matches!(byte, b'a'..=b'z' | b'A'..=b'Z' | b'_' | b':') +} + +pub struct SimdScanner<'a> { + pub data: &'a [u8], + pub pos: usize, +} + +impl<'a> SimdScanner<'a> { + #[inline(always)] + pub fn new(data: &'a [u8]) -> Self { + Self { data, pos: 0 } + } + + #[inline(always)] + pub fn skip_whitespace(&mut self) { + self.pos = skip_whitespace(self.data, self.pos); + } + + #[inline(always)] + pub fn find_next(&self, byte: u8) -> Option { + memchr(byte, &self.data[self.pos..]).map(|i| self.pos + i) + } + + #[inline(always)] + pub fn find_pattern(&self, pattern: &[u8]) -> Option { + find_pattern(&self.data[self.pos..], pattern).map(|i| self.pos + i) + } + + #[inline(always)] + pub fn advance(&mut self, n: usize) { + self.pos = (self.pos + n).min(self.data.len()); + } + + #[inline(always)] + pub fn peek(&self) -> Option { + self.data.get(self.pos).copied() + } + + #[inline(always)] + pub fn peek_ahead(&self, n: usize) -> Option<&'a [u8]> { + if self.pos + n <= self.data.len() { + Some(&self.data[self.pos..self.pos + n]) + } else { + None + } + } + + #[inline(always)] + pub fn remaining(&self) -> &'a [u8] { + &self.data[self.pos..] + } + + #[inline(always)] + pub fn is_eof(&self) -> bool { + self.pos >= self.data.len() + } + + #[inline(always)] + pub fn read_until(&mut self, byte: u8) -> &'a [u8] { + let start = self.pos; + if let Some(offset) = memchr(byte, &self.data[self.pos..]) { + self.pos += offset; + &self.data[start..self.pos] + } else { + self.pos = self.data.len(); + &self.data[start..] + } + } + + #[inline(always)] + pub fn read_until_any(&mut self, bytes: &[u8]) -> &'a [u8] { + let start = self.pos; + while self.pos < self.data.len() { + if bytes.contains(&self.data[self.pos]) { + return &self.data[start..self.pos]; + } + self.pos += 1; + } + &self.data[start..] + } + + #[inline(always)] + pub fn consume_if(&mut self, byte: u8) -> bool { + if self.peek() == Some(byte) { + self.advance(1); + true + } else { + false + } + } + + #[inline(always)] + pub fn consume_while bool>(&mut self, predicate: F) -> &'a [u8] { + let start = self.pos; + while let Some(byte) = self.peek() { + if predicate(byte) { + self.advance(1); + } else { + break; + } + } + &self.data[start..self.pos] + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_find_pattern() { + let haystack = b""; + let pattern = b"context"; + assert_eq!(find_pattern(haystack, pattern), Some(6)); + } + + #[test] + fn test_skip_whitespace() { + let data = b" \t\n\r"; + assert_eq!(skip_whitespace(data, 0), 6); + } + + #[test] + fn test_scanner_read_until() { + let data = b"hello world>"; + let mut scanner = SimdScanner::new(data); + let result = scanner.read_until(b'>'); + assert_eq!(result, b"hello world"); + assert_eq!(scanner.peek(), Some(b'>')); + } + + #[test] + fn test_scanner_consume_while() { + let data = b"abc123"; + let mut scanner = SimdScanner::new(data); + let result = scanner.consume_while(|b| b.is_ascii_alphabetic()); + assert_eq!(result, b"abc"); + assert_eq!(scanner.peek(), Some(b'1')); + } +} diff --git a/rust/crabrl-fork/src/simple_parser.rs b/rust/crabrl-fork/src/simple_parser.rs new file mode 100644 index 0000000..0f6aa36 --- /dev/null +++ b/rust/crabrl-fork/src/simple_parser.rs @@ -0,0 +1,99 @@ +//! Simple working XBRL parser + +use crate::{model::*, Result}; +use std::path::Path; + +#[derive(Default)] +pub struct Parser { + #[allow(dead_code)] + load_linkbases: bool, +} + +impl Parser { + pub fn new() -> Self { + Self::default() + } + + pub fn parse_str(&self, content: &str) -> Result { + self.parse_bytes(content.as_bytes()) + } + + pub fn parse_file>(&self, path: P) -> Result { + let content = std::fs::read(path)?; + self.parse_bytes(&content) + } + + pub fn parse_bytes(&self, data: &[u8]) -> Result { + // Simple XML parsing - just count elements for now + let text = String::from_utf8_lossy(data); + + // Count facts (very simplified) + let fact_count = text.matches(", + pub linkbases: Vec, +} + +pub struct Schema { + pub target_namespace: CompactString, + pub elements: HashMap, +} + +pub struct Element { + pub name: CompactString, + pub element_type: CompactString, + pub substitution_group: Option, + pub period_type: Option, +} + +pub struct Linkbase { + pub role: CompactString, + pub arcs: Vec, +} + +pub struct Arc { + pub from: CompactString, + pub to: CompactString, + pub order: f32, + pub weight: f32, +} + +impl Taxonomy { + pub fn new() -> Self { + Self { + schemas: Vec::new(), + linkbases: Vec::new(), + } + } + + pub fn load_schema(&mut self, _path: &str) -> Result<()> { + Ok(()) + } + + pub fn load_linkbase(&mut self, _path: &str) -> Result<()> { + Ok(()) + } +} diff --git a/rust/crabrl-fork/src/validator.rs b/rust/crabrl-fork/src/validator.rs new file mode 100644 index 0000000..c5bcd21 --- /dev/null +++ b/rust/crabrl-fork/src/validator.rs @@ -0,0 +1,601 @@ +// Comprehensive XBRL validation +use crate::{model::*, Error, Result}; +use std::collections::HashSet; + +#[derive(Debug, Clone)] +pub enum ValidationError { + InvalidContextRef { + fact_index: usize, + context_id: u16, + }, + InvalidUnitRef { + fact_index: usize, + unit_id: u16, + }, + CalculationInconsistency { + concept: String, + expected: f64, + actual: f64, + }, + InvalidDataType { + concept: String, + expected_type: String, + actual_value: String, + }, + MissingRequiredElement { + element: String, + }, + DuplicateId { + id: String, + }, +} + +pub struct XbrlValidator { + strict_mode: bool, + #[allow(dead_code)] + check_calculations: bool, + check_duplicates: bool, + check_contexts: bool, + check_units: bool, + #[allow(dead_code)] + check_datatypes: bool, + decimal_tolerance: f64, +} + +impl Default for XbrlValidator { + fn default() -> Self { + Self { + strict_mode: false, + check_calculations: true, + check_duplicates: true, + check_contexts: true, + check_units: true, + check_datatypes: true, + decimal_tolerance: 0.01, + } + } +} + +impl XbrlValidator { + pub fn new() -> Self { + Self::default() + } + + pub fn strict(mut self) -> Self { + self.strict_mode = true; + self + } + + pub fn with_tolerance(mut self, tolerance: f64) -> Self { + self.decimal_tolerance = tolerance; + self + } + + pub fn validate(&self, doc: &mut Document) -> Result<()> { + let mut validation_errors = Vec::new(); + + // Context validation + if self.check_contexts { + validation_errors.extend(self.validate_contexts(doc)); + } + + // Unit validation + if self.check_units { + validation_errors.extend(self.validate_units(doc)); + } + + // Fact validation + validation_errors.extend(self.validate_facts(doc)); + + // Duplicate detection + if self.check_duplicates { + validation_errors.extend(self.check_duplicate_facts(doc)); + } + + // Return error in strict mode if any validation errors + if self.strict_mode && !validation_errors.is_empty() { + return Err(Error::Validation(format!( + "Validation failed with {} errors", + validation_errors.len() + ))); + } + + Ok(()) + } + + fn validate_contexts(&self, doc: &Document) -> Vec { + let mut errors = Vec::new(); + let mut context_ids = HashSet::new(); + + for ctx in &doc.contexts { + // Check for duplicate context IDs + if !context_ids.insert(ctx.id.clone()) { + errors.push(ValidationError::DuplicateId { + id: ctx.id.to_string(), + }); + } + + // Validate entity identifier + if ctx.entity.identifier.is_empty() { + errors.push(ValidationError::MissingRequiredElement { + element: format!("Entity identifier for context {}", ctx.id), + }); + } + + // Validate period + if let Period::Duration { start, end } = &ctx.period { + if start > end { + errors.push(ValidationError::InvalidDataType { + concept: format!("context_{}", ctx.id), + expected_type: "valid period".to_string(), + actual_value: format!("start {} > end {}", start, end), + }); + } + } + } + + errors + } + + fn validate_units(&self, doc: &Document) -> Vec { + let mut errors = Vec::new(); + let mut unit_ids = HashSet::new(); + + for unit in &doc.units { + // Check for duplicate unit IDs + if !unit_ids.insert(unit.id.clone()) { + errors.push(ValidationError::DuplicateId { + id: unit.id.to_string(), + }); + } + + // Validate measures + match &unit.unit_type { + UnitType::Simple(measures) => { + if measures.is_empty() { + errors.push(ValidationError::MissingRequiredElement { + element: format!("Measures for unit {}", unit.id), + }); + } + } + UnitType::Divide { + numerator, + denominator, + } => { + if numerator.is_empty() || denominator.is_empty() { + errors.push(ValidationError::MissingRequiredElement { + element: format!("Numerator/denominator for unit {}", unit.id), + }); + } + } + UnitType::Multiply(measures) => { + if measures.is_empty() { + errors.push(ValidationError::MissingRequiredElement { + element: format!("Measures for unit {}", unit.id), + }); + } + } + } + } + + errors + } + + fn validate_facts(&self, doc: &Document) -> Vec { + let mut errors = Vec::new(); + + // Validate fact references + for i in 0..doc.facts.len() { + if i < doc.facts.context_ids.len() { + let context_id = doc.facts.context_ids[i]; + if context_id as usize >= doc.contexts.len() { + errors.push(ValidationError::InvalidContextRef { + fact_index: i, + context_id, + }); + } + } + + if i < doc.facts.unit_ids.len() { + let unit_id = doc.facts.unit_ids[i]; + if unit_id > 0 && unit_id as usize > doc.units.len() { + errors.push(ValidationError::InvalidUnitRef { + fact_index: i, + unit_id, + }); + } + } + } + + errors + } + + fn check_duplicate_facts(&self, doc: &Document) -> Vec { + let mut errors = Vec::new(); + let mut fact_keys = HashSet::new(); + + for i in 0..doc.facts.len() { + if i < doc.facts.concept_ids.len() && i < doc.facts.context_ids.len() { + let key = (doc.facts.concept_ids[i], doc.facts.context_ids[i]); + if !fact_keys.insert(key) && self.strict_mode { + errors.push(ValidationError::DuplicateId { + id: format!("Duplicate fact at index {}", i), + }); + } + } + } + + errors + } +} + +// Type alias for validation rules +type ValidationRule = Box Vec>; + +// Validation context and rules +pub struct ValidationContext { + pub profile: ValidationProfile, + pub custom_rules: Vec, +} + +#[derive(Debug, Clone, Copy)] +pub enum ValidationProfile { + Generic, + SecEdgar, + Ifrs, + UsGaap, +} + +impl ValidationContext { + pub fn new(profile: ValidationProfile) -> Self { + Self { + profile, + custom_rules: Vec::new(), + } + } + + pub fn add_rule(&mut self, rule: F) + where + F: Fn(&Document) -> Vec + 'static, + { + self.custom_rules.push(Box::new(rule)); + } + + pub fn validate(&self, doc: &Document) -> Vec { + let mut errors = Vec::new(); + + // Apply profile-specific rules + match self.profile { + ValidationProfile::SecEdgar => { + errors.extend(sec_validation_rules(doc)); + } + ValidationProfile::Ifrs => { + errors.extend(ifrs_validation_rules(doc)); + } + _ => {} + } + + // Apply custom rules + for rule in &self.custom_rules { + errors.extend(rule(doc)); + } + + errors + } +} + +// SEC EDGAR specific validation rules +pub fn sec_validation_rules(doc: &Document) -> Vec { + let mut errors = Vec::new(); + + // Check for required DEI contexts + let mut has_current_period = false; + let mut has_entity_info = false; + let mut has_dei_elements = false; + + for ctx in &doc.contexts { + // Check for current period context + if ctx.id.contains("CurrentYear") + || ctx.id.contains("CurrentPeriod") + || ctx.id.contains("DocumentPeriodEndDate") + { + has_current_period = true; + } + + // Validate CIK format (10 digits) + if ctx.entity.scheme.contains("sec.gov/CIK") { + has_entity_info = true; + let cik = &ctx.entity.identifier; + if cik.len() != 10 || !cik.chars().all(|c| c.is_ascii_digit()) { + errors.push(ValidationError::InvalidDataType { + concept: "CIK".to_string(), + expected_type: "10-digit number".to_string(), + actual_value: cik.to_string(), + }); + } + } + } + + // Check for DEI elements in facts + for i in 0..doc.facts.concept_ids.len() { + if i < doc.concept_names.len() { + let concept = &doc.concept_names[i]; + if concept.contains("dei:") + || concept.contains("DocumentType") + || concept.contains("EntityRegistrantName") + { + has_dei_elements = true; + } + } + } + + // Required elements validation + if !has_current_period { + errors.push(ValidationError::MissingRequiredElement { + element: "Current period context required for SEC filing".to_string(), + }); + } + + if !has_entity_info { + errors.push(ValidationError::MissingRequiredElement { + element: "Entity CIK information required for SEC filing".to_string(), + }); + } + + if !has_dei_elements { + errors.push(ValidationError::MissingRequiredElement { + element: "DEI (Document and Entity Information) elements required".to_string(), + }); + } + + // Validate segment reporting if present + for ctx in &doc.contexts { + if let Some(segment) = &ctx.entity.segment { + // Check explicit members have valid dimension references + for member in &segment.explicit_members { + if member.dimension.is_empty() || member.member.is_empty() { + errors.push(ValidationError::InvalidDataType { + concept: format!("segment_{}", ctx.id), + expected_type: "valid dimension member".to_string(), + actual_value: format!("{}:{}", member.dimension, member.member), + }); + } + } + } + } + + // Validate calculation consistency for monetary items + let mut monetary_facts: Vec<(usize, f64)> = Vec::new(); + for i in 0..doc.facts.len() { + if i < doc.facts.values.len() { + if let FactValue::Decimal(val) = &doc.facts.values[i] { + // Check if this is a monetary fact (has USD unit) + if i < doc.facts.unit_ids.len() { + let unit_id = doc.facts.unit_ids[i] as usize; + if unit_id < doc.units.len() { + if let UnitType::Simple(measures) = &doc.units[unit_id].unit_type { + if measures.iter().any(|m| m.name == "USD" || m.name == "usd") { + monetary_facts.push((i, *val)); + } + } + } + } + } + } + } + + // Basic calculation validation - check for reasonable values + for (idx, value) in monetary_facts { + if value.is_nan() || value.is_infinite() { + errors.push(ValidationError::InvalidDataType { + concept: format!("fact_{}", idx), + expected_type: "valid monetary amount".to_string(), + actual_value: format!("{}", value), + }); + } + // Check for suspiciously large values (> $10 trillion) + if value.abs() > 10_000_000_000_000.0 { + errors.push(ValidationError::InvalidDataType { + concept: format!("fact_{}", idx), + expected_type: "reasonable monetary amount".to_string(), + actual_value: format!("${:.2}", value), + }); + } + } + + errors +} + +// IFRS specific validation rules +pub fn ifrs_validation_rules(doc: &Document) -> Vec { + let mut errors = Vec::new(); + + // Check for IFRS-required contexts + let mut has_reporting_period = false; + let mut has_comparative_period = false; + let mut has_entity_info = false; + + for ctx in &doc.contexts { + // Check for reporting period + match &ctx.period { + Period::Duration { start, end: _ } => { + has_reporting_period = true; + // IFRS requires comparative information + if start.contains("PY") + || ctx.id.contains("PriorYear") + || ctx.id.contains("Comparative") + { + has_comparative_period = true; + } + } + Period::Instant { date } => { + if !date.is_empty() { + has_reporting_period = true; + } + } + _ => {} + } + + // Validate entity information + if !ctx.entity.identifier.is_empty() { + has_entity_info = true; + } + } + + // Required contexts validation + if !has_reporting_period { + errors.push(ValidationError::MissingRequiredElement { + element: "Reporting period required for IFRS filing".to_string(), + }); + } + + if !has_comparative_period { + errors.push(ValidationError::MissingRequiredElement { + element: "Comparative period information required by IFRS".to_string(), + }); + } + + if !has_entity_info { + errors.push(ValidationError::MissingRequiredElement { + element: "Entity identification required for IFRS filing".to_string(), + }); + } + + // Validate dimensional structure + let mut dimension_validations = Vec::new(); + for ctx in &doc.contexts { + // Check segment dimensions + if let Some(segment) = &ctx.entity.segment { + for member in &segment.explicit_members { + // IFRS dimensions should follow specific patterns + if !member.dimension.contains(":") { + dimension_validations + .push(format!("Invalid dimension format: {}", member.dimension)); + } + if member.dimension.contains("ifrs") || member.dimension.contains("ifrs-full") { + // Valid IFRS dimension + if member.member.is_empty() { + errors.push(ValidationError::InvalidDataType { + concept: format!("dimension_{}", ctx.id), + expected_type: "valid IFRS dimension member".to_string(), + actual_value: member.dimension.to_string(), + }); + } + } + } + + // Check typed members for IFRS compliance + for typed in &segment.typed_members { + if typed.dimension.contains("ifrs") && typed.value.is_empty() { + errors.push(ValidationError::InvalidDataType { + concept: format!("typed_dimension_{}", ctx.id), + expected_type: "non-empty typed dimension value".to_string(), + actual_value: typed.dimension.to_string(), + }); + } + } + } + + // Check scenario dimensions (alternative to segment) + if let Some(scenario) = &ctx.scenario { + for member in &scenario.explicit_members { + if member.dimension.contains("ifrs") && member.member.is_empty() { + errors.push(ValidationError::InvalidDataType { + concept: format!("scenario_dimension_{}", ctx.id), + expected_type: "valid IFRS scenario member".to_string(), + actual_value: member.dimension.to_string(), + }); + } + } + } + } + + // Check for mandatory IFRS disclosures in facts + let mut has_financial_position = false; + let mut has_comprehensive_income = false; + let mut has_cash_flows = false; + let mut has_changes_in_equity = false; + + for i in 0..doc.concept_names.len() { + let concept = &doc.concept_names[i]; + let lower = concept.to_lowercase(); + + if lower.contains("financialposition") + || lower.contains("balancesheet") + || lower.contains("assets") + || lower.contains("liabilities") + { + has_financial_position = true; + } + + if lower.contains("comprehensiveincome") + || lower.contains("profitorloss") + || lower.contains("income") + || lower.contains("revenue") + { + has_comprehensive_income = true; + } + + if lower.contains("cashflow") || lower.contains("cashflows") { + has_cash_flows = true; + } + + if lower.contains("changesinequity") || lower.contains("equity") { + has_changes_in_equity = true; + } + } + + // Validate mandatory statements + if !has_financial_position { + errors.push(ValidationError::MissingRequiredElement { + element: "Statement of Financial Position required by IFRS".to_string(), + }); + } + + if !has_comprehensive_income { + errors.push(ValidationError::MissingRequiredElement { + element: "Statement of Comprehensive Income required by IFRS".to_string(), + }); + } + + if !has_cash_flows { + errors.push(ValidationError::MissingRequiredElement { + element: "Statement of Cash Flows required by IFRS".to_string(), + }); + } + + if !has_changes_in_equity { + errors.push(ValidationError::MissingRequiredElement { + element: "Statement of Changes in Equity required by IFRS".to_string(), + }); + } + + // Validate presentation linkbase relationships + for link in &doc.presentation_links { + // Check order is valid (typically 1.0 to 999.0) + if link.order < 0.0 || link.order > 1000.0 { + errors.push(ValidationError::InvalidDataType { + concept: format!("presentation_link_{}_{}", link.from, link.to), + expected_type: "valid presentation order (0-1000)".to_string(), + actual_value: format!("{}", link.order), + }); + } + } + + // Validate calculation relationships + for link in &doc.calculation_links { + // Check weight is reasonable (-1.0 or 1.0 typically) + if link.weight != 1.0 && link.weight != -1.0 && link.weight != 0.0 { + // Unusual weight, might be an error + if link.weight.abs() > 10.0 { + errors.push(ValidationError::InvalidDataType { + concept: format!("calculation_link_{}_{}", link.from, link.to), + expected_type: "reasonable calculation weight".to_string(), + actual_value: format!("{}", link.weight), + }); + } + } + } + + errors +}