accidentals/explore_dataset.py

"""Explore the PrIMuS dataset to understand accidental distribution and image structure."""

import glob
import os
from collections import Counter
from PIL import Image
import numpy as np

DATASET_ROOT = r"C:\src\accidentals\dataset"


def parse_agnostic(path: str) -> list[str]:
    """Parse an agnostic encoding file into a list of tokens."""
    with open(path, "r", encoding="utf-8") as f:
        return f.read().strip().split("\t")


def main():
    # Find all agnostic files
    patterns = [
        os.path.join(DATASET_ROOT, "package_aa", "*", "*.agnostic"),
        os.path.join(DATASET_ROOT, "package_ab", "*", "*.agnostic"),
    ]
    agnostic_files = []
    for p in patterns:
        agnostic_files.extend(glob.glob(p))

    print(f"Total incipits: {len(agnostic_files)}")

    # Count accidental tokens
    accidental_type_counts = Counter()  # sharp/flat/natural
    accidental_full_counts = Counter()  # full token like accidental.sharp-L5
    incipits_with_accidentals = 0
    incipits_with_inline_accidentals = 0  # accidentals that aren't in key sig
    all_symbol_types = Counter()
    total_accidentals = 0

    for path in agnostic_files:
        tokens = parse_agnostic(path)
        has_any_accidental = False
        has_inline = False
        past_time_sig = False

        for tok in tokens:
            # Track symbol types (just the prefix)
            base = tok.split("-")[0] if "-" in tok else tok
            all_symbol_types[base] += 1

            if tok.startswith("digit."):
                past_time_sig = True

            if tok.startswith("accidental."):
                has_any_accidental = True
                total_accidentals += 1
                # Extract type: accidental.sharp, accidental.flat, etc.
                acc_type = tok.split("-")[0]  # e.g. "accidental.sharp"
                accidental_type_counts[acc_type] += 1
                accidental_full_counts[tok] += 1

                if past_time_sig:
                    has_inline = True

        if has_any_accidental:
            incipits_with_accidentals += 1
        if has_inline:
            incipits_with_inline_accidentals += 1

    print(f"\n=== Accidental Statistics ===")
    print(f"Total accidental tokens: {total_accidentals}")
    print(f"Incipits with any accidentals: {incipits_with_accidentals} / {len(agnostic_files)} ({100*incipits_with_accidentals/len(agnostic_files):.1f}%)")
    print(f"Incipits with inline accidentals: {incipits_with_inline_accidentals} / {len(agnostic_files)} ({100*incipits_with_inline_accidentals/len(agnostic_files):.1f}%)")

    print(f"\n=== Accidental Type Counts ===")
    for acc_type, count in accidental_type_counts.most_common():
        print(f"  {acc_type:25s} {count:7d}")

    print(f"\n=== Top 20 Accidental Positions ===")
    for tok, count in accidental_full_counts.most_common(20):
        print(f"  {tok:30s} {count:7d}")

    print(f"\n=== Top 30 Symbol Types ===")
    for sym, count in all_symbol_types.most_common(30):
        print(f"  {sym:30s} {count:7d}")

    # Image statistics from a sample
    print(f"\n=== Image Statistics (sample of 500) ===")
    png_files = glob.glob(os.path.join(DATASET_ROOT, "package_aa", "*", "*.png"))[:500]
    widths, heights = [], []
    for f in png_files:
        im = Image.open(f)
        widths.append(im.size[0])
        heights.append(im.size[1])

    widths = np.array(widths)
    heights = np.array(heights)
    print(f"  Width:  min={widths.min()}, max={widths.max()}, mean={widths.mean():.0f}, std={widths.std():.0f}")
    print(f"  Height: min={heights.min()}, max={heights.max()}, mean={heights.mean():.0f}, std={heights.std():.0f}")
    print(f"  Modes:  {Counter(Image.open(f).mode for f in png_files[:50])}")


if __name__ == "__main__":
    main()