accidentals/explore_dataset.py
dullfig ebc925482e Initial commit: accidental classifier (sharp/flat/natural)
CNN trained on PrIMuS crops achieves 100% on held-out test set.
Includes training pipeline, evaluation script, extraction tools,
and saved model weights.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-02 08:01:37 -08:00

102 lines
3.7 KiB
Python

"""Explore the PrIMuS dataset to understand accidental distribution and image structure."""
import glob
import os
from collections import Counter
from PIL import Image
import numpy as np
DATASET_ROOT = r"C:\src\accidentals\dataset"
def parse_agnostic(path: str) -> list[str]:
"""Parse an agnostic encoding file into a list of tokens."""
with open(path, "r", encoding="utf-8") as f:
return f.read().strip().split("\t")
def main():
# Find all agnostic files
patterns = [
os.path.join(DATASET_ROOT, "package_aa", "*", "*.agnostic"),
os.path.join(DATASET_ROOT, "package_ab", "*", "*.agnostic"),
]
agnostic_files = []
for p in patterns:
agnostic_files.extend(glob.glob(p))
print(f"Total incipits: {len(agnostic_files)}")
# Count accidental tokens
accidental_type_counts = Counter() # sharp/flat/natural
accidental_full_counts = Counter() # full token like accidental.sharp-L5
incipits_with_accidentals = 0
incipits_with_inline_accidentals = 0 # accidentals that aren't in key sig
all_symbol_types = Counter()
total_accidentals = 0
for path in agnostic_files:
tokens = parse_agnostic(path)
has_any_accidental = False
has_inline = False
past_time_sig = False
for tok in tokens:
# Track symbol types (just the prefix)
base = tok.split("-")[0] if "-" in tok else tok
all_symbol_types[base] += 1
if tok.startswith("digit."):
past_time_sig = True
if tok.startswith("accidental."):
has_any_accidental = True
total_accidentals += 1
# Extract type: accidental.sharp, accidental.flat, etc.
acc_type = tok.split("-")[0] # e.g. "accidental.sharp"
accidental_type_counts[acc_type] += 1
accidental_full_counts[tok] += 1
if past_time_sig:
has_inline = True
if has_any_accidental:
incipits_with_accidentals += 1
if has_inline:
incipits_with_inline_accidentals += 1
print(f"\n=== Accidental Statistics ===")
print(f"Total accidental tokens: {total_accidentals}")
print(f"Incipits with any accidentals: {incipits_with_accidentals} / {len(agnostic_files)} ({100*incipits_with_accidentals/len(agnostic_files):.1f}%)")
print(f"Incipits with inline accidentals: {incipits_with_inline_accidentals} / {len(agnostic_files)} ({100*incipits_with_inline_accidentals/len(agnostic_files):.1f}%)")
print(f"\n=== Accidental Type Counts ===")
for acc_type, count in accidental_type_counts.most_common():
print(f" {acc_type:25s} {count:7d}")
print(f"\n=== Top 20 Accidental Positions ===")
for tok, count in accidental_full_counts.most_common(20):
print(f" {tok:30s} {count:7d}")
print(f"\n=== Top 30 Symbol Types ===")
for sym, count in all_symbol_types.most_common(30):
print(f" {sym:30s} {count:7d}")
# Image statistics from a sample
print(f"\n=== Image Statistics (sample of 500) ===")
png_files = glob.glob(os.path.join(DATASET_ROOT, "package_aa", "*", "*.png"))[:500]
widths, heights = [], []
for f in png_files:
im = Image.open(f)
widths.append(im.size[0])
heights.append(im.size[1])
widths = np.array(widths)
heights = np.array(heights)
print(f" Width: min={widths.min()}, max={widths.max()}, mean={widths.mean():.0f}, std={widths.std():.0f}")
print(f" Height: min={heights.min()}, max={heights.max()}, mean={heights.mean():.0f}, std={heights.std():.0f}")
print(f" Modes: {Counter(Image.open(f).mode for f in png_files[:50])}")
if __name__ == "__main__":
main()