CNN trained on PrIMuS crops achieves 100% on held-out test set. Includes training pipeline, evaluation script, extraction tools, and saved model weights. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
102 lines
3.7 KiB
Python
102 lines
3.7 KiB
Python
"""Explore the PrIMuS dataset to understand accidental distribution and image structure."""
|
|
|
|
import glob
|
|
import os
|
|
from collections import Counter
|
|
from PIL import Image
|
|
import numpy as np
|
|
|
|
DATASET_ROOT = r"C:\src\accidentals\dataset"
|
|
|
|
|
|
def parse_agnostic(path: str) -> list[str]:
|
|
"""Parse an agnostic encoding file into a list of tokens."""
|
|
with open(path, "r", encoding="utf-8") as f:
|
|
return f.read().strip().split("\t")
|
|
|
|
|
|
def main():
|
|
# Find all agnostic files
|
|
patterns = [
|
|
os.path.join(DATASET_ROOT, "package_aa", "*", "*.agnostic"),
|
|
os.path.join(DATASET_ROOT, "package_ab", "*", "*.agnostic"),
|
|
]
|
|
agnostic_files = []
|
|
for p in patterns:
|
|
agnostic_files.extend(glob.glob(p))
|
|
|
|
print(f"Total incipits: {len(agnostic_files)}")
|
|
|
|
# Count accidental tokens
|
|
accidental_type_counts = Counter() # sharp/flat/natural
|
|
accidental_full_counts = Counter() # full token like accidental.sharp-L5
|
|
incipits_with_accidentals = 0
|
|
incipits_with_inline_accidentals = 0 # accidentals that aren't in key sig
|
|
all_symbol_types = Counter()
|
|
total_accidentals = 0
|
|
|
|
for path in agnostic_files:
|
|
tokens = parse_agnostic(path)
|
|
has_any_accidental = False
|
|
has_inline = False
|
|
past_time_sig = False
|
|
|
|
for tok in tokens:
|
|
# Track symbol types (just the prefix)
|
|
base = tok.split("-")[0] if "-" in tok else tok
|
|
all_symbol_types[base] += 1
|
|
|
|
if tok.startswith("digit."):
|
|
past_time_sig = True
|
|
|
|
if tok.startswith("accidental."):
|
|
has_any_accidental = True
|
|
total_accidentals += 1
|
|
# Extract type: accidental.sharp, accidental.flat, etc.
|
|
acc_type = tok.split("-")[0] # e.g. "accidental.sharp"
|
|
accidental_type_counts[acc_type] += 1
|
|
accidental_full_counts[tok] += 1
|
|
|
|
if past_time_sig:
|
|
has_inline = True
|
|
|
|
if has_any_accidental:
|
|
incipits_with_accidentals += 1
|
|
if has_inline:
|
|
incipits_with_inline_accidentals += 1
|
|
|
|
print(f"\n=== Accidental Statistics ===")
|
|
print(f"Total accidental tokens: {total_accidentals}")
|
|
print(f"Incipits with any accidentals: {incipits_with_accidentals} / {len(agnostic_files)} ({100*incipits_with_accidentals/len(agnostic_files):.1f}%)")
|
|
print(f"Incipits with inline accidentals: {incipits_with_inline_accidentals} / {len(agnostic_files)} ({100*incipits_with_inline_accidentals/len(agnostic_files):.1f}%)")
|
|
|
|
print(f"\n=== Accidental Type Counts ===")
|
|
for acc_type, count in accidental_type_counts.most_common():
|
|
print(f" {acc_type:25s} {count:7d}")
|
|
|
|
print(f"\n=== Top 20 Accidental Positions ===")
|
|
for tok, count in accidental_full_counts.most_common(20):
|
|
print(f" {tok:30s} {count:7d}")
|
|
|
|
print(f"\n=== Top 30 Symbol Types ===")
|
|
for sym, count in all_symbol_types.most_common(30):
|
|
print(f" {sym:30s} {count:7d}")
|
|
|
|
# Image statistics from a sample
|
|
print(f"\n=== Image Statistics (sample of 500) ===")
|
|
png_files = glob.glob(os.path.join(DATASET_ROOT, "package_aa", "*", "*.png"))[:500]
|
|
widths, heights = [], []
|
|
for f in png_files:
|
|
im = Image.open(f)
|
|
widths.append(im.size[0])
|
|
heights.append(im.size[1])
|
|
|
|
widths = np.array(widths)
|
|
heights = np.array(heights)
|
|
print(f" Width: min={widths.min()}, max={widths.max()}, mean={widths.mean():.0f}, std={widths.std():.0f}")
|
|
print(f" Height: min={heights.min()}, max={heights.max()}, mean={heights.mean():.0f}, std={heights.std():.0f}")
|
|
print(f" Modes: {Counter(Image.open(f).mode for f in png_files[:50])}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|