"""Explore the PrIMuS dataset to understand accidental distribution and image structure.""" import glob import os from collections import Counter from PIL import Image import numpy as np DATASET_ROOT = r"C:\src\accidentals\dataset" def parse_agnostic(path: str) -> list[str]: """Parse an agnostic encoding file into a list of tokens.""" with open(path, "r", encoding="utf-8") as f: return f.read().strip().split("\t") def main(): # Find all agnostic files patterns = [ os.path.join(DATASET_ROOT, "package_aa", "*", "*.agnostic"), os.path.join(DATASET_ROOT, "package_ab", "*", "*.agnostic"), ] agnostic_files = [] for p in patterns: agnostic_files.extend(glob.glob(p)) print(f"Total incipits: {len(agnostic_files)}") # Count accidental tokens accidental_type_counts = Counter() # sharp/flat/natural accidental_full_counts = Counter() # full token like accidental.sharp-L5 incipits_with_accidentals = 0 incipits_with_inline_accidentals = 0 # accidentals that aren't in key sig all_symbol_types = Counter() total_accidentals = 0 for path in agnostic_files: tokens = parse_agnostic(path) has_any_accidental = False has_inline = False past_time_sig = False for tok in tokens: # Track symbol types (just the prefix) base = tok.split("-")[0] if "-" in tok else tok all_symbol_types[base] += 1 if tok.startswith("digit."): past_time_sig = True if tok.startswith("accidental."): has_any_accidental = True total_accidentals += 1 # Extract type: accidental.sharp, accidental.flat, etc. acc_type = tok.split("-")[0] # e.g. "accidental.sharp" accidental_type_counts[acc_type] += 1 accidental_full_counts[tok] += 1 if past_time_sig: has_inline = True if has_any_accidental: incipits_with_accidentals += 1 if has_inline: incipits_with_inline_accidentals += 1 print(f"\n=== Accidental Statistics ===") print(f"Total accidental tokens: {total_accidentals}") print(f"Incipits with any accidentals: {incipits_with_accidentals} / {len(agnostic_files)} ({100*incipits_with_accidentals/len(agnostic_files):.1f}%)") print(f"Incipits with inline accidentals: {incipits_with_inline_accidentals} / {len(agnostic_files)} ({100*incipits_with_inline_accidentals/len(agnostic_files):.1f}%)") print(f"\n=== Accidental Type Counts ===") for acc_type, count in accidental_type_counts.most_common(): print(f" {acc_type:25s} {count:7d}") print(f"\n=== Top 20 Accidental Positions ===") for tok, count in accidental_full_counts.most_common(20): print(f" {tok:30s} {count:7d}") print(f"\n=== Top 30 Symbol Types ===") for sym, count in all_symbol_types.most_common(30): print(f" {sym:30s} {count:7d}") # Image statistics from a sample print(f"\n=== Image Statistics (sample of 500) ===") png_files = glob.glob(os.path.join(DATASET_ROOT, "package_aa", "*", "*.png"))[:500] widths, heights = [], [] for f in png_files: im = Image.open(f) widths.append(im.size[0]) heights.append(im.size[1]) widths = np.array(widths) heights = np.array(heights) print(f" Width: min={widths.min()}, max={widths.max()}, mean={widths.mean():.0f}, std={widths.std():.0f}") print(f" Height: min={heights.min()}, max={heights.max()}, mean={heights.mean():.0f}, std={heights.std():.0f}") print(f" Modes: {Counter(Image.open(f).mode for f in png_files[:50])}") if __name__ == "__main__": main()