From 5badd5e483d06794f05ef13bbff80c7e9bd38546 Mon Sep 17 00:00:00 2001 From: zmoon Date: Fri, 21 Mar 2025 13:28:49 -0500 Subject: [PATCH 01/28] Initial loading from js data experiments --- .gitignore | 1 + .pre-commit-config.yaml | 3 +- pyabc2/sources/_lzstring.py | 444 ++++++++++++++++++++++++++++++++++++ pyabc2/sources/eskin.py | 37 +++ pyproject.toml | 3 + 5 files changed, 487 insertions(+), 1 deletion(-) create mode 100644 pyabc2/sources/_lzstring.py create mode 100644 pyabc2/sources/eskin.py diff --git a/.gitignore b/.gitignore index d749ce2..e40cb3b 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,7 @@ .vscode pyabc2/sources/_* !pyabc2/sources/__init__.py +!pyabc2/sources/_lzstring.py poetry.lock venv*/ diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 89e1c73..e708ec5 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -14,12 +14,13 @@ repos: rev: '6.0.1' hooks: - id: isort + exclude: ^pyabc2/sources/_lzstring.py - repo: https://github.com/psf/black rev: '25.1.0' hooks: - id: black - exclude: ^examples/ + exclude: ^examples/|^pyabc2/sources/_lzstring.py - repo: https://github.com/csachs/pyproject-flake8 rev: 'v7.0.0' diff --git a/pyabc2/sources/_lzstring.py b/pyabc2/sources/_lzstring.py new file mode 100644 index 0000000..bc7aacd --- /dev/null +++ b/pyabc2/sources/_lzstring.py @@ -0,0 +1,444 @@ +# https://github.com/gkovacs/lz-string-python/blob/f1c109544413c1ba910c4af99337c14da1680441/lzstring/__init__.py +# - Remove dep on future, fix 3.12 support (PR#6) +# - Remove unused imports +# - Fix UTF-16 decompress (PR#7) + +from __future__ import division +from __future__ import unicode_literals +from __future__ import print_function +from __future__ import absolute_import +from builtins import range +from builtins import chr +from builtins import object +import math + + +keyStrBase64 = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/=" +keyStrUriSafe = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+-$" +baseReverseDic = {}; + +class Object(object): + def __init__(self, **kwargs): + for k, v in kwargs.items(): + setattr(self, k, v) + + +def getBaseValue(alphabet, character): + if alphabet not in baseReverseDic: + baseReverseDic[alphabet] = {} + for i in range(len(alphabet)): + baseReverseDic[alphabet][alphabet[i]] = i + return baseReverseDic[alphabet][character] + + +def _compress(uncompressed, bitsPerChar, getCharFromInt): + if (uncompressed is None): + return "" + + context_dictionary = {} + context_dictionaryToCreate= {} + context_c = "" + context_wc = "" + context_w = "" + context_enlargeIn = 2 # Compensate for the first entry which should not count + context_dictSize = 3 + context_numBits = 2 + context_data = [] + context_data_val = 0 + context_data_position = 0 + + for ii in range(len(uncompressed)): + if isinstance(uncompressed, (bytes)): + context_c = chr(uncompressed[ii]) + else: + context_c = uncompressed[ii] + if context_c not in context_dictionary: + context_dictionary[context_c] = context_dictSize + context_dictSize += 1 + context_dictionaryToCreate[context_c] = True + + context_wc = context_w + context_c + if context_wc in context_dictionary: + context_w = context_wc + else: + if context_w in context_dictionaryToCreate: + if ord(context_w[0]) < 256: + for i in range(context_numBits): + context_data_val = (context_data_val << 1) + if context_data_position == bitsPerChar-1: + context_data_position = 0 + context_data.append(getCharFromInt(context_data_val)) + context_data_val = 0 + else: + context_data_position += 1 + value = ord(context_w[0]) + for i in range(8): + context_data_val = (context_data_val << 1) | (value & 1) + if context_data_position == bitsPerChar - 1: + context_data_position = 0 + context_data.append(getCharFromInt(context_data_val)) + context_data_val = 0 + else: + context_data_position += 1 + value = value >> 1 + + else: + value = 1 + for i in range(context_numBits): + context_data_val = (context_data_val << 1) | value + if context_data_position == bitsPerChar - 1: + context_data_position = 0 + context_data.append(getCharFromInt(context_data_val)) + context_data_val = 0 + else: + context_data_position += 1 + value = 0 + value = ord(context_w[0]) + for i in range(16): + context_data_val = (context_data_val << 1) | (value & 1) + if context_data_position == bitsPerChar - 1: + context_data_position = 0 + context_data.append(getCharFromInt(context_data_val)) + context_data_val = 0 + else: + context_data_position += 1 + value = value >> 1 + context_enlargeIn -= 1 + if context_enlargeIn == 0: + context_enlargeIn = math.pow(2, context_numBits) + context_numBits += 1 + del context_dictionaryToCreate[context_w] + else: + value = context_dictionary[context_w] + for i in range(context_numBits): + context_data_val = (context_data_val << 1) | (value & 1) + if context_data_position == bitsPerChar - 1: + context_data_position = 0 + context_data.append(getCharFromInt(context_data_val)) + context_data_val = 0 + else: + context_data_position += 1 + value = value >> 1 + + context_enlargeIn -= 1 + if context_enlargeIn == 0: + context_enlargeIn = math.pow(2, context_numBits) + context_numBits += 1 + + # Add wc to the dictionary. + context_dictionary[context_wc] = context_dictSize + context_dictSize += 1 + context_w = str(context_c) + + # Output the code for w. + if context_w != "": + if context_w in context_dictionaryToCreate: + if ord(context_w[0]) < 256: + for i in range(context_numBits): + context_data_val = (context_data_val << 1) + if context_data_position == bitsPerChar-1: + context_data_position = 0 + context_data.append(getCharFromInt(context_data_val)) + context_data_val = 0 + else: + context_data_position += 1 + value = ord(context_w[0]) + for i in range(8): + context_data_val = (context_data_val << 1) | (value & 1) + if context_data_position == bitsPerChar - 1: + context_data_position = 0 + context_data.append(getCharFromInt(context_data_val)) + context_data_val = 0 + else: + context_data_position += 1 + value = value >> 1 + else: + value = 1 + for i in range(context_numBits): + context_data_val = (context_data_val << 1) | value + if context_data_position == bitsPerChar - 1: + context_data_position = 0 + context_data.append(getCharFromInt(context_data_val)) + context_data_val = 0 + else: + context_data_position += 1 + value = 0 + value = ord(context_w[0]) + for i in range(16): + context_data_val = (context_data_val << 1) | (value & 1) + if context_data_position == bitsPerChar - 1: + context_data_position = 0 + context_data.append(getCharFromInt(context_data_val)) + context_data_val = 0 + else: + context_data_position += 1 + value = value >> 1 + context_enlargeIn -= 1 + if context_enlargeIn == 0: + context_enlargeIn = math.pow(2, context_numBits) + context_numBits += 1 + del context_dictionaryToCreate[context_w] + else: + value = context_dictionary[context_w] + for i in range(context_numBits): + context_data_val = (context_data_val << 1) | (value & 1) + if context_data_position == bitsPerChar - 1: + context_data_position = 0 + context_data.append(getCharFromInt(context_data_val)) + context_data_val = 0 + else: + context_data_position += 1 + value = value >> 1 + + context_enlargeIn -= 1 + if context_enlargeIn == 0: + context_enlargeIn = math.pow(2, context_numBits) + context_numBits += 1 + + # Mark the end of the stream + value = 2 + for i in range(context_numBits): + context_data_val = (context_data_val << 1) | (value & 1) + if context_data_position == bitsPerChar - 1: + context_data_position = 0 + context_data.append(getCharFromInt(context_data_val)) + context_data_val = 0 + else: + context_data_position += 1 + value = value >> 1 + + # Flush the last char + while True: + context_data_val = (context_data_val << 1) + if context_data_position == bitsPerChar - 1: + context_data.append(getCharFromInt(context_data_val)) + break + else: + context_data_position += 1 + + return "".join(context_data) + + +def _decompress(length, resetValue, getNextValue): + dictionary = {} + enlargeIn = 4 + dictSize = 4 + numBits = 3 + entry = "" + result = [] + + data = Object( + val=getNextValue(0), + position=resetValue, + index=1 + ) + + for i in range(3): + dictionary[i] = i + + bits = 0 + maxpower = math.pow(2, 2) + power = 1 + + while power != maxpower: + resb = data.val & data.position + data.position >>= 1 + if data.position == 0: + data.position = resetValue + data.val = getNextValue(data.index) + data.index += 1 + + bits |= power if resb > 0 else 0 + power <<= 1; + + next = bits + if next == 0: + bits = 0 + maxpower = math.pow(2, 8) + power = 1 + while power != maxpower: + resb = data.val & data.position + data.position >>= 1 + if data.position == 0: + data.position = resetValue + data.val = getNextValue(data.index) + data.index += 1 + bits |= power if resb > 0 else 0 + power <<= 1 + c = chr(bits) + elif next == 1: + bits = 0 + maxpower = math.pow(2, 16) + power = 1 + while power != maxpower: + resb = data.val & data.position + data.position >>= 1 + if data.position == 0: + data.position = resetValue; + data.val = getNextValue(data.index) + data.index += 1 + bits |= power if resb > 0 else 0 + power <<= 1 + c = chr(bits) + elif next == 2: + return "" + + dictionary[3] = c + w = c + result.append(c) + counter = 0 + while True: + counter += 1 + if data.index > length: + return "" + + bits = 0 + maxpower = math.pow(2, numBits) + power = 1 + while power != maxpower: + resb = data.val & data.position + data.position >>= 1 + if data.position == 0: + data.position = resetValue; + data.val = getNextValue(data.index) + data.index += 1 + bits |= power if resb > 0 else 0 + power <<= 1 + + c = bits + if c == 0: + bits = 0 + maxpower = math.pow(2, 8) + power = 1 + while power != maxpower: + resb = data.val & data.position + data.position >>= 1 + if data.position == 0: + data.position = resetValue + data.val = getNextValue(data.index) + data.index += 1 + bits |= power if resb > 0 else 0 + power <<= 1 + + dictionary[dictSize] = chr(bits) + dictSize += 1 + c = dictSize - 1 + enlargeIn -= 1 + elif c == 1: + bits = 0 + maxpower = math.pow(2, 16) + power = 1 + while power != maxpower: + resb = data.val & data.position + data.position >>= 1 + if data.position == 0: + data.position = resetValue; + data.val = getNextValue(data.index) + data.index += 1 + bits |= power if resb > 0 else 0 + power <<= 1 + dictionary[dictSize] = chr(bits) + dictSize += 1 + c = dictSize - 1 + enlargeIn -= 1 + elif c == 2: + return "".join(result) + + + if enlargeIn == 0: + enlargeIn = math.pow(2, numBits) + numBits += 1 + + if c in dictionary: + entry = dictionary[c] + else: + if c == dictSize: + entry = w + w[0] + else: + return None + result.append(entry) + + # Add w+entry[0] to the dictionary. + dictionary[dictSize] = w + entry[0] + dictSize += 1 + enlargeIn -= 1 + + w = entry + if enlargeIn == 0: + enlargeIn = math.pow(2, numBits) + numBits += 1 + + +class LZString(object): + @staticmethod + def compress(uncompressed): + return _compress(uncompressed, 16, chr) + + @staticmethod + def compressToUint8Array(uncompressed): + return bytes([ord(x) for x in _compress(uncompressed, 8, chr)]) + + + @staticmethod + def compressToUTF16(uncompressed): + if uncompressed is None: + return "" + return _compress(uncompressed, 15, lambda a: chr(a+32)) + " " + + @staticmethod + def compressToBase64(uncompressed): + if uncompressed is None: + return "" + res = _compress(uncompressed, 6, lambda a: keyStrBase64[a]) + # To produce valid Base64 + end = len(res) % 4 + if end > 0: + res += "="*(4 - end) + return res + + @staticmethod + def compressToEncodedURIComponent(uncompressed): + if uncompressed is None: + return "" + return _compress(uncompressed, 6, lambda a: keyStrUriSafe[a]) + + @staticmethod + def decompress(compressed): + if compressed is None: + return "" + if compressed == "": + return None + return _decompress(len(compressed), 32768, lambda index: ord(compressed[index])) + + @staticmethod + def decompressFromUint8Array(compressed): + if compressed is None: + return "" + if compressed == "": + return None + return _decompress(len(compressed), 128, lambda index: compressed[index]) + + @staticmethod + def decompressFromUTF16(compressed): + if compressed is None: + return "" + if compressed == "": + return None + return _decompress(len(compressed), 16384, lambda index: ord(compressed[index]) - 32) + + @staticmethod + def decompressFromBase64(compressed): + if compressed is None: + return "" + if compressed == "": + return None + return _decompress(len(compressed), 32, lambda index: getBaseValue(keyStrBase64, compressed[index])) + + @staticmethod + def decompressFromEncodedURIComponent(compressed): + if compressed is None: + return "" + if compressed == "": + return None + compressed = compressed.replace(" ", "+") + return _decompress(len(compressed), 32, lambda index: getBaseValue(keyStrUriSafe, compressed[index])) diff --git a/pyabc2/sources/eskin.py b/pyabc2/sources/eskin.py new file mode 100644 index 0000000..d53fd04 --- /dev/null +++ b/pyabc2/sources/eskin.py @@ -0,0 +1,37 @@ +""" +Load data from the Eskin tunebook websites. +""" + +import json +from urllib.parse import parse_qs, urlsplit + +from pyabc2.sources._lzstring import LZString + +# https://michaeleskin.com/tunebook_websites/king_street_sessions_tunebook_17Jan2025.html +# reels line extract +s = """\ + const reels=[{"Name":"An Moinfheir","URL":"https://michaeleskin.com/abctools/abctools.html?lzw=BoLgBAjAUApDAuBLeAbApgMwPYDt5gAUBDFIpHLMAJithgGcBXAIyVU132NL0QsgAcdPtmx5CJMn0oQALFAAqIAII4wAWSwiAFmkQAnKACUQRtGhRQA8iACS+i0RwATKAGUQ6xvUQBjMBj6WAC2YPaOLmAAalgoAHRgAMxQAEKe3n4BQaHhPM7RsQnJAFogCvpO9L76iAAOSLjg6ogA1mhgADK4AOZQ6iAAwlAdIBAA9EIA0iAAIrAGiPTaAPpBKCj0y7iwRMy+AFab9FiMLmL4GGTMWACedOq2M7ZgtUHdFaEADPePz8xE9Horyw3TA3zgDyeYF82iw+mcwNB4JgkL+APoADdYmAAKzI1HQ2HwrEoXHfABEM3JzioYGUAHEwAAxZRgGZUAA+ygANDMwAA-dlgZy+Bkcynk5T5fk0sCU4KSgC8GDQznFA3Jit8tPp-gAor5dRyADpQCWyhnM5QzFLcrm8gVCkViqBSgUW5ws9Xk+mKgZ6xkSpkzdkgE1QDkgc20jD5IjOS7e7pa1Vy5TkgB6opSvnFVJltLdLJmTI5Zvp5P59NpLLlGq1tJz4YlboLdITasl5N8zkwYG6Me6HP5RFpaCI0KIaFz5IAFPSAJTU0WBqm10McoA&format=noten&ssp=10&name=An_Moinfheir&play=1"},{"Name":"Around the world","URL":"https://michaeleskin.com/abctools/abctools.html?lzw=BoLgBATAUApDAuBLeAbApgMwPYDt5gAUBDFIpHLSaOAZwFcAjJVTXfY0vRCsARgA5YMbtmx5CJMt0q8ALFAAqIAIIAnLHRwATMPAAWaMAHcsqlFsUgFB44hqGAtkUQWASiFdo0KKAHkQAJKq3kTaUADKgap2emAKqkRayIi4JGAAsnQ0iADGYOFoNNm4cZqFUABCUTFxCUlIqSgZWbn5hcU4pTjlAFpWCTg0OdEADg04ADRgRKoDAOZoDmh4Uzl6plo0IOmIANaGADK4c1DpIADCUAcgvAD0ggDSIAAisIjRNHoA+uooKDRfXCwIgMHIAKwBNA02jE+AwZAYWAAnkJ0gFngEwCN1HMEg4wAAGVHozEMIhFbFYOaE4kYsBrDaU6lEuBoulkooANywTQArCyYGzMQzVFpuXyiQAiZ6SgBiEFlAHEwLKAKLPVUAH1lymUFTAyjVz010slWggYAAFABmS2YOYASjAkuUkowWjQORNMq0FUNztdFVV6s1AB0oKb5UqVeqtTq9QajWGIz6LTa7RhHQG3R6vVAANq8Z2Ks1+5WR54akCa-MWyUl33Kcsy2XPHXhzUgU3mpsqtvKb1m+EYbNzDBoOaDjDKLQjl2StDKHJoTUpodoHTznK6iDJ+uSioQRX603KfuD2dEOeuscbk0l9MVHJaJ3L52W54O0tNmtFyOu11VVlZ0ZWeBVlGrWsQLlADJVVZtpQgDVNQAXSAA&format=noten&ssp=10&name=Around_the_world&play=1"}]; +""".strip().rstrip( + ";" +) + +_, s = s.split(" ", 1) +type_, s = s.split("=", 1) + +try: + data = json.loads(s) +except json.JSONDecodeError as e: + print(s[e.pos], "context:", s[e.pos - 10 : e.pos + 10]) + raise + +for d in data[:1]: + name = d["Name"] + abctools_url = d["URL"] + res = urlsplit(abctools_url) + assert res.netloc == "michaeleskin.com" + query_params = parse_qs(res.query) + (lzw,) = query_params["lzw"] # note `+` has been replaced with space + + # Example LZW string (made by js `LZString.compressToEncodedURIComponent()`): + # 'BoLgBAjAUApDAuBLeAbApgMwPYDt5gAUBDFIpHLMAJithgGcBXAIyVU132NL0QsgAcdPtmx5CJMn0oQALFAAqIAII4wAWSwiAFmkQAnKACUQRtGhRQA8iACS i0RwATKAGUQ6xvUQBjMBj6WAC2YPaOLmAAalgoAHRgAMxQAEKe3n4BQaHhPM7RsQnJAFogCvpO9L76iAAOSLjg6ogA1mhgADK4AOZQ6iAAwlAdIBAA9EIA0iAAIrAGiPTaAPpBKCj0y7iwRMy AFab9FiMLmL4GGTMWACedOq2M7ZgtUHdFaEADPePz8xE9Horyw3TA3zgDyeYF82iw mcwNB4JgkL APoADdYmAAKzI1HQ2HwrEoXHfABEM3JzioYGUAHEwAAxZRgGZUAA ygANDMwAA-dlgZy Bkcynk5T5fk0sCU4KSgC8GDQznFA3Jit8tPp-gAor5dRyADpQCWyhnM5QzFLcrm8gVCkViqBSgUW5ws9Xk mKgZ6xkSpkzdkgE1QDkgc20jD5IjOS7e7pa1Vy5TkgB6opSvnFVJltLdLJmTI5Zvp5P59NpLLlGq1tJz4YlboLdITasl5N8zkwYG6Me6HP5RFpaCI0KIaFz5IAFPSAJTU0WBqm10McoA' + text = LZString.decompressFromEncodedURIComponent(lzw) diff --git a/pyproject.toml b/pyproject.toml index 3c7cbd3..05a2c1b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -48,6 +48,9 @@ ignore = [ "W503", # line break before binary operator "E226", # missing whitespace around arithmetic operator - not always more readable imho ] +per-file-ignores = [ + "pyabc2/sources/_lzstring.py: E111,E225,E261,E302,E303,E703", +] [tool.isort] profile = "black" From d9c88960b736350fdfd45b9b21a6eb851767dbd0 Mon Sep 17 00:00:00 2001 From: zmoon Date: Fri, 21 Mar 2025 13:57:42 -0500 Subject: [PATCH 02/28] Add some of the tunebook URLs --- pyabc2/sources/eskin.py | 16 ++++++++++++++++ tests/test_sources.py | 26 +++++++++++++++++++++++++- 2 files changed, 41 insertions(+), 1 deletion(-) diff --git a/pyabc2/sources/eskin.py b/pyabc2/sources/eskin.py index d53fd04..2a84f3f 100644 --- a/pyabc2/sources/eskin.py +++ b/pyabc2/sources/eskin.py @@ -7,6 +7,22 @@ from pyabc2.sources._lzstring import LZString +_TBW = "https://michaeleskin.com/tunebook_websites" +_CCE_SD = "https://michaeleskin.com/cce_sd" +_TBWS = { + # https://michaeleskin.com/tunebooks.html#websites_irish + "kss": f"{_TBW}/king_street_sessions_tunebook_17Jan2025.html", + "carp": f"{_TBW}/carp_celtic_jam_tunebook_17Jan2025.html", + "hardy": f"{_TBW}/paul_hardy_2024_8feb2025.html", + "cce_dublin_2001": f"{_CCE_SD}/cce_dublin_2001_tunebook_17Jan2025.html", + "cce_san_diego": f"{_CCE_SD}/cce_san_diego_tunes_31jan2025.html", + # https://michaeleskin.com/tunebooks.html#websites_18th_century_collections + "aird": f"{_TBW}/james_aird_campin_18jan2025.html", + "playford1": f"{_TBW}/playford_1_partington_17jan2025.html", + "playford2": f"{_TBW}/playford_2_partington_17jan2025.html", + "playford3": f"{_TBW}/playford_3_partington_20jan2025.html", +} + # https://michaeleskin.com/tunebook_websites/king_street_sessions_tunebook_17Jan2025.html # reels line extract s = """\ diff --git a/tests/test_sources.py b/tests/test_sources.py index 72d03ed..b0b24eb 100644 --- a/tests/test_sources.py +++ b/tests/test_sources.py @@ -5,7 +5,15 @@ from pyabc2 import Key from pyabc2.parse import Tune -from pyabc2.sources import examples, load_example, load_example_abc, load_url, norbeck, the_session +from pyabc2.sources import ( + eskin, + examples, + load_example, + load_example_abc, + load_url, + norbeck, + the_session, +) NORBECK_IRISH_COUNT = 2733 @@ -249,3 +257,19 @@ def test_load_url_norbeck(): def test_load_url_invalid_domain(): with pytest.raises(NotImplementedError): _ = load_url("https://www.google.com") + + +@pytest.mark.parametrize("key", eskin._TBWS) +def test_eskin_tunebook_url_exist(key): + import requests + + url = eskin._TBWS[key] + r = requests.head(url, timeout=5) + r.raise_for_status() + # Bad URLs seem to just redirect to his homepage, + # so we need to check the final URL + if ( + r.status_code == 302 + and r.headers.get("Location", "").rstrip("/") == "https://michaeleskin.com" + ): + raise ValueError(f"{key!r} URL {url} redirects to homepage") From a2e108e2031c31fb8f4e0fe0516b2777d8b4f37f Mon Sep 17 00:00:00 2001 From: zmoon Date: Fri, 21 Mar 2025 14:30:17 -0500 Subject: [PATCH 03/28] Download/store data working for kss at least... --- pyabc2/sources/eskin.py | 79 ++++++++++++++++++++++++++++++++--------- 1 file changed, 63 insertions(+), 16 deletions(-) diff --git a/pyabc2/sources/eskin.py b/pyabc2/sources/eskin.py index 2a84f3f..bd02e15 100644 --- a/pyabc2/sources/eskin.py +++ b/pyabc2/sources/eskin.py @@ -3,10 +3,15 @@ """ import json +from pathlib import Path from urllib.parse import parse_qs, urlsplit from pyabc2.sources._lzstring import LZString +HERE = Path(__file__).parent + +SAVE_TO = HERE / "_eskin" + _TBW = "https://michaeleskin.com/tunebook_websites" _CCE_SD = "https://michaeleskin.com/cce_sd" _TBWS = { @@ -31,23 +36,65 @@ ";" ) -_, s = s.split(" ", 1) -type_, s = s.split("=", 1) - -try: - data = json.loads(s) -except json.JSONDecodeError as e: - print(s[e.pos], "context:", s[e.pos - 10 : e.pos + 10]) - raise -for d in data[:1]: - name = d["Name"] - abctools_url = d["URL"] - res = urlsplit(abctools_url) +def abctools_url_to_abc(url: str) -> str: + """Extract the ABC from an abctools share URL.""" + res = urlsplit(url) assert res.netloc == "michaeleskin.com" + assert res.path.startswith("/abctools/") + query_params = parse_qs(res.query) - (lzw,) = query_params["lzw"] # note `+` has been replaced with space + (lzw,) = query_params["lzw"] + # Note `+` has been replaced with space by parse_qs + # js LZString.compressToEncodedURIComponent() is used to compress/encode the ABC + + # TODO: optionally remove the lines that start with % or at least the %% lines + + return LZString.decompressFromEncodedURIComponent(lzw) + + +def _download_data(tunebook_url: str): + """Extract and save the tune data from the tunebook webpage as JSON.""" + import gzip + import re + + import requests + + r = requests.get(tunebook_url, timeout=5) + r.raise_for_status() + html = r.text + + # First find the tune type options by searching for 'tunes = type;' + types = sorted(set(re.findall(r"tunes = (.*?);", html))) + if not types: + raise RuntimeError("Unable to detect tune types") + print(types) + + # Then the data are in like `const reels=[{...}, ...];` + all_data = {} + for type_ in types: + m = re.search(f"const {type_}=(.*?);", html, flags=re.DOTALL) + if m is None: + raise RuntimeError(f"Unable to find data for type {type_!r}") + s_data = m.group(1) + + try: + data = json.loads(s_data) + except json.JSONDecodeError as e: + print(s_data[e.pos], "context:", s_data[e.pos - 10 : e.pos + 10]) + raise + + for d in data: + assert d.keys() == {"Name", "URL"} + d["name"] = d.pop("Name") + d["abc"] = abctools_url_to_abc(d.pop("URL")) + + all_data[type_] = data + + SAVE_TO.mkdir(exist_ok=True) + stem = Path(urlsplit(tunebook_url).path).stem + with gzip.open(SAVE_TO / f"{stem}.json.gz", "wt") as f: + json.dump(all_data, f, indent=2) + - # Example LZW string (made by js `LZString.compressToEncodedURIComponent()`): - # 'BoLgBAjAUApDAuBLeAbApgMwPYDt5gAUBDFIpHLMAJithgGcBXAIyVU132NL0QsgAcdPtmx5CJMn0oQALFAAqIAII4wAWSwiAFmkQAnKACUQRtGhRQA8iACS i0RwATKAGUQ6xvUQBjMBj6WAC2YPaOLmAAalgoAHRgAMxQAEKe3n4BQaHhPM7RsQnJAFogCvpO9L76iAAOSLjg6ogA1mhgADK4AOZQ6iAAwlAdIBAA9EIA0iAAIrAGiPTaAPpBKCj0y7iwRMy AFab9FiMLmL4GGTMWACedOq2M7ZgtUHdFaEADPePz8xE9Horyw3TA3zgDyeYF82iw mcwNB4JgkL APoADdYmAAKzI1HQ2HwrEoXHfABEM3JzioYGUAHEwAAxZRgGZUAA ygANDMwAA-dlgZy Bkcynk5T5fk0sCU4KSgC8GDQznFA3Jit8tPp-gAor5dRyADpQCWyhnM5QzFLcrm8gVCkViqBSgUW5ws9Xk mKgZ6xkSpkzdkgE1QDkgc20jD5IjOS7e7pa1Vy5TkgB6opSvnFVJltLdLJmTI5Zvp5P59NpLLlGq1tJz4YlboLdITasl5N8zkwYG6Me6HP5RFpaCI0KIaFz5IAFPSAJTU0WBqm10McoA' - text = LZString.decompressFromEncodedURIComponent(lzw) +_download_data(_TBWS["kss"]) From c9252cf51b112c03ecaf056a4a5026ba037555ff Mon Sep 17 00:00:00 2001 From: zmoon Date: Fri, 21 Mar 2025 15:03:32 -0500 Subject: [PATCH 04/28] Data loader; remove certain lines when downloading kss is %-free now --- pyabc2/sources/eskin.py | 55 ++++++++++++++++++++++++++++++----------- 1 file changed, 41 insertions(+), 14 deletions(-) diff --git a/pyabc2/sources/eskin.py b/pyabc2/sources/eskin.py index bd02e15..2db97fa 100644 --- a/pyabc2/sources/eskin.py +++ b/pyabc2/sources/eskin.py @@ -3,7 +3,9 @@ """ import json +import re from pathlib import Path +from typing import Literal, Tuple from urllib.parse import parse_qs, urlsplit from pyabc2.sources._lzstring import LZString @@ -28,17 +30,33 @@ "playford3": f"{_TBW}/playford_3_partington_20jan2025.html", } -# https://michaeleskin.com/tunebook_websites/king_street_sessions_tunebook_17Jan2025.html -# reels line extract -s = """\ - const reels=[{"Name":"An Moinfheir","URL":"https://michaeleskin.com/abctools/abctools.html?lzw=BoLgBAjAUApDAuBLeAbApgMwPYDt5gAUBDFIpHLMAJithgGcBXAIyVU132NL0QsgAcdPtmx5CJMn0oQALFAAqIAII4wAWSwiAFmkQAnKACUQRtGhRQA8iACS+i0RwATKAGUQ6xvUQBjMBj6WAC2YPaOLmAAalgoAHRgAMxQAEKe3n4BQaHhPM7RsQnJAFogCvpO9L76iAAOSLjg6ogA1mhgADK4AOZQ6iAAwlAdIBAA9EIA0iAAIrAGiPTaAPpBKCj0y7iwRMy+AFab9FiMLmL4GGTMWACedOq2M7ZgtUHdFaEADPePz8xE9Horyw3TA3zgDyeYF82iw+mcwNB4JgkL+APoADdYmAAKzI1HQ2HwrEoXHfABEM3JzioYGUAHEwAAxZRgGZUAA+ygANDMwAA-dlgZy+Bkcynk5T5fk0sCU4KSgC8GDQznFA3Jit8tPp-gAor5dRyADpQCWyhnM5QzFLcrm8gVCkViqBSgUW5ws9Xk+mKgZ6xkSpkzdkgE1QDkgc20jD5IjOS7e7pa1Vy5TkgB6opSvnFVJltLdLJmTI5Zvp5P59NpLLlGq1tJz4YlboLdITasl5N8zkwYG6Me6HP5RFpaCI0KIaFz5IAFPSAJTU0WBqm10McoA&format=noten&ssp=10&name=An_Moinfheir&play=1"},{"Name":"Around the world","URL":"https://michaeleskin.com/abctools/abctools.html?lzw=BoLgBATAUApDAuBLeAbApgMwPYDt5gAUBDFIpHLSaOAZwFcAjJVTXfY0vRCsARgA5YMbtmx5CJMt0q8ALFAAqIAIIAnLHRwATMPAAWaMAHcsqlFsUgFB44hqGAtkUQWASiFdo0KKAHkQAJKq3kTaUADKgap2emAKqkRayIi4JGAAsnQ0iADGYOFoNNm4cZqFUABCUTFxCUlIqSgZWbn5hcU4pTjlAFpWCTg0OdEADg04ADRgRKoDAOZoDmh4Uzl6plo0IOmIANaGADK4c1DpIADCUAcgvAD0ggDSIAAisIjRNHoA+uooKDRfXCwIgMHIAKwBNA02jE+AwZAYWAAnkJ0gFngEwCN1HMEg4wAAGVHozEMIhFbFYOaE4kYsBrDaU6lEuBoulkooANywTQArCyYGzMQzVFpuXyiQAiZ6SgBiEFlAHEwLKAKLPVUAH1lymUFTAyjVz010slWggYAAFABmS2YOYASjAkuUkowWjQORNMq0FUNztdFVV6s1AB0oKb5UqVeqtTq9QajWGIz6LTa7RhHQG3R6vVAANq8Z2Ks1+5WR54akCa-MWyUl33Kcsy2XPHXhzUgU3mpsqtvKb1m+EYbNzDBoOaDjDKLQjl2StDKHJoTUpodoHTznK6iDJ+uSioQRX603KfuD2dEOeuscbk0l9MVHJaJ3L52W54O0tNmtFyOu11VVlZ0ZWeBVlGrWsQLlADJVVZtpQgDVNQAXSAA&format=noten&ssp=10&name=Around_the_world&play=1"}]; -""".strip().rstrip( - ";" -) +def abctools_url_to_abc( + url: str, + *, + remove_prefs: str | Tuple[str] | Literal[False] = ( + r"%%titlefont ", + r"%%subtitlefont ", + r"%%infofont ", + r"%irish_rolls_on", + r"%abcjs_", + r"%%MIDI ", + ), +) -> str: + """Extract the ABC from an Eskin abctools share URL. + + Parameters + ---------- + remove_prefs + Remove lines starting with these prefixes. + Use ``False`` or an empty iterable to keep all lines instead. + """ + + if not remove_prefs: + remove_prefs = () + elif isinstance(remove_prefs, str): + remove_prefs = (remove_prefs,) -def abctools_url_to_abc(url: str) -> str: - """Extract the ABC from an abctools share URL.""" res = urlsplit(url) assert res.netloc == "michaeleskin.com" assert res.path.startswith("/abctools/") @@ -48,15 +66,18 @@ def abctools_url_to_abc(url: str) -> str: # Note `+` has been replaced with space by parse_qs # js LZString.compressToEncodedURIComponent() is used to compress/encode the ABC - # TODO: optionally remove the lines that start with % or at least the %% lines + abc = LZString.decompressFromEncodedURIComponent(lzw) - return LZString.decompressFromEncodedURIComponent(lzw) + wanted_lines = [ + line.strip() for line in abc.splitlines() if not line.lstrip().startswith(remove_prefs) + ] + + return "\n".join(wanted_lines) def _download_data(tunebook_url: str): """Extract and save the tune data from the tunebook webpage as JSON.""" import gzip - import re import requests @@ -68,7 +89,6 @@ def _download_data(tunebook_url: str): types = sorted(set(re.findall(r"tunes = (.*?);", html))) if not types: raise RuntimeError("Unable to detect tune types") - print(types) # Then the data are in like `const reels=[{...}, ...];` all_data = {} @@ -97,4 +117,11 @@ def _download_data(tunebook_url: str): json.dump(all_data, f, indent=2) -_download_data(_TBWS["kss"]) +def _load_data(key: str): + """Load the data from the saved JSON.""" + import gzip + + stem = Path(urlsplit(_TBWS[key]).path).stem + fp = SAVE_TO / f"{stem}.json.gz" + with gzip.open(fp, "rt") as f: + return json.load(f) From f2997508d86ac1dff9184a0e009ac3ba010dd19b Mon Sep 17 00:00:00 2001 From: zmoon Date: Fri, 21 Mar 2025 15:31:30 -0500 Subject: [PATCH 05/28] Make share URL from ABC string --- pyabc2/sources/eskin.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/pyabc2/sources/eskin.py b/pyabc2/sources/eskin.py index 2db97fa..e81588c 100644 --- a/pyabc2/sources/eskin.py +++ b/pyabc2/sources/eskin.py @@ -75,6 +75,18 @@ def abctools_url_to_abc( return "\n".join(wanted_lines) +def abc_to_abctools_url(abc: str) -> str: + """Create an Eskin abctools share URL for `abc`.""" + + # Must start with 'X:' (seems value is not required) + if not abc.lstrip().startswith("X"): + abc = "X:\n" + abc + + lzw = LZString.compressToEncodedURIComponent(abc) + + return f"https://michaeleskin.com/abctools/abctools.html?lzw={lzw}" + + def _download_data(tunebook_url: str): """Extract and save the tune data from the tunebook webpage as JSON.""" import gzip From 5d829e075366a620cf272be8c0aef6b90d1322bb Mon Sep 17 00:00:00 2001 From: zmoon Date: Fri, 28 Mar 2025 10:58:57 -0500 Subject: [PATCH 06/28] Test out Eskin share URL gen --- pyabc2/sources/eskin.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/pyabc2/sources/eskin.py b/pyabc2/sources/eskin.py index e81588c..a134691 100644 --- a/pyabc2/sources/eskin.py +++ b/pyabc2/sources/eskin.py @@ -76,7 +76,10 @@ def abctools_url_to_abc( def abc_to_abctools_url(abc: str) -> str: - """Create an Eskin abctools share URL for `abc`.""" + """Create an Eskin abctools share URL for `abc`. + + More info: https://michaeleskin.com/tools/generate_share_link.html + """ # Must start with 'X:' (seems value is not required) if not abc.lstrip().startswith("X"): @@ -137,3 +140,11 @@ def _load_data(key: str): fp = SAVE_TO / f"{stem}.json.gz" with gzip.open(fp, "rt") as f: return json.load(f) + + +if __name__ == "__main__": + from . import load_example_abc + + abc = load_example_abc("For the Love of Music") + url = abc_to_abctools_url(abc) + print(url) From 59e495da50ca5b89dca4150da5b787c6df91116a Mon Sep 17 00:00:00 2001 From: zmoon Date: Wed, 9 Apr 2025 12:19:36 -0500 Subject: [PATCH 07/28] Get all current tunebooks loading fixes for a tune that had ; in the name and book that didn't have tune types --- pyabc2/sources/eskin.py | 63 ++++++++++++++++++++++++++++++++++------- tests/test_sources.py | 25 ++++++++++++++++ 2 files changed, 77 insertions(+), 11 deletions(-) diff --git a/pyabc2/sources/eskin.py b/pyabc2/sources/eskin.py index a134691..08e97a4 100644 --- a/pyabc2/sources/eskin.py +++ b/pyabc2/sources/eskin.py @@ -5,7 +5,7 @@ import json import re from pathlib import Path -from typing import Literal, Tuple +from typing import Literal, NamedTuple, Tuple from urllib.parse import parse_qs, urlsplit from pyabc2.sources._lzstring import LZString @@ -90,28 +90,53 @@ def abc_to_abctools_url(abc: str) -> str: return f"https://michaeleskin.com/abctools/abctools.html?lzw={lzw}" -def _download_data(tunebook_url: str): +class EskinTunebookInfo(NamedTuple): + key: str + url: str + stem: str + path: Path + + +def get_tunebook_info(key: str) -> EskinTunebookInfo: + url = _TBWS[key] + stem = Path(urlsplit(url).path).stem + + return EskinTunebookInfo( + key=key, + url=url, + stem=stem, + path=SAVE_TO / f"{stem}.json.gz", + ) + + +def _download_data(key: str): """Extract and save the tune data from the tunebook webpage as JSON.""" import gzip import requests - r = requests.get(tunebook_url, timeout=5) + tb_info = get_tunebook_info(key) + + r = requests.get(tb_info.url, timeout=5) r.raise_for_status() html = r.text # First find the tune type options by searching for 'tunes = type;' types = sorted(set(re.findall(r"tunes = (.*?);", html))) - if not types: + if types: + pass + elif "const tunes=[" in html: # no types, just one list of tunes + types = ["tunes"] + else: raise RuntimeError("Unable to detect tune types") # Then the data are in like `const reels=[{...}, ...];` all_data = {} for type_ in types: - m = re.search(f"const {type_}=(.*?);", html, flags=re.DOTALL) + m = re.search(rf"const {type_}=\[(.*?)\];", html, flags=re.DOTALL) if m is None: raise RuntimeError(f"Unable to find data for type {type_!r}") - s_data = m.group(1) + s_data = "[" + m.group(1) + "]" try: data = json.loads(s_data) @@ -127,8 +152,7 @@ def _download_data(tunebook_url: str): all_data[type_] = data SAVE_TO.mkdir(exist_ok=True) - stem = Path(urlsplit(tunebook_url).path).stem - with gzip.open(SAVE_TO / f"{stem}.json.gz", "wt") as f: + with gzip.open(tb_info.path, "wt") as f: json.dump(all_data, f, indent=2) @@ -136,15 +160,32 @@ def _load_data(key: str): """Load the data from the saved JSON.""" import gzip - stem = Path(urlsplit(_TBWS[key]).path).stem - fp = SAVE_TO / f"{stem}.json.gz" - with gzip.open(fp, "rt") as f: + tb_info = get_tunebook_info(key) + + with gzip.open(tb_info.path, "rt") as f: return json.load(f) +def load_meta(key: str, *, redownload: bool = False): + """Load the tunebook data, no parsing.""" + + tb_info = get_tunebook_info(key) + + fp = tb_info.path + if not fp.is_file() or redownload: + print("downloading...", end=" ", flush=True) + _download_data(key) + print("done") + + return _load_data(key) + + if __name__ == "__main__": from . import load_example_abc abc = load_example_abc("For the Love of Music") url = abc_to_abctools_url(abc) print(url) + + kss = load_meta("kss") + print(kss.keys()) diff --git a/tests/test_sources.py b/tests/test_sources.py index b0b24eb..fca6a2e 100644 --- a/tests/test_sources.py +++ b/tests/test_sources.py @@ -268,8 +268,33 @@ def test_eskin_tunebook_url_exist(key): r.raise_for_status() # Bad URLs seem to just redirect to his homepage, # so we need to check the final URL + # TODO: maybe move this to the module if ( r.status_code == 302 and r.headers.get("Location", "").rstrip("/") == "https://michaeleskin.com" ): raise ValueError(f"{key!r} URL {url} redirects to homepage") + + +@pytest.mark.parametrize("key", eskin._TBWS) +def test_eskin_tunebook_data_load(key): + data = eskin.load_meta(key) + + tune_type_keys = { + "airs_songs", + "hornpipes", + "jigs", + "long_dances", + "marches", + "misc_tunes", + "ocarolan", + "polkas", + "reels", + "scotchreels", + "slides", + "slipjigs", + "strathspeys", + "waltzes", + } + + assert data.keys() <= tune_type_keys or list(data) == ["tunes"] From 83daef01c9fa135204e4efe8b6ee86274ee8b863 Mon Sep 17 00:00:00 2001 From: zmoon Date: Wed, 9 Apr 2025 12:30:36 -0500 Subject: [PATCH 08/28] tweaks --- pyabc2/sources/eskin.py | 32 ++++++++++++++++++-------------- tests/test_sources.py | 6 +++--- 2 files changed, 21 insertions(+), 17 deletions(-) diff --git a/pyabc2/sources/eskin.py b/pyabc2/sources/eskin.py index 08e97a4..a31b1df 100644 --- a/pyabc2/sources/eskin.py +++ b/pyabc2/sources/eskin.py @@ -14,21 +14,25 @@ SAVE_TO = HERE / "_eskin" -_TBW = "https://michaeleskin.com/tunebook_websites" +_TBWS = "https://michaeleskin.com/tunebook_websites" _CCE_SD = "https://michaeleskin.com/cce_sd" -_TBWS = { +_TUNEBOOK_KEY_TO_URL = { # https://michaeleskin.com/tunebooks.html#websites_irish - "kss": f"{_TBW}/king_street_sessions_tunebook_17Jan2025.html", - "carp": f"{_TBW}/carp_celtic_jam_tunebook_17Jan2025.html", - "hardy": f"{_TBW}/paul_hardy_2024_8feb2025.html", + "kss": f"{_TBWS}/king_street_sessions_tunebook_17Jan2025.html", + "carp": f"{_TBWS}/carp_celtic_jam_tunebook_17Jan2025.html", + "hardy": f"{_TBWS}/paul_hardy_2024_8feb2025.html", "cce_dublin_2001": f"{_CCE_SD}/cce_dublin_2001_tunebook_17Jan2025.html", "cce_san_diego": f"{_CCE_SD}/cce_san_diego_tunes_31jan2025.html", # https://michaeleskin.com/tunebooks.html#websites_18th_century_collections - "aird": f"{_TBW}/james_aird_campin_18jan2025.html", - "playford1": f"{_TBW}/playford_1_partington_17jan2025.html", - "playford2": f"{_TBW}/playford_2_partington_17jan2025.html", - "playford3": f"{_TBW}/playford_3_partington_20jan2025.html", + "aird": f"{_TBWS}/james_aird_campin_18jan2025.html", + "playford1": f"{_TBWS}/playford_1_partington_17jan2025.html", + "playford2": f"{_TBWS}/playford_2_partington_17jan2025.html", + "playford3": f"{_TBWS}/playford_3_partington_20jan2025.html", } +"""Mapping of tunebook keys (defined here, not by Eskin; e.g. 'kss' for King Street Sessions) +to tunebook website URLs, which come from this page: +https://michaeleskin.com/tunebooks.html +""" def abctools_url_to_abc( @@ -45,6 +49,8 @@ def abctools_url_to_abc( ) -> str: """Extract the ABC from an Eskin abctools share URL. + More info: https://michaeleskin.com/tools/generate_share_link.html + Parameters ---------- remove_prefs @@ -64,7 +70,7 @@ def abctools_url_to_abc( query_params = parse_qs(res.query) (lzw,) = query_params["lzw"] # Note `+` has been replaced with space by parse_qs - # js LZString.compressToEncodedURIComponent() is used to compress/encode the ABC + # Note js LZString.compressToEncodedURIComponent() is used to compress/encode the ABC abc = LZString.decompressFromEncodedURIComponent(lzw) @@ -98,7 +104,7 @@ class EskinTunebookInfo(NamedTuple): def get_tunebook_info(key: str) -> EskinTunebookInfo: - url = _TBWS[key] + url = _TUNEBOOK_KEY_TO_URL[key] stem = Path(urlsplit(url).path).stem return EskinTunebookInfo( @@ -160,9 +166,7 @@ def _load_data(key: str): """Load the data from the saved JSON.""" import gzip - tb_info = get_tunebook_info(key) - - with gzip.open(tb_info.path, "rt") as f: + with gzip.open(get_tunebook_info(key).path, "rt") as f: return json.load(f) diff --git a/tests/test_sources.py b/tests/test_sources.py index fca6a2e..47fc949 100644 --- a/tests/test_sources.py +++ b/tests/test_sources.py @@ -259,11 +259,11 @@ def test_load_url_invalid_domain(): _ = load_url("https://www.google.com") -@pytest.mark.parametrize("key", eskin._TBWS) +@pytest.mark.parametrize("key", eskin._TUNEBOOK_KEY_TO_URL) def test_eskin_tunebook_url_exist(key): import requests - url = eskin._TBWS[key] + url = eskin._TUNEBOOK_KEY_TO_URL[key] r = requests.head(url, timeout=5) r.raise_for_status() # Bad URLs seem to just redirect to his homepage, @@ -276,7 +276,7 @@ def test_eskin_tunebook_url_exist(key): raise ValueError(f"{key!r} URL {url} redirects to homepage") -@pytest.mark.parametrize("key", eskin._TBWS) +@pytest.mark.parametrize("key", eskin._TUNEBOOK_KEY_TO_URL) def test_eskin_tunebook_data_load(key): data = eskin.load_meta(key) From dce5faad75a6c1d41e11326f62710094902aeeb1 Mon Sep 17 00:00:00 2001 From: zmoon Date: Wed, 9 Apr 2025 12:34:12 -0500 Subject: [PATCH 09/28] Please mypy --- pyabc2/sources/eskin.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyabc2/sources/eskin.py b/pyabc2/sources/eskin.py index a31b1df..f89f696 100644 --- a/pyabc2/sources/eskin.py +++ b/pyabc2/sources/eskin.py @@ -38,7 +38,7 @@ def abctools_url_to_abc( url: str, *, - remove_prefs: str | Tuple[str] | Literal[False] = ( + remove_prefs: str | Tuple[str, ...] | Literal[False] = ( r"%%titlefont ", r"%%subtitlefont ", r"%%infofont ", From 97cd5d2778cca38985849b6131e50280408d288e Mon Sep 17 00:00:00 2001 From: zmoon Date: Fri, 11 Apr 2025 10:32:36 -0500 Subject: [PATCH 10/28] Stick with older typing like the rest of the codebase --- pyabc2/sources/eskin.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyabc2/sources/eskin.py b/pyabc2/sources/eskin.py index f89f696..5c9f0de 100644 --- a/pyabc2/sources/eskin.py +++ b/pyabc2/sources/eskin.py @@ -5,7 +5,7 @@ import json import re from pathlib import Path -from typing import Literal, NamedTuple, Tuple +from typing import Literal, NamedTuple, Tuple, Union from urllib.parse import parse_qs, urlsplit from pyabc2.sources._lzstring import LZString @@ -38,7 +38,7 @@ def abctools_url_to_abc( url: str, *, - remove_prefs: str | Tuple[str, ...] | Literal[False] = ( + remove_prefs: Union[str, Tuple[str, ...], Literal[False]] = ( r"%%titlefont ", r"%%subtitlefont ", r"%%infofont ", From ba5c1db5e3321c8f5d3b6880a2e8710658a1385c Mon Sep 17 00:00:00 2001 From: zmoon Date: Mon, 5 Jan 2026 15:39:13 -0600 Subject: [PATCH 11/28] Add a basic Eskin URL creation test --- tests/test_sources.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/tests/test_sources.py b/tests/test_sources.py index 47fc949..65f9ff6 100644 --- a/tests/test_sources.py +++ b/tests/test_sources.py @@ -298,3 +298,18 @@ def test_eskin_tunebook_data_load(key): } assert data.keys() <= tune_type_keys or list(data) == ["tunes"] + + +def test_eskin_abc_url_creation(): + import requests + + abc = load_example_abc("For the Love of Music") + + url = eskin.abc_to_abctools_url(abc) + r = requests.head(url, timeout=5) + r.raise_for_status() + if ( + r.status_code == 302 + and r.headers.get("Location", "").rstrip("/") == "https://michaeleskin.com" + ): + raise ValueError(f"URL {url} redirects to homepage") From b31f8a4b1fbc1911835076bf817525962bd8a645 Mon Sep 17 00:00:00 2001 From: zmoon Date: Mon, 5 Jan 2026 15:41:32 -0600 Subject: [PATCH 12/28] Please linter --- pyabc2/sources/_lzstring.py | 2 ++ pyabc2/sources/eskin.py | 2 ++ tests/test_sources.py | 1 + 3 files changed, 5 insertions(+) diff --git a/pyabc2/sources/_lzstring.py b/pyabc2/sources/_lzstring.py index bc7aacd..d43677a 100644 --- a/pyabc2/sources/_lzstring.py +++ b/pyabc2/sources/_lzstring.py @@ -3,6 +3,8 @@ # - Remove unused imports # - Fix UTF-16 decompress (PR#7) +# type: ignore + from __future__ import division from __future__ import unicode_literals from __future__ import print_function diff --git a/pyabc2/sources/eskin.py b/pyabc2/sources/eskin.py index 5c9f0de..d6b9511 100644 --- a/pyabc2/sources/eskin.py +++ b/pyabc2/sources/eskin.py @@ -73,6 +73,8 @@ def abctools_url_to_abc( # Note js LZString.compressToEncodedURIComponent() is used to compress/encode the ABC abc = LZString.decompressFromEncodedURIComponent(lzw) + if abc is None: + raise RuntimeError("Failed to decompress LZString data") wanted_lines = [ line.strip() for line in abc.splitlines() if not line.lstrip().startswith(remove_prefs) diff --git a/tests/test_sources.py b/tests/test_sources.py index 65f9ff6..1fcc3ee 100644 --- a/tests/test_sources.py +++ b/tests/test_sources.py @@ -206,6 +206,7 @@ def test_the_session_load_meta_invalid(): def test_the_session_load_meta_doc_consistency(): s_options = ", ".join(repr(x) for x in sorted(the_session._META_ALLOWED)) expected_line = f"which : {{{s_options}}}" + assert the_session.load_meta.__doc__ is not None assert expected_line in the_session.load_meta.__doc__ From 4d5d61370822f0270ce02dc5d15374c793ac70c8 Mon Sep 17 00:00:00 2001 From: zmoon Date: Mon, 5 Jan 2026 16:07:39 -0600 Subject: [PATCH 13/28] Add short Eskin example and give the list of keys on invalid key --- docs/examples/sources.ipynb | 39 +++++++++++++++++++++++++++++++++++-- pyabc2/sources/eskin.py | 7 ++++++- 2 files changed, 43 insertions(+), 3 deletions(-) diff --git a/docs/examples/sources.ipynb b/docs/examples/sources.ipynb index 14d3418..62751a4 100644 --- a/docs/examples/sources.ipynb +++ b/docs/examples/sources.ipynb @@ -17,7 +17,7 @@ "metadata": {}, "outputs": [], "source": [ - "from pyabc2.sources import load_example, norbeck, the_session" + "from pyabc2.sources import load_example, norbeck, the_session, eskin" ] }, { @@ -360,11 +360,46 @@ " )\n", ");" ] + }, + { + "cell_type": "markdown", + "id": "31", + "metadata": {}, + "source": [ + "## Eskin\n", + "\n", + "Michael Eskin has tunebooks available at , viewable with his ABC Transcription Tools.\n", + "\n", + "We can load selected tunebooks from there, e.g. the King Street Sessions:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "32", + "metadata": {}, + "outputs": [], + "source": [ + "dct = eskin.load_meta(\"kss\")\n", + "print(sorted(dct))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "33", + "metadata": {}, + "outputs": [], + "source": [ + "from pyabc2 import Tune\n", + "\n", + "Tune(dct[\"jigs\"][0][\"abc\"])" + ] } ], "metadata": { "kernelspec": { - "display_name": "venv", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, diff --git a/pyabc2/sources/eskin.py b/pyabc2/sources/eskin.py index d6b9511..0cec043 100644 --- a/pyabc2/sources/eskin.py +++ b/pyabc2/sources/eskin.py @@ -106,7 +106,12 @@ class EskinTunebookInfo(NamedTuple): def get_tunebook_info(key: str) -> EskinTunebookInfo: - url = _TUNEBOOK_KEY_TO_URL[key] + try: + url = _TUNEBOOK_KEY_TO_URL[key.lower()] + except KeyError: + raise ValueError( + f"Unknown Eskin tunebook key: {key!r}. Valid options: {sorted(_TUNEBOOK_KEY_TO_URL)}." + ) from None stem = Path(urlsplit(url).path).stem return EskinTunebookInfo( From ddb36b4decfce7db002b4716d26425dcba98e394 Mon Sep 17 00:00:00 2001 From: zmoon Date: Wed, 7 Jan 2026 10:01:04 -0600 Subject: [PATCH 14/28] Document tunebook keys; add OFla --- docs/api.rst | 17 +++++++++ docs/examples/sources.ipynb | 12 ++++++ pyabc2/sources/__init__.py | 5 ++- pyabc2/sources/eskin.py | 75 +++++++++++++++++++++++++++++++++---- tests/test_sources.py | 26 +++++++++++++ 5 files changed, 127 insertions(+), 8 deletions(-) diff --git a/docs/api.rst b/docs/api.rst index 85a7548..1078a45 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -133,3 +133,20 @@ Functions: the_session.load the_session.load_meta the_session.load_url + +Eskin ABC Tools +--------------- + +.. automodule:: pyabc2.sources.eskin + +Functions: + +.. currentmodule:: pyabc2.sources + +.. autosummary:: + :toctree: api/ + + eskin.load_meta + eskin.load_url + eskin.abctools_url_to_abc + eskin.abc_to_abctools_url diff --git a/docs/examples/sources.ipynb b/docs/examples/sources.ipynb index 62751a4..865bdff 100644 --- a/docs/examples/sources.ipynb +++ b/docs/examples/sources.ipynb @@ -395,6 +395,18 @@ "\n", "Tune(dct[\"jigs\"][0][\"abc\"])" ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "34", + "metadata": {}, + "outputs": [], + "source": [ + "url = \"https://michaeleskin.com/abctools/abctools.html?lzw=BoLgUAKiBiD2BOACCALApogMrAbhg8gGaICyArgM4CWAxmAEogUA2VADogFZUDmYAwiExUAXon4BDePFjNmYEiACcAegAcYTCACM6sAGkQAcTBGAogBFEFs0cQBBIwCFEAHwdG7zgCaI0333dzKxs7Rxo3RCc0DCd7F3MzRBBXMB5-PxVCFR4EpxUaFUDEdN80HgAjRAkAJmJ3Uszs3Id8wuL-F28nMKdAtIy0LJy8gqLIxvKq2olIipimnIxankjOxG7e+zdUoA\"\n", + "print(url)\n", + "eskin.load_url(url)" + ] } ], "metadata": { diff --git a/pyabc2/sources/__init__.py b/pyabc2/sources/__init__.py index 0ae10db..d662665 100644 --- a/pyabc2/sources/__init__.py +++ b/pyabc2/sources/__init__.py @@ -84,6 +84,7 @@ def load_url(url: str) -> Tune: - Norbeck (``norbeck.nu/abc/``) - The Session (``thesession.org``) + - Eskin ABC Tools (``michaeleskin.com/abctools/``) See Also -------- @@ -92,12 +93,14 @@ def load_url(url: str) -> Tune: """ from urllib.parse import urlsplit - from . import norbeck, the_session + from . import eskin, norbeck, the_session res = urlsplit(url) if res.netloc in norbeck._URL_NETLOCS: return norbeck.load_url(url) elif res.netloc in the_session._URL_NETLOCS: return the_session.load_url(url) + elif res.netloc in eskin._URL_NETLOCS: + return eskin.load_url(url) else: raise NotImplementedError(f"loading URL from {res.netloc} not implemented.") diff --git a/pyabc2/sources/eskin.py b/pyabc2/sources/eskin.py index 0cec043..9ad6ba9 100644 --- a/pyabc2/sources/eskin.py +++ b/pyabc2/sources/eskin.py @@ -1,5 +1,10 @@ """ -Load data from the Eskin tunebook websites. +Load data from the Eskin ABC Transcription Tools tunebook websites +(https://michaeleskin.com/tunebooks.html). + +Requires: + +* `requests `__ """ import json @@ -8,6 +13,7 @@ from typing import Literal, NamedTuple, Tuple, Union from urllib.parse import parse_qs, urlsplit +from pyabc2 import Tune from pyabc2.sources._lzstring import LZString HERE = Path(__file__).parent @@ -19,10 +25,13 @@ _TUNEBOOK_KEY_TO_URL = { # https://michaeleskin.com/tunebooks.html#websites_irish "kss": f"{_TBWS}/king_street_sessions_tunebook_17Jan2025.html", + "oflaherty_2025": f"{_TBWS}/oflahertys_2025_retreat_tunes_final.html", "carp": f"{_TBWS}/carp_celtic_jam_tunebook_17Jan2025.html", - "hardy": f"{_TBWS}/paul_hardy_2024_8feb2025.html", + "hardy_2024": f"{_TBWS}/paul_hardy_2024_8feb2025.html", + "hardy_2025": f"{_TBWS}/paul_hardy_2025_12aug2025.html", "cce_dublin_2001": f"{_CCE_SD}/cce_dublin_2001_tunebook_17Jan2025.html", - "cce_san_diego": f"{_CCE_SD}/cce_san_diego_tunes_31jan2025.html", + "cce_san_diego_jan2025": f"{_CCE_SD}/cce_san_diego_tunes_31jan2025.html", + "cce_san_diego_nov2025": f"{_CCE_SD}/cce_san_diego_tunes_10nov2025.html", # https://michaeleskin.com/tunebooks.html#websites_18th_century_collections "aird": f"{_TBWS}/james_aird_campin_18jan2025.html", "playford1": f"{_TBWS}/playford_1_partington_17jan2025.html", @@ -34,6 +43,15 @@ https://michaeleskin.com/tunebooks.html """ +# Definitive versions +_TUNEBOOK_ALIAS = { + "cce_san_diego": "cce_san_diego_nov2025", +} +for _alias, _target in _TUNEBOOK_ALIAS.items(): + _TUNEBOOK_KEY_TO_URL[_alias] = _TUNEBOOK_KEY_TO_URL[_target] + +_URL_NETLOCS = {"michaeleskin.com", "www.michaeleskin.com"} + def abctools_url_to_abc( url: str, @@ -47,7 +65,7 @@ def abctools_url_to_abc( r"%%MIDI ", ), ) -> str: - """Extract the ABC from an Eskin abctools share URL. + """Extract the ABC from an Eskin abctools (``michaeleskin.com/abctools/``) share URL. More info: https://michaeleskin.com/tools/generate_share_link.html @@ -64,7 +82,7 @@ def abctools_url_to_abc( remove_prefs = (remove_prefs,) res = urlsplit(url) - assert res.netloc == "michaeleskin.com" + assert res.netloc in _URL_NETLOCS assert res.path.startswith("/abctools/") query_params = parse_qs(res.query) @@ -84,7 +102,7 @@ def abctools_url_to_abc( def abc_to_abctools_url(abc: str) -> str: - """Create an Eskin abctools share URL for `abc`. + """Create an Eskin abctools (``michaeleskin.com/abctools/``) share URL for `abc`. More info: https://michaeleskin.com/tools/generate_share_link.html """ @@ -178,7 +196,39 @@ def _load_data(key: str): def load_meta(key: str, *, redownload: bool = False): - """Load the tunebook data, no parsing.""" + """Load the tunebook data, no parsing. + + Parameters + ---------- + key + Tunebook key (ID), e.g. ``'kss'`` for King Street Sessions. + + .. list-table:: + :header-rows: 1 + :widths: 15 85 + + * - Key + - Description + * - ``aird`` + - James Aird's Airs by Jack Campin + * - ``carp`` + - CARP Celtic Jam Tunebook + * - ``cce_dublin_2001`` + - CCE Dublin 2001 + * - ``cce_san_diego`` + - CCE San Diego + * - ``hardy_{2024,2025}`` + - Paul Hardy's Session Tunebook + * - ``kss`` + - King Street Sessions Tunebook + * - ``oflaherty_2025`` + - O'Flaherty's Retreat Tunes + * - ``playford{1,2,3}`` + - Playford vols. 1--3 + + See https://michaeleskin.com/tunebooks.html + for more information. + """ tb_info = get_tunebook_info(key) @@ -191,6 +241,17 @@ def load_meta(key: str, *, redownload: bool = False): return _load_data(key) +def load_url(url: str) -> Tune: + """Load tune from an Eskin abctools (``michaeleskin.com/abctools/``) share URL. + + Notes + ----- + The ABC is encoded in the URL, so we don't need to load the page. + """ + abc = abctools_url_to_abc(url) + return Tune(abc) + + if __name__ == "__main__": from . import load_example_abc diff --git a/tests/test_sources.py b/tests/test_sources.py index 1fcc3ee..bd4e2cc 100644 --- a/tests/test_sources.py +++ b/tests/test_sources.py @@ -277,6 +277,32 @@ def test_eskin_tunebook_url_exist(key): raise ValueError(f"{key!r} URL {url} redirects to homepage") +def test_eskin_tunebook_url_current(): + import requests + + url = "https://michaeleskin.com/tunebooks.html" + r = requests.get(url, timeout=5) + r.raise_for_status() + if ( + r.status_code == 302 + and r.headers.get("Location", "").rstrip("/") == "https://michaeleskin.com" + ): + raise ValueError(f"URL {url} redirects to homepage") + html = r.text + + old_keys = { + "cce_san_diego_jan2025", + "hardy_2024", + } + for key, tb_url in eskin._TUNEBOOK_KEY_TO_URL.items(): + m = re.search(rf'href=["\']({tb_url})["\']', html) + if key in old_keys: + assert m is None + else: + if m is None: + raise ValueError(f"Could not find link for tunebook {key!r} in tunebooks page.") + + @pytest.mark.parametrize("key", eskin._TUNEBOOK_KEY_TO_URL) def test_eskin_tunebook_data_load(key): data = eskin.load_meta(key) From 9d2f3c8268468d409c0ebadc51b58b5e363de70e Mon Sep 17 00:00:00 2001 From: zmoon Date: Wed, 7 Jan 2026 10:33:00 -0600 Subject: [PATCH 15/28] Quiet Pyright a different way that doesn't affect mypy --- pyabc2/sources/_lzstring.py | 2 -- pyproject.toml | 3 +++ 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/pyabc2/sources/_lzstring.py b/pyabc2/sources/_lzstring.py index d43677a..bc7aacd 100644 --- a/pyabc2/sources/_lzstring.py +++ b/pyabc2/sources/_lzstring.py @@ -3,8 +3,6 @@ # - Remove unused imports # - Fix UTF-16 decompress (PR#7) -# type: ignore - from __future__ import division from __future__ import unicode_literals from __future__ import print_function diff --git a/pyproject.toml b/pyproject.toml index a629735..82dff29 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -58,3 +58,6 @@ markers = ["slow"] exclude = ["^venv"] install_types = true ignore_missing_imports = true + +[tool.pyright] +ignore = ["pyabc2/sources/_lzstring.py"] From 30d01e46c38b80770067b7b2e3f8fd8d45e61425 Mon Sep 17 00:00:00 2001 From: zmoon Date: Wed, 7 Jan 2026 13:08:46 -0600 Subject: [PATCH 16/28] Return df from `eskin.load_meta` consistent with The Session current behavior --- docs/examples/sources.ipynb | 22 +++++++++++++++++----- pyabc2/sources/eskin.py | 27 ++++++++++++++++++++++++--- tests/test_sources.py | 9 ++++++--- 3 files changed, 47 insertions(+), 11 deletions(-) diff --git a/docs/examples/sources.ipynb b/docs/examples/sources.ipynb index 865bdff..83437f0 100644 --- a/docs/examples/sources.ipynb +++ b/docs/examples/sources.ipynb @@ -380,8 +380,8 @@ "metadata": {}, "outputs": [], "source": [ - "dct = eskin.load_meta(\"kss\")\n", - "print(sorted(dct))" + "df = eskin.load_meta(\"kss\")\n", + "df" ] }, { @@ -390,21 +390,33 @@ "id": "33", "metadata": {}, "outputs": [], + "source": [ + "df.group.value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "34", + "metadata": {}, + "outputs": [], "source": [ "from pyabc2 import Tune\n", "\n", - "Tune(dct[\"jigs\"][0][\"abc\"])" + "Tune(df.query(\"group == 'jigs'\").iloc[0].abc)" ] }, { "cell_type": "code", "execution_count": null, - "id": "34", + "id": "35", "metadata": {}, "outputs": [], "source": [ + "from IPython.display import display, Markdown\n", + "\n", "url = \"https://michaeleskin.com/abctools/abctools.html?lzw=BoLgUAKiBiD2BOACCALApogMrAbhg8gGaICyArgM4CWAxmAEogUA2VADogFZUDmYAwiExUAXon4BDePFjNmYEiACcAegAcYTCACM6sAGkQAcTBGAogBFEFs0cQBBIwCFEAHwdG7zgCaI0333dzKxs7Rxo3RCc0DCd7F3MzRBBXMB5-PxVCFR4EpxUaFUDEdN80HgAjRAkAJmJ3Uszs3Id8wuL-F28nMKdAtIy0LJy8gqLIxvKq2olIipimnIxankjOxG7e+zdUoA\"\n", - "print(url)\n", + "display(Markdown(f\"<{url}>\"))\n", "eskin.load_url(url)" ] } diff --git a/pyabc2/sources/eskin.py b/pyabc2/sources/eskin.py index 9ad6ba9..1aec498 100644 --- a/pyabc2/sources/eskin.py +++ b/pyabc2/sources/eskin.py @@ -10,12 +10,15 @@ import json import re from pathlib import Path -from typing import Literal, NamedTuple, Tuple, Union +from typing import TYPE_CHECKING, Literal, NamedTuple, Tuple, Union from urllib.parse import parse_qs, urlsplit from pyabc2 import Tune from pyabc2.sources._lzstring import LZString +if TYPE_CHECKING: # pragma: no cover + import pandas + HERE = Path(__file__).parent SAVE_TO = HERE / "_eskin" @@ -60,9 +63,13 @@ def abctools_url_to_abc( r"%%titlefont ", r"%%subtitlefont ", r"%%infofont ", + r"%%partsfont ", + r"%%textfont ", + r"%%tempofont ", r"%irish_rolls_on", r"%abcjs_", r"%%MIDI ", + r"%add_all_playback_links", ), ) -> str: """Extract the ABC from an Eskin abctools (``michaeleskin.com/abctools/``) share URL. @@ -195,7 +202,7 @@ def _load_data(key: str): return json.load(f) -def load_meta(key: str, *, redownload: bool = False): +def load_meta(key: str, *, redownload: bool = False) -> "pandas.DataFrame": """Load the tunebook data, no parsing. Parameters @@ -228,7 +235,12 @@ def load_meta(key: str, *, redownload: bool = False): See https://michaeleskin.com/tunebooks.html for more information. + + See Also + -------- + :doc:`/examples/sources` """ + import pandas as pd tb_info = get_tunebook_info(key) @@ -238,7 +250,16 @@ def load_meta(key: str, *, redownload: bool = False): _download_data(key) print("done") - return _load_data(key) + data = _load_data(key) + + dfs = [] + for group, tunes in data.items(): + df_ = pd.DataFrame(tunes) + df_["group"] = group + dfs.append(df_) + df = pd.concat(dfs, ignore_index=True) + + return df def load_url(url: str) -> Tune: diff --git a/tests/test_sources.py b/tests/test_sources.py index bd4e2cc..e0ec353 100644 --- a/tests/test_sources.py +++ b/tests/test_sources.py @@ -305,9 +305,9 @@ def test_eskin_tunebook_url_current(): @pytest.mark.parametrize("key", eskin._TUNEBOOK_KEY_TO_URL) def test_eskin_tunebook_data_load(key): - data = eskin.load_meta(key) + df = eskin.load_meta(key) - tune_type_keys = { + tune_group_keys = { "airs_songs", "hornpipes", "jigs", @@ -324,7 +324,10 @@ def test_eskin_tunebook_data_load(key): "waltzes", } - assert data.keys() <= tune_type_keys or list(data) == ["tunes"] + if key in {"kss"}: + assert set(df.group.unique()) <= tune_group_keys + else: + assert df.group.unique().tolist() == ["tunes"] def test_eskin_abc_url_creation(): From 527a3b6e5887ed4aff6cce1be09fd650eb359076 Mon Sep 17 00:00:00 2001 From: zmoon Date: Wed, 7 Jan 2026 13:13:10 -0600 Subject: [PATCH 17/28] Test `sources.load_url` dispatch to Eskin --- tests/test_sources.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tests/test_sources.py b/tests/test_sources.py index e0ec353..c3125bb 100644 --- a/tests/test_sources.py +++ b/tests/test_sources.py @@ -255,6 +255,13 @@ def test_load_url_norbeck(): assert tune.title == "For The Love Of Music" +def test_load_url_eskin(): + url = "https://michaeleskin.com/abctools/abctools.html?lzw=BoLgUAKiBiD2BOACCALApogMrAbhg8gGaICyArgM4CWAxmAEogUA2VADogFZUDmYAwiExUAXon4BDePFjNmYEiACcAegAcYTCACM6sAGkQAcTBGAogBFEFs0cQBBIwCFEAHwdG7zgCaI0333dzKxs7Rxo3RCc0DCd7F3MzRBBXMB5-PxVCFR4EpxUaFUDEdN80HgAjRAkAJmJ3Uszs3Id8wuL-F28nMKdAtIy0LJy8gqLIxvKq2olIipimnIxankjOxG7e+zdUoA" + + tune = load_url(url) + assert tune.title == "For The Love Of Music" + + def test_load_url_invalid_domain(): with pytest.raises(NotImplementedError): _ = load_url("https://www.google.com") From b4daa105f7d32b6477ab437f8329aa84f76ac8c8 Mon Sep 17 00:00:00 2001 From: zmoon Date: Wed, 7 Jan 2026 13:16:30 -0600 Subject: [PATCH 18/28] Test other netlocs for load_url --- tests/test_sources.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/tests/test_sources.py b/tests/test_sources.py index c3125bb..ca74262 100644 --- a/tests/test_sources.py +++ b/tests/test_sources.py @@ -238,15 +238,17 @@ def test_int_downcast(): assert s3.dtype == expected_dtype_ext -def test_load_url_the_session(): - tune = load_url("https://thesession.org/tunes/10000") +@pytest.mark.parametrize("netloc", the_session._URL_NETLOCS) +def test_load_url_the_session(netloc): + tune = load_url(f"https://{netloc}/tunes/10000") assert tune.title == "Brian Quinn's" -def test_load_url_norbeck(): +@pytest.mark.parametrize("netloc", norbeck._URL_NETLOCS) +def test_load_url_norbeck(netloc): import requests - url = "https://norbeck.nu/abc/display.asp?rhythm=slip+jig&ref=106" + url = f"https://{netloc}/abc/display.asp?rhythm=slip+jig&ref=106" try: tune = load_url(url) except requests.exceptions.ReadTimeout as e: @@ -255,9 +257,9 @@ def test_load_url_norbeck(): assert tune.title == "For The Love Of Music" -def test_load_url_eskin(): - url = "https://michaeleskin.com/abctools/abctools.html?lzw=BoLgUAKiBiD2BOACCALApogMrAbhg8gGaICyArgM4CWAxmAEogUA2VADogFZUDmYAwiExUAXon4BDePFjNmYEiACcAegAcYTCACM6sAGkQAcTBGAogBFEFs0cQBBIwCFEAHwdG7zgCaI0333dzKxs7Rxo3RCc0DCd7F3MzRBBXMB5-PxVCFR4EpxUaFUDEdN80HgAjRAkAJmJ3Uszs3Id8wuL-F28nMKdAtIy0LJy8gqLIxvKq2olIipimnIxankjOxG7e+zdUoA" - +@pytest.mark.parametrize("netloc", eskin._URL_NETLOCS) +def test_load_url_eskin(netloc): + url = f"https://{netloc}/abctools/abctools.html?lzw=BoLgUAKiBiD2BOACCALApogMrAbhg8gGaICyArgM4CWAxmAEogUA2VADogFZUDmYAwiExUAXon4BDePFjNmYEiACcAegAcYTCACM6sAGkQAcTBGAogBFEFs0cQBBIwCFEAHwdG7zgCaI0333dzKxs7Rxo3RCc0DCd7F3MzRBBXMB5-PxVCFR4EpxUaFUDEdN80HgAjRAkAJmJ3Uszs3Id8wuL-F28nMKdAtIy0LJy8gqLIxvKq2olIipimnIxankjOxG7e+zdUoA" tune = load_url(url) assert tune.title == "For The Love Of Music" From bbe5a58d1856956109098b0f62a3543ef8bc6666 Mon Sep 17 00:00:00 2001 From: zmoon Date: Wed, 7 Jan 2026 13:31:52 -0600 Subject: [PATCH 19/28] cov --- pyabc2/sources/eskin.py | 2 +- pyproject.toml | 3 +++ tests/test_sources.py | 5 +++++ 3 files changed, 9 insertions(+), 1 deletion(-) diff --git a/pyabc2/sources/eskin.py b/pyabc2/sources/eskin.py index 1aec498..7a7e3f5 100644 --- a/pyabc2/sources/eskin.py +++ b/pyabc2/sources/eskin.py @@ -273,7 +273,7 @@ def load_url(url: str) -> Tune: return Tune(abc) -if __name__ == "__main__": +if __name__ == "__main__": # pragma: no cover from . import load_example_abc abc = load_example_abc("For the Love of Music") diff --git a/pyproject.toml b/pyproject.toml index 82dff29..9e6402d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -61,3 +61,6 @@ ignore_missing_imports = true [tool.pyright] ignore = ["pyabc2/sources/_lzstring.py"] + +[tool.coverage.run] +omit = ["pyabc2/sources/_lzstring.py"] diff --git a/tests/test_sources.py b/tests/test_sources.py index ca74262..89c958b 100644 --- a/tests/test_sources.py +++ b/tests/test_sources.py @@ -352,3 +352,8 @@ def test_eskin_abc_url_creation(): and r.headers.get("Location", "").rstrip("/") == "https://michaeleskin.com" ): raise ValueError(f"URL {url} redirects to homepage") + + +def test_eskin_invalid_tunebook_key(): + with pytest.raises(ValueError, match="Unknown Eskin tunebook key: 'asdf'"): + _ = eskin.get_tunebook_info("asdf") From 2549de5c724abf898104ee08ccf452663317902a Mon Sep 17 00:00:00 2001 From: zmoon Date: Wed, 7 Jan 2026 13:37:39 -0600 Subject: [PATCH 20/28] Help pytest-xdist sets are an issue https://pytest-xdist.readthedocs.io/en/stable/known-limitations.html --- tests/test_sources.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/test_sources.py b/tests/test_sources.py index 89c958b..1da106d 100644 --- a/tests/test_sources.py +++ b/tests/test_sources.py @@ -238,13 +238,13 @@ def test_int_downcast(): assert s3.dtype == expected_dtype_ext -@pytest.mark.parametrize("netloc", the_session._URL_NETLOCS) +@pytest.mark.parametrize("netloc", sorted(the_session._URL_NETLOCS)) def test_load_url_the_session(netloc): tune = load_url(f"https://{netloc}/tunes/10000") assert tune.title == "Brian Quinn's" -@pytest.mark.parametrize("netloc", norbeck._URL_NETLOCS) +@pytest.mark.parametrize("netloc", sorted(norbeck._URL_NETLOCS)) def test_load_url_norbeck(netloc): import requests @@ -257,7 +257,7 @@ def test_load_url_norbeck(netloc): assert tune.title == "For The Love Of Music" -@pytest.mark.parametrize("netloc", eskin._URL_NETLOCS) +@pytest.mark.parametrize("netloc", sorted(eskin._URL_NETLOCS)) def test_load_url_eskin(netloc): url = f"https://{netloc}/abctools/abctools.html?lzw=BoLgUAKiBiD2BOACCALApogMrAbhg8gGaICyArgM4CWAxmAEogUA2VADogFZUDmYAwiExUAXon4BDePFjNmYEiACcAegAcYTCACM6sAGkQAcTBGAogBFEFs0cQBBIwCFEAHwdG7zgCaI0333dzKxs7Rxo3RCc0DCd7F3MzRBBXMB5-PxVCFR4EpxUaFUDEdN80HgAjRAkAJmJ3Uszs3Id8wuL-F28nMKdAtIy0LJy8gqLIxvKq2olIipimnIxankjOxG7e+zdUoA" tune = load_url(url) From c2f8059cfb6450e5fe3e3e95c3307210e9048490 Mon Sep 17 00:00:00 2001 From: zmoon Date: Wed, 7 Jan 2026 13:49:44 -0600 Subject: [PATCH 21/28] Test different Eskin ABC line filtering settings --- pyabc2/sources/eskin.py | 1 + tests/test_sources.py | 17 +++++++++++++++++ 2 files changed, 18 insertions(+) diff --git a/pyabc2/sources/eskin.py b/pyabc2/sources/eskin.py index 7a7e3f5..9985916 100644 --- a/pyabc2/sources/eskin.py +++ b/pyabc2/sources/eskin.py @@ -67,6 +67,7 @@ def abctools_url_to_abc( r"%%textfont ", r"%%tempofont ", r"%irish_rolls_on", + r"%swing", r"%abcjs_", r"%%MIDI ", r"%add_all_playback_links", diff --git a/tests/test_sources.py b/tests/test_sources.py index 1da106d..34e4c3b 100644 --- a/tests/test_sources.py +++ b/tests/test_sources.py @@ -339,6 +339,23 @@ def test_eskin_tunebook_data_load(key): assert df.group.unique().tolist() == ["tunes"] +def test_eskin_abc_url_parsing(): + url = "https://michaeleskin.com/abctools/abctools.html?lzw=BoLgjAUApFAuCWsA2BTAZgewHawAQAUBDJQhLDXMADmigGcBXAIwWXWzyJLIrAGZa8LJkw4CxUkN4CYAB0IAnWHVGcJPSjLgoAtrIyrx3KZtqwUAD1iGuk8qYAqIBwAsUuAIJMmKAJ64IABlwAHoaAFkQABYQqIgARRAAJliAXgBOAAYIACUQHJQUJGg6AHchAHNcTIA6SABpEABxaEImAGMAKzoAfToMBiwAE0M0UiYMX1pwgEkAERncWQUMCoVCHWrp+cWmQjo6ZdWtmFmF3HaXDAUho6rs053cPYOANwwkXAA2OMfzy+uQ3enx+rSGQx6xCQPVkJF8e3aAGsekghIi6BAAEQeHSYzx8XAAIU8SVwTQAorgAD6eDxNDy4TFNPGE8HEmnY3G0jzEunkgBi1MZzLJTXpRLZ1KxOLxHlJPJJZMpNI8dIZTJZko5MsVCr5go5IrF4tZQyqVKp0q5KAqttwhFJeyFGtwFTaVUIFRQQ2dOptdodrsIzuZTDdaFd7iGpMtnLx-o9FTDIcxnuTnu9vutto9pLdKbDhAjXqG7IAukA&format=noten&ssp=10&name=The_Abbey&play=1" + + # default: explicit prefixes + abc = eskin.abctools_url_to_abc(url) + + # any % + abc_rm_any_pct = eskin.abctools_url_to_abc(url, remove_prefs="%") + + # no remove + abc_no_rm = eskin.abctools_url_to_abc(url, remove_prefs=False) + + assert abc == abc_rm_any_pct + assert sum(line.startswith("%") for line in abc_no_rm.splitlines()) > 0 + assert sum(line.startswith(r"%%") for line in abc_no_rm.splitlines()) > 0 + + def test_eskin_abc_url_creation(): import requests From d9ac5a7a11ad2270eee0c00088d5a31a7aa7f719 Mon Sep 17 00:00:00 2001 From: zmoon Date: Wed, 7 Jan 2026 14:12:59 -0600 Subject: [PATCH 22/28] Add explanatory test of Eskin redirection --- tests/test_sources.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/tests/test_sources.py b/tests/test_sources.py index 34e4c3b..6d8a8da 100644 --- a/tests/test_sources.py +++ b/tests/test_sources.py @@ -269,6 +269,22 @@ def test_load_url_invalid_domain(): _ = load_url("https://www.google.com") +def test_eskin_tunebook_bad_url_redirects(): + import requests + + # Bad URL (2025 -> 3025) + # Redirects to the home page. + # Nothing in `r.history`. `allow_redirects=False` has no impact. + url = "https://michaeleskin.com/cce_sd/cce_san_diego_tunes_10nov3025.html" + r = requests.head(url, timeout=5) + r.raise_for_status() + + assert r.status_code == 302 + assert r.headers.get("Location", "").rstrip("/") == "https://michaeleskin.com" + assert r.history == [] + assert r.is_redirect + + @pytest.mark.parametrize("key", eskin._TUNEBOOK_KEY_TO_URL) def test_eskin_tunebook_url_exist(key): import requests @@ -340,6 +356,7 @@ def test_eskin_tunebook_data_load(key): def test_eskin_abc_url_parsing(): + # From https://michaeleskin.com/cce_sd/cce_san_diego_tunes_10nov2025.html url = "https://michaeleskin.com/abctools/abctools.html?lzw=BoLgjAUApFAuCWsA2BTAZgewHawAQAUBDJQhLDXMADmigGcBXAIwWXWzyJLIrAGZa8LJkw4CxUkN4CYAB0IAnWHVGcJPSjLgoAtrIyrx3KZtqwUAD1iGuk8qYAqIBwAsUuAIJMmKAJ64IABlwAHoaAFkQABYQqIgARRAAJliAXgBOAAYIACUQHJQUJGg6AHchAHNcTIA6SABpEABxaEImAGMAKzoAfToMBiwAE0M0UiYMX1pwgEkAERncWQUMCoVCHWrp+cWmQjo6ZdWtmFmF3HaXDAUho6rs053cPYOANwwkXAA2OMfzy+uQ3enx+rSGQx6xCQPVkJF8e3aAGsekghIi6BAAEQeHSYzx8XAAIU8SVwTQAorgAD6eDxNDy4TFNPGE8HEmnY3G0jzEunkgBi1MZzLJTXpRLZ1KxOLxHlJPJJZMpNI8dIZTJZko5MsVCr5go5IrF4tZQyqVKp0q5KAqttwhFJeyFGtwFTaVUIFRQQ2dOptdodrsIzuZTDdaFd7iGpMtnLx-o9FTDIcxnuTnu9vutto9pLdKbDhAjXqG7IAukA&format=noten&ssp=10&name=The_Abbey&play=1" # default: explicit prefixes From 104ffe8bd8fc6bd0023318d57d36f6fbb3a167c2 Mon Sep 17 00:00:00 2001 From: zmoon Date: Wed, 7 Jan 2026 14:15:38 -0600 Subject: [PATCH 23/28] Error message if lzw missing (or multiple) --- pyabc2/sources/eskin.py | 5 ++++- tests/test_sources.py | 6 ++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/pyabc2/sources/eskin.py b/pyabc2/sources/eskin.py index 9985916..3b63445 100644 --- a/pyabc2/sources/eskin.py +++ b/pyabc2/sources/eskin.py @@ -94,7 +94,10 @@ def abctools_url_to_abc( assert res.path.startswith("/abctools/") query_params = parse_qs(res.query) - (lzw,) = query_params["lzw"] + try: + (lzw,) = query_params["lzw"] + except Exception as e: + raise ValueError("URL does not contain required 'lzw' parameter") from e # Note `+` has been replaced with space by parse_qs # Note js LZString.compressToEncodedURIComponent() is used to compress/encode the ABC diff --git a/tests/test_sources.py b/tests/test_sources.py index 6d8a8da..dff0ea2 100644 --- a/tests/test_sources.py +++ b/tests/test_sources.py @@ -373,6 +373,12 @@ def test_eskin_abc_url_parsing(): assert sum(line.startswith(r"%%") for line in abc_no_rm.splitlines()) > 0 +def test_eskin_abc_url_bad(): + url = "https://michaeleskin.com/abctools/abctools.html?" + with pytest.raises(ValueError, match="URL does not contain required 'lzw' parameter"): + _ = eskin.abctools_url_to_abc(url) + + def test_eskin_abc_url_creation(): import requests From 39e430dc98479010d562833eba9e2f3226d44843 Mon Sep 17 00:00:00 2001 From: zmoon Date: Wed, 7 Jan 2026 14:21:20 -0600 Subject: [PATCH 24/28] Normalize tunebook key for result too --- pyabc2/sources/eskin.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pyabc2/sources/eskin.py b/pyabc2/sources/eskin.py index 3b63445..75a9b0b 100644 --- a/pyabc2/sources/eskin.py +++ b/pyabc2/sources/eskin.py @@ -135,8 +135,9 @@ class EskinTunebookInfo(NamedTuple): def get_tunebook_info(key: str) -> EskinTunebookInfo: + key = key.lower() try: - url = _TUNEBOOK_KEY_TO_URL[key.lower()] + url = _TUNEBOOK_KEY_TO_URL[key] except KeyError: raise ValueError( f"Unknown Eskin tunebook key: {key!r}. Valid options: {sorted(_TUNEBOOK_KEY_TO_URL)}." From 0f08969d0af47cae842f97e83d067d3c63d999ed Mon Sep 17 00:00:00 2001 From: zmoon Date: Wed, 7 Jan 2026 14:48:58 -0600 Subject: [PATCH 25/28] assert -> log --- pyabc2/sources/eskin.py | 17 +++++++++++++---- tests/test_sources.py | 19 ++++++++++++++++++- 2 files changed, 31 insertions(+), 5 deletions(-) diff --git a/pyabc2/sources/eskin.py b/pyabc2/sources/eskin.py index 75a9b0b..91ee1c4 100644 --- a/pyabc2/sources/eskin.py +++ b/pyabc2/sources/eskin.py @@ -14,11 +14,14 @@ from urllib.parse import parse_qs, urlsplit from pyabc2 import Tune +from pyabc2._util import get_logger as _get_logger from pyabc2.sources._lzstring import LZString if TYPE_CHECKING: # pragma: no cover import pandas +logger = _get_logger(__name__) + HERE = Path(__file__).parent SAVE_TO = HERE / "_eskin" @@ -90,8 +93,10 @@ def abctools_url_to_abc( remove_prefs = (remove_prefs,) res = urlsplit(url) - assert res.netloc in _URL_NETLOCS - assert res.path.startswith("/abctools/") + if res.netloc not in _URL_NETLOCS: + logger.debug(f"Unexpected Eskin URL netloc: {res.netloc}") + if not res.path.startswith("/abctools/"): + logger.debug(f"Unexpected Eskin URL path: {res.path}") query_params = parse_qs(res.query) try: @@ -101,7 +106,10 @@ def abctools_url_to_abc( # Note `+` has been replaced with space by parse_qs # Note js LZString.compressToEncodedURIComponent() is used to compress/encode the ABC - abc = LZString.decompressFromEncodedURIComponent(lzw) + try: + abc = LZString.decompressFromEncodedURIComponent(lzw) + except Exception as e: + raise RuntimeError("Failed to decompress LZString data") from e if abc is None: raise RuntimeError("Failed to decompress LZString data") @@ -188,9 +196,10 @@ def _download_data(key: str): raise for d in data: - assert d.keys() == {"Name", "URL"} d["name"] = d.pop("Name") d["abc"] = abctools_url_to_abc(d.pop("URL")) + if d: # pragma: no cover + logger.debug(f"Extra fields in Eskin tune data: {sorted(d)}") all_data[type_] = data diff --git a/tests/test_sources.py b/tests/test_sources.py index dff0ea2..823648a 100644 --- a/tests/test_sources.py +++ b/tests/test_sources.py @@ -373,12 +373,29 @@ def test_eskin_abc_url_parsing(): assert sum(line.startswith(r"%%") for line in abc_no_rm.splitlines()) > 0 -def test_eskin_abc_url_bad(): +def test_eskin_abc_url_missing_param(): url = "https://michaeleskin.com/abctools/abctools.html?" with pytest.raises(ValueError, match="URL does not contain required 'lzw' parameter"): _ = eskin.abctools_url_to_abc(url) +def test_eskin_abc_url_bad_param(): + url = "https://michaeleskin.com/abctools/abctools.html?lzw=hi" + with pytest.raises(RuntimeError, match="Failed to decompress LZString data"): + _ = eskin.abctools_url_to_abc(url) + + +def test_eskin_abc_url_bad(caplog): + url = "https://michaeleski.com/deftools/abctools.html?lzw=BoLgjAUApFAuCWsA2BTAZgewHawAQAUBDJQhLDXMADmigGcBXAIwWXWzyJLIrAGZa8LJkw4CxUkN4CYAB0IAnWHVGcJPSjLgoAtrIyrx3KZtqwUAD1iGuk8qYAqIBwAsUuAIJMmKAJ64IABlwAHoaAFkQABYQqIgARRAAJliAXgBOAAYIACUQHJQUJGg6AHchAHNcTIA6SABpEABxaEImAGMAKzoAfToMBiwAE0M0UiYMX1pwgEkAERncWQUMCoVCHWrp+cWmQjo6ZdWtmFmF3HaXDAUho6rs053cPYOANwwkXAA2OMfzy+uQ3enx+rSGQx6xCQPVkJF8e3aAGsekghIi6BAAEQeHSYzx8XAAIU8SVwTQAorgAD6eDxNDy4TFNPGE8HEmnY3G0jzEunkgBi1MZzLJTXpRLZ1KxOLxHlJPJJZMpNI8dIZTJZko5MsVCr5go5IrF4tZQyqVKp0q5KAqttwhFJeyFGtwFTaVUIFRQQ2dOptdodrsIzuZTDdaFd7iGpMtnLx-o9FTDIcxnuTnu9vutto9pLdKbDhAjXqG7IAukA&format=noten&ssp=10&name=The_Abbey&play=1" + with caplog.at_level("DEBUG"): + _ = eskin.abctools_url_to_abc(url) + + assert caplog.messages == [ + "Unexpected Eskin URL netloc: michaeleski.com", + "Unexpected Eskin URL path: /deftools/abctools.html", + ] + + def test_eskin_abc_url_creation(): import requests From baa9697ba0d4a22844b01105c29aa8a5efc3878c Mon Sep 17 00:00:00 2001 From: zmoon Date: Wed, 7 Jan 2026 15:23:46 -0600 Subject: [PATCH 26/28] Include failed parse info in RuntimeError instead --- pyabc2/sources/eskin.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/pyabc2/sources/eskin.py b/pyabc2/sources/eskin.py index 91ee1c4..759720c 100644 --- a/pyabc2/sources/eskin.py +++ b/pyabc2/sources/eskin.py @@ -192,8 +192,13 @@ def _download_data(key: str): try: data = json.loads(s_data) except json.JSONDecodeError as e: - print(s_data[e.pos], "context:", s_data[e.pos - 10 : e.pos + 10]) - raise + w = 25 + a = max(0, e.pos - w) + b = min(len(s_data), e.pos + w) + raise RuntimeError( + f"Error parsing JSON data for Eskin tunebook {key!r} group {type_!r}. " + f"Context ({a}:{b}): {s_data[a:b]!r}" + ) from e for d in data: d["name"] = d.pop("Name") From 04290a413cd72ffad3f634b7820ff35091d648b3 Mon Sep 17 00:00:00 2001 From: zmoon Date: Wed, 7 Jan 2026 15:45:21 -0600 Subject: [PATCH 27/28] Document Eskin redownload option --- pyabc2/sources/eskin.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pyabc2/sources/eskin.py b/pyabc2/sources/eskin.py index 759720c..03defab 100644 --- a/pyabc2/sources/eskin.py +++ b/pyabc2/sources/eskin.py @@ -254,6 +254,8 @@ def load_meta(key: str, *, redownload: bool = False) -> "pandas.DataFrame": See https://michaeleskin.com/tunebooks.html for more information. + redownload + Re-download the data file. See Also -------- From f3f47d24da0e3a32b406a3a74c10dc075440ca22 Mon Sep 17 00:00:00 2001 From: zmoon Date: Wed, 7 Jan 2026 15:55:07 -0600 Subject: [PATCH 28/28] Add changelog entry --- docs/changes.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/changes.md b/docs/changes.md index 7fafd85..a154a6a 100644 --- a/docs/changes.md +++ b/docs/changes.md @@ -5,6 +5,7 @@ * Fix loading The Session sets data ({pull}`77`) * Fix HTML display of pitch classes with double accidentals ({pull}`76`) * Fix Norbeck URL gen for multi-word tune types (e.g., slip jig, set dance) +* Add initial support for loading Eskin ABC Transcription Tools tunebooks ({pull}`86`) ## v0.1.0 (2025-07-02)