diff --git a/scripts/safe_build.py b/scripts/safe_build.py index 05689e7..fce879f 100644 --- a/scripts/safe_build.py +++ b/scripts/safe_build.py @@ -33,6 +33,16 @@ skipped too. Where a song shows more than one lyric blockquote (alternate verses, e.g. clem.html), each is removed but any editorial note Dodd wrote between them is preserved. + +Not every song uses a
. Some lay the lyrics out as bare +
-separated lines, or inside a layout (e.g. soma.html, bird.html). For +those we strip the span from the end of the credit/copyright preamble to the +seam, gated so non-lyric pages stay untouched: pages whose region carries list +markup (discographies, title-phrase nav like appl.html / tribute.html) are +skipped, and the span must be verse-dense (>=10
; real lyric spans have >=27, +everything else <=5). Single-line epigraphs above the credit line, and short +lyric fragments quoted inside the annotations, are fragments -- left in place, by +the same fair-use reasoning that keeps the essays. """ import re @@ -57,6 +67,18 @@ BLOCKQUOTE_RE = re.compile(r"
.*?
", re.I | re.S) +# Some pages lay lyrics out as bare
-separated lines (or inside a layout +#
) with no
at all. There the lyric span runs from the end +# of the credit/copyright preamble to the annotation seam. PREAMBLE_RE finds the +# credit lines (the lyrics start after the LAST one); BOUNDARY_RE finds the +#
/

that ends that line. +PREAMBLE_RE = re.compile( + r"(words?\s+(?:and\s+music\s+)?by|lyrics?\s+by|music\s+by" + r"|copyright|used\s+(?:by|with)[^<\n]*permission)", + re.I, +) +BOUNDARY_RE = re.compile(r"|", re.I) + # Marker left in stripped pages so re-running the pass is a no-op. NOTICE_MARK = "" NOTICE = ( @@ -69,6 +91,19 @@ ) +def _lyric_start(text, lo, seam): + """Where the lyrics begin: just after the last credit/copyright line in the + [lo, seam) region. Falls back to lo if no preamble line is found.""" + last = None + for m in PREAMBLE_RE.finditer(text[lo:seam]): + last = m + if not last: + return lo + pos = lo + last.end() + boundary = BOUNDARY_RE.search(text, pos, seam) + return boundary.end() if boundary else pos + + def strip_page(text): """Return (new_text, n_blocks_removed) if this is a song page with lyrics to strip, else None to leave the page untouched.""" @@ -82,27 +117,42 @@ def strip_page(text): return None # no song-credit line -> essay/bio, skip lo = credit_m.start() - # Lyric blockquotes are those starting between the credit line and the seam. + # Case 1: lyrics wrapped in

(the common layout). Targets are the + # blockquotes starting between the credit line and the seam. targets = [m for m in BLOCKQUOTE_RE.finditer(text) if lo <= m.start() < seam] - if not targets: - return None # song title page with no reproduced lyrics - - # Replace right-to-left so earlier match offsets stay valid. The first lyric - # block (in document order) becomes the notice; any others are dropped, while - # the prose between them -- Dodd's editorial notes -- is left in place. - # - # Some 1990s pages (e.g. eleven.html) never close the lyric
- # before the annotations, so the match runs past the seam and would engulf - # the anchor. Clamp every removal at the seam: never delete a - # byte of the annotation section, even at the cost of leaving a stray, - # browser-ignored
behind. - first = targets[0] - out = text - for m in reversed(targets): - end = min(m.end(), seam) - repl = NOTICE if m is first else "" - out = out[: m.start()] + repl + out[end:] - return out, len(targets) + if targets: + # Replace right-to-left so earlier match offsets stay valid. The first + # lyric block (in document order) becomes the notice; any others are + # dropped, while the prose between them -- Dodd's editorial notes -- is + # left in place. + # + # Some 1990s pages (e.g. eleven.html) never close the lyric
+ # before the annotations, so the match runs past the seam and would + # engulf the anchor. Clamp every removal at the seam: never + # delete a byte of the annotation section, even at the cost of leaving a + # stray, browser-ignored
behind. + first = targets[0] + out = text + for m in reversed(targets): + end = min(m.end(), seam) + repl = NOTICE if m is first else "" + out = out[: m.start()] + repl + out[end:] + return out, len(targets) + + # Case 2: no lyric
. Lyrics may instead be bare
-separated + # lines or inside a layout
(e.g. soma.html, bird.html). Strip the + # span from the end of the credit preamble to the seam -- but only when it + # really is a reproduced lyric. Two gates keep non-lyric pages (title-phrase + # annotations, discographies, the home page) untouched: skip if the region + # carries list markup (a discography / nav block), and require a verse-dense + # span. Across the archive that span has >=27
on real lyric pages and + # <=5 on everything else, so a threshold of 10 separates them cleanly. + if "