at all. There the lyric span runs from the end
+# of the credit/copyright preamble to the annotation seam. PREAMBLE_RE finds the
+# credit lines (the lyrics start after the LAST one); BOUNDARY_RE finds the
+#
/ that ends that line.
+PREAMBLE_RE = re.compile(
+ r"(words?\s+(?:and\s+music\s+)?by|lyrics?\s+by|music\s+by"
+ r"|copyright|used\s+(?:by|with)[^<\n]*permission)",
+ re.I,
+)
+BOUNDARY_RE = re.compile(r"
|
", re.I)
+
# Marker left in stripped pages so re-running the pass is a no-op.
NOTICE_MARK = ""
NOTICE = (
@@ -69,6 +91,19 @@
)
+def _lyric_start(text, lo, seam):
+ """Where the lyrics begin: just after the last credit/copyright line in the
+ [lo, seam) region. Falls back to lo if no preamble line is found."""
+ last = None
+ for m in PREAMBLE_RE.finditer(text[lo:seam]):
+ last = m
+ if not last:
+ return lo
+ pos = lo + last.end()
+ boundary = BOUNDARY_RE.search(text, pos, seam)
+ return boundary.end() if boundary else pos
+
+
def strip_page(text):
"""Return (new_text, n_blocks_removed) if this is a song page with lyrics to
strip, else None to leave the page untouched."""
@@ -82,27 +117,42 @@ def strip_page(text):
return None # no song-credit line -> essay/bio, skip
lo = credit_m.start()
- # Lyric blockquotes are those starting between the credit line and the seam.
+ # Case 1: lyrics wrapped in
(the common layout). Targets are the
+ # blockquotes starting between the credit line and the seam.
targets = [m for m in BLOCKQUOTE_RE.finditer(text) if lo <= m.start() < seam]
- if not targets:
- return None # song title page with no reproduced lyrics
-
- # Replace right-to-left so earlier match offsets stay valid. The first lyric
- # block (in document order) becomes the notice; any others are dropped, while
- # the prose between them -- Dodd's editorial notes -- is left in place.
- #
- # Some 1990s pages (e.g. eleven.html) never close the lyric
- # before the annotations, so the match runs past the seam and would engulf
- # the anchor. Clamp every removal at the seam: never delete a
- # byte of the annotation section, even at the cost of leaving a stray,
- # browser-ignored
behind.
- first = targets[0]
- out = text
- for m in reversed(targets):
- end = min(m.end(), seam)
- repl = NOTICE if m is first else ""
- out = out[: m.start()] + repl + out[end:]
- return out, len(targets)
+ if targets:
+ # Replace right-to-left so earlier match offsets stay valid. The first
+ # lyric block (in document order) becomes the notice; any others are
+ # dropped, while the prose between them -- Dodd's editorial notes -- is
+ # left in place.
+ #
+ # Some 1990s pages (e.g. eleven.html) never close the lyric
+ # before the annotations, so the match runs past the seam and would
+ # engulf the anchor. Clamp every removal at the seam: never
+ # delete a byte of the annotation section, even at the cost of leaving a
+ # stray, browser-ignored
behind.
+ first = targets[0]
+ out = text
+ for m in reversed(targets):
+ end = min(m.end(), seam)
+ repl = NOTICE if m is first else ""
+ out = out[: m.start()] + repl + out[end:]
+ return out, len(targets)
+
+ # Case 2: no lyric . Lyrics may instead be bare
-separated
+ # lines or inside a layout (e.g. soma.html, bird.html). Strip the
+ # span from the end of the credit preamble to the seam -- but only when it
+ # really is a reproduced lyric. Two gates keep non-lyric pages (title-phrase
+ # annotations, discographies, the home page) untouched: skip if the region
+ # carries list markup (a discography / nav block), and require a verse-dense
+ # span. Across the archive that span has >=27
on real lyric pages and
+ # <=5 on everything else, so a threshold of 10 separates them cleanly.
+ if "