From 1eabd2e705e8885bde4b4b56176e7308be77f126 Mon Sep 17 00:00:00 2001
From: Jason Deppen <jDeppen@gmail.com>
Date: Sat, 4 Nov 2023 16:52:55 -0400
Subject: [PATCH] Include labels and note images

---
 keepToText.py | 40 ++++++++++++++++++++++++----------------
 1 file changed, 24 insertions(+), 16 deletions(-)

diff --git a/keepToText.py b/keepToText.py
index 2510781..6c945a9 100644
--- a/keepToText.py
+++ b/keepToText.py
@@ -5,52 +5,61 @@
 class MyHTMLParser(HTMLParser):
     def attrib_matches(self, tag, attrs):
         return [pair for pair in attrs if
-                pair[0] == self.attrib and pair[1] == self.attribVal]    
+            pair[0] == "class" and pair[1] == "content"]
 
     def handle_starttag(self, tag, attrs):
-        if tag == self.tag:
+        if tag == "div":
             if self.attrib_matches(tag, attrs) and not self.nesting:
                 self.nesting = 1
             elif self.nesting:
                 self.nesting += 1
         elif tag == "br" and self.nesting:
             self.outf.write("\n")
+        elif tag == "img":
+            self.outf.write("\n\n")
+            self.outf.write("oneImage")
+        elif tag == "span" and attrs[0][1] == "label-name":
+            self.nesting = 1
+            self.wrap = 1
 
     def handle_endtag(self, tag):
-        if tag == self.tag and self.nesting:
+        if tag == "div" and self.nesting:
             self.nesting -= 1
-            
+
     def handle_data(self, data):
         if self.nesting:
-            self.outf.write(data.strip())
+            if self.wrap:
+                self.outf.write("\n\n")
+                self.outf.write("label({0})".format(data.strip()))
+                self.wrap = 0
+            else:
+                self.outf.write(data.strip())
     
-    def __init__(self, outf, tag, attrib, attribVal):
+    def __init__(self, outf):
         HTMLParser.__init__(self)
         self.outf = outf
-        self.tag = tag
-        self.attrib = attrib
-        self.attribVal = attribVal
         self.nesting = 0
-        
+        self.wrap = 0
+
 def msg(s):
     print >> sys.stderr, s
     sys.stderr.flush()
 
-def htmlFileToText(inputPath, outputDir, tag, attrib, attribVal):
+def htmlFileToText(inputPath, outputDir):
     basename = os.path.basename(inputPath).replace(".html", ".txt")
     outfname = os.path.join(outputDir, basename)
     with open(inputPath, "r") as inf, open(outfname, "w") as outf:
         html = inf.read()
-        parser = MyHTMLParser(outf, tag, attrib, attribVal)
+        parser = MyHTMLParser(outf)
         parser.feed(html)
         
-def htmlDirToText(inputDir, outputDir, tag, attrib, attribVal):
+def htmlDirToText(inputDir, outputDir):
     try_rmtree(outputDir)
     try_mkdir(outputDir)
     msg("Building text files in {0} ...".format(outputDir))
     
     for path in glob.glob(os.path.join(inputDir, "*.html")):
-        htmlFileToText(path, outputDir, tag, attrib, attribVal)
+        htmlFileToText(path, outputDir)
         
     msg("Done.")
     
@@ -106,8 +115,7 @@ def keepZipToText(zipFileName):
     for dirName in translatedKeepDirs:
 	if os.path.isdir(takeoutDir+"/"+dirName): htmlDir = os.path.join(takeoutDir, dirName)
 
-    htmlDirToText(inputDir=htmlDir, outputDir=outputDir,
-        tag="div", attrib="class", attribVal="content")
+    htmlDirToText(inputDir=htmlDir, outputDir=outputDir)
 
 def main():
     try: