From cdc8b29e0d414d25f0b9de5bdb1a6d813b14981a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A8d=E2=80=99C?= <cedc.git.4xjor@8alias.com>
Date: Wed, 18 Dec 2024 23:00:18 +0100
Subject: [PATCH 01/11] Feat: Improve `chardict.py` to add n-gram counts

Warning format change to count n-gram occurences

Can go up to large n-gram parametter by changing `NGRAM_MAX_LENGTH`
constant.

Switch from os module to pathlib to manage paths.

Fixes #7
---
 bin/chardict.py | 155 ++++++++++++++++++++++++++++++------------------
 1 file changed, 98 insertions(+), 57 deletions(-)

diff --git a/bin/chardict.py b/bin/chardict.py
index 5ec3e5d..2d82505 100755
--- a/bin/chardict.py
+++ b/bin/chardict.py
@@ -1,50 +1,69 @@
 #!/usr/bin/env python3
-"""Turn corpus texts into dictionaries of symbols, bigrams and trigrams."""
+"""Turn corpus texts into dictionaries of n-grams."""
 
 import json
-from os import listdir, path
+from pathlib import Path
 from sys import argv
 
-IGNORED_CHARS = "1234567890 \t\r\n\ufeff"
-
-
-def parse_corpus(file_path):
-    """Count symbols, bigrams and trigrams in a text file."""
-
-    symbols = {}
-    bigrams = {}
-    trigrams = {}
-    char_count = 0
-    prev_symbol = None
-    prev_prev_symbol = None
-
-    # get a dictionary of all symbols (letters, punctuation marks...)
-    file = open(file_path, "r", encoding="utf-8")
-    for char in file.read():
-        symbol = char.lower()
-        if char not in IGNORED_CHARS:
-            char_count += 1
-            if symbol not in symbols:
-                symbols[symbol] = 0
-            symbols[symbol] += 1
-            if prev_symbol is not None:
-                bigram = prev_symbol + symbol
-                if bigram not in bigrams:
-                    bigrams[bigram] = 0
-                bigrams[bigram] += 1
-                if prev_prev_symbol is not None:
-                    trigram = prev_prev_symbol + bigram
-                    if trigram not in trigrams:
-                        trigrams[trigram] = 0
-                    trigrams[trigram] += 1
-            prev_prev_symbol = prev_symbol
-            prev_symbol = symbol
-        else:
-            prev_symbol = None
-    file.close()
+NGRAM_MAX_LENGTH = 5  # Quadrigrams
+IGNORED_CHARS = "1234567890 \t\r\n\ufeff↵"
+APP_NAME = "kalamine"
+APP_AUTHOR = "1dk"
+
+
+def parse_corpus(txt: str) -> dict:
+    """Count ngrams in a string.
+    retuns a dict of ngrams
+        ngrams[1]=symbols
+        ngrams[2]=bigrames
+        ngrams[3]=trigrams
+        etc., up to NGRAM_MAX_LENGTH
+    ngrams[2] is shaped as { "aa": count }
+    """
+
+    ngrams = {}
+    ngrams_count = {}  # ngrams_count counts the total number of ngrams[i] in corpus.
+
+    txt = txt.lower()  # we want to be case **in**sensitive
+
+    for ngram in range(1, NGRAM_MAX_LENGTH):
+        ngrams[ngram] = {}
+        ngrams_count[ngram] = 0
+
+    def get_ngram(txt: str, ngram_start: int, ngram_length: int) -> str:
+        """get a ngram of a given length at given position in txt
+        returns empty string if ngram cannot be provided"""
+        if txt[ngram_start] in IGNORED_CHARS:
+            return ""
+        if ngram_length <= 0:
+            return ""
+        if ngram_start + ngram_length >= len(txt):
+            return ""
+
+        ngram = txt[ngram_start : ngram_start + ngram_length]
+
+        for n in ngram[1:]:  # 1st char already tested
+            if n in IGNORED_CHARS:
+                return ""
+
+        return ngram
+
+    # get all n-grams
+    for ngram_start in range(len(txt)):
+        for ngram_length in range(NGRAM_MAX_LENGTH):
+            _ngram = get_ngram(txt, ngram_start, ngram_length)
+
+            if not _ngram:  # _ngram is ""
+                continue
+
+            if _ngram not in ngrams[ngram_length]:
+                ngrams[ngram_length][_ngram] = 0
+
+            ngrams[ngram_length][_ngram] += 1
+            ngrams_count[ngram_length] += 1
 
     # sort the dictionary by symbol frequency (requires CPython 3.6+)
-    def sort_by_frequency(table, precision=3):
+    def sort_by_frequency(table: dict, char_count: int, precision: int = 3) -> dict:
         sorted_dict = {}
         for key, count in sorted(table.items(), key=lambda x: -x[1]):
             freq = round(100 * count / char_count, precision)
@@ -52,27 +71,49 @@ def sort_by_frequency(table, precision=3):
                 sorted_dict[key] = freq
         return sorted_dict
 
-    results = {}
-    results["corpus"] = file_path
-    results["symbols"] = sort_by_frequency(symbols)
-    results["bigrams"] = sort_by_frequency(bigrams, 4)
-    results["trigrams"] = sort_by_frequency(trigrams)
-    return results
+    for ngram in range(1, NGRAM_MAX_LENGTH):
+        ngrams[ngram] = sort_by_frequency(ngrams[ngram], ngrams_count[ngram], 4)
+
+    return ngrams, ngrams_count
+
+
+def read_corpus(file_path: str, name: str = "", encoding="utf-8") -> dict:
+    """read a .txt file and provide a dictionary of n-grams"""
+    try:
+        path = Path(file_path)
+        if not path.is_file:
+            raise Exception("Error, this is not a file")
+        if not name:
+            name = path.stem
+        with path.open("r", encoding=encoding) as file:
+            corpus_txt = "↵".join(file.readlines())
+
+    except Exception as e:
+        print(f"file does not exist or could not be read.\n {e}")
+
+    ngrams_freq, ngrams_count = parse_corpus(corpus_txt)
+    return {
+        "name": name,
+        #   "text": corpus_txt,
+        "freq": ngrams_freq,
+        "count": ngrams_count,
+    }
 
 
 if __name__ == "__main__":
     if len(argv) == 2:  # convert one file
-        data = parse_corpus(argv[1])
+        file_path = Path(argv[1])
+        data = read_corpus(str(file_path))
+        output_file_path = file_path.parent / f"{file_path.stem}.json"
+        with open(output_file_path, "w", encoding="utf-8") as outfile:
+            json.dump(data, outfile, indent=4, ensure_ascii=False)
         print(json.dumps(data, indent=4, ensure_ascii=False))
+
     else:  # converts all *.txt files in the script directory
-        bin_dir = path.dirname(__file__)
-        destdir = path.join(bin_dir, "..", "txt")
-        txtdir = path.join(bin_dir, "..", "txt")
-        for filename in listdir(txtdir):
-            if filename.endswith(".txt"):
-                basename = filename[:-4]
-                print(f"...  {basename}")
-                data = parse_corpus(path.join(txtdir, filename))
-                destfile = path.join(destdir, basename + ".json")
-                with open(destfile, "w", encoding="utf-8") as outfile:
+        curent_path = Path(__file__).resolve().parent
+        for file in curent_path.glob("*.txt"):
+            if file.is_file():
+                data = read_corpus(str(file))
+                output_file_path = file.parent / f"{file.stem}.json"
+                with open(output_file_path, "w", encoding="utf-8") as outfile:
                     json.dump(data, outfile, indent=4, ensure_ascii=False)

From f69cf2a8b8e653cc44783c5c9ff09b5598b1f346 Mon Sep 17 00:00:00 2001
From: Fabien Cazenave <fabien@cazenave.cc>
Date: Sun, 22 Dec 2024 17:04:35 +0100
Subject: [PATCH 02/11] minor fixes

---
 Makefile        |  1 +
 bin/chardict.py | 29 ++++++++++++++---------------
 2 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/Makefile b/Makefile
index ef7e6c5..5abf447 100644
--- a/Makefile
+++ b/Makefile
@@ -15,6 +15,7 @@ json:
 	@mkdir -p json
 	@echo "Creating JSON dicts..."
 	@bin/chardict.py
+	@echo "Merging JSON dicts..."
 	@echo "...  de_modern"
 	@bin/merge.py txt/deu_*.json > json/de_modern.json
 	@echo "...  en_modern"
diff --git a/bin/chardict.py b/bin/chardict.py
index 2d82505..e293e1f 100755
--- a/bin/chardict.py
+++ b/bin/chardict.py
@@ -5,7 +5,7 @@
 from pathlib import Path
 from sys import argv
 
-NGRAM_MAX_LENGTH = 5  # Quadrigrams
+NGRAM_MAX_LENGTH = 4  # trigrams
 IGNORED_CHARS = "1234567890 \t\r\n\ufeff↵"
 APP_NAME = "kalamine"
 APP_AUTHOR = "1dk"
@@ -77,16 +77,15 @@ def sort_by_frequency(table: dict, char_count: int, precision: int = 3) -> dict:
     return ngrams, ngrams_count
 
 
-def read_corpus(file_path: str, name: str = "", encoding="utf-8") -> dict:
+def read_corpus(file: Path, name: str = "", encoding="utf-8") -> dict:
     """read a .txt file and provide a dictionary of n-grams"""
     try:
-        path = Path(file_path)
-        if not path.is_file:
+        if not file.is_file:
             raise Exception("Error, this is not a file")
         if not name:
-            name = path.stem
-        with path.open("r", encoding=encoding) as file:
-            corpus_txt = "↵".join(file.readlines())
+            name = file.stem
+        with file.open("r", encoding=encoding) as f:
+            corpus_txt = "↵".join(f.readlines())
 
     except Exception as e:
         print(f"file does not exist or could not be read.\n {e}")
@@ -94,7 +93,6 @@ def read_corpus(file_path: str, name: str = "", encoding="utf-8") -> dict:
     ngrams_freq, ngrams_count = parse_corpus(corpus_txt)
     return {
         "name": name,
-        #   "text": corpus_txt,
         "freq": ngrams_freq,
         "count": ngrams_count,
     }
@@ -102,18 +100,19 @@ def read_corpus(file_path: str, name: str = "", encoding="utf-8") -> dict:
 
 if __name__ == "__main__":
     if len(argv) == 2:  # convert one file
-        file_path = Path(argv[1])
-        data = read_corpus(str(file_path))
-        output_file_path = file_path.parent / f"{file_path.stem}.json"
+        file = Path(argv[1])
+        data = read_corpus(file)
+        output_file_path = file.parent / f"{file.stem}.json"
         with open(output_file_path, "w", encoding="utf-8") as outfile:
             json.dump(data, outfile, indent=4, ensure_ascii=False)
         print(json.dumps(data, indent=4, ensure_ascii=False))
 
     else:  # converts all *.txt files in the script directory
-        curent_path = Path(__file__).resolve().parent
-        for file in curent_path.glob("*.txt"):
+        txt_dir = Path(__file__).resolve().parent.parent / "txt"
+        for file in sorted(txt_dir.glob("*.txt")):
             if file.is_file():
-                data = read_corpus(str(file))
-                output_file_path = file.parent / f"{file.stem}.json"
+                print(f"...  {file.stem}")
+                data = read_corpus(file)
+                output_file_path = txt_dir / f"{file.stem}.json"
                 with open(output_file_path, "w", encoding="utf-8") as outfile:
                     json.dump(data, outfile, indent=4, ensure_ascii=False)

From d88560ef56a828fa12a685107a6414c16a3318df Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A8d=E2=80=99C?= <cedc.git.4xjor@8alias.com>
Date: Tue, 24 Dec 2024 11:54:22 +0100
Subject: [PATCH 03/11] Refactor: Following #9 advise, cleaned up the code to
 be simpler and only implement current merge behaviour.

---
 bin/merge.py | 120 ++++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 89 insertions(+), 31 deletions(-)

diff --git a/bin/merge.py b/bin/merge.py
index 5efe94b..6f63817 100755
--- a/bin/merge.py
+++ b/bin/merge.py
@@ -3,43 +3,101 @@
 
 import json
 from sys import argv
+from pathlib import Path
 
 
-def merge(filenames, filecount):
-    merged = {
-        "symbols": {},
-        "bigrams": {},
-        "trigrams": {},
-    }
+# sort the merged dictionary by symbol frequency (requires CPython 3.6+)
+def _sort_ngram_by_frequency(table, precision=3):
+    sorted_dict = {}
+    for key, count in sorted(table.items(), key=lambda x: -x[1]):
+        freq = round(count, precision)
+        if freq > 0:
+            sorted_dict[key] = freq
+    return sorted_dict
 
-    # merge dictionaries
+
+def sort_by_frequency(corpus: dict, precision=3):
+    for ngram in range(1, len(corpus["freq"].keys())+1):
+        ngram = str(ngram)
+        corpus["freq"][ngram] = _sort_ngram_by_frequency(
+            corpus["freq"][ngram], precision
+        )
+    return corpus
+
+def read_corpora(filenames: list[Path]) -> list[dict]:
+    """open a collection of corpus from path and dump its content in a dictionary"""
+    corpora_dict = {}
     for filename in filenames:
-        with open(filename, "r") as corpus:
-            data = json.load(corpus)
-            for section in merged.keys():
-                for key, count in data[section].items():
-                    if key not in merged[section]:
-                        merged[section][key] = 0.0
-                    merged[section][key] += count / filecount
-
-    # sort the merged dictionary by symbol frequency (requires CPython 3.6+)
-    def sort_by_frequency(table, precision=2):
-        sorted_dict = {}
-        for key, count in sorted(table.items(), key=lambda x: -x[1]):
-            freq = round(count, precision)
-            if freq > 0:
-                sorted_dict[key] = freq
-        return sorted_dict
-
-    results = {}
-    results["corpus"] = ""
-    results["symbols"] = sort_by_frequency(merged["symbols"])
-    results["bigrams"] = sort_by_frequency(merged["bigrams"], 4)
-    results["trigrams"] = sort_by_frequency(merged["trigrams"])
-    return results
+        try:
+            with open(filename) as f:
+                corpus = json.load(f)
+                corpora_dict[corpus["name"]] = corpus
+        except:
+            print(
+                f"Warning: cannot open the `{filename.stem}` corpus; skipping this file"
+            )
+            continue
+
+    if len(corpora_dict) < 2:
+        print("Error: at least 2 corpuses are needed to merge, aborting")
+        return []
+
+    # removing corpus that do not have the same ngram lenght
+    ngram_length = len( # 1st corpus in corpora
+        next(iter(corpora_dict.values()))["freq"]
+    )
+    for key in corpora_dict.keys():
+        corpus = corpora_dict[key]
+        if len(corpus["freq"]) != ngram_length:
+            _name = corpus["name"]
+            corpora_dict.pop(_name)
+            print(f"Warning: removing {_name} from corpora because ngram length is different")
+    
+    if len(corpora_dict) >= 2:
+        return list(corpora_dict.values())
+
+    print("Error: at least 2 corpuses are needed to merge, aborting")
+    return []
+
+def mix(corpora:list[dict], name:str="mixed", ratio:list[float]=[]) -> dict:
+    """merge corpora of same n-gram length, optionally with a giver ratio"""
+    if ratio == []:
+        # merge with same weight by default
+        ratio = [ 1/len(corpora) ] * len(corpora)
+    elif round(sum(ratio),1) != 1:
+        print("Error: provided merge ratio do not add-up to 1; aborting merge")
+
+    output_corpus = corpora[0].copy()
+    output_corpus["name"] = name
+
+    # manage 1st corpus
+    for index in output_corpus["freq"]:
+        n = str(index)
+        for ngram in output_corpus["freq"][n]:
+            output_corpus["freq"][n][ngram] *= ratio[0]
+
+    # manage others
+    for index in range(1, len(output_corpus["freq"].keys()) +1):
+        n = str(index)
+        for corpus_index, corpus in enumerate(corpora[1:]):
+            print(corpus, corpus_index)
+            output_corpus["count"][n] += corpus["count"][n]
+
+            for ngram in corpus["freq"][n]:
+                if ngram not in output_corpus["freq"][n]:
+                    output_corpus["freq"][n][ngram] = 0
+                output_corpus["freq"][n][ngram] += corpus["freq"][n][ngram] * ratio[corpus_index]
+    return output_corpus
+
 
 
 if __name__ == "__main__":
     argl = len(argv) - 1  # number of files to merge
     if argl >= 2:
-        print(json.dumps(merge(argv[1:], argl), indent=4, ensure_ascii=False))
+        dir = Path(__file__).resolve().parent.parent
+        files = [Path(f) for f in argv[1:]]
+        corpora = read_corpora(files)
+        corpus = mix(corpora, name="mixed")
+        with open(f"{corpus["name"]}.json", "w", encoding="utf-8") as outfile:
+            json.dump(corpus, outfile, indent=4, ensure_ascii=False)
+        print(json.dumps(corpus, indent=4, ensure_ascii=False))

From d0593ed758ed17e7996a13ea9aec18b470375fc9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A8d=E2=80=99C?= <cedc.git.4xjor@8alias.com>
Date: Thu, 26 Dec 2024 17:28:12 +0100
Subject: [PATCH 04/11] Refactor: `merge.read_corpora` dropping dict of corpus
 for a list instead

---
 bin/merge.py | 24 ++++++++++--------------
 1 file changed, 10 insertions(+), 14 deletions(-)

diff --git a/bin/merge.py b/bin/merge.py
index 6f63817..9ad01c3 100755
--- a/bin/merge.py
+++ b/bin/merge.py
@@ -26,35 +26,31 @@ def sort_by_frequency(corpus: dict, precision=3):
 
 def read_corpora(filenames: list[Path]) -> list[dict]:
     """open a collection of corpus from path and dump its content in a dictionary"""
-    corpora_dict = {}
+    corpora = []
     for filename in filenames:
         try:
             with open(filename) as f:
                 corpus = json.load(f)
-                corpora_dict[corpus["name"]] = corpus
+                corpora.append(corpus)
         except:
             print(
                 f"Warning: cannot open the `{filename.stem}` corpus; skipping this file"
             )
             continue
 
-    if len(corpora_dict) < 2:
+    if len(corpora) < 2:
         print("Error: at least 2 corpuses are needed to merge, aborting")
         return []
 
     # removing corpus that do not have the same ngram lenght
-    ngram_length = len( # 1st corpus in corpora
-        next(iter(corpora_dict.values()))["freq"]
-    )
-    for key in corpora_dict.keys():
-        corpus = corpora_dict[key]
-        if len(corpus["freq"]) != ngram_length:
-            _name = corpus["name"]
-            corpora_dict.pop(_name)
-            print(f"Warning: removing {_name} from corpora because ngram length is different")
+    ngram_length = len( corpora[0]["freq"] )
+    removed_corpuses = [corpus["name"] for corpus in corpora if len(corpus["freq"]) != ngram_length]
+    corpora = [corpus for corpus in corpora if len(corpus["freq"]) == ngram_length]
+    for _name in removed_corpuses:
+        print(f"Warning: removing {_name} from corpora because ngram length is different")
     
-    if len(corpora_dict) >= 2:
-        return list(corpora_dict.values())
+    if len(corpora) >= 2:
+        return corpora
 
     print("Error: at least 2 corpuses are needed to merge, aborting")
     return []

From ac3e971ad0a829faebc3571f904f9d05257cbc67 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A8d=E2=80=99C?= <cedc.git.4xjor@8alias.com>
Date: Thu, 26 Dec 2024 17:47:54 +0100
Subject: [PATCH 05/11] Refactor: adding `merge.mergable` to check if merge can
 be processed.

---
 bin/merge.py | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/bin/merge.py b/bin/merge.py
index 9ad01c3..732292b 100755
--- a/bin/merge.py
+++ b/bin/merge.py
@@ -37,10 +37,14 @@ def read_corpora(filenames: list[Path]) -> list[dict]:
                 f"Warning: cannot open the `{filename.stem}` corpus; skipping this file"
             )
             continue
+    return corpora
 
+def mergeable(corpora:list[dict]) -> bool:
+    """check if corpora cam be merge (n-gram of same length)"""
+    error_str = "Error: at least 2 corpuses are needed to merge, aborting"
     if len(corpora) < 2:
-        print("Error: at least 2 corpuses are needed to merge, aborting")
-        return []
+        print(error_str)
+        return False
 
     # removing corpus that do not have the same ngram lenght
     ngram_length = len( corpora[0]["freq"] )
@@ -50,10 +54,10 @@ def read_corpora(filenames: list[Path]) -> list[dict]:
         print(f"Warning: removing {_name} from corpora because ngram length is different")
     
     if len(corpora) >= 2:
-        return corpora
+        return True
 
-    print("Error: at least 2 corpuses are needed to merge, aborting")
-    return []
+    print(error_str)
+    return False
 
 def mix(corpora:list[dict], name:str="mixed", ratio:list[float]=[]) -> dict:
     """merge corpora of same n-gram length, optionally with a giver ratio"""
@@ -93,6 +97,9 @@ def mix(corpora:list[dict], name:str="mixed", ratio:list[float]=[]) -> dict:
         dir = Path(__file__).resolve().parent.parent
         files = [Path(f) for f in argv[1:]]
         corpora = read_corpora(files)
+        if not mergeable(corpora):
+            print("Error: cannot merge corpora, aborting")
+            exit()
         corpus = mix(corpora, name="mixed")
         with open(f"{corpus["name"]}.json", "w", encoding="utf-8") as outfile:
             json.dump(corpus, outfile, indent=4, ensure_ascii=False)

From 3832f644f16f2da7b0aa4cdb6917faf8ef2f92d8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A8d=E2=80=99C?= <cedc.git.4xjor@8alias.com>
Date: Thu, 26 Dec 2024 18:20:57 +0100
Subject: [PATCH 06/11] =?UTF-8?q?Refactor=E2=80=AF:=20updating=20`merge.mi?=
 =?UTF-8?q?x`=20to=20follow=20#9=20guidelines=20(no=20empty=20list=20argum?=
 =?UTF-8?q?ent)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 bin/merge.py | 31 ++++++++++++++-----------------
 1 file changed, 14 insertions(+), 17 deletions(-)

diff --git a/bin/merge.py b/bin/merge.py
index 732292b..6793aff 100755
--- a/bin/merge.py
+++ b/bin/merge.py
@@ -59,34 +59,31 @@ def mergeable(corpora:list[dict]) -> bool:
     print(error_str)
     return False
 
-def mix(corpora:list[dict], name:str="mixed", ratio:list[float]=[]) -> dict:
+def mix(corpora:list[dict], name:str="mixed", weights:list[float]=None) -> dict:
     """merge corpora of same n-gram length, optionally with a giver ratio"""
-    if ratio == []:
+    weights = weights or []
+    if weights == []:
         # merge with same weight by default
-        ratio = [ 1/len(corpora) ] * len(corpora)
-    elif round(sum(ratio),1) != 1:
+        weights = [ 1/len(corpora) ] * len(corpora)
+    elif round(sum(weights),1) != 1:
         print("Error: provided merge ratio do not add-up to 1; aborting merge")
 
-    output_corpus = corpora[0].copy()
-    output_corpus["name"] = name
+    ngram_length = range(1, len(corpora[0]["freq"].keys()) +1)
 
-    # manage 1st corpus
-    for index in output_corpus["freq"]:
-        n = str(index)
-        for ngram in output_corpus["freq"][n]:
-            output_corpus["freq"][n][ngram] *= ratio[0]
+    output_corpus = {
+        "name": name,
+        "freq": {str(n):{} for n in ngram_length},
+        "count": {str(n):0 for n in ngram_length},
+    }
 
-    # manage others
-    for index in range(1, len(output_corpus["freq"].keys()) +1):
+    for index in ngram_length:
         n = str(index)
-        for corpus_index, corpus in enumerate(corpora[1:]):
-            print(corpus, corpus_index)
+        for corpus_index, corpus in enumerate(corpora):
             output_corpus["count"][n] += corpus["count"][n]
-
             for ngram in corpus["freq"][n]:
                 if ngram not in output_corpus["freq"][n]:
                     output_corpus["freq"][n][ngram] = 0
-                output_corpus["freq"][n][ngram] += corpus["freq"][n][ngram] * ratio[corpus_index]
+                output_corpus["freq"][n][ngram] += corpus["freq"][n][ngram] * weights[corpus_index]
     return output_corpus
 
 

From 6e59e643154546d8317a1db143d3b9f76281afa0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A8d=E2=80=99C?= <cedc.git.4xjor@8alias.com>
Date: Thu, 26 Dec 2024 18:23:10 +0100
Subject: [PATCH 07/11] Refactor: addressing #9 comment that it should be in
 another PR

---
 bin/chardict.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/bin/chardict.py b/bin/chardict.py
index e293e1f..9b8cecf 100755
--- a/bin/chardict.py
+++ b/bin/chardict.py
@@ -7,8 +7,6 @@
 
 NGRAM_MAX_LENGTH = 4  # trigrams
 IGNORED_CHARS = "1234567890 \t\r\n\ufeff↵"
-APP_NAME = "kalamine"
-APP_AUTHOR = "1dk"
 
 
 def parse_corpus(txt: str) -> dict:

From c16107001fdb0ea07d4fbf92ac57fb84f5aff24f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A8d=E2=80=99C?= <cedc.git.4xjor@8alias.com>
Date: Thu, 26 Dec 2024 18:26:04 +0100
Subject: [PATCH 08/11] Refactor: `merge.sort_by_frequency` good practice to
 **not** change variable type

---
 bin/merge.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bin/merge.py b/bin/merge.py
index 6793aff..94cb385 100755
--- a/bin/merge.py
+++ b/bin/merge.py
@@ -17,7 +17,7 @@ def _sort_ngram_by_frequency(table, precision=3):
 
 
 def sort_by_frequency(corpus: dict, precision=3):
-    for ngram in range(1, len(corpus["freq"].keys())+1):
+    for index in range(1, len(corpus["freq"].keys())+1):
         ngram = str(ngram)
         corpus["freq"][ngram] = _sort_ngram_by_frequency(
             corpus["freq"][ngram], precision

From f7e27e159d87030e7f49ea8ea44c78b9680c360c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A8d=E2=80=99C?= <cedc.git.4xjor@8alias.com>
Date: Thu, 26 Dec 2024 18:39:34 +0100
Subject: [PATCH 09/11] =?UTF-8?q?Refactor:=20removing=20=E2=80=9Cname?=
 =?UTF-8?q?=E2=80=9D=20field=20from=20corpus=20json=20files?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 bin/chardict.py |  5 +----
 bin/merge.py    | 12 ++++++------
 2 files changed, 7 insertions(+), 10 deletions(-)

diff --git a/bin/chardict.py b/bin/chardict.py
index 9b8cecf..5ac50f1 100755
--- a/bin/chardict.py
+++ b/bin/chardict.py
@@ -75,13 +75,11 @@ def sort_by_frequency(table: dict, char_count: int, precision: int = 3) -> dict:
     return ngrams, ngrams_count
 
 
-def read_corpus(file: Path, name: str = "", encoding="utf-8") -> dict:
+def read_corpus(file: Path, encoding="utf-8") -> dict:
     """read a .txt file and provide a dictionary of n-grams"""
     try:
         if not file.is_file:
             raise Exception("Error, this is not a file")
-        if not name:
-            name = file.stem
         with file.open("r", encoding=encoding) as f:
             corpus_txt = "↵".join(f.readlines())
 
@@ -90,7 +88,6 @@ def read_corpus(file: Path, name: str = "", encoding="utf-8") -> dict:
 
     ngrams_freq, ngrams_count = parse_corpus(corpus_txt)
     return {
-        "name": name,
         "freq": ngrams_freq,
         "count": ngrams_count,
     }
diff --git a/bin/merge.py b/bin/merge.py
index 94cb385..596a80e 100755
--- a/bin/merge.py
+++ b/bin/merge.py
@@ -48,10 +48,10 @@ def mergeable(corpora:list[dict]) -> bool:
 
     # removing corpus that do not have the same ngram lenght
     ngram_length = len( corpora[0]["freq"] )
-    removed_corpuses = [corpus["name"] for corpus in corpora if len(corpus["freq"]) != ngram_length]
+    corpora_initial_length = len(corpora)
     corpora = [corpus for corpus in corpora if len(corpus["freq"]) == ngram_length]
-    for _name in removed_corpuses:
-        print(f"Warning: removing {_name} from corpora because ngram length is different")
+    if len(corpora) != corpora_initial_length:
+        print(f"Error: cannot merge because corpus file format is different; all corpuses do not have the same ngram length")
     
     if len(corpora) >= 2:
         return True
@@ -71,7 +71,6 @@ def mix(corpora:list[dict], name:str="mixed", weights:list[float]=None) -> dict:
     ngram_length = range(1, len(corpora[0]["freq"].keys()) +1)
 
     output_corpus = {
-        "name": name,
         "freq": {str(n):{} for n in ngram_length},
         "count": {str(n):0 for n in ngram_length},
     }
@@ -97,7 +96,8 @@ def mix(corpora:list[dict], name:str="mixed", weights:list[float]=None) -> dict:
         if not mergeable(corpora):
             print("Error: cannot merge corpora, aborting")
             exit()
-        corpus = mix(corpora, name="mixed")
-        with open(f"{corpus["name"]}.json", "w", encoding="utf-8") as outfile:
+        name = "mixed"
+        corpus = mix(corpora, name=name)
+        with open(f"{name}.json", "w", encoding="utf-8") as outfile:
             json.dump(corpus, outfile, indent=4, ensure_ascii=False)
         print(json.dumps(corpus, indent=4, ensure_ascii=False))

From 52be518995e1f1946caa675f00db653170de4d85 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A8d=E2=80=99C?= <cedc.git.4xjor@8alias.com>
Date: Thu, 26 Dec 2024 18:44:11 +0100
Subject: [PATCH 10/11] Chore: typo fix in comments

---
 bin/merge.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/bin/merge.py b/bin/merge.py
index 596a80e..94afedf 100755
--- a/bin/merge.py
+++ b/bin/merge.py
@@ -46,7 +46,7 @@ def mergeable(corpora:list[dict]) -> bool:
         print(error_str)
         return False
 
-    # removing corpus that do not have the same ngram lenght
+    # removing corpus that do not have the same ngram length
     ngram_length = len( corpora[0]["freq"] )
     corpora_initial_length = len(corpora)
     corpora = [corpus for corpus in corpora if len(corpus["freq"]) == ngram_length]
@@ -60,7 +60,7 @@ def mergeable(corpora:list[dict]) -> bool:
     return False
 
 def mix(corpora:list[dict], name:str="mixed", weights:list[float]=None) -> dict:
-    """merge corpora of same n-gram length, optionally with a giver ratio"""
+    """merge corpora of same n-gram length, optionally with a given set of weight"""
     weights = weights or []
     if weights == []:
         # merge with same weight by default

From be99fc25ebde3cadaf1e317c477ba9a0f58d8d98 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A8d=E2=80=99C?= <cedc.git.4xjor@8alias.com>
Date: Thu, 9 Jan 2025 17:55:17 +0100
Subject: [PATCH 11/11] Chore: Finish PR review and drop "name" field

---
 bin/merge.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/bin/merge.py b/bin/merge.py
index 94afedf..3c466ee 100755
--- a/bin/merge.py
+++ b/bin/merge.py
@@ -59,7 +59,7 @@ def mergeable(corpora:list[dict]) -> bool:
     print(error_str)
     return False
 
-def mix(corpora:list[dict], name:str="mixed", weights:list[float]=None) -> dict:
+def mix(corpora:list[dict], weights:list[float]=None) -> dict:
     """merge corpora of same n-gram length, optionally with a given set of weight"""
     weights = weights or []
     if weights == []:
@@ -97,7 +97,7 @@ def mix(corpora:list[dict], name:str="mixed", weights:list[float]=None) -> dict:
             print("Error: cannot merge corpora, aborting")
             exit()
         name = "mixed"
-        corpus = mix(corpora, name=name)
+        corpus = mix(corpora)
         with open(f"{name}.json", "w", encoding="utf-8") as outfile:
             json.dump(corpus, outfile, indent=4, ensure_ascii=False)
         print(json.dumps(corpus, indent=4, ensure_ascii=False))