From cdc8b29e0d414d25f0b9de5bdb1a6d813b14981a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A8d=E2=80=99C?= Date: Wed, 18 Dec 2024 23:00:18 +0100 Subject: [PATCH 01/11] Feat: Improve `chardict.py` to add n-gram counts Warning format change to count n-gram occurences Can go up to large n-gram parametter by changing `NGRAM_MAX_LENGTH` constant. Switch from os module to pathlib to manage paths. Fixes #7 --- bin/chardict.py | 155 ++++++++++++++++++++++++++++++------------------ 1 file changed, 98 insertions(+), 57 deletions(-) diff --git a/bin/chardict.py b/bin/chardict.py index 5ec3e5d..2d82505 100755 --- a/bin/chardict.py +++ b/bin/chardict.py @@ -1,50 +1,69 @@ #!/usr/bin/env python3 -"""Turn corpus texts into dictionaries of symbols, bigrams and trigrams.""" +"""Turn corpus texts into dictionaries of n-grams.""" import json -from os import listdir, path +from pathlib import Path from sys import argv -IGNORED_CHARS = "1234567890 \t\r\n\ufeff" - - -def parse_corpus(file_path): - """Count symbols, bigrams and trigrams in a text file.""" - - symbols = {} - bigrams = {} - trigrams = {} - char_count = 0 - prev_symbol = None - prev_prev_symbol = None - - # get a dictionary of all symbols (letters, punctuation marks...) - file = open(file_path, "r", encoding="utf-8") - for char in file.read(): - symbol = char.lower() - if char not in IGNORED_CHARS: - char_count += 1 - if symbol not in symbols: - symbols[symbol] = 0 - symbols[symbol] += 1 - if prev_symbol is not None: - bigram = prev_symbol + symbol - if bigram not in bigrams: - bigrams[bigram] = 0 - bigrams[bigram] += 1 - if prev_prev_symbol is not None: - trigram = prev_prev_symbol + bigram - if trigram not in trigrams: - trigrams[trigram] = 0 - trigrams[trigram] += 1 - prev_prev_symbol = prev_symbol - prev_symbol = symbol - else: - prev_symbol = None - file.close() +NGRAM_MAX_LENGTH = 5 # Quadrigrams +IGNORED_CHARS = "1234567890 \t\r\n\ufeff↵" +APP_NAME = "kalamine" +APP_AUTHOR = "1dk" + + +def parse_corpus(txt: str) -> dict: + """Count ngrams in a string. + retuns a dict of ngrams + ngrams[1]=symbols + ngrams[2]=bigrames + ngrams[3]=trigrams + etc., up to NGRAM_MAX_LENGTH + ngrams[2] is shaped as { "aa": count } + """ + + ngrams = {} + ngrams_count = {} # ngrams_count counts the total number of ngrams[i] in corpus. + + txt = txt.lower() # we want to be case **in**sensitive + + for ngram in range(1, NGRAM_MAX_LENGTH): + ngrams[ngram] = {} + ngrams_count[ngram] = 0 + + def get_ngram(txt: str, ngram_start: int, ngram_length: int) -> str: + """get a ngram of a given length at given position in txt + returns empty string if ngram cannot be provided""" + if txt[ngram_start] in IGNORED_CHARS: + return "" + if ngram_length <= 0: + return "" + if ngram_start + ngram_length >= len(txt): + return "" + + ngram = txt[ngram_start : ngram_start + ngram_length] + + for n in ngram[1:]: # 1st char already tested + if n in IGNORED_CHARS: + return "" + + return ngram + + # get all n-grams + for ngram_start in range(len(txt)): + for ngram_length in range(NGRAM_MAX_LENGTH): + _ngram = get_ngram(txt, ngram_start, ngram_length) + + if not _ngram: # _ngram is "" + continue + + if _ngram not in ngrams[ngram_length]: + ngrams[ngram_length][_ngram] = 0 + + ngrams[ngram_length][_ngram] += 1 + ngrams_count[ngram_length] += 1 # sort the dictionary by symbol frequency (requires CPython 3.6+) - def sort_by_frequency(table, precision=3): + def sort_by_frequency(table: dict, char_count: int, precision: int = 3) -> dict: sorted_dict = {} for key, count in sorted(table.items(), key=lambda x: -x[1]): freq = round(100 * count / char_count, precision) @@ -52,27 +71,49 @@ def sort_by_frequency(table, precision=3): sorted_dict[key] = freq return sorted_dict - results = {} - results["corpus"] = file_path - results["symbols"] = sort_by_frequency(symbols) - results["bigrams"] = sort_by_frequency(bigrams, 4) - results["trigrams"] = sort_by_frequency(trigrams) - return results + for ngram in range(1, NGRAM_MAX_LENGTH): + ngrams[ngram] = sort_by_frequency(ngrams[ngram], ngrams_count[ngram], 4) + + return ngrams, ngrams_count + + +def read_corpus(file_path: str, name: str = "", encoding="utf-8") -> dict: + """read a .txt file and provide a dictionary of n-grams""" + try: + path = Path(file_path) + if not path.is_file: + raise Exception("Error, this is not a file") + if not name: + name = path.stem + with path.open("r", encoding=encoding) as file: + corpus_txt = "↵".join(file.readlines()) + + except Exception as e: + print(f"file does not exist or could not be read.\n {e}") + + ngrams_freq, ngrams_count = parse_corpus(corpus_txt) + return { + "name": name, + # "text": corpus_txt, + "freq": ngrams_freq, + "count": ngrams_count, + } if __name__ == "__main__": if len(argv) == 2: # convert one file - data = parse_corpus(argv[1]) + file_path = Path(argv[1]) + data = read_corpus(str(file_path)) + output_file_path = file_path.parent / f"{file_path.stem}.json" + with open(output_file_path, "w", encoding="utf-8") as outfile: + json.dump(data, outfile, indent=4, ensure_ascii=False) print(json.dumps(data, indent=4, ensure_ascii=False)) + else: # converts all *.txt files in the script directory - bin_dir = path.dirname(__file__) - destdir = path.join(bin_dir, "..", "txt") - txtdir = path.join(bin_dir, "..", "txt") - for filename in listdir(txtdir): - if filename.endswith(".txt"): - basename = filename[:-4] - print(f"... {basename}") - data = parse_corpus(path.join(txtdir, filename)) - destfile = path.join(destdir, basename + ".json") - with open(destfile, "w", encoding="utf-8") as outfile: + curent_path = Path(__file__).resolve().parent + for file in curent_path.glob("*.txt"): + if file.is_file(): + data = read_corpus(str(file)) + output_file_path = file.parent / f"{file.stem}.json" + with open(output_file_path, "w", encoding="utf-8") as outfile: json.dump(data, outfile, indent=4, ensure_ascii=False) From f69cf2a8b8e653cc44783c5c9ff09b5598b1f346 Mon Sep 17 00:00:00 2001 From: Fabien Cazenave Date: Sun, 22 Dec 2024 17:04:35 +0100 Subject: [PATCH 02/11] minor fixes --- Makefile | 1 + bin/chardict.py | 29 ++++++++++++++--------------- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/Makefile b/Makefile index ef7e6c5..5abf447 100644 --- a/Makefile +++ b/Makefile @@ -15,6 +15,7 @@ json: @mkdir -p json @echo "Creating JSON dicts..." @bin/chardict.py + @echo "Merging JSON dicts..." @echo "... de_modern" @bin/merge.py txt/deu_*.json > json/de_modern.json @echo "... en_modern" diff --git a/bin/chardict.py b/bin/chardict.py index 2d82505..e293e1f 100755 --- a/bin/chardict.py +++ b/bin/chardict.py @@ -5,7 +5,7 @@ from pathlib import Path from sys import argv -NGRAM_MAX_LENGTH = 5 # Quadrigrams +NGRAM_MAX_LENGTH = 4 # trigrams IGNORED_CHARS = "1234567890 \t\r\n\ufeff↵" APP_NAME = "kalamine" APP_AUTHOR = "1dk" @@ -77,16 +77,15 @@ def sort_by_frequency(table: dict, char_count: int, precision: int = 3) -> dict: return ngrams, ngrams_count -def read_corpus(file_path: str, name: str = "", encoding="utf-8") -> dict: +def read_corpus(file: Path, name: str = "", encoding="utf-8") -> dict: """read a .txt file and provide a dictionary of n-grams""" try: - path = Path(file_path) - if not path.is_file: + if not file.is_file: raise Exception("Error, this is not a file") if not name: - name = path.stem - with path.open("r", encoding=encoding) as file: - corpus_txt = "↵".join(file.readlines()) + name = file.stem + with file.open("r", encoding=encoding) as f: + corpus_txt = "↵".join(f.readlines()) except Exception as e: print(f"file does not exist or could not be read.\n {e}") @@ -94,7 +93,6 @@ def read_corpus(file_path: str, name: str = "", encoding="utf-8") -> dict: ngrams_freq, ngrams_count = parse_corpus(corpus_txt) return { "name": name, - # "text": corpus_txt, "freq": ngrams_freq, "count": ngrams_count, } @@ -102,18 +100,19 @@ def read_corpus(file_path: str, name: str = "", encoding="utf-8") -> dict: if __name__ == "__main__": if len(argv) == 2: # convert one file - file_path = Path(argv[1]) - data = read_corpus(str(file_path)) - output_file_path = file_path.parent / f"{file_path.stem}.json" + file = Path(argv[1]) + data = read_corpus(file) + output_file_path = file.parent / f"{file.stem}.json" with open(output_file_path, "w", encoding="utf-8") as outfile: json.dump(data, outfile, indent=4, ensure_ascii=False) print(json.dumps(data, indent=4, ensure_ascii=False)) else: # converts all *.txt files in the script directory - curent_path = Path(__file__).resolve().parent - for file in curent_path.glob("*.txt"): + txt_dir = Path(__file__).resolve().parent.parent / "txt" + for file in sorted(txt_dir.glob("*.txt")): if file.is_file(): - data = read_corpus(str(file)) - output_file_path = file.parent / f"{file.stem}.json" + print(f"... {file.stem}") + data = read_corpus(file) + output_file_path = txt_dir / f"{file.stem}.json" with open(output_file_path, "w", encoding="utf-8") as outfile: json.dump(data, outfile, indent=4, ensure_ascii=False) From d88560ef56a828fa12a685107a6414c16a3318df Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A8d=E2=80=99C?= Date: Tue, 24 Dec 2024 11:54:22 +0100 Subject: [PATCH 03/11] Refactor: Following #9 advise, cleaned up the code to be simpler and only implement current merge behaviour. --- bin/merge.py | 120 ++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 89 insertions(+), 31 deletions(-) diff --git a/bin/merge.py b/bin/merge.py index 5efe94b..6f63817 100755 --- a/bin/merge.py +++ b/bin/merge.py @@ -3,43 +3,101 @@ import json from sys import argv +from pathlib import Path -def merge(filenames, filecount): - merged = { - "symbols": {}, - "bigrams": {}, - "trigrams": {}, - } +# sort the merged dictionary by symbol frequency (requires CPython 3.6+) +def _sort_ngram_by_frequency(table, precision=3): + sorted_dict = {} + for key, count in sorted(table.items(), key=lambda x: -x[1]): + freq = round(count, precision) + if freq > 0: + sorted_dict[key] = freq + return sorted_dict - # merge dictionaries + +def sort_by_frequency(corpus: dict, precision=3): + for ngram in range(1, len(corpus["freq"].keys())+1): + ngram = str(ngram) + corpus["freq"][ngram] = _sort_ngram_by_frequency( + corpus["freq"][ngram], precision + ) + return corpus + +def read_corpora(filenames: list[Path]) -> list[dict]: + """open a collection of corpus from path and dump its content in a dictionary""" + corpora_dict = {} for filename in filenames: - with open(filename, "r") as corpus: - data = json.load(corpus) - for section in merged.keys(): - for key, count in data[section].items(): - if key not in merged[section]: - merged[section][key] = 0.0 - merged[section][key] += count / filecount - - # sort the merged dictionary by symbol frequency (requires CPython 3.6+) - def sort_by_frequency(table, precision=2): - sorted_dict = {} - for key, count in sorted(table.items(), key=lambda x: -x[1]): - freq = round(count, precision) - if freq > 0: - sorted_dict[key] = freq - return sorted_dict - - results = {} - results["corpus"] = "" - results["symbols"] = sort_by_frequency(merged["symbols"]) - results["bigrams"] = sort_by_frequency(merged["bigrams"], 4) - results["trigrams"] = sort_by_frequency(merged["trigrams"]) - return results + try: + with open(filename) as f: + corpus = json.load(f) + corpora_dict[corpus["name"]] = corpus + except: + print( + f"Warning: cannot open the `{filename.stem}` corpus; skipping this file" + ) + continue + + if len(corpora_dict) < 2: + print("Error: at least 2 corpuses are needed to merge, aborting") + return [] + + # removing corpus that do not have the same ngram lenght + ngram_length = len( # 1st corpus in corpora + next(iter(corpora_dict.values()))["freq"] + ) + for key in corpora_dict.keys(): + corpus = corpora_dict[key] + if len(corpus["freq"]) != ngram_length: + _name = corpus["name"] + corpora_dict.pop(_name) + print(f"Warning: removing {_name} from corpora because ngram length is different") + + if len(corpora_dict) >= 2: + return list(corpora_dict.values()) + + print("Error: at least 2 corpuses are needed to merge, aborting") + return [] + +def mix(corpora:list[dict], name:str="mixed", ratio:list[float]=[]) -> dict: + """merge corpora of same n-gram length, optionally with a giver ratio""" + if ratio == []: + # merge with same weight by default + ratio = [ 1/len(corpora) ] * len(corpora) + elif round(sum(ratio),1) != 1: + print("Error: provided merge ratio do not add-up to 1; aborting merge") + + output_corpus = corpora[0].copy() + output_corpus["name"] = name + + # manage 1st corpus + for index in output_corpus["freq"]: + n = str(index) + for ngram in output_corpus["freq"][n]: + output_corpus["freq"][n][ngram] *= ratio[0] + + # manage others + for index in range(1, len(output_corpus["freq"].keys()) +1): + n = str(index) + for corpus_index, corpus in enumerate(corpora[1:]): + print(corpus, corpus_index) + output_corpus["count"][n] += corpus["count"][n] + + for ngram in corpus["freq"][n]: + if ngram not in output_corpus["freq"][n]: + output_corpus["freq"][n][ngram] = 0 + output_corpus["freq"][n][ngram] += corpus["freq"][n][ngram] * ratio[corpus_index] + return output_corpus + if __name__ == "__main__": argl = len(argv) - 1 # number of files to merge if argl >= 2: - print(json.dumps(merge(argv[1:], argl), indent=4, ensure_ascii=False)) + dir = Path(__file__).resolve().parent.parent + files = [Path(f) for f in argv[1:]] + corpora = read_corpora(files) + corpus = mix(corpora, name="mixed") + with open(f"{corpus["name"]}.json", "w", encoding="utf-8") as outfile: + json.dump(corpus, outfile, indent=4, ensure_ascii=False) + print(json.dumps(corpus, indent=4, ensure_ascii=False)) From d0593ed758ed17e7996a13ea9aec18b470375fc9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A8d=E2=80=99C?= Date: Thu, 26 Dec 2024 17:28:12 +0100 Subject: [PATCH 04/11] Refactor: `merge.read_corpora` dropping dict of corpus for a list instead --- bin/merge.py | 24 ++++++++++-------------- 1 file changed, 10 insertions(+), 14 deletions(-) diff --git a/bin/merge.py b/bin/merge.py index 6f63817..9ad01c3 100755 --- a/bin/merge.py +++ b/bin/merge.py @@ -26,35 +26,31 @@ def sort_by_frequency(corpus: dict, precision=3): def read_corpora(filenames: list[Path]) -> list[dict]: """open a collection of corpus from path and dump its content in a dictionary""" - corpora_dict = {} + corpora = [] for filename in filenames: try: with open(filename) as f: corpus = json.load(f) - corpora_dict[corpus["name"]] = corpus + corpora.append(corpus) except: print( f"Warning: cannot open the `{filename.stem}` corpus; skipping this file" ) continue - if len(corpora_dict) < 2: + if len(corpora) < 2: print("Error: at least 2 corpuses are needed to merge, aborting") return [] # removing corpus that do not have the same ngram lenght - ngram_length = len( # 1st corpus in corpora - next(iter(corpora_dict.values()))["freq"] - ) - for key in corpora_dict.keys(): - corpus = corpora_dict[key] - if len(corpus["freq"]) != ngram_length: - _name = corpus["name"] - corpora_dict.pop(_name) - print(f"Warning: removing {_name} from corpora because ngram length is different") + ngram_length = len( corpora[0]["freq"] ) + removed_corpuses = [corpus["name"] for corpus in corpora if len(corpus["freq"]) != ngram_length] + corpora = [corpus for corpus in corpora if len(corpus["freq"]) == ngram_length] + for _name in removed_corpuses: + print(f"Warning: removing {_name} from corpora because ngram length is different") - if len(corpora_dict) >= 2: - return list(corpora_dict.values()) + if len(corpora) >= 2: + return corpora print("Error: at least 2 corpuses are needed to merge, aborting") return [] From ac3e971ad0a829faebc3571f904f9d05257cbc67 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A8d=E2=80=99C?= Date: Thu, 26 Dec 2024 17:47:54 +0100 Subject: [PATCH 05/11] Refactor: adding `merge.mergable` to check if merge can be processed. --- bin/merge.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/bin/merge.py b/bin/merge.py index 9ad01c3..732292b 100755 --- a/bin/merge.py +++ b/bin/merge.py @@ -37,10 +37,14 @@ def read_corpora(filenames: list[Path]) -> list[dict]: f"Warning: cannot open the `{filename.stem}` corpus; skipping this file" ) continue + return corpora +def mergeable(corpora:list[dict]) -> bool: + """check if corpora cam be merge (n-gram of same length)""" + error_str = "Error: at least 2 corpuses are needed to merge, aborting" if len(corpora) < 2: - print("Error: at least 2 corpuses are needed to merge, aborting") - return [] + print(error_str) + return False # removing corpus that do not have the same ngram lenght ngram_length = len( corpora[0]["freq"] ) @@ -50,10 +54,10 @@ def read_corpora(filenames: list[Path]) -> list[dict]: print(f"Warning: removing {_name} from corpora because ngram length is different") if len(corpora) >= 2: - return corpora + return True - print("Error: at least 2 corpuses are needed to merge, aborting") - return [] + print(error_str) + return False def mix(corpora:list[dict], name:str="mixed", ratio:list[float]=[]) -> dict: """merge corpora of same n-gram length, optionally with a giver ratio""" @@ -93,6 +97,9 @@ def mix(corpora:list[dict], name:str="mixed", ratio:list[float]=[]) -> dict: dir = Path(__file__).resolve().parent.parent files = [Path(f) for f in argv[1:]] corpora = read_corpora(files) + if not mergeable(corpora): + print("Error: cannot merge corpora, aborting") + exit() corpus = mix(corpora, name="mixed") with open(f"{corpus["name"]}.json", "w", encoding="utf-8") as outfile: json.dump(corpus, outfile, indent=4, ensure_ascii=False) From 3832f644f16f2da7b0aa4cdb6917faf8ef2f92d8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A8d=E2=80=99C?= Date: Thu, 26 Dec 2024 18:20:57 +0100 Subject: [PATCH 06/11] =?UTF-8?q?Refactor=E2=80=AF:=20updating=20`merge.mi?= =?UTF-8?q?x`=20to=20follow=20#9=20guidelines=20(no=20empty=20list=20argum?= =?UTF-8?q?ent)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- bin/merge.py | 31 ++++++++++++++----------------- 1 file changed, 14 insertions(+), 17 deletions(-) diff --git a/bin/merge.py b/bin/merge.py index 732292b..6793aff 100755 --- a/bin/merge.py +++ b/bin/merge.py @@ -59,34 +59,31 @@ def mergeable(corpora:list[dict]) -> bool: print(error_str) return False -def mix(corpora:list[dict], name:str="mixed", ratio:list[float]=[]) -> dict: +def mix(corpora:list[dict], name:str="mixed", weights:list[float]=None) -> dict: """merge corpora of same n-gram length, optionally with a giver ratio""" - if ratio == []: + weights = weights or [] + if weights == []: # merge with same weight by default - ratio = [ 1/len(corpora) ] * len(corpora) - elif round(sum(ratio),1) != 1: + weights = [ 1/len(corpora) ] * len(corpora) + elif round(sum(weights),1) != 1: print("Error: provided merge ratio do not add-up to 1; aborting merge") - output_corpus = corpora[0].copy() - output_corpus["name"] = name + ngram_length = range(1, len(corpora[0]["freq"].keys()) +1) - # manage 1st corpus - for index in output_corpus["freq"]: - n = str(index) - for ngram in output_corpus["freq"][n]: - output_corpus["freq"][n][ngram] *= ratio[0] + output_corpus = { + "name": name, + "freq": {str(n):{} for n in ngram_length}, + "count": {str(n):0 for n in ngram_length}, + } - # manage others - for index in range(1, len(output_corpus["freq"].keys()) +1): + for index in ngram_length: n = str(index) - for corpus_index, corpus in enumerate(corpora[1:]): - print(corpus, corpus_index) + for corpus_index, corpus in enumerate(corpora): output_corpus["count"][n] += corpus["count"][n] - for ngram in corpus["freq"][n]: if ngram not in output_corpus["freq"][n]: output_corpus["freq"][n][ngram] = 0 - output_corpus["freq"][n][ngram] += corpus["freq"][n][ngram] * ratio[corpus_index] + output_corpus["freq"][n][ngram] += corpus["freq"][n][ngram] * weights[corpus_index] return output_corpus From 6e59e643154546d8317a1db143d3b9f76281afa0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A8d=E2=80=99C?= Date: Thu, 26 Dec 2024 18:23:10 +0100 Subject: [PATCH 07/11] Refactor: addressing #9 comment that it should be in another PR --- bin/chardict.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/bin/chardict.py b/bin/chardict.py index e293e1f..9b8cecf 100755 --- a/bin/chardict.py +++ b/bin/chardict.py @@ -7,8 +7,6 @@ NGRAM_MAX_LENGTH = 4 # trigrams IGNORED_CHARS = "1234567890 \t\r\n\ufeff↵" -APP_NAME = "kalamine" -APP_AUTHOR = "1dk" def parse_corpus(txt: str) -> dict: From c16107001fdb0ea07d4fbf92ac57fb84f5aff24f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A8d=E2=80=99C?= Date: Thu, 26 Dec 2024 18:26:04 +0100 Subject: [PATCH 08/11] Refactor: `merge.sort_by_frequency` good practice to **not** change variable type --- bin/merge.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/merge.py b/bin/merge.py index 6793aff..94cb385 100755 --- a/bin/merge.py +++ b/bin/merge.py @@ -17,7 +17,7 @@ def _sort_ngram_by_frequency(table, precision=3): def sort_by_frequency(corpus: dict, precision=3): - for ngram in range(1, len(corpus["freq"].keys())+1): + for index in range(1, len(corpus["freq"].keys())+1): ngram = str(ngram) corpus["freq"][ngram] = _sort_ngram_by_frequency( corpus["freq"][ngram], precision From f7e27e159d87030e7f49ea8ea44c78b9680c360c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A8d=E2=80=99C?= Date: Thu, 26 Dec 2024 18:39:34 +0100 Subject: [PATCH 09/11] =?UTF-8?q?Refactor:=20removing=20=E2=80=9Cname?= =?UTF-8?q?=E2=80=9D=20field=20from=20corpus=20json=20files?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- bin/chardict.py | 5 +---- bin/merge.py | 12 ++++++------ 2 files changed, 7 insertions(+), 10 deletions(-) diff --git a/bin/chardict.py b/bin/chardict.py index 9b8cecf..5ac50f1 100755 --- a/bin/chardict.py +++ b/bin/chardict.py @@ -75,13 +75,11 @@ def sort_by_frequency(table: dict, char_count: int, precision: int = 3) -> dict: return ngrams, ngrams_count -def read_corpus(file: Path, name: str = "", encoding="utf-8") -> dict: +def read_corpus(file: Path, encoding="utf-8") -> dict: """read a .txt file and provide a dictionary of n-grams""" try: if not file.is_file: raise Exception("Error, this is not a file") - if not name: - name = file.stem with file.open("r", encoding=encoding) as f: corpus_txt = "↵".join(f.readlines()) @@ -90,7 +88,6 @@ def read_corpus(file: Path, name: str = "", encoding="utf-8") -> dict: ngrams_freq, ngrams_count = parse_corpus(corpus_txt) return { - "name": name, "freq": ngrams_freq, "count": ngrams_count, } diff --git a/bin/merge.py b/bin/merge.py index 94cb385..596a80e 100755 --- a/bin/merge.py +++ b/bin/merge.py @@ -48,10 +48,10 @@ def mergeable(corpora:list[dict]) -> bool: # removing corpus that do not have the same ngram lenght ngram_length = len( corpora[0]["freq"] ) - removed_corpuses = [corpus["name"] for corpus in corpora if len(corpus["freq"]) != ngram_length] + corpora_initial_length = len(corpora) corpora = [corpus for corpus in corpora if len(corpus["freq"]) == ngram_length] - for _name in removed_corpuses: - print(f"Warning: removing {_name} from corpora because ngram length is different") + if len(corpora) != corpora_initial_length: + print(f"Error: cannot merge because corpus file format is different; all corpuses do not have the same ngram length") if len(corpora) >= 2: return True @@ -71,7 +71,6 @@ def mix(corpora:list[dict], name:str="mixed", weights:list[float]=None) -> dict: ngram_length = range(1, len(corpora[0]["freq"].keys()) +1) output_corpus = { - "name": name, "freq": {str(n):{} for n in ngram_length}, "count": {str(n):0 for n in ngram_length}, } @@ -97,7 +96,8 @@ def mix(corpora:list[dict], name:str="mixed", weights:list[float]=None) -> dict: if not mergeable(corpora): print("Error: cannot merge corpora, aborting") exit() - corpus = mix(corpora, name="mixed") - with open(f"{corpus["name"]}.json", "w", encoding="utf-8") as outfile: + name = "mixed" + corpus = mix(corpora, name=name) + with open(f"{name}.json", "w", encoding="utf-8") as outfile: json.dump(corpus, outfile, indent=4, ensure_ascii=False) print(json.dumps(corpus, indent=4, ensure_ascii=False)) From 52be518995e1f1946caa675f00db653170de4d85 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A8d=E2=80=99C?= Date: Thu, 26 Dec 2024 18:44:11 +0100 Subject: [PATCH 10/11] Chore: typo fix in comments --- bin/merge.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bin/merge.py b/bin/merge.py index 596a80e..94afedf 100755 --- a/bin/merge.py +++ b/bin/merge.py @@ -46,7 +46,7 @@ def mergeable(corpora:list[dict]) -> bool: print(error_str) return False - # removing corpus that do not have the same ngram lenght + # removing corpus that do not have the same ngram length ngram_length = len( corpora[0]["freq"] ) corpora_initial_length = len(corpora) corpora = [corpus for corpus in corpora if len(corpus["freq"]) == ngram_length] @@ -60,7 +60,7 @@ def mergeable(corpora:list[dict]) -> bool: return False def mix(corpora:list[dict], name:str="mixed", weights:list[float]=None) -> dict: - """merge corpora of same n-gram length, optionally with a giver ratio""" + """merge corpora of same n-gram length, optionally with a given set of weight""" weights = weights or [] if weights == []: # merge with same weight by default From be99fc25ebde3cadaf1e317c477ba9a0f58d8d98 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A8d=E2=80=99C?= Date: Thu, 9 Jan 2025 17:55:17 +0100 Subject: [PATCH 11/11] Chore: Finish PR review and drop "name" field --- bin/merge.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bin/merge.py b/bin/merge.py index 94afedf..3c466ee 100755 --- a/bin/merge.py +++ b/bin/merge.py @@ -59,7 +59,7 @@ def mergeable(corpora:list[dict]) -> bool: print(error_str) return False -def mix(corpora:list[dict], name:str="mixed", weights:list[float]=None) -> dict: +def mix(corpora:list[dict], weights:list[float]=None) -> dict: """merge corpora of same n-gram length, optionally with a given set of weight""" weights = weights or [] if weights == []: @@ -97,7 +97,7 @@ def mix(corpora:list[dict], name:str="mixed", weights:list[float]=None) -> dict: print("Error: cannot merge corpora, aborting") exit() name = "mixed" - corpus = mix(corpora, name=name) + corpus = mix(corpora) with open(f"{name}.json", "w", encoding="utf-8") as outfile: json.dump(corpus, outfile, indent=4, ensure_ascii=False) print(json.dumps(corpus, indent=4, ensure_ascii=False))