VMSembedding/Makefile at main · CoderXYZ7/VMSembedding · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
.PHONY: all parse embed reduce reduce-tsne reduce-both reduce-3d reduce-ft visualize visualize-3d dash folio analyze bigrams similarity section-vocab vocab-drift nn-graph embed-ft word-families pmi function-words entropy-scatter report analogy char-ngrams line-structure positional-bigrams cooccurrence hapax word-length hmm word-transition cluster-purity context-profile folio-drift morpheme section-distance word-fingerprint word-roles phonotactics line-entropy line-clusters lm-perplexity embed-stability cross-section folio-richness line-similarity paradigm-finder semantic-fields entropy-rate changepoint topic-model char-position-entropy sif-embeddings word-sequence-model word-burstiness affix-entropy folio-similarity-matrix zipf-analysis prefix-suffix-matrix line-position-words cv-skeleton word-network-centrality ppmi-vectors context-asymmetry surprisal-map portal clean help

all: parse embed reduce visualize

parse:
	python3 src/core/parse.py

embed: parse
	python3 src/core/embed.py

reduce: embed
	python3 src/core/reduce.py

reduce-tsne: embed
	python3 src/core/reduce.py --tsne

reduce-both: embed
	python3 src/core/reduce.py --both

visualize: reduce
	python3 src/viz/visualize.py

dash: reduce
	python3 src/viz/visualize_dash.py

folio: embed
	python3 src/core/folio_embed.py

analyze: embed
	python3 src/analysis/analyze.py

bigrams: parse
	python3 src/analysis/bigrams.py

similarity: embed
	python3 src/analysis/similarity_matrix.py

section-vocab: parse
	python3 src/analysis/section_vocab.py

vocab-drift: parse
	python3 src/analysis/vocab_drift.py

nn-graph: reduce
	python3 src/analysis/nn_graph.py

embed-ft: parse
	python3 src/core/embed_fasttext.py

reduce-ft: embed-ft
	python3 src/core/reduce.py --ft

word-families: embed-ft
	python3 src/analysis/word_families.py

pmi: parse
	python3 src/analysis/pmi.py

function-words: parse word-families
	python3 src/analysis/function_words.py

entropy-scatter: embed
	python3 src/analysis/entropy_scatter.py

report: embed word-families pmi
	python3 src/viz/report.py

portal: report
	python3 src/viz/portal.py

analogy: embed-ft
	python3 src/cli/analogy_discover.py

char-ngrams: parse
	python3 src/analysis/char_ngrams.py

line-structure: parse word-families
	python3 src/analysis/line_structure.py

positional-bigrams: parse
	python3 src/analysis/positional_bigrams.py

cooccurrence: parse
	python3 src/analysis/cooccurrence_network.py

hapax: parse
	python3 src/analysis/hapax_analysis.py

word-length: parse
	python3 src/analysis/word_length_profile.py

hmm: embed
	python3 src/analysis/hidden_markov.py

reduce-3d: embed
	python3 src/core/reduce.py --3d

visualize-3d: reduce-3d
	python3 src/viz/visualize_3d.py

word-transition: embed
	python3 src/analysis/word_transition.py

cluster-purity: embed hmm
	python3 src/analysis/cluster_purity.py

context-profile: parse
	python3 src/analysis/context_profile.py

folio-drift: embed
	python3 src/analysis/folio_centroid_drift.py

morpheme: parse
	python3 src/analysis/morpheme_inventory.py

section-distance: embed word-families
	python3 src/analysis/section_distance.py

word-fingerprint: embed word-families pmi
	python3 src/analysis/word_fingerprint.py $(word)

word-roles: embed
	python3 src/analysis/word_roles.py

phonotactics: parse
	python3 src/analysis/phonotactics.py

line-entropy: parse
	python3 src/analysis/line_entropy.py

line-clusters: embed
	python3 src/analysis/line_clusters.py

lm-perplexity: parse
	python3 src/analysis/lm_perplexity.py

embed-stability: embed
	python3 src/analysis/embed_stability.py

cross-section: embed
	python3 src/analysis/cross_section.py

folio-richness: parse
	python3 src/analysis/folio_richness.py

line-similarity: line-clusters
	python3 src/analysis/line_similarity.py

paradigm-finder: embed
	python3 src/analysis/paradigm_finder.py

semantic-fields: embed
	python3 src/analysis/semantic_fields.py

entropy-rate: parse
	python3 src/analysis/entropy_rate.py

changepoint: embed
	python3 src/analysis/changepoint.py

topic-model: parse embed
	python3 src/analysis/topic_model.py

char-position-entropy: parse
	python3 src/analysis/char_position_entropy.py

sif-embeddings: embed
	python3 src/analysis/sif_embeddings.py

word-sequence-model: parse
	python3 src/analysis/word_sequence_model.py

word-burstiness: parse
	python3 src/analysis/word_burstiness.py

affix-entropy: parse
	python3 src/analysis/affix_entropy.py

folio-similarity-matrix: embed
	python3 src/analysis/folio_similarity_matrix.py

zipf-analysis: parse
	python3 src/analysis/zipf_analysis.py

prefix-suffix-matrix: parse
	python3 src/analysis/prefix_suffix_matrix.py

line-position-words: embed
	python3 src/analysis/line_position_words.py

cv-skeleton: parse
	python3 src/analysis/cv_skeleton.py

word-network-centrality: parse
	python3 src/analysis/word_network_centrality.py

ppmi-vectors: embed
	python3 src/analysis/ppmi_vectors.py

context-asymmetry: embed
	python3 src/analysis/context_asymmetry.py

surprisal-map: parse
	python3 src/analysis/surprisal_map.py

clean:
	rm -rf data/ index.html voynich_embeddings.html folio_embeddings.html \
	       bigrams_heatmap.html bigrams_network.html \
	       similarity_matrix.html section_vocab.html \
	       vocab_drift.html nn_graph.html \
	       word_families.html pmi_heatmap.html \
	       function_words.html entropy_scatter.html report.html \
	       char_ngrams.html line_structure.html \
	       positional_bigrams.html cooccurrence_network.html \
	       hapax_analysis.html word_length_profile.html hmm_states.html \
	       voynich_3d.html word_transition.html \
	       cluster_purity.html context_profile.html \
	       folio_drift.html morpheme_inventory.html \
	       section_distance.html word_roles.html phonotactics.html \
	       line_entropy.html line_clusters.html lm_perplexity.html \
	       embed_stability.html cross_section.html \
	       folio_richness.html line_similarity.html \
	       paradigm_finder.html semantic_fields.html \
	       entropy_rate.html changepoint.html \
	       topic_model.html char_position_entropy.html \
	       sif_embeddings.html word_sequence_model.html \
	       word_burstiness.html affix_entropy.html \
	       folio_similarity_matrix.html zipf_analysis.html \
	       prefix_suffix_matrix.html line_position_words.html \
	       cv_skeleton.html word_network_centrality.html \
	       ppmi_vectors.html context_asymmetry.html \
	       surprisal_map.html

help:
	@echo "Voynich Embedding Pipeline"
	@echo ""
	@echo "  make all          parse → embed → UMAP → visualize (static HTML)"
	@echo "  make reduce-both  run both UMAP and t-SNE reductions"
	@echo "  make dash         interactive Dash app at http://127.0.0.1:8050"
	@echo "  make folio        folio-level embedding visualization"
	@echo "  make analyze      EVA prefix/suffix pattern analysis"
	@echo "  make bigrams      bigram frequency + heatmap + network"
	@echo "  make similarity   pairwise cosine similarity heatmap (top-N words)"
	@echo "  make section-vocab  TF-IDF-style distinctive vocabulary per section"
	@echo "  make vocab-drift  word frequency drift across folio windows"
	@echo "  make nn-graph     K-nearest-neighbour graph in UMAP space"
	@echo "  make embed-ft     FastText character-ngram embeddings"
	@echo "  make reduce-ft    UMAP reduction of FastText embeddings"
	@echo "  make word-families  morphological word families via FastText clustering"
	@echo "  make pmi          Pointwise Mutual Information bigram analysis"
	@echo "  make function-words  initial-fraction vs entropy scatter (function-word ID)"
	@echo "  make entropy-scatter prev vs next entropy asymmetry (directional)"
	@echo "  make report       HTML summary dashboard linking all outputs"
	@echo "  make portal       generate searchable HTML portal (index.html) linking all analyses"
	@echo "  make analogy      morphological offset coherence (w2v vs FastText)"
	@echo "  make char-ngrams  EVA character n-gram analysis + transition heatmap"
	@echo "  make line-structure  first/last word + line-length + template analysis"
	@echo "  make positional-bigrams  G² enrichment of bigrams by line zone"
	@echo "  make cooccurrence    word co-occurrence network + community detection"
	@echo "  make hapax           Zipf / Heap / hapax legomena frequency analysis"
	@echo "  make word-length     word-length by section / position / folio trend"
	@echo "  make hmm             unsupervised HMM (K=6 latent POS-like states)"
	@echo "  make reduce-3d       UMAP 3D reduction of word2vec embeddings"
	@echo "  make visualize-3d    interactive 3D scatter (section/freq/prefix/HMM)"
	@echo "  make word-transition directed word transition probability network"
	@echo "  make cluster-purity  ARI/NMI of KMeans vs section/HMM/prefix/length"
	@echo "  make context-profile left/right context probability heatmap"
	@echo "  make folio-drift     folio semantic trajectory (PCA of mean embeddings)"
	@echo "  make morpheme        EVA morpheme candidates via char n-gram segmentation"
	@echo "  make section-distance  pairwise section distance matrices (6 metrics)"
	@echo "  make word-fingerprint word=<word>  multi-signal fingerprint card per word"
	@echo "  make word-roles      GMM functional role clustering (BIC-selected K)"
	@echo "  make phonotactics    EVA phonotactic patterns: positional bias, trigram log-odds, CV shapes"
	@echo "  make line-entropy    line-slot entropy, adjacent Jaccard, repetition rate, first-word PMI"
	@echo "  make line-clusters   line-embedding UMAP + KMeans (K=7) section recovery"
	@echo "  make lm-perplexity   Kneser-Ney bigram LM perplexity scoring per line"
	@echo "  make embed-stability bootstrap embedding stability (15 models, K=10 neighbors)"
	@echo "  make cross-section   chi² section exclusivity, TF-IDF, overlap, border words"
	@echo "  make folio-richness  per-folio TTR, MSTTR, hapax rate, entropy, Yule's K"
	@echo "  make line-similarity near-duplicate and exact-copy line detection"
	@echo "  make paradigm-finder suffix/prefix paradigm detection via embedding offsets"
	@echo "  make semantic-fields hierarchical dendrogram of top-200 words (Ward/cosine)"
	@echo "  make entropy-rate    block entropy H(n) and entropy rate estimation"
	@echo "  make changepoint     folio embedding change-point detection (MMD + CUSUM)"
	@echo "  make clean        remove all generated data/ and HTML files"
	@echo ""
	@echo "CLI tools:"
	@echo "  python3 src/cli/neighbors.py daiin                 nearest neighbours"
	@echo "  python3 src/cli/neighbors.py --cluster 8           KMeans vocabulary clusters"
	@echo "  python3 src/cli/neighbors.py --analogy A B C       vector analogy A - B + C = ?"
	@echo "  python3 src/cli/neighbors.py --interpolate A B N   semantic path A → B in N steps"