-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathpruning.py
More file actions
409 lines (337 loc) · 18.1 KB
/
pruning.py
File metadata and controls
409 lines (337 loc) · 18.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
from typing import List, Dict, Any, Optional
from datetime import datetime, timedelta
import threading
import os
import json
import uuid
import logging
from .database import DatabaseManager
from .enrichment import EnrichmentService
from .schemas import MemoryType, RetrievalContext, RetrievalHandler, MesaConfig
from .telemetry import get_tracer
from .mirror import MirrorService
tracer = get_tracer(__name__)
logger = logging.getLogger(__name__)
class PruningEngine:
"""Stateless utility for filtering retrieval candidates based on score quality."""
@staticmethod
def prune(candidates: Dict[int, Dict[str, Any]],
top_n: int = 3,
relative_threshold: float = 0.5,
min_absolute_score: float = 0.3) -> Dict[int, Dict[str, Any]]:
"""
Filters candidates using a multi-gate approach:
1. Top-N limit.
2. Relative threshold (percentage of the top score).
3. Absolute floor (hard cutoff).
"""
if not candidates:
return {}
# 1. Sort by score
sorted_items = sorted(
candidates.items(),
key=lambda x: x[1].get("score", 0.0),
reverse=True
)
top_score = sorted_items[0][1].get("score", 0.0)
# 2. Filter
pruned = {}
for cid, cand in sorted_items[:top_n]:
score = cand.get("score", 0.0)
# Relative Gate: Must be within X% of the best result
is_relative_match = score >= (top_score * relative_threshold)
# Absolute Gate: Must be above the hard floor
is_absolute_match = score >= min_absolute_score
if is_relative_match and is_absolute_match:
pruned[cid] = cand
return pruned
class PruningHandler(RetrievalHandler):
"""Stage E: Quality-based Pruning (The 'Junk' Sieve)"""
def process(self, context: RetrievalContext, retriever: Any) -> None:
if not context.candidates:
return
# Config resolution (expects a PruningConfig-like object or dict)
if hasattr(self.config, "top_n"):
top_n = self.config.top_n
rel_thresh = self.config.relative_threshold
min_abs = self.config.min_absolute_score
else:
# Fallback for manual dict injection
cfg = self.config or {}
top_n = cfg.get("top_n", 3)
rel_thresh = cfg.get("relative_threshold", 0.5)
min_abs = cfg.get("min_absolute_score", 0.3)
original_count = len(context.candidates)
# Execute Pruning
context.candidates = PruningEngine.prune(
context.candidates,
top_n=top_n,
relative_threshold=rel_thresh,
min_absolute_score=min_abs
)
pruned_count = original_count - len(context.candidates)
# Telemetry
context.metrics["pruning"] = {
"pruned": pruned_count,
"remaining": len(context.candidates),
"top_n_limit": top_n,
"relative_threshold": rel_thresh,
"min_absolute_score": min_abs
}
if pruned_count > 0:
logger.info(f"PruningHandler: Filtered {pruned_count} noise candidates. Remaining: {len(context.candidates)}")
class MesaService:
"""
Two-Tier Maintenance Service:
Tier 1: Soft-Prune (Marks fragmented/stale memories as 'ARCHIVED')
Tier 2: Deep Clean (Permanently deletes old archives and executes VACUUM)
"""
def __init__(self, db: DatabaseManager,
enrichment: EnrichmentService,
mirror: Any = None,
config: Optional[Any] = None):
self.db = db
self.enrichment = enrichment
self.mirror = mirror
# 1. Use injected config or safe defaults
if config:
self.config = config
else:
# Fallback for legacy or direct instantiation
from .retrieval import MesaConfig
self.config = MesaConfig()
self.centrality_threshold = self.config.centrality_threshold
self.max_age_days = self.config.retention_days
self.importance_cutoff = self.config.importance_cutoff
self.interval_seconds = self.config.interval_seconds
self.consolidation_threshold = self.config.consolidation_threshold
self.purge_enabled = self.config.purge_enabled
self.dry_run = self.config.dry_run
self._stop_event = threading.Event()
self._thread = None
self.last_deep_clean = None
def start(self):
"""Starts the maintenance loop in a background daemon thread."""
if not self.config.pipeline:
logger.info("MesaService has no active pipeline stages. Skipping background loop.")
return
if self._thread and self._thread.is_alive():
return
self._thread = threading.Thread(target=self._run_loop, daemon=True, name="reverie-mesa")
self._thread.start()
logger.info(f"MesaService started (Interval: {self.interval_seconds}s, Stages: {self.config.pipeline}, DryRun: {self.dry_run})")
def stop(self):
"""Signals the background thread to stop."""
self._stop_event.set()
if self._thread:
self._thread.join(timeout=5.0)
logger.info("MesaService stopped.")
def _run_loop(self):
"""Main background loop following Activation by Inclusion."""
while not self._stop_event.wait(self.interval_seconds):
with tracer.start_as_current_span("reverie.mesa.maintenance_cycle") as span:
try:
for stage in self.config.pipeline:
with tracer.start_as_current_span(f"reverie.mesa.stage.{stage}") as s_span:
if stage == "soft_prune":
self.run_soft_prune()
elif stage == "consolidate":
self.run_hierarchical_consolidation()
elif stage == "deep_clean":
if self.purge_enabled and self._should_deep_clean():
self.run_deep_clean()
except Exception as e:
logger.error(f"MesaService loop error: {e}")
span.record_exception(e)
def _should_deep_clean(self) -> bool:
"""Triggers deep clean every 30 days."""
if self.last_deep_clean is None:
return True
return (datetime.now() - self.last_deep_clean).days >= self.config.deep_clean_interval_days
def run_soft_prune(self):
"""Identifies fragmented memories and marks them as ARCHIVED."""
with tracer.start_as_current_span("reverie.mesa.soft_prune") as span:
logger.info("MesaService: Running Tier 1 (Soft Prune)...")
# Logic:
# - importance_score < cutoff
# - last_accessed_at < now - age_days
# - status = 'ACTIVE'
# - edge count < centrality_threshold
try:
# Subquery to find candidates
candidate_query = f"""
SELECT m.id FROM memories m
LEFT JOIN (
SELECT source_id as node_id FROM memory_relations WHERE source_type = 'MEMORY'
UNION ALL
SELECT target_id as node_id FROM memory_relations WHERE target_type = 'MEMORY'
) a ON m.id = a.node_id
WHERE m.importance_score < ?
AND m.last_accessed_at < datetime('now', ?)
AND m.status = 'ACTIVE'
GROUP BY m.id
HAVING COUNT(a.node_id) < ?
"""
age_filter = f"-{self.max_age_days} days"
cursor = self.db.get_cursor()
with self.db.trace_query("SELECT", "memories", candidate_query, (self.importance_cutoff, age_filter, self.centrality_threshold)) as span:
cursor.execute(candidate_query, (self.importance_cutoff, age_filter, self.centrality_threshold))
candidate_ids = [row[0] for row in cursor.fetchall()]
if not candidate_ids:
logger.debug("MesaService: No fragmentation detected.")
return
if self.dry_run:
logger.info(f"MesaService [DRY RUN]: Would archive {len(candidate_ids)} memories: {candidate_ids}")
return
placeholders = ",".join(["?"] * len(candidate_ids))
update_query = f"UPDATE memories SET status = 'ARCHIVED' WHERE id IN ({placeholders})"
with self.db.write_lock() as cursor:
with self.db.trace_query("UPDATE", "memories", update_query, tuple(candidate_ids), batch_size=len(candidate_ids)) as span:
cursor.execute(update_query, tuple(candidate_ids))
logger.info(f"MesaService: Archived {len(candidate_ids)} fragmented memories.")
# Mirror-as-Code: Export to local archive
if self.mirror:
for mid in candidate_ids:
self.mirror.export_node(mid)
except Exception as e:
logger.error(f"MesaService Soft Prune failed: {e}")
def run_hierarchical_consolidation(self):
"""Identifies clusters of stale/fragmented memories and crystallizes them into Observation Anchors."""
with tracer.start_as_current_span("reverie.mesa.hierarchical_consolidation") as span:
logger.info("MesaService: Running Tier 1.5 (Hierarchical Consolidation)...")
try:
cursor = self.db.get_cursor()
# Find entities mentioned in clusters of stale/fragmented ACTIVE memories
# Criteria:
# - status = 'ACTIVE'
# - shared entity
# - count >= threshold
# - memories are candidates for pruning (stale OR low centrality)
age_filter = f"-{self.max_age_days} days"
query = f"""
SELECT e.id, e.name, COUNT(ma.source_id) as c_count, GROUP_CONCAT(ma.source_id) as member_ids
FROM memory_relations ma
JOIN entities e ON ma.target_id = e.id
JOIN memories m ON ma.source_id = m.id
WHERE ma.source_type = 'MEMORY'
AND ma.target_type = 'ENTITY'
AND m.status = 'ACTIVE'
AND (m.last_accessed_at < datetime('now', ?) OR m.importance_score < ?)
GROUP BY e.id
HAVING c_count >= ?
"""
with self.db.trace_query("SELECT", "memory_relations", query, (age_filter, self.importance_cutoff, self.consolidation_threshold)) as span:
cursor.execute(query, (age_filter, self.importance_cutoff, self.consolidation_threshold))
clusters = cursor.fetchall()
for ent_id, ent_name, count, member_ids_str in clusters:
logger.debug(f"Cluster: Entity {ent_name}, Count {count}")
member_ids = [int(i) for i in member_ids_str.split(',')]
self._consolidate_to_hierarchy(member_ids, ent_name, ent_id)
except Exception as e:
logger.error(f"Hierarchical consolidation failed: {e}")
def _consolidate_to_hierarchy(self, member_ids: List[int], entity_name: str, entity_id: int):
"""Creates a high-level Observation Anchor and archives children with CHILD_OF links."""
logger.info(f"MesaService: Consolidating '{entity_name}' hierarchy ({len(member_ids)} fragments)...")
try:
# 0. Fetch content
id_to_text = {}
with self.db.write_lock() as cursor:
placeholders = ",".join(["?"] * len(member_ids))
fetch_query = f"SELECT id, content_full FROM memories WHERE id IN ({placeholders})"
with self.db.trace_query("SELECT", "memories", fetch_query, tuple(member_ids)) as span:
cursor.execute(fetch_query, tuple(member_ids))
for mid, txt in cursor.fetchall():
id_to_text[mid] = txt
if not id_to_text:
return
# 1. Hierarchical Synthesis
summary_text = self.enrichment.synthesize_memories(id_to_text, entity_name)
# 2. Extract Profile & Importance
profile = self.enrichment.generate_semantic_profile(summary_text)
imp_data = self.enrichment.calculate_importance(summary_text)
with self.db.write_lock() as cursor:
# 3. Save Observation Anchor
metadata = json.dumps({"source_ids": member_ids, "consensus_target": entity_name})
insert_query = """
INSERT INTO memories (
content_full, content_abstract, importance_score, memory_type, status, metadata, guid
) VALUES (?, ?, ?, ?, 'ACTIVE', ?, ?)
"""
guid = str(uuid.uuid4())
with self.db.trace_query("INSERT", "memories", insert_query, (summary_text, profile, 4.5, "OBSERVATION", metadata, guid)) as span:
cursor.execute(insert_query, (summary_text, profile, 4.5, "OBSERVATION", metadata, guid)) # Force high importance for anchors
anchor_id = cursor.lastrowid
# Vector
vec = self.enrichment.generate_embedding(profile)
import sqlite_vec
insert_vec_query = "INSERT INTO memories_vec (rowid, embedding) VALUES (?, ?)"
with self.db.trace_query("INSERT", "memories_vec", insert_vec_query, (anchor_id, "BLOB")) as span:
cursor.execute(insert_vec_query, (anchor_id, sqlite_vec.serialize_float32(vec)))
# 4. Link Hierarchy and Archive
for mid in member_ids:
# CHILD_OF link
rel_query = """
INSERT INTO memory_relations (source_id, source_type, target_id, target_type, relation_type)
VALUES (?, 'MEMORY', ?, 'MEMORY', 'CHILD_OF')
"""
params = (mid, anchor_id)
with self.db.trace_query("INSERT", "memory_relations", rel_query, params) as span:
cursor.execute(rel_query, params)
# Also keep SUPERSEDES for backward compatibility
sup_query = """
INSERT INTO memory_relations (source_id, source_type, target_id, target_type, relation_type)
VALUES (?, 'MEMORY', ?, 'MEMORY', 'SUPERSEDES')
"""
with self.db.trace_query("INSERT", "memory_relations", sup_query, (anchor_id, mid)) as span:
cursor.execute(sup_query, (anchor_id, mid))
# Archive source
with self.db.trace_query("UPDATE", "memories", "UPDATE memories SET status = 'ARCHIVED' WHERE id = ?", (mid,)) as span:
cursor.execute("UPDATE memories SET status = 'ARCHIVED' WHERE id = ?", (mid,))
# Mirror-as-Code: Export child archive
if self.mirror:
self.mirror.export_node(mid)
# 5. Link anchor to entity
mentions_query = """
INSERT INTO memory_relations (source_id, source_type, target_id, target_type, relation_type)
VALUES (?, 'MEMORY', ?, 'ENTITY', 'MENTIONS')
"""
with self.db.trace_query("INSERT", "memory_relations", mentions_query, (anchor_id, entity_id)) as span:
cursor.execute(mentions_query, (anchor_id, entity_id))
logger.info(f"MesaService: Hierarchical crystallization complete. Anchor: {anchor_id}")
# Mirror-as-Code: Export new anchor
if self.mirror:
self.mirror.export_node(anchor_id)
except Exception as e:
logger.error(f"Hierarchical crystallization for {entity_name} failed: {e}")
self.db.conn.rollback()
def run_deep_clean(self):
"""Tier 2: Purges old archives and VACUUMs the database."""
with tracer.start_as_current_span("reverie.mesa.deep_clean") as span:
logger.info("MesaService: Running Tier 2 (Deep Clean)...")
try:
if self.dry_run:
logger.info("MesaService [DRY RUN]: Skipping Deep Clean.")
return
with self.db.write_lock() as cursor:
# 1. Delete ARCHIVED memories older than N days
purge_days = f"-{self.config.archive_retention_days} days"
purge_query = "DELETE FROM memories WHERE status = 'ARCHIVED' AND learned_at < datetime('now', ?)"
with self.db.trace_query("DELETE", "memories", purge_query, (purge_days,)) as span:
cursor.execute(purge_query, (purge_days,))
purged_count = cursor.rowcount
# 2. Cleanup orphaned vector entries (if any)
cleanup_query = "DELETE FROM memories_vec WHERE rowid NOT IN (SELECT id FROM memories)"
with self.db.trace_query("DELETE", "memories_vec", cleanup_query) as span:
cursor.execute(cleanup_query)
logger.info(f"MesaService: Purged {purged_count} records.")
# 3. VACUUM to reclaim space (MUST be outside transaction)
try:
# Using a fresh cursor directly from connection to be safe
with self.db.trace_query("VACUUM", None, "VACUUM") as span:
self.db.conn.execute("VACUUM")
logger.info("MesaService: VACUUM executed successfully.")
except Exception as ev:
logger.warning(f"MesaService: VACUUM failed (likely concurrent access): {ev}")
self.last_deep_clean = datetime.now()
except Exception as e:
logger.error(f"MesaService Deep Clean failed: {e}")