Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
69 changes: 47 additions & 22 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,20 +1,21 @@
# theright2read — corpus refresh entry points.
#
# As of 2026-05-06 the LS + RS crawler that builds the parliamentary
# library corpus is the public package `sansad-semantic-crawler`
# (PolyForm-NC), pinned in requirements.txt at v0.2.0. The host project
# supplies the topic profile (`topics/libraries.json`, vendored from
# the upstream `examples/topics/libraries.json` because the package
# install does not include the `examples/` directory) and the output
# directory (`data/_parliament_libraries/`, gitignored).
# The LS + RS crawler is the public package `sansad-semantic-crawler`
# (PolyForm-NC), pinned in requirements.txt. The host project supplies
# the topic profile (`topics/libraries.json`, vendored from the upstream
# `examples/topics/libraries.json` because the package install does not
# include the `examples/` directory) and the output directory
# (`data/_parliament_libraries/`, gitignored).
#
# Two legacy scripts (`scripts/sansad_library_crawl.py`,
# `scripts/sansad_library_parse.py`) were retired in the same commit;
# their LS-side schema variations are now harmonised by the package.
# As of 2026-05-12 the pipeline runs against sansad-semantic-crawler
# v1.0.0, which adds an analytical layer on top of the crawl/parse/export
# basics: extract-answers → analyse-discourse → analyse-ministry. The
# `corpus-enrich` step joins those analytical outputs into
# assets/parliament_libraries.js via scripts/build_parliament_libraries.py
# (the upstream `export` only emits the manifest-derived summary).
#
# After regenerating `assets/parliament_libraries.js`, BUMP THE
# `?v=N` cache-bust suffix everywhere it is referenced — see AGENTS.md
# section 5 for the one-pass `find ... sed` command.
# After regenerating assets/parliament_libraries.js, BUMP THE `?v=N`
# cache-bust suffix everywhere it is referenced.

VENV := .venv
PYTHON := $(VENV)/bin/python
Expand All @@ -24,7 +25,9 @@ TOPIC_PROFILE := topics/libraries.json
CORPUS_OUT := data/_parliament_libraries
EXPORT_PATH := assets/parliament_libraries.js

.PHONY: deps corpus-crawl corpus-parse corpus-export corpus-refresh sync-agents help
.PHONY: deps corpus-crawl corpus-parse corpus-export corpus-extract-answers \
corpus-analyse-discourse corpus-analyse-ministry corpus-analyse \
corpus-enrich corpus-refresh sync-agents help

$(PYTHON):
python3 -m venv $(VENV)
Expand Down Expand Up @@ -53,21 +56,43 @@ corpus-export: $(PYTHON)
--js-global PARLIAMENT_LIBRARY_DATA \
--export-path $(EXPORT_PATH)

# Full pipeline: crawl, parse, export. After this finishes, manually
# bump the `?v=N` cache-bust everywhere index.html / data/index.html /
# inequality/index.html load assets/parliament_libraries.js. See
# AGENTS.md section 5 for the canonical sed command.
corpus-refresh: corpus-crawl corpus-parse corpus-export
# v1.0.0 analytical layer.
corpus-extract-answers: $(PYTHON)
$(PYTHON) -m sansad_semantic_crawler extract-answers \
--out $(CORPUS_OUT)

corpus-analyse-discourse: $(PYTHON)
$(PYTHON) -m sansad_semantic_crawler analyse-discourse \
--out $(CORPUS_OUT)

corpus-analyse-ministry: $(PYTHON)
$(PYTHON) -m sansad_semantic_crawler analyse-ministry \
--topic $(TOPIC_PROFILE) \
--out $(CORPUS_OUT)

corpus-analyse: corpus-extract-answers corpus-analyse-discourse corpus-analyse-ministry

# Join the upstream manifest export with the v1.0.0 analytical outputs
# into a single enriched assets/parliament_libraries.js.
corpus-enrich: corpus-export
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Make corpus-enrich require analysis outputs

corpus-enrich only depends on corpus-export, but build_parliament_libraries.py reads analysis_discourse.jsonl and ministry_summary_qa.jsonl and silently treats missing files as empty arrays. As a result, running make corpus-enrich (or running corpus-refresh with parallel make) can produce a “successful” artifact with zeroed discourse metrics and missing sections instead of failing fast, which makes the published dataset silently incomplete.

Useful? React with 👍 / 👎.

$(PYTHON) scripts/build_parliament_libraries.py

# Full pipeline: crawl → parse → analyse → export → enrich. After this
# finishes, manually bump the `?v=N` cache-bust everywhere index.html /
# data/index.html / inequality/index.html load assets/parliament_libraries.js.
corpus-refresh: corpus-crawl corpus-parse corpus-analyse corpus-enrich

sync-agents:
python3 scripts/sync_agents.py

help:
@echo "Corpus refresh (sansad-semantic-crawler):"
@echo " make corpus-refresh — full pipeline (crawl + parse + export)"
@echo "Corpus refresh (sansad-semantic-crawler v1.0.0):"
@echo " make corpus-refresh — full pipeline (crawl + parse + analyse + enrich)"
@echo " make corpus-crawl ARGS='--max-records 5 --no-download' — smoke-test"
@echo " make corpus-parse — re-extract text from cached PDFs"
@echo " make corpus-export — regenerate assets/parliament_libraries.js"
@echo " make corpus-analyse — extract-answers + analyse-discourse + analyse-ministry"
@echo " make corpus-export — upstream manifest-only export"
@echo " make corpus-enrich — export + join analytical files (the public artefact)"
@echo "Setup:"
@echo " make deps — install pinned deps into .venv"
@echo "Agent rules:"
Expand Down
151 changes: 115 additions & 36 deletions assets/main.js
Original file line number Diff line number Diff line change
Expand Up @@ -193,56 +193,135 @@ $$("#actions-grid").innerHTML = ACTIONS.map((a) => `
<p class="action-text">${esc(a.body)}</p>
</div>`).join("");

// Parliament library corpus. The data is generated by
// scripts/sansad_library_parse.py into assets/parliament_libraries.js.
// Parliament library corpus. Generated by
// scripts/build_parliament_libraries.py into assets/parliament_libraries.js,
// joining the upstream sansad-semantic-crawler manifest with the v1.0.0
// discourse-analysis outputs.
(function renderParliamentLibraries() {
const grid = $("#parl-grid");
if (!grid) return;
const data = window.PARLIAMENT_LIBRARY_DATA || {};
const stats = data.summaryStats || [];
const tags = data.topTags || [];
const questions = data.keyQuestions || [];
if (!$("#parl-headline-stat")) return;

const ds = data.discourseSummary || {};
const ministries = data.ministryDiscourse || [];
const excerpts = data.discourseExcerpts || [];
const rrrlf = data.rrrlfDeflections || [];

// ── Headline stat ─────────────────────────────────────────────────
const headlineEl = $("#parl-headline-stat");
if (headlineEl && ds.responsesClassified) {
const pct = ds.evasionRateClassified != null
? Math.round(ds.evasionRateClassified * 100) + "%"
: "—";
headlineEl.innerHTML = `
<div class="parl-headline-num">${esc(pct)}</div>
<div class="parl-headline-body">
<div class="parl-headline-lede"><strong>${esc(ds.evasiveCount)} of ${esc(ds.responsesClassified)}</strong> classified responses to library questions in Parliament were evasive.</div>
<div class="parl-headline-sub">
Across labels: REJECTED · SUBSTITUTED · FEDERAL_DEFLECTION · DEFLECTED · DATA_WITHHELD · STRUCTURAL_REFUSAL · CONSTITUTIONAL_DEFAULT · REPRESENTATIONAL_SILENCE.
</div>
</div>`;
}

grid.innerHTML = stats.map((p) => `
<div class="parl-stat">
<div class="label">${esc(p.label)}</div>
<div class="value">${esc(p.value)}</div>
<div class="sub">${esc(p.sub)}</div>
</div>`).join("");
// ── RRRLF "State subject" cascade ─────────────────────────────────
const cascadeEl = $("#parl-rrrlf-cascade");
if (cascadeEl) {
cascadeEl.innerHTML = rrrlf.length
? rrrlf.map((r) => {
const asker = (r.askers || []).filter(Boolean).slice(0, 2).join(", ");
return `
<article class="parl-cascade-card">
<div class="parl-cascade-date">${esc(r.date || "undated")}</div>
<div class="parl-cascade-body">
<div class="parl-cascade-quote">&ldquo;${esc(r.matchedPattern)}&rdquo;</div>
<div class="parl-cascade-meta">
${r.uri ? `<a href="${esc(r.uri)}" target="_blank" rel="noopener">` : ""}${esc(r.house)} ${esc(r.qtype)} Q${esc(r.qno)} — ${esc(r.title)}${r.uri ? `</a>` : ""}
</div>
<div class="parl-cascade-min">${esc(r.ministry)} · response to ${esc(asker || "MP")}</div>
</div>
</article>`;
}).join("")
: `<div class="parl-empty">No RRRLF-tagged FEDERAL_DEFLECTION responses in the current corpus.</div>`;
}

// ── Per-ministry evasion bars ────────────────────────────────────
const minRowsEl = $("#parl-ministry-rows");
if (minRowsEl) {
const shown = ministries.filter((m) => (m.recordsClassified || 0) > 0);
minRowsEl.innerHTML = shown.length
? shown.map((m) => {
const rate = m.evasionRateClassified != null
? Math.round(m.evasionRateClassified * 100)
: 0;
const ratePct = `${rate}%`;
return `
<div class="parl-ministry-row">
<div class="parl-ministry-name">${esc(m.ministry)}</div>
<div class="parl-ministry-bar-wrap">
<div class="parl-ministry-bar" style="width:${rate}%"></div>
<div class="parl-ministry-rate">${ratePct}</div>
</div>
<div class="parl-ministry-n">N = ${esc(m.recordsClassified)} classified <span class="t-cream-soft">of ${esc(m.recordsTotal)} total</span></div>
</div>`;
}).join("")
: "";
}

// ── Verbatim evasion cards (one per label) ───────────────────────
const gridEl = $("#parl-evasion-grid");
if (gridEl) {
const seen = new Set();
const oneEach = [];
for (const e of excerpts) {
if (seen.has(e.label)) continue;
seen.add(e.label);
oneEach.push(e);
}
gridEl.innerHTML = oneEach.map((e) => {
const pattern = (e.matchedPattern || "").trim() || e.excerpt;
const citation = `${esc(e.ministry)} · ${esc(e.date)} · ${esc(e.house)} ${esc(e.qtype)} Q${esc(e.qno)} — ${esc(e.title)}`;
const cite = e.uri
? `<a href="${esc(e.uri)}" target="_blank" rel="noopener">${citation}</a>`
: citation;
return `
<article class="parl-evasion-row">
<div class="parl-evasion-tag">${esc(e.label.replace(/_/g, " "))}</div>
<div class="parl-evasion-body">
<div class="parl-evasion-pattern">&ldquo;${esc(pattern)}&rdquo;</div>
<div class="parl-evasion-cite">${cite}</div>
<details class="parl-evasion-more">
<summary>Read the passage</summary>
<blockquote>${esc(e.excerpt)}</blockquote>
${e.politicalFunction ? `<div class="parl-evasion-function">Classifier note: ${esc(e.politicalFunction)}</div>` : ""}
</details>
</div>
</article>`;
}).join("");
}

// ── Collapsible: corpus stats + top tags (kept for completeness) ─
const stats = data.summaryStats || [];
const tags = data.topTags || [];
const grid = $("#parl-grid");
if (grid) {
grid.innerHTML = stats.map((p) => `
<div class="parl-stat">
<div class="label">${esc(p.label)}</div>
<div class="value">${esc(p.value)}</div>
<div class="sub">${esc(p.sub)}</div>
</div>`).join("");
}
const tagEl = $("#parl-tags");
if (tagEl) {
tagEl.innerHTML = tags.length
? tags.map((t) => `<span class="parl-chip">${esc(t.label)} <strong>${esc(t.count)}</strong></span>`).join("")
: `<span class="parl-chip">No topic tags yet</span>`;
}

const questionsEl = $("#parl-questions");
if (questionsEl) {
questionsEl.innerHTML = questions.length
? questions.map((q) => {
const asker = (q.askers || []).filter(Boolean).slice(0, 3).join(", ");
const tagLine = (q.tags || []).join(", ");
const title = q.href
? `<a href="${esc(q.href)}" target="_blank" rel="noopener">${esc(q.title)}</a>`
: esc(q.title);
return `
<article class="parl-question">
<div class="parl-question-meta">${esc(q.label)} · ${esc(q.date || "undated")}${q.ministry ? ` · ${esc(q.ministry)}` : ""}</div>
<h3>${title}</h3>
${asker ? `<div class="parl-question-asker">${esc(asker)}</div>` : ""}
${q.stat ? `<div class="parl-question-stat">${esc(q.stat)}</div>` : ""}
<p>${esc(q.excerpt || "")}</p>
${tagLine ? `<div class="parl-question-tags">${esc(tagLine)}</div>` : ""}
</article>`;
}).join("")
: `<div class="parl-empty">No local Parliament library crawl has been exported yet. Run <code>python scripts/sansad_library_crawl.py</code>, then <code>python scripts/sansad_library_parse.py</code>.</div>`;
}

// ── Method note ──────────────────────────────────────────────────
const sourceEl = $("#parl-source-note");
if (sourceEl) {
const generated = data.generatedAt ? `Generated ${data.generatedAt}. ` : "";
sourceEl.textContent = `${generated}Sources: ${data.sourceManifest || "data/_parliament_libraries/manifest.jsonl"}; Lok Sabha elibrary.sansad.in; Rajya Sabha rsdoc.nic.in.`;
sourceEl.innerHTML = `${esc(generated)}Corpus: ${esc(ds.questionsTotal || 0)} library questions, ${esc(ds.responsesExtracted || 0)} parseable responses, ${esc(ds.responsesClassified || 0)} classified by sansad-semantic-crawler regex_v2 + LLM ensemble. Sources: Lok Sabha <a href="https://elibrary.sansad.in/" target="_blank" rel="noopener">elibrary.sansad.in</a>; Rajya Sabha <a href="https://rsdoc.nic.in/" target="_blank" rel="noopener">rsdoc.nic.in</a>. Classifier: <a href="https://github.com/CommonerLLP/sansad-semantic-crawler" target="_blank" rel="noopener">CommonerLLP/sansad-semantic-crawler</a> v1.0.0.`;
}
})();

Expand Down
Loading
Loading