Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
71 changes: 71 additions & 0 deletions app/Console/Commands/Tags/RebuildTagCounts.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
<?php

namespace App\Console\Commands\Tags;

use Illuminate\Console\Command;
use Illuminate\Support\Facades\DB;
use Illuminate\Support\Facades\File;

class RebuildTagCounts extends Command
{
protected $signature = 'olm:rebuild-tag-counts {--scope=total : "total" (internal, all recorded tags) or "public" (verified >= 2 AND is_public — what is visible on the map)}';

protected $description = 'Rebuild a committed tag counts JSON (recorded tags per object/category/type). --scope=total writes the internal file, --scope=public writes the public on-map file.';

public function handle(): int
{
$scope = $this->option('scope');

if (! in_array($scope, ['total', 'public'], true)) {
$this->error("Invalid --scope '{$scope}'. Use 'total' or 'public'.");

return self::FAILURE;
}

[$path, $scopeLabel] = $scope === 'public'
? [config('tags.public_counts_path'), 'verified_public_on_map']
: [config('tags.usage_counts_path'), 'total_recorded_tags'];

$this->info('Aggregating tag usage counts...');

$query = DB::table('photo_tags as pt')
->join('photos as p', 'p.id', '=', 'pt.photo_id')
->whereNull('p.deleted_at')
->whereNotNull('pt.litter_object_id');

if ($scope === 'public') {
$query->where('p.is_public', 1)
->where('p.verified', '>=', 2);
}

$rows = $query
->groupBy('pt.litter_object_id', 'pt.category_id', 'pt.litter_object_type_id')
->select(
'pt.litter_object_id',
'pt.category_id',
'pt.litter_object_type_id',
DB::raw('COUNT(*) as cnt')
)
->get();

$counts = [];

foreach ($rows as $row) {
$key = $row->litter_object_id.':'.$row->category_id.':'.($row->litter_object_type_id ?? 0);
$counts[$key] = (int) $row->cnt;
}

$payload = [
'generated_at' => now()->toDateString(),
'scope' => $scopeLabel,
'counts' => $counts,
];

File::ensureDirectoryExists(dirname($path));
File::put($path, json_encode($payload, JSON_PRETTY_PRINT | JSON_UNESCAPED_SLASHES)."\n");

$this->info(count($counts).' distinct (object, category, type) keys written to '.$path);

return self::SUCCESS;
}
}
122 changes: 122 additions & 0 deletions app/Http/Controllers/API/Tags/GetTagsController.php
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,9 @@
use Illuminate\Http\JsonResponse;
use Illuminate\Http\Request;
use Illuminate\Support\Collection;
use Illuminate\Support\Facades\Cache;
use Illuminate\Support\Facades\DB;
use Illuminate\Support\Facades\File;

class GetTagsController extends Controller
{
Expand Down Expand Up @@ -86,9 +88,129 @@ public function getAllTags(): JsonResponse
'types' => $types,
'category_objects' => $categoryObjects,
'category_object_types' => $categoryObjectTypes,
'tag_usage_counts' => $this->loadTagUsageCounts(),
]);
}

/**
* Public-facing "most tagged litter" ranking.
*
* Read-only, no auth. Serves the committed public counts file (scoped to
* photos visible on the map: is_public + verified >= 2) as a ranked list at
* coarse object + category granularity — the per-type buckets are summed,
* zero-count pairs hidden, ordered by count descending (ties broken by
* object then category id for a stable, reproducible ranking). Labels are
* resolved client-side from the object/category vocabulary /api/tags/all
* already ships, so only ids and counts are returned here. The read is
* cached on the file's mtime — never a live aggregate query per request.
* Missing or malformed files degrade gracefully to an empty list.
*/
public function getMostTagged(): JsonResponse
{
$payload = $this->loadPublicTagCounts();

$coarse = [];

foreach ($payload['counts'] as $key => $count) {
$parts = explode(':', (string) $key);

if (count($parts) < 2) {
continue;
}

$pairKey = $parts[0].':'.$parts[1];
$coarse[$pairKey] = ($coarse[$pairKey] ?? 0) + (int) $count;
}

$mostTagged = [];

foreach ($coarse as $pairKey => $count) {
if ($count <= 0) {
continue;
}

[$objectId, $categoryId] = explode(':', $pairKey);

$mostTagged[] = [
'object_id' => (int) $objectId,
'category_id' => (int) $categoryId,
'count' => $count,
];
}

usort($mostTagged, fn (array $a, array $b) => [$b['count'], $a['object_id'], $a['category_id']]
<=> [$a['count'], $b['object_id'], $b['category_id']]);

return response()->json([
'generated_at' => $payload['generated_at'],
'scope' => $payload['scope'],
'most_tagged' => $mostTagged,
]);
}

/**
* Load the pre-computed public tag counts payload from the committed JSON
* file, cached on the file's mtime. Missing, empty, or malformed files
* degrade gracefully to an empty payload.
*
* @return array{generated_at: ?string, scope: ?string, counts: array<string, int>}
*/
protected function loadPublicTagCounts(): array
{
$empty = ['generated_at' => null, 'scope' => null, 'counts' => []];

$path = config('tags.public_counts_path');

if (! $path || ! File::exists($path)) {
return $empty;
}

$mtime = File::lastModified($path);

return Cache::rememberForever("tag_counts_public:{$mtime}", function () use ($path, $empty) {
$decoded = json_decode(File::get($path), true);

if (! is_array($decoded) || ! isset($decoded['counts']) || ! is_array($decoded['counts'])) {
return $empty;
}

return [
'generated_at' => $decoded['generated_at'] ?? null,
'scope' => $decoded['scope'] ?? null,
'counts' => $decoded['counts'],
];
});
}

/**
* Load the pre-computed tag usage counts map from the committed JSON file.
*
* The file is regenerated by hand via `olm:rebuild-tag-counts`. The read is
* cached on the file's mtime so it happens once until the file changes — we
* never run a live aggregate query per request. Missing, empty, or malformed
* files degrade gracefully to an empty map.
*
* @return array<string, int>
*/
protected function loadTagUsageCounts(): array
{
$path = config('tags.usage_counts_path');

if (! $path || ! File::exists($path)) {
return [];
}

$mtime = File::lastModified($path);

return Cache::rememberForever("tag_usage_counts:{$mtime}", function () use ($path) {
$decoded = json_decode(File::get($path), true);

return is_array($decoded) && isset($decoded['counts']) && is_array($decoded['counts'])
? $decoded['counts']
: [];
});
}

/**
* Build a query that filters by available models.
*/
Expand Down
35 changes: 35 additions & 0 deletions config/tags.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
<?php

return [
/*
|--------------------------------------------------------------------------
| Tag Usage Counts File
|--------------------------------------------------------------------------
|
| Path to the committed JSON file holding the all-time count of recorded
| tags per (litter_object, category, type). It is regenerated by hand with
| `php artisan olm:rebuild-tag-counts` and read by GetTagsController to
| embed the counts map in the /api/tags/all response. This is the single
| source of truth for the path so both the writer (command) and reader
| (controller) stay in sync, and so tests can point it at a temp file.
|
*/
'usage_counts_path' => resource_path('data/tag_usage_counts.json'),

/*
|--------------------------------------------------------------------------
| Public Tag Counts File
|--------------------------------------------------------------------------
|
| Path to the committed JSON file holding the public-facing count of
| recorded tags, scoped to photos visible on the public map
| (`is_public = 1 AND verified >= 2`). Regenerated by hand with
| `php artisan olm:rebuild-tag-counts --scope=public` and read by
| GetTagsController to serve the /api/tags/most-tagged ranking. Kept at the
| same (object, category, type) granularity as the internal file so the two
| are diffable row-for-row. Separate from the internal total so both numbers
| stay transparent and auditable.
|
*/
'public_counts_path' => resource_path('data/tag_counts_public.json'),
];
38 changes: 36 additions & 2 deletions readme/API.md
Original file line number Diff line number Diff line change
Expand Up @@ -357,7 +357,7 @@ Side effects: S3 upload (full + bbox thumbnail), reverse geocoding via `ResolveL

**Auth:** None (public)

This is the primary endpoint for building a tag search UI. Returns 7 flat collections that the client must join locally to build a searchable index.
This is the primary endpoint for building a tag search UI. Returns 7 flat collections that the client must join locally to build a searchable index, plus a `tag_usage_counts` map for popularity ranking.

**Response (200):**
```json
Expand Down Expand Up @@ -392,10 +392,17 @@ This is the primary endpoint for building a tag search UI. Returns 7 flat collec
{ "category_litter_object_id": 42, "litter_object_type_id": 1 },
{ "category_litter_object_id": 42, "litter_object_type_id": 2 },
{ "category_litter_object_id": 42, "litter_object_type_id": 3 }
]
],
"tag_usage_counts": {
"12:2:1": 17195,
"12:2:2": 642,
"5:1:0": 44289
}
}
```

**`tag_usage_counts`** — all-time count of recorded tags per `(litter_object_id, category_id, litter_object_type_id)`. Key format is `"{object_id}:{category_id}:{type_id}"` where `type_id` is `0` when the tag has no type. Use it to rank objects in a "Most Tagged" browse view and to order a CLO's type picker by popularity. **Scope:** *total recorded tags* — every tag on a non-deleted photo regardless of verification or public status (the truthful tagging-behaviour signal, not on-map verified counts). Sum a CLO's type buckets (e.g. `12:2:*`) to get its per-category total. The map is read from a committed JSON file (`resources/data/tag_usage_counts.json`) regenerated by hand via `php artisan olm:rebuild-tag-counts`; it is cached on the file's mtime, never computed live. Missing/empty file → `{}`.

**How to build a search index from this data:**

The 7 collections relate as follows:
Expand Down Expand Up @@ -433,6 +440,33 @@ When a user selects "wine", you submit `category_litter_object_id: 42, litter_ob

---

### GET /api/tags/most-tagged — Public "Most Tagged Litter" Ranking

**Auth:** None (public, read-only)

A public-facing ranking of the most-tagged litter, scoped to what is visible on the public map. Unlike `tag_usage_counts` in `/api/tags/all` (which is the internal *total recorded tags* signal), this figure is scoped to `is_public = 1 AND verified >= 2` — the documented "on the map" standard — so a published OLM number reconciles with the map. The divergence from the internal total is small (~2% across all objects).

Returns a list ordered by count descending (ties broken by `object_id` then `category_id` for a stable ranking), at coarse **object + category** granularity — the per-type buckets in the underlying file are summed, and zero-count pairs are hidden. Labels are not included: resolve `object_id`/`category_id` to keys client-side using the `objects` and `categories` collections already shipped by `/api/tags/all`.

**Response:**
```json
{
"generated_at": "2026-06-14",
"scope": "verified_public_on_map",
"most_tagged": [
{ "object_id": 89, "category_id": 12, "count": 85300 },
{ "object_id": 118, "category_id": 15, "count": 76703 },
{ "object_id": 9, "category_id": 8, "count": 43503 }
]
}
```

Served from a committed JSON file (`resources/data/tag_counts_public.json`) regenerated by hand via `php artisan olm:rebuild-tag-counts --scope=public`; cached on the file's mtime, never computed live. Missing/empty/malformed file → `{ "generated_at": null, "scope": null, "most_tagged": [] }`.

> **Note (map-visibility predicates, for the record — not a contract):** three endpoints apply different visibility filters. The cluster layer requires `verified >= 2 AND is_public = 1` (this endpoint matches it); a second clustering path uses `verified >= 1 AND is_public = 1`; and `GET /api/points` (`PointsController`) filters `is_public` only, with no `verified` gate. This is a known latent inconsistency, documented here so the full picture lives in one place.

---

### POST /api/v3/tags — Add Tags to Photo

**Auth:** Required (Sanctum)
Expand Down
1 change: 1 addition & 0 deletions readme/ArtisanCommands.md
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ These live in `tmp/` and are intended for the v5 migration period only.
|---------|---------|
| `seed:tags` | Run GenerateTagsSeeder (required for test DB setup) |
| `tags:verify-for-user-id {user_id}` | Verify remaining tags for a user |
| `olm:rebuild-tag-counts` | Rebuild a committed tag-counts JSON — recorded-tag counts per (object, category, type). `--scope=total` (default/omitted) writes `resources/data/tag_usage_counts.json` (all recorded tags, embedded in `/api/tags/all`); `--scope=public` writes `resources/data/tag_counts_public.json` (scoped to `is_public AND verified >= 2` — on-map photos — served by `/api/tags/most-tagged`). Run by hand (no schedule); commit the regenerated file. |

---

Expand Down
18 changes: 18 additions & 0 deletions readme/changelog/2026-06-14.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# 2026-06-14

## Session: Tag usage counts dataset

- v5.12.4 — Add `olm:rebuild-tag-counts` artisan command: aggregates all-time recorded-tag counts per `(litter_object_id, category_id, litter_object_type_id)` over non-soft-deleted photos (`litter_object_id IS NOT NULL`) into the committed JSON file `resources/data/tag_usage_counts.json` (`{generated_at, scope: "total_recorded_tags", counts}`). No DB table, no schedule, no per-request grouped query — regenerated by hand.
- v5.12.4 — `GetTagsController::getAllTags` now embeds the counts map under `tag_usage_counts` in `/api/tags/all`, read from the committed file and cached on file mtime (degrades gracefully to `{}` when the file is missing/empty). Powers the mobile "Most Tagged" browse view and popularity-ordered type picker.
- v5.12.4 — Add `config/tags.php` (`usage_counts_path`) as the single source of truth for the file path, shared by command (write) and controller (read), overridable in tests.
- v5.12.4 — Tests: `RebuildTagCountsCommandTest` (correct counts, soft-delete exclusion, null type_id → `0` key, extra-tag-only exclusion) and `TagUsageCountsTest` (embedded map, missing-file and empty-file graceful degradation). 7 new tests; full Tags + Api/Tags suites green (117 passed).
- v5.12.4 — Docs: `readme/API.md` (`/api/tags/all` response shape + scope note) and `readme/ArtisanCommands.md` (Tags table).

## Session: Public most-tagged litter statistic

- v5.12.5 — Add `--scope` option to `olm:rebuild-tag-counts`: `total` (default/omitted) is byte-for-byte the existing behaviour writing `resources/data/tag_usage_counts.json` (scope `total_recorded_tags`); `public` adds `WHERE is_public = 1 AND verified >= 2` and writes a separate committed file `resources/data/tag_counts_public.json` (scope `verified_public_on_map`) at the same (object, category, type) granularity so the two are diffable row-for-row. Uses the raw query builder's `>= 2` like `ClusteringService` (no `VerificationStatus` enum cast).
- v5.12.5 — Add public, read-only `GET /api/tags/most-tagged`: serves `tag_counts_public.json` as a ranked list at coarse object+category granularity (per-type buckets summed, zero-count pairs hidden, count desc with object/category id tie-break for a stable ranking). Returns `{object_id, category_id, count}` only — labels resolve client-side from the `/api/tags/all` vocabulary. Read cached on file mtime, never a live `GROUP BY`. Missing/empty/malformed file → empty list. Powers a public "Most Tagged Litter" figure that reconciles with the map (~2% below the internal total across all objects).
- v5.12.5 — Add `config/tags.php` `public_counts_path` as the single source of truth for the public file path (writer + reader + tests).
- v5.12.5 — Generated `resources/data/tag_counts_public.json` from the dev DB: 117 (object, category, type) keys, 562,201 on-map tags vs 572,859 total recorded (1.86% scoped out). Top public pairs: plastic/other (85.3k), butts/smoking (76.7k), packaging/food (43.5k), can/softdrinks (37.4k).
- v5.12.5 — Tests: `MostTaggedTest` (ranked coarse list summing types, zero-count exclusion, missing-file and malformed-file graceful degradation) and two new `RebuildTagCountsCommandTest` cases (`--scope=public` filters to `is_public AND verified >= 2`; `--scope=total` leaves the public file untouched). 13 tests green across the Tags + Api/Tags suites.
- v5.12.5 — Docs: `readme/API.md` (new `/api/tags/most-tagged` endpoint) and `readme/ArtisanCommands.md` (`--scope` option). Recorded for the record (not fixed) the map-visibility predicate inconsistency: clusters use `verified >= 2 AND is_public`, a second clustering path uses `verified >= 1 AND is_public`, and `/api/points` filters `is_public` only.
Loading
Loading