nabi-allenby · ChefControl · Feb 22, 2026 · Feb 22, 2026 · Feb 22, 2026 · Feb 22, 2026
diff --git a/.gitignore b/.gitignore
@@ -15,3 +15,4 @@ tests/
 *.tgz
 frontend/node_modules/
 frontend/dist/
+.claude/
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/docs/api-reference.md b/docs/api-reference.md
@@ -20,13 +20,14 @@ Start a new crawl from a given URL.
 |-------|------|----------|-------------|
 | `url` | string | Yes | The URL to crawl (must be http or https) |
 | `depth` | integer | Yes | Maximum link depth to follow (1–5, where 1 = root only) |
+| `targeted` | boolean | No | When `true`, only follow links within the same registered domain (eTLD+1) as the root URL. Defaults to `false`. |
 
 **Example:**
 
 ```bash
 curl -X POST http://localhost:8080/api/v1/crawls \
   -H 'Content-Type: application/json' \
-  -d '{"url": "https://example.com", "depth": 2}'
+  -d '{"url": "https://example.com", "depth": 2, "targeted": true}'
 ```
 
 **Response:** `201 Created`
@@ -84,7 +85,8 @@ curl "http://localhost:8080/api/v1/crawls?status=running&limit=10"
       "total": 42,
       "completed": 40,
       "failed": 2,
-      "cancelled": 0
+      "cancelled": 0,
+      "targeted": true
     }
   ],
   "total": 1,
@@ -128,7 +130,8 @@ curl http://localhost:8080/api/v1/crawls/d262a3e7-19de-437f-b0a4-cf1d689b1caf
   "failed": 60,
   "cancelled": 0,
   "root_url": "https://example.com",
-  "requested_depth": 3
+  "requested_depth": 3,
+  "targeted": false
 }
 ```
 

diff --git a/docs/project-vision.md b/docs/project-vision.md
@@ -0,0 +1,47 @@
+# Web crawler vision
+Create a free, open-source, deployable platform for Red & Blue teams that want to discover the web attack surface of their applications.
+
+## About
+This file should be used as general guidelines for development. When design decisions are made, this doc should define the "spirit" of those decisions.
+
+## My philosophy
+1. Don't reinvent the wheel - There is code written by smarter people than you. Be humble and use well-established code and tools.
+2. Open Source - This platform should be open and transparent for everyone to contribute, share, and use.
+3. Respect others - Use this platform for the betterment of software and products. Make the world better than you found it.
+4. Have fun - The process of creating things should be fun. There will be chores, but enjoy the process.
+
+
+## Design Principles (Derived from above)
+These principles are a collection of coding and design rules I personally came across and found to work. A lot of this is based on other people's design principles.
+
+---
+
+### Don't reinvent the wheel
+
+#### Adopt mainstream tools
+Use well-established tools from other open-source projects. Only create custom tools when it's absolutely necessary.
+
+#### Keep it simple stupid
+Keep the project as simple as possible. The more moving parts, the less scalable it becomes, and the more things break.
+
+### Open Source
+
+#### All source code is public
+The project vision is to be an open source platform for blue & red teams, anyone can contribute.
+
+#### All source code should be free for individuals
+This platform should always be free for individuals, and for the foreseeable future, for anyone. The code license should reflect that.
+
+### Respect others
+
+#### Respectful crawling
+Rate limiting, robots.txt awareness, and polite user-agent strings by default. The tool should be hard to misuse for DoS or abuse.
+
+### Have fun
+
+#### Visualization graph should be fun to use and explore
+The visuals and tools for exploring the graph should be fun for the user, possibly gamified.
+
+#### Project theme should be fun
+The theme of this project should be cartoony, playful, and fun. The main theme is cobweb (as it's a crawler).
+
diff --git a/feeder/src/job.rs b/feeder/src/job.rs
@@ -1,4 +1,4 @@
-use std::collections::HashSet;
+use std::collections::{HashMap, HashSet};
 
 use neo4rs::{query, Graph};
 
@@ -16,6 +16,8 @@ pub struct UrlJob {
     pub current_depth: i64,
     pub attempts: Option<i64>,
     pub crawl_id: String,
+    pub targeted: bool,
+    pub target_domain: String,
 }
 
 /// Represents a child node to be created in Neo4j.
@@ -28,6 +30,8 @@ struct ChildNode {
     current_depth: i64,
     request_time: String,
     crawl_id: String,
+    targeted: bool,
+    target_domain: String,
 }
 
 /// Atomically fetches and claims a single URL job from Neo4j.
@@ -64,6 +68,8 @@ pub async fn fetch_job(graph: &Graph, stale_timeout: i64) -> Result<Option<UrlJo
                 current_depth: node.get("current_depth")?,
                 attempts: node.get::<i64>("attempts").ok(),
                 crawl_id: node.get("crawl_id").unwrap_or_default(),
+                targeted: node.get::<bool>("targeted").unwrap_or(false),
+                target_domain: node.get::<String>("target_domain").unwrap_or_default(),
             }))
         }
         None => Ok(None),
@@ -110,23 +116,23 @@ async fn validate_job(
 
             tracing::warn!("Request failed: {} -- Attempts: {} -- Error: {}", full_url, attempts, e);
 
-            if attempts >= config.max_attempts {
-                tracing::error!(
-                    "Failure limit reached! Giving up on {} after {} attempts.",
-                    full_url,
-                    attempts
-                );
+            // 4xx errors are permanent — fail immediately without retry
+            let is_permanent = matches!(e, CrawlerError::HttpStatus { status, .. } if (400..500).contains(&status));
+
+            if is_permanent || attempts >= config.max_attempts {
+                if !is_permanent {
+                    tracing::error!(
+                        "Failure limit reached! Giving up on {} after {} attempts.",
+                        full_url,
+                        attempts
+                    );
+                }
                 update_job_status(graph, job, "FAILED", Some(attempts)).await?;
             } else {
-                // Fix: reset to PENDING so other feeders can retry
+                // Reset to PENDING so other feeders can retry
                 update_job_status(graph, job, "PENDING", Some(attempts)).await?;
             }
 
-            // Return permanent failures (4xx) as immediate failure
-            if matches!(e, CrawlerError::HttpStatus { status, .. } if (400..500).contains(&status)) {
-                update_job_status(graph, job, "FAILED", Some(attempts)).await?;
-            }
-
             Ok(None)
         }
     }
@@ -181,7 +187,8 @@ async fn batch_create_children(
                  ON CREATE SET c.ip = $ip, c.domain = $domain, \
                      c.job_status = CASE WHEN $cur_depth = $req_depth THEN 'COMPLETED' ELSE 'PENDING' END, \
                      c.requested_depth = $req_depth, \
-                     c.current_depth = $cur_depth, c.request_time = $req_time \
+                     c.current_depth = $cur_depth, c.request_time = $req_time, \
+                     c.targeted = $targeted, c.target_domain = $target_domain \
                  MERGE (p)-[:Lead]->(c)",
             )
             .param("pname", parent.name.as_str())
@@ -194,7 +201,9 @@ async fn batch_create_children(
             .param("http_type", child.http_type.as_str())
             .param("req_depth", child.requested_depth)
             .param("cur_depth", child.current_depth)
-            .param("req_time", child.request_time.as_str()),
+            .param("req_time", child.request_time.as_str())
+            .param("targeted", child.targeted)
+            .param("target_domain", child.target_domain.as_str()),
         )
         .await?;
     }
@@ -279,11 +288,24 @@ pub async fn feeding(
         None => return Ok(false),
     };
 
-    // Step 2: Extract URLs from HTML
+    // Step 2: Extract URLs from HTML and normalize once
     let extracted_urls = crawler::extract_urls(&page_data.html);
+    let mut normalized_map: HashMap<String, (String, String)> = HashMap::new();
+    for url in &extracted_urls {
+        let (norm_name, http_type) = url_normalize::normalize_url(url);
+        let upper_key = format!("{}{}", http_type, norm_name).to_uppercase();
+        normalized_map.entry(upper_key).or_insert((norm_name, http_type));
+    }
+
+    // Step 2b: Filter by target domain when targeted
+    if job.targeted && !job.target_domain.is_empty() {
+        normalized_map.retain(|_, (norm_name, _)| {
+            url_normalize::is_same_registered_domain(norm_name, &job.target_domain)
+        });
+    }
 
     // Step 3: Deduplicate against existing DB nodes (server-side)
-    let upper_urls: HashSet<String> = extracted_urls.iter().map(|u| u.to_uppercase()).collect();
+    let upper_urls: HashSet<String> = normalized_map.keys().cloned().collect();
     let new_urls = filter_new_urls(graph, &upper_urls, &job.crawl_id).await?;
 
     if new_urls.is_empty() {
@@ -292,24 +314,28 @@ pub async fn feeding(
         return Ok(true);
     }
 
-    // Step 4: Normalize, DNS resolve in parallel, build child list
+    // Step 4: DNS resolve in parallel, build child list
     let normalized: HashSet<(String, String)> = new_urls
         .iter()
-        .map(|u| url_normalize::normalize_url(u))
+        .filter_map(|key| normalized_map.get(key).cloned())
         .collect();
 
     let request_time = format!("{:?}", page_data.elapsed);
     let requested_depth = job.requested_depth;
     let current_depth = job.current_depth;
     let crawl_id = job.crawl_id.clone();
 
+    let targeted = job.targeted;
+    let target_domain = job.target_domain.clone();
+
     let dns_futures: Vec<_> = normalized
         .iter()
         .map(|(name, http_type)| {
             let name = name.clone();
             let http_type = http_type.clone();
             let req_time = request_time.clone();
             let cid = crawl_id.clone();
+            let td = target_domain.clone();
             async move {
                 match dns::get_network_stats(resolver, &name, config.max_dns_depth).await {
                     Ok(stats) => Some(ChildNode {
@@ -321,6 +347,8 @@ pub async fn feeding(
                         current_depth: current_depth + 1,
                         request_time: req_time,
                         crawl_id: cid,
+                        targeted,
+                        target_domain: td,
                     }),
                     Err(e) => {
                         tracing::error!("URL: {} -- FAILED: {}", name, e);

diff --git a/feeder/src/main.rs b/feeder/src/main.rs
@@ -123,6 +123,8 @@ async fn main() -> anyhow::Result<()> {
             current_depth: url_job.current_depth,
             attempts: url_job.attempts,
             crawl_id: url_job.crawl_id.clone(),
+            targeted: url_job.targeted,
+            target_domain: url_job.target_domain.clone(),
         });
 
         // Check for shutdown after claiming but before processing.

diff --git a/frontend/package-lock.json b/frontend/package-lock.json
diff --git a/frontend/package.json b/frontend/package.json
@@ -21,6 +21,7 @@
     "@tanstack/react-query": "^5.62.0",
     "class-variance-authority": "^0.7.1",
     "clsx": "^2.1.1",
+    "d3-force": "^3.0.0",
     "lucide-react": "^0.460.0",
     "react": "^18.3.1",
     "react-dom": "^18.3.1",
@@ -33,6 +34,7 @@
   },
   "devDependencies": {
     "@eslint/js": "^9.15.0",
+    "@types/d3-force": "^3.0.10",
     "@types/react": "^18.3.12",
     "@types/react-dom": "^18.3.1",
     "@vitejs/plugin-react": "^4.3.4",