diff --git a/tasks/manifest.yaml b/tasks/manifest.yaml
index 1a80e2a..38b22cc 100644
--- a/tasks/manifest.yaml
+++ b/tasks/manifest.yaml
@@ -110,6 +110,30 @@ categories:
     - task_log_apache_critical
     - task_log_apache_timeline
     - task_log_syslog_boot
+    - task_log_nginx_status_codes
+    - task_log_nginx_traffic
+    - task_log_nginx_slow_requests
+    - task_log_nginx_user_agents
+    - task_log_nginx_errors
+    - task_log_ssh_failed_logins
+    - task_log_ssh_brute_force
+    - task_log_ssh_successful
+    - task_log_ssh_user_activity
+    - task_log_ssh_unusual_times
+    - task_log_hdfs_failures
+    - task_log_hdfs_connections
+    - task_log_hdfs_slow_ops
+    - task_log_hdfs_block_ops
+    - task_log_hdfs_storage
+    - task_log_mapreduce_jobs
+    - task_log_mapreduce_failures
+    - task_log_mapreduce_slow_tasks
+    - task_log_mapreduce_resources
+    - task_log_mapreduce_timeline
+    - task_log_syslog_anomalies
+    - task_log_syslog_services
+    - task_log_syslog_cron
+    - task_log_syslog_auth_failures
 
   meeting_analysis:
     - task_meeting_council_votes
diff --git a/tasks/task_log_hdfs_block_ops.md b/tasks/task_log_hdfs_block_ops.md
new file mode 100644
index 0000000..e8b7e5b
--- /dev/null
+++ b/tasks/task_log_hdfs_block_ops.md
@@ -0,0 +1,146 @@
+---
+id: task_log_hdfs_block_ops
+name: HDFS DataNode Log - Block Operations Summary
+category: log_analysis
+grading_type: hybrid
+timeout_seconds: 180
+workspace_files:
+  - dest: "hdfs_datanode.log"
+    source: "logs/hdfs_datanode.log"
+---
+
+# HDFS DataNode Log - Block Operations Summary
+
+## Prompt
+
+Analyze the HDFS DataNode log at `hdfs_datanode.log` and produce a comprehensive summary of all block operations. The log comes from an HDFS cluster and tracks block lifecycle events.
+
+Your report should include:
+
+1. **Block Inventory**: Total unique block IDs in the log, with a full list
+2. **Operation Types**: For each operation type (allocateBlock, Receiving, Received, addStoredBlock, replicate, PacketResponder), count total occurrences
+3. **Block Lifecycle Tracking**: For each block that has a complete lifecycle (allocate → receive → stored), document the full chain
+4. **Replication Chain**: For blocks with replication events, trace the replication path across nodes
+5. **Associated Jobs**: Identify the MapReduce jobs that triggered these block operations (visible in file paths)
+6. **Per-Block Detail Table**: Create a table with columns: Block ID, Size (if known), Allocated Path, Nodes Involved, Replication Count
+
+Write the report to `hdfs_block_ops_report.md` as a well-structured markdown document.
+
+---
+
+## Expected Behavior
+
+The agent should parse 2000 log entries and produce:
+
+**Block Inventory:**
+- ~390 unique block IDs
+
+**Operation Counts:**
+- Receiving block: ~1149
+- allocateBlock: ~385
+- Received block: ~19
+- addStoredBlock: ~19
+- PacketResponder: ~12
+- Replicate: 4
+
+**Complete Block Lifecycles (blocks with full data):**
+- blk_-1608999687919862906: 91178 bytes, allocated for job_200811092030_0001/job.jar
+- blk_7503483334202473044: 233217 bytes, allocated for job_200811092030_0001/job.split
+- blk_-3544583377289625738: 11971 bytes
+- blk_-9073992586687739851: 11977 bytes
+
+**Replication Chain:**
+- blk_-1608999687919862906 was replicated 4 times across the cluster:
+  10.250.14.224 → 10.251.215.16 → 10.251.74.79 → 10.251.31.5 → 10.251.90.64
+
+**Associated Job:**
+- job_200811092030_0001 — MapReduce job, files: job.jar, job.split
+
+Acceptable variations:
+- Block ID lists may be truncated
+- Not all 390 blocks need full detail — just those with complete lifecycle data
+- Table format may vary
+
+---
+
+## Grading Criteria
+
+- [ ] `hdfs_block_ops_report.md` is created in the workspace
+- [ ] Unique block count is provided (~390)
+- [ ] Operation types are counted (receiving, allocate, replicate, etc.)
+- [ ] At least one block lifecycle is fully traced (allocate → receive → stored)
+- [ ] The associated MapReduce job is identified (job_200811092030_0001)
+
+---
+
+## Automated Checks
+
+```python
+def grade(transcript: list, workspace_path: str) -> dict:
+    """Grade the HDFS block operations summary task."""
+    from pathlib import Path
+
+    scores = {}
+    workspace = Path(workspace_path)
+    report_file = workspace / "hdfs_block_ops_report.md"
+
+    if not report_file.exists():
+        return {
+            "output_created": 0.0,
+            "block_count": 0.0,
+            "operations_counted": 0.0,
+            "lifecycle_traced": 0.0,
+            "job_identified": 0.0,
+        }
+
+    scores["output_created"] = 1.0
+    content = report_file.read_text(encoding="utf-8").lower()
+
+    # Check 1: Block count
+    has_count = any(n in content for n in ["390", "~390", "385", "~385", "380", "~400"])
+    scores["block_count"] = (
+        1.0 if has_count else
+        0.5 if any(kw in content for kw in ["hundred", "unique block"]) else 0.0
+    )
+
+    # Check 2: Operations counted
+    op_keywords = ["receiving", "allocate", "replicate", "addstored",
+                   "packetresponder", "received"]
+    ops_found = sum(1 for kw in op_keywords if kw in content)
+    scores["operations_counted"] = (
+        1.0 if ops_found >= 4 else
+        0.5 if ops_found >= 2 else 0.0
+    )
+
+    # Check 3: Block lifecycle traced
+    lifecycle_keywords = ["91178", "233217", "blk_-1608999687919862906",
+                          "blk_7503483334202473044", "lifecycle", "job.jar", "job.split"]
+    scores["lifecycle_traced"] = (
+        1.0 if sum(1 for kw in lifecycle_keywords if kw in content) >= 2 else
+        0.5 if sum(1 for kw in lifecycle_keywords if kw in content) >= 1 else 0.0
+    )
+
+    # Check 4: MapReduce job identified
+    has_job = "job_200811092030_0001" in content or "200811092030" in content
+    has_mapreduce = "mapreduce" in content or "mapred" in content or "map reduce" in content
+    scores["job_identified"] = (
+        1.0 if has_job else
+        0.5 if has_mapreduce else 0.0
+    )
+
+    return scores
+```
+
+---
+
+## Additional Notes
+
+**Key facts from the log:**
+
+- Most blocks only have "Receiving" and "allocateBlock" entries — the cluster was mid-operation
+- Only ~19 blocks have complete lifecycle data with confirmed sizes
+- The 390 block IDs represent a MapReduce job's data being distributed across the cluster
+- Replication is only logged for blk_-1608999687919862906, which is replicated 4 times
+- File paths show this is related to a MapReduce job: `/mnt/hadoop/mapred/system/job_200811092030_0001/`
+
+**Grading weights (equal):** Each of the five criteria contributes 0.2 to the final score.
diff --git a/tasks/task_log_hdfs_connections.md b/tasks/task_log_hdfs_connections.md
new file mode 100644
index 0000000..35ec348
--- /dev/null
+++ b/tasks/task_log_hdfs_connections.md
@@ -0,0 +1,142 @@
+---
+id: task_log_hdfs_connections
+name: HDFS DataNode Log - Connection Pattern Analysis
+category: log_analysis
+grading_type: hybrid
+timeout_seconds: 180
+workspace_files:
+  - dest: "hdfs_datanode.log"
+    source: "logs/hdfs_datanode.log"
+---
+
+# HDFS DataNode Log - Connection Pattern Analysis
+
+## Prompt
+
+Analyze the HDFS DataNode log at `hdfs_datanode.log` and produce a report on connection and communication patterns between nodes. The log contains entries from DataNode, FSNamesystem, and PacketResponder components.
+
+Your report should include:
+
+1. **Network Topology**: List all unique IP addresses that appear in the log, categorized by their role (source, destination, or both)
+2. **Subnet Analysis**: Group IPs by subnet (e.g., 10.250.x.x vs 10.251.x.x). How many nodes are in each subnet?
+3. **Most Active Nodes**: Top 10 IPs by frequency of appearance (as source or destination)
+4. **Communication Patterns**: Which pairs of nodes communicate most frequently?
+5. **DataNode vs NameSystem**: Separate the activity — what comes from DataNode operations vs FSNamesystem operations?
+6. **Cluster Size Estimate**: Based on the IPs observed, estimate the cluster size
+
+Write the report to `hdfs_connections_report.md` as a well-structured markdown document.
+
+---
+
+## Expected Behavior
+
+The agent should parse 2000 log entries and produce:
+
+**Network Topology:**
+- 202 unique IP addresses observed
+- IPs fall in the 10.250.x.x and 10.251.x.x ranges (private network)
+- All nodes use port 50010 (HDFS DataNode data transfer port)
+
+**Subnet Analysis:**
+- 10.250.x.x subnet — contains some of the most active nodes
+- 10.251.x.x subnet — contains additional DataNode cluster members
+- The split suggests a multi-rack HDFS deployment
+
+**Most Active Nodes:**
+- 10.250.19.102 — extremely active (appears as source in many block transfers)
+- 10.250.10.6, 10.251.215.16, 10.250.14.224 — also very active
+
+**Component Activity:**
+- DataNode$DataXceiver: Block receive operations (~1149 entries)
+- FSNamesystem: Block allocation and storage tracking (~400+ entries)
+- DataNode$PacketResponder: Block receive confirmations with sizes
+
+Acceptable variations:
+- Exact IP counts and rankings may vary by parsing approach
+- Subnet grouping granularity may differ
+- Cluster size estimates will be approximate
+
+---
+
+## Grading Criteria
+
+- [ ] `hdfs_connections_report.md` is created in the workspace
+- [ ] Unique IPs are listed or counted (~202)
+- [ ] IPs are grouped by subnet (10.250.x.x vs 10.251.x.x)
+- [ ] Most active nodes are identified
+- [ ] DataNode vs FSNamesystem activity is distinguished
+
+---
+
+## Automated Checks
+
+```python
+def grade(transcript: list, workspace_path: str) -> dict:
+    """Grade the HDFS connection pattern analysis task."""
+    from pathlib import Path
+
+    scores = {}
+    workspace = Path(workspace_path)
+    report_file = workspace / "hdfs_connections_report.md"
+
+    if not report_file.exists():
+        return {
+            "output_created": 0.0,
+            "ips_listed": 0.0,
+            "subnets_grouped": 0.0,
+            "active_nodes": 0.0,
+            "components_separated": 0.0,
+        }
+
+    scores["output_created"] = 1.0
+    content = report_file.read_text(encoding="utf-8").lower()
+
+    # Check 1: IPs listed/counted
+    has_count = any(n in content for n in ["202", "200", "~200", "over 200"])
+    has_ips = "10.250" in content and "10.251" in content
+    scores["ips_listed"] = (
+        1.0 if has_count and has_ips else
+        0.5 if has_ips else 0.0
+    )
+
+    # Check 2: Subnets grouped
+    subnet_keywords = ["subnet", "10.250", "10.251", "rack", "network segment",
+                       "address range", "ip range"]
+    scores["subnets_grouped"] = (
+        1.0 if "10.250" in content and "10.251" in content and
+              sum(1 for kw in subnet_keywords if kw in content) >= 2 else
+        0.5 if "10.250" in content and "10.251" in content else 0.0
+    )
+
+    # Check 3: Active nodes identified
+    active_ips = ["10.250.19.102", "10.251.215.16", "10.250.14.224", "10.250.10.6"]
+    ips_found = sum(1 for ip in active_ips if ip in content)
+    scores["active_nodes"] = (
+        1.0 if ips_found >= 2 else
+        0.5 if ips_found >= 1 else 0.0
+    )
+
+    # Check 4: Components separated
+    component_keywords = ["dataxceiver", "dataxeceiver", "fsnamesystem",
+                          "packetresponder", "namenode", "datanode"]
+    scores["components_separated"] = (
+        1.0 if sum(1 for kw in component_keywords if kw in content) >= 2 else
+        0.5 if sum(1 for kw in component_keywords if kw in content) >= 1 else 0.0
+    )
+
+    return scores
+```
+
+---
+
+## Additional Notes
+
+**Key facts from the log:**
+
+- 202 unique IPs — this is a large HDFS cluster
+- Two main subnets: 10.250.x.x and 10.251.x.x
+- Port 50010 is used throughout — standard HDFS DataNode port
+- 10.250.19.102 appears as source in a disproportionate number of entries
+- The log captures a burst of activity related to job_200811092030_0001
+
+**Grading weights (equal):** Each of the five criteria contributes 0.2 to the final score.
diff --git a/tasks/task_log_hdfs_failures.md b/tasks/task_log_hdfs_failures.md
new file mode 100644
index 0000000..5dc5df0
--- /dev/null
+++ b/tasks/task_log_hdfs_failures.md
@@ -0,0 +1,147 @@
+---
+id: task_log_hdfs_failures
+name: HDFS DataNode Log - Block and Replication Failure Analysis
+category: log_analysis
+grading_type: hybrid
+timeout_seconds: 180
+workspace_files:
+  - dest: "hdfs_datanode.log"
+    source: "logs/hdfs_datanode.log"
+---
+
+# HDFS DataNode Log - Block and Replication Failure Analysis
+
+## Prompt
+
+Analyze the HDFS DataNode log at `hdfs_datanode.log` and identify any block operation failures, replication issues, or error conditions. The log is from an HDFS cluster and contains DataNode, FSNamesystem, and PacketResponder entries.
+
+Your report should include:
+
+1. **Log Overview**: Total entries, date/time range, log level distribution (INFO, WARN, ERROR)
+2. **Block Operation Summary**: Count of block receives, allocations, stored block confirmations, and replications
+3. **Error and Warning Analysis**: List any WARN or ERROR level entries with details
+4. **Replication Activity**: Detail all replication requests — which blocks are being replicated, from where to where?
+5. **Failed or Incomplete Operations**: Are there any blocks where receive started but confirmation was never logged?
+6. **Health Assessment**: Based on the log, is the HDFS cluster operating normally?
+
+Write the report to `hdfs_failure_report.md` as a well-structured markdown document.
+
+---
+
+## Expected Behavior
+
+The agent should parse 2000 log entries and produce:
+
+**Log Overview:**
+- 2000 entries, all from November 9, 2008 (081109), covering ~28 seconds (203518–203546)
+- All entries are INFO level — no WARN or ERROR entries
+
+**Block Operations:**
+- Receiving block: ~1149 entries
+- Allocate block: ~385 entries
+- Received block (confirmed): ~19 entries
+- addStoredBlock: ~19 entries
+- PacketResponder: ~12 entries
+- Replication requests: 4
+
+**Replication Details:**
+- Block blk_-1608999687919862906 has 4 replication requests:
+  - 10.250.14.224 → 10.251.215.16
+  - 10.251.215.16 → 10.251.74.79
+  - 10.251.107.19 → 10.251.31.5
+  - 10.251.31.5 → 10.251.90.64
+
+**Health Assessment:**
+- No errors or warnings — the cluster appears healthy
+- The large number of "Receiving block" entries (1149) with relatively few confirmations (19) suggests high concurrency
+- Replication activity for a single block across multiple nodes is normal HDFS behavior
+
+Acceptable variations:
+- Exact counts may differ slightly depending on parsing approach
+- Assessment language will vary
+- The distinction between "no failures" and "potential incomplete operations" is valid
+
+---
+
+## Grading Criteria
+
+- [ ] `hdfs_failure_report.md` is created in the workspace
+- [ ] Log overview with entry count and time range is provided
+- [ ] Block operations are categorized and counted (receive, allocate, replicate)
+- [ ] The absence of WARN/ERROR entries is noted (or any found are detailed)
+- [ ] A health assessment is provided
+
+---
+
+## Automated Checks
+
+```python
+def grade(transcript: list, workspace_path: str) -> dict:
+    """Grade the HDFS failure analysis task."""
+    from pathlib import Path
+
+    scores = {}
+    workspace = Path(workspace_path)
+    report_file = workspace / "hdfs_failure_report.md"
+
+    if not report_file.exists():
+        return {
+            "output_created": 0.0,
+            "log_overview": 0.0,
+            "operations_counted": 0.0,
+            "error_status": 0.0,
+            "health_assessment": 0.0,
+        }
+
+    scores["output_created"] = 1.0
+    content = report_file.read_text(encoding="utf-8").lower()
+
+    # Check 1: Log overview
+    has_count = any(n in content for n in ["2000", "2,000"])
+    has_date = any(d in content for d in ["081109", "november 9", "nov 9", "2008-11-09", "nov 2008"])
+    scores["log_overview"] = (
+        1.0 if has_count and has_date else
+        0.5 if has_count or has_date else 0.0
+    )
+
+    # Check 2: Operations categorized
+    op_keywords = ["receiving", "allocate", "replicate", "addstored",
+                   "packetresponder", "block operation"]
+    ops_found = sum(1 for kw in op_keywords if kw in content)
+    scores["operations_counted"] = (
+        1.0 if ops_found >= 3 else
+        0.5 if ops_found >= 2 else 0.0
+    )
+
+    # Check 3: Error status noted
+    error_keywords = ["no error", "no warn", "all info", "no failures",
+                      "0 error", "0 warn", "no warning", "entirely info"]
+    scores["error_status"] = (
+        1.0 if sum(1 for kw in error_keywords if kw in content) >= 1 else
+        0.5 if "info" in content else 0.0
+    )
+
+    # Check 4: Health assessment
+    health_keywords = ["healthy", "normal", "operating correctly", "no issues",
+                       "good health", "stable", "functioning"]
+    scores["health_assessment"] = (
+        1.0 if sum(1 for kw in health_keywords if kw in content) >= 1 else 0.0
+    )
+
+    return scores
+```
+
+---
+
+## Additional Notes
+
+**Key facts from the log:**
+
+- Format: `YYMMDD HHMMSS threadID LEVEL component: message`
+- Date: November 9, 2008 (081109), times 203518–203546 (~28 seconds of activity)
+- 202 unique IP addresses in the cluster
+- 390 unique block IDs
+- Block sizes range from 11,971 to 233,217 bytes
+- This is a burst of HDFS activity — likely a MapReduce job starting (job_200811092030_0001 visible in paths)
+
+**Grading weights (equal):** Each of the five criteria contributes 0.2 to the final score.
diff --git a/tasks/task_log_hdfs_slow_ops.md b/tasks/task_log_hdfs_slow_ops.md
new file mode 100644
index 0000000..0a1205f
--- /dev/null
+++ b/tasks/task_log_hdfs_slow_ops.md
@@ -0,0 +1,144 @@
+---
+id: task_log_hdfs_slow_ops
+name: HDFS DataNode Log - Slow Operation Detection
+category: log_analysis
+grading_type: hybrid
+timeout_seconds: 180
+workspace_files:
+  - dest: "hdfs_datanode.log"
+    source: "logs/hdfs_datanode.log"
+---
+
+# HDFS DataNode Log - Slow Operation Detection
+
+## Prompt
+
+Analyze the HDFS DataNode log at `hdfs_datanode.log` and identify operations that took longer than expected. The log records block receives, allocations, and replications with timestamps.
+
+Your report should include:
+
+1. **Block Lifecycle Timing**: For blocks where both "Receiving block" and "Received block" entries exist, calculate the elapsed time
+2. **Allocation-to-Receive Timing**: For blocks where both "allocateBlock" and first "Receiving block" entries exist, calculate the delay
+3. **Replication Timing**: How quickly are replication requests issued after block allocation?
+4. **Slowest Operations**: Rank the top 5 slowest block operations by elapsed time
+5. **Block Size vs Time Correlation**: Do larger blocks take longer? Correlate block size with transfer time where both are available
+6. **Performance Summary**: Overall assessment of cluster performance during this period
+
+Write the report to `hdfs_slow_ops_report.md` as a well-structured markdown document.
+
+---
+
+## Expected Behavior
+
+The agent should parse timestamps from the log format `YYMMDD HHMMSS` and calculate:
+
+**Block Lifecycle:**
+- The log covers only ~28 seconds (203518 to 203546)
+- Most block operations complete within 1-3 seconds
+- Confirmed block receives with sizes: 91178 bytes, 233217 bytes, 11971 bytes, 11977 bytes
+
+**Key observations:**
+- blk_-1608999687919862906 (91178 bytes): Allocated at 203518, first receive at 203518, confirmed at 203519 (~1 second)
+- blk_7503483334202473044 (233217 bytes): Allocated at 203520, confirmed at 203521 (~1 second)
+- blk_-3544583377289625738 (11971 bytes): Confirmed at 203522-203523
+- Block operations are very fast — consistent with a healthy cluster under normal load
+
+**Replication:**
+- 4 replication requests for blk_-1608999687919862906, issued at 203521, 203524, 203527, 203530
+- ~3 second intervals between replication hops
+
+**Performance:**
+- All operations complete in 1-3 seconds — no slow operations detected
+- The cluster is performing well during this snapshot
+
+Acceptable variations:
+- Timestamp resolution is 1 second, so some timing analysis will be approximate
+- Different approaches to matching start/end events are valid
+- Block size correlation may show insufficient data for meaningful analysis
+
+---
+
+## Grading Criteria
+
+- [ ] `hdfs_slow_ops_report.md` is created in the workspace
+- [ ] Block lifecycle timing is calculated for at least one block
+- [ ] Block sizes are correlated with operation times where data is available
+- [ ] The time range of the log is correctly identified (~28 seconds)
+- [ ] A performance assessment is provided
+
+---
+
+## Automated Checks
+
+```python
+def grade(transcript: list, workspace_path: str) -> dict:
+    """Grade the HDFS slow operation detection task."""
+    from pathlib import Path
+
+    scores = {}
+    workspace = Path(workspace_path)
+    report_file = workspace / "hdfs_slow_ops_report.md"
+
+    if not report_file.exists():
+        return {
+            "output_created": 0.0,
+            "lifecycle_timing": 0.0,
+            "size_correlation": 0.0,
+            "time_range": 0.0,
+            "performance_assessment": 0.0,
+        }
+
+    scores["output_created"] = 1.0
+    content = report_file.read_text(encoding="utf-8").lower()
+
+    # Check 1: Block lifecycle timing calculated
+    has_timing = any(kw in content for kw in ["1 second", "2 second", "3 second",
+                                                "elapsed", "duration", "latency",
+                                                "took", "completed in"])
+    has_block = "blk_" in content or "blk-" in content or "block" in content
+    scores["lifecycle_timing"] = (
+        1.0 if has_timing and has_block else
+        0.5 if has_timing or has_block else 0.0
+    )
+
+    # Check 2: Block sizes mentioned
+    sizes = ["91178", "233217", "11971", "11977"]
+    sizes_found = sum(1 for s in sizes if s in content)
+    has_correlation = any(kw in content for kw in ["size", "bytes", "larger", "smaller"])
+    scores["size_correlation"] = (
+        1.0 if sizes_found >= 2 and has_correlation else
+        0.5 if sizes_found >= 1 else 0.0
+    )
+
+    # Check 3: Time range identified
+    has_28s = any(kw in content for kw in ["28 second", "~28", "30 second",
+                                            "half a minute", "less than a minute"])
+    has_timestamps = "203518" in content or "20:35:18" in content or "20:35" in content
+    scores["time_range"] = (
+        1.0 if has_28s or has_timestamps else
+        0.5 if any(kw in content for kw in ["short", "brief", "seconds"]) else 0.0
+    )
+
+    # Check 4: Performance assessment
+    perf_keywords = ["healthy", "normal", "fast", "no slow", "performing well",
+                     "efficient", "optimal", "good performance"]
+    scores["performance_assessment"] = (
+        1.0 if sum(1 for kw in perf_keywords if kw in content) >= 1 else 0.0
+    )
+
+    return scores
+```
+
+---
+
+## Additional Notes
+
+**Key facts from the log:**
+
+- Timestamp format: YYMMDD HHMMSS (e.g., 081109 203518 = 2008-11-09 20:35:18)
+- Resolution: 1 second — so sub-second timing is not available
+- 390 unique blocks, but only ~19 have confirmed "Received" entries with sizes
+- The cluster is in a burst of activity (job startup), so performance is under load
+- No errors or warnings suggest all operations completed successfully
+
+**Grading weights (equal):** Each of the five criteria contributes 0.2 to the final score.
diff --git a/tasks/task_log_hdfs_storage.md b/tasks/task_log_hdfs_storage.md
new file mode 100644
index 0000000..f7c56e5
--- /dev/null
+++ b/tasks/task_log_hdfs_storage.md
@@ -0,0 +1,148 @@
+---
+id: task_log_hdfs_storage
+name: HDFS DataNode Log - Storage and Capacity Analysis
+category: log_analysis
+grading_type: hybrid
+timeout_seconds: 180
+workspace_files:
+  - dest: "hdfs_datanode.log"
+    source: "logs/hdfs_datanode.log"
+---
+
+# HDFS DataNode Log - Storage and Capacity Analysis
+
+## Prompt
+
+Analyze the HDFS DataNode log at `hdfs_datanode.log` and produce a storage-focused analysis. Examine block sizes, data distribution across nodes, and storage patterns.
+
+Your report should include:
+
+1. **Data Volume**: Total bytes stored across all confirmed block receives (where size is known)
+2. **Block Size Distribution**: List all known block sizes, calculate min/max/mean/median
+3. **Data Distribution by Node**: For each node that confirmed receiving blocks (PacketResponder "Received" entries), total the bytes stored
+4. **Storage Path Analysis**: What storage paths are being used? (Extract from allocateBlock file paths)
+5. **Replication Factor**: Based on how many nodes receive the same block, what is the effective replication factor?
+6. **Capacity Planning**: Based on the data ingestion rate observed, estimate the storage needed for 1 hour of similar activity
+
+Write the report to `hdfs_storage_report.md` as a well-structured markdown document.
+
+---
+
+## Expected Behavior
+
+The agent should parse the log and calculate:
+
+**Confirmed Block Sizes:**
+- blk_-1608999687919862906: 91,178 bytes (received by 3+ nodes)
+- blk_7503483334202473044: 233,217 bytes (received by 3 nodes)
+- blk_-3544583377289625738: 11,971 bytes (received by 3 nodes)
+- blk_-9073992586687739851: 11,977 bytes (received by 3 nodes)
+
+**Block Size Statistics:**
+- Min: 11,971 bytes (~12 KB)
+- Max: 233,217 bytes (~228 KB)
+- Mean: ~87,086 bytes (~85 KB)
+- Total confirmed data: ~348,343 bytes per replica
+
+**Replication Factor:**
+- Each confirmed block is received by 3 nodes → replication factor of 3
+- This is standard HDFS default replication
+
+**Storage Path:**
+- `/mnt/hadoop/mapred/system/job_200811092030_0001/` — MapReduce job staging directory
+- Files: job.jar, job.split
+
+**Capacity Planning:**
+- ~28 seconds of activity produced ~390 block allocations
+- If each block averages ~85 KB with replication factor 3, that's ~100 MB/minute raw storage
+- 1 hour estimate: ~6 GB (rough)
+
+Acceptable variations:
+- Capacity estimates will be very rough given limited confirmed sizes
+- Approaches to extrapolation will differ
+- Statistics should be based on confirmed sizes only
+
+---
+
+## Grading Criteria
+
+- [ ] `hdfs_storage_report.md` is created in the workspace
+- [ ] Known block sizes are listed (91178, 233217, 11971, 11977)
+- [ ] Block size statistics are calculated (min, max, mean)
+- [ ] Replication factor is identified (3)
+- [ ] Storage paths are extracted from the log
+
+---
+
+## Automated Checks
+
+```python
+def grade(transcript: list, workspace_path: str) -> dict:
+    """Grade the HDFS storage and capacity analysis task."""
+    from pathlib import Path
+
+    scores = {}
+    workspace = Path(workspace_path)
+    report_file = workspace / "hdfs_storage_report.md"
+
+    if not report_file.exists():
+        return {
+            "output_created": 0.0,
+            "block_sizes_listed": 0.0,
+            "statistics_calculated": 0.0,
+            "replication_factor": 0.0,
+            "storage_paths": 0.0,
+        }
+
+    scores["output_created"] = 1.0
+    content = report_file.read_text(encoding="utf-8").lower()
+
+    # Check 1: Block sizes listed
+    sizes = ["91178", "233217", "11971", "11977"]
+    sizes_found = sum(1 for s in sizes if s in content)
+    scores["block_sizes_listed"] = (
+        1.0 if sizes_found >= 3 else
+        0.5 if sizes_found >= 1 else 0.0
+    )
+
+    # Check 2: Statistics calculated
+    stat_keywords = ["min", "max", "mean", "median", "average", "total",
+                     "distribution", "range"]
+    scores["statistics_calculated"] = (
+        1.0 if sum(1 for kw in stat_keywords if kw in content) >= 3 else
+        0.5 if sum(1 for kw in stat_keywords if kw in content) >= 1 else 0.0
+    )
+
+    # Check 3: Replication factor identified
+    has_replication = any(kw in content for kw in ["replication factor",
+                                                     "replication of 3",
+                                                     "factor of 3",
+                                                     "3 replicas",
+                                                     "three replicas",
+                                                     "3 copies",
+                                                     "three copies",
+                                                     "replicated 3",
+                                                     "3 nodes"])
+    scores["replication_factor"] = 1.0 if has_replication else 0.0
+
+    # Check 4: Storage paths extracted
+    has_path = any(p in content for p in ["/mnt/hadoop", "job_200811092030",
+                                           "mapred/system", "job.jar", "job.split"])
+    scores["storage_paths"] = 1.0 if has_path else 0.0
+
+    return scores
+```
+
+---
+
+## Additional Notes
+
+**Key facts from the log:**
+
+- Only 4 unique block sizes are confirmed in the log (from PacketResponder "Received" entries)
+- 390 blocks were allocated but only ~19 receive confirmations appear in the log window
+- Standard HDFS replication factor is 3, which matches the 3 receive confirmations per block
+- The addStoredBlock entries (19) update the NameSystem's block map
+- Storage is under `/mnt/hadoop/mapred/system/` — standard MapReduce staging directory
+
+**Grading weights (equal):** Each of the five criteria contributes 0.2 to the final score.
diff --git a/tasks/task_log_mapreduce_failures.md b/tasks/task_log_mapreduce_failures.md
new file mode 100644
index 0000000..f7a27a9
--- /dev/null
+++ b/tasks/task_log_mapreduce_failures.md
@@ -0,0 +1,147 @@
+---
+id: task_log_mapreduce_failures
+name: MapReduce Log - Failed Task Analysis
+category: log_analysis
+grading_type: hybrid
+timeout_seconds: 180
+workspace_files:
+  - dest: "mapreduce.log"
+    source: "logs/hadoop_mapreduce.log"
+---
+
+# MapReduce Log - Failed Task Analysis
+
+## Prompt
+
+Analyze the Hadoop MapReduce application log at `mapreduce.log` and identify any task failures, errors, or anomalies. Focus on anything that went wrong during execution.
+
+Your report should include:
+
+1. **Error and Warning Entries**: List all WARN and ERROR level log entries with full context
+2. **Task Retries**: Identify any tasks that required multiple attempts (look for attempt numbers > 0)
+3. **Root Cause Analysis**: For each error, explain the likely cause
+4. **I/O Errors**: Detail any IOException or network-related failures
+5. **Impact Assessment**: Did any failures impact the overall job result?
+6. **Failure Prevention**: Recommend changes to prevent these failures in future runs
+
+Write the report to `mapreduce_failures.md` as a well-structured markdown document.
+
+---
+
+## Expected Behavior
+
+The agent should identify:
+
+**WARN Entries (4 total):**
+1. ResponseProcessor for block BP-1347369012-10.190.173.170-1444972147527:blk_1073742514_1708 — related to I/O issue
+2. DataStreamer for file /tmp/hadoop-yarn/staging/msrabi/.staging/job_1445062781478_0011/job — write pipeline issue
+3. CommitterEvent Processor — FileOutputCommitter recovery task count issue
+
+**ERROR Entries (1 total):**
+1. java.io.IOException: Bad response ERROR for block BP-1347369012-10.190.173.170-1444972147527:blk_1073742514_1708 from datanode — an I/O error during block write
+
+**Task Retries:**
+- attempt_1445062781478_0011_m_000006_1 (retry of m_000006_0)
+- attempt_1445062781478_0011_m_000007_1 (retry of m_000007_0)
+- Two map tasks needed a second attempt, suggesting transient failures
+
+**Root Cause:**
+- The IOException and WARN entries relate to HDFS block write failures
+- A DataNode returned a "Bad response ERROR" for block blk_1073742514_1708
+- This is a transient HDFS I/O error, likely caused by a DataNode being unavailable or overloaded
+
+**Impact:**
+- Despite the errors, the overall job SUCCEEDED
+- YARN's retry mechanism handled the transient failures transparently
+- 2 out of 10 map tasks needed retries — 20% retry rate
+
+Acceptable variations:
+- Root cause analysis depth may vary
+- Prevention recommendations will differ
+- Some agents may find additional context around the errors
+
+---
+
+## Grading Criteria
+
+- [ ] `mapreduce_failures.md` is created in the workspace
+- [ ] WARN and ERROR entries are listed (4 WARN, 1 ERROR)
+- [ ] Task retries are identified (m_000006 and m_000007 retried)
+- [ ] The IOException / bad response error is analyzed
+- [ ] Impact assessment notes the job still succeeded
+
+---
+
+## Automated Checks
+
+```python
+def grade(transcript: list, workspace_path: str) -> dict:
+    """Grade the MapReduce failed task analysis."""
+    from pathlib import Path
+
+    scores = {}
+    workspace = Path(workspace_path)
+    report_file = workspace / "mapreduce_failures.md"
+
+    if not report_file.exists():
+        return {
+            "output_created": 0.0,
+            "warn_error_listed": 0.0,
+            "retries_identified": 0.0,
+            "ioexception_analyzed": 0.0,
+            "impact_assessed": 0.0,
+        }
+
+    scores["output_created"] = 1.0
+    content = report_file.read_text(encoding="utf-8").lower()
+
+    # Check 1: WARN/ERROR entries listed
+    has_warn = "warn" in content
+    has_error = "error" in content
+    has_counts = any(n in content for n in ["4 warn", "4 warning", "1 error"])
+    scores["warn_error_listed"] = (
+        1.0 if has_warn and has_error else
+        0.5 if has_warn or has_error else 0.0
+    )
+
+    # Check 2: Task retries identified
+    has_006 = "m_000006" in content or "000006" in content
+    has_007 = "m_000007" in content or "000007" in content
+    has_retry = any(kw in content for kw in ["retry", "reattempt", "second attempt",
+                                               "_1", "attempt 1"])
+    scores["retries_identified"] = (
+        1.0 if (has_006 or has_007) and has_retry else
+        0.5 if has_retry else 0.0
+    )
+
+    # Check 3: IOException analyzed
+    io_keywords = ["ioexception", "io exception", "bad response", "block write",
+                   "datanode", "datastreamer", "blk_1073742514"]
+    scores["ioexception_analyzed"] = (
+        1.0 if sum(1 for kw in io_keywords if kw in content) >= 2 else
+        0.5 if sum(1 for kw in io_keywords if kw in content) >= 1 else 0.0
+    )
+
+    # Check 4: Impact assessment
+    impact_keywords = ["succeeded", "success", "still completed", "job completed",
+                       "transparent", "handled", "recovered", "despite"]
+    scores["impact_assessed"] = (
+        1.0 if sum(1 for kw in impact_keywords if kw in content) >= 1 else 0.0
+    )
+
+    return scores
+```
+
+---
+
+## Additional Notes
+
+**Key facts from the log:**
+
+- 1206 INFO, 4 WARN, 1 ERROR entries out of 1282 total
+- The ERROR is a Java IOException wrapped in a WARN-level DataStreamer message
+- Block BP-1347369012-10.190.173.170-1444972147527:blk_1073742514_1708 had a write failure
+- FileOutputCommitter also logged a WARN about recovery task count
+- Despite these issues, all 10 map tasks and 1 reduce task eventually completed
+
+**Grading weights (equal):** Each of the five criteria contributes 0.2 to the final score.
diff --git a/tasks/task_log_mapreduce_jobs.md b/tasks/task_log_mapreduce_jobs.md
new file mode 100644
index 0000000..82f81e2
--- /dev/null
+++ b/tasks/task_log_mapreduce_jobs.md
@@ -0,0 +1,147 @@
+---
+id: task_log_mapreduce_jobs
+name: MapReduce Log - Job Completion Summary
+category: log_analysis
+grading_type: hybrid
+timeout_seconds: 180
+workspace_files:
+  - dest: "mapreduce.log"
+    source: "logs/hadoop_mapreduce.log"
+---
+
+# MapReduce Log - Job Completion Summary
+
+## Prompt
+
+Analyze the Hadoop MapReduce application log at `mapreduce.log` and produce a comprehensive job completion summary. The log is from a MapReduce v2 (YARN) application.
+
+Your report should include:
+
+1. **Job Identification**: Job ID, application attempt ID, and job name/type
+2. **Job Configuration**: OutputCommitter type, file system, and any other configuration details
+3. **Task Summary**: Total map tasks, total reduce tasks, how many of each completed successfully
+4. **Task Completion Timeline**: When did each task complete? Create a timeline showing the order of task completions with timestamps
+5. **Job Duration**: Total job runtime from start to finish
+6. **Final Status**: Did the job succeed or fail? What was the final transition?
+
+Write the report to `job_completion_report.md` as a well-structured markdown document.
+
+---
+
+## Expected Behavior
+
+The agent should parse 1282 log entries and produce:
+
+**Job Identification:**
+- Job ID: job_1445062781478_0011
+- Application Attempt: appattempt_1445062781478_0011_000001
+- Job type: pagerank (visible in history file path)
+- User: msrabi
+
+**Configuration:**
+- OutputCommitter: FileOutputCommitter
+- File system: hdfs://msra-sa-41:9000
+- API: mapred newApiCommitter
+
+**Task Summary:**
+- 10 map tasks (m_000000 through m_000009), plus 2 retries (m_000006_1, m_000007_1)
+- 1 reduce task (r_000000)
+- All 11 tasks completed successfully (10 map + 1 reduce)
+- Total of 12 map task attempts, 1 reduce task attempt
+
+**Timeline:**
+- Job start: 15:37:56
+- First map completion: 15:39:24 (m_000009)
+- Last map completion: 15:41:25 (m_000006)
+- Reduce completion: 15:42:46 (r_000000)
+- Job finish: ~15:42:47
+
+**Duration:** ~5 minutes (15:37:56 to 15:42:47)
+
+**Final Status:** SUCCEEDED
+
+Acceptable variations:
+- Timeline formatting may differ
+- Duration calculation approach may vary
+- Task numbering notation may differ
+
+---
+
+## Grading Criteria
+
+- [ ] `job_completion_report.md` is created in the workspace
+- [ ] Job ID (job_1445062781478_0011) is identified
+- [ ] Map and reduce task counts are correct (10 map tasks, 1 reduce task)
+- [ ] Job duration is calculated (~5 minutes)
+- [ ] Final status is identified as SUCCEEDED
+
+---
+
+## Automated Checks
+
+```python
+def grade(transcript: list, workspace_path: str) -> dict:
+    """Grade the MapReduce job completion summary task."""
+    from pathlib import Path
+
+    scores = {}
+    workspace = Path(workspace_path)
+    report_file = workspace / "job_completion_report.md"
+
+    if not report_file.exists():
+        return {
+            "output_created": 0.0,
+            "job_id": 0.0,
+            "task_counts": 0.0,
+            "duration": 0.0,
+            "final_status": 0.0,
+        }
+
+    scores["output_created"] = 1.0
+    content = report_file.read_text(encoding="utf-8").lower()
+
+    # Check 1: Job ID identified
+    scores["job_id"] = (
+        1.0 if "job_1445062781478_0011" in content or "1445062781478_0011" in content else 0.0
+    )
+
+    # Check 2: Task counts correct
+    has_10_map = any(kw in content for kw in ["10 map", "ten map", "10 mapper"])
+    has_1_reduce = any(kw in content for kw in ["1 reduce", "one reduce", "single reduce",
+                                                  "1 reducer"])
+    scores["task_counts"] = (
+        1.0 if has_10_map and has_1_reduce else
+        0.5 if has_10_map or has_1_reduce else 0.0
+    )
+
+    # Check 3: Duration calculated
+    duration_keywords = ["5 minute", "~5 min", "4 minute", "4:51", "4:50",
+                         "approximately 5", "about 5", "15:37", "15:42",
+                         "nearly 5"]
+    scores["duration"] = (
+        1.0 if sum(1 for kw in duration_keywords if kw in content) >= 1 else 0.0
+    )
+
+    # Check 4: Final status
+    scores["final_status"] = (
+        1.0 if "succeeded" in content else
+        0.5 if "success" in content or "completed" in content else 0.0
+    )
+
+    return scores
+```
+
+---
+
+## Additional Notes
+
+**Key facts from the log:**
+
+- Single MapReduce job: a pagerank computation by user "msrabi"
+- YARN-based (v2 MapReduce) on a cluster with namenode msra-sa-41
+- 12 map task attempts for 10 map tasks (m_000006 and m_000007 each had retries)
+- The job history file confirms SUCCEEDED status with 10 maps and 1 reduce
+- October 17, 2015
+- Container allocation visible: 13 unique containers used
+
+**Grading weights (equal):** Each of the five criteria contributes 0.2 to the final score.
diff --git a/tasks/task_log_mapreduce_resources.md b/tasks/task_log_mapreduce_resources.md
new file mode 100644
index 0000000..6a59617
--- /dev/null
+++ b/tasks/task_log_mapreduce_resources.md
@@ -0,0 +1,143 @@
+---
+id: task_log_mapreduce_resources
+name: MapReduce Log - Resource Utilization Analysis
+category: log_analysis
+grading_type: hybrid
+timeout_seconds: 180
+workspace_files:
+  - dest: "mapreduce.log"
+    source: "logs/hadoop_mapreduce.log"
+---
+
+# MapReduce Log - Resource Utilization Analysis
+
+## Prompt
+
+Analyze the Hadoop MapReduce application log at `mapreduce.log` and produce a resource utilization report. Focus on container allocation, scheduling, and resource usage patterns.
+
+Your report should include:
+
+1. **Container Inventory**: List all containers allocated for this job, with their IDs
+2. **Container Allocation Timeline**: When was each container requested and assigned?
+3. **Scheduling Analysis**: Track pending maps and reduces over time from the RMContainerAllocator entries
+4. **Reduce Scheduling**: When did the reduce slow start threshold get met? What was the completion percentage?
+5. **Container Reuse**: Were any containers completed and then reused?
+6. **Resource Efficiency**: Based on container allocation vs task completion patterns, assess resource efficiency
+
+Write the report to `mapreduce_resources.md` as a well-structured markdown document.
+
+---
+
+## Expected Behavior
+
+The agent should parse RMContainerAllocator entries and produce:
+
+**Container Inventory:**
+- 13 unique containers used (container_1445062781478_0011_01_000001 through 000013)
+- Application attempt: 01
+
+**Scheduling Progression:**
+- Initial state: 10 pending maps, 1 pending reduce
+- Reduce slow start threshold repeatedly "not met" from 15:38:00 to 15:39:24
+- First map completes at 15:39:24 (10% complete) — still not enough for reduce
+- Reduce scheduling begins when completedMapPercent reaches sufficient threshold
+- "completedMapPercent 0.1 totalResources 2" logged at 15:39:24
+
+**Container Lifecycle:**
+- Containers allocated around 15:38:00–15:38:15
+- Containers released as tasks complete
+- "Received completed container" entries track when containers finish
+
+**Key observations:**
+- The reduce task had to wait for enough maps to complete (slow start)
+- Map tasks had varying completion times (1.5 to 3.5 minutes)
+- Container turnover: some containers were released and their resources freed quickly
+
+Acceptable variations:
+- Container ID enumeration approach may vary
+- Timeline granularity may differ
+- Resource efficiency assessment is subjective
+
+---
+
+## Grading Criteria
+
+- [ ] `mapreduce_resources.md` is created in the workspace
+- [ ] Containers are listed (13 containers identified)
+- [ ] Scheduling progression is tracked (pending maps/reduces over time)
+- [ ] Reduce slow start threshold discussion is included
+- [ ] Container completion events are analyzed
+
+---
+
+## Automated Checks
+
+```python
+def grade(transcript: list, workspace_path: str) -> dict:
+    """Grade the MapReduce resource utilization analysis task."""
+    from pathlib import Path
+
+    scores = {}
+    workspace = Path(workspace_path)
+    report_file = workspace / "mapreduce_resources.md"
+
+    if not report_file.exists():
+        return {
+            "output_created": 0.0,
+            "containers_listed": 0.0,
+            "scheduling_tracked": 0.0,
+            "slow_start": 0.0,
+            "container_completion": 0.0,
+        }
+
+    scores["output_created"] = 1.0
+    content = report_file.read_text(encoding="utf-8").lower()
+
+    # Check 1: Containers listed
+    has_container = "container_" in content or "container" in content
+    has_count = any(n in content for n in ["13 container", "13 unique", "thirteen"])
+    scores["containers_listed"] = (
+        1.0 if has_container and has_count else
+        0.5 if has_container else 0.0
+    )
+
+    # Check 2: Scheduling tracked
+    sched_keywords = ["pending", "scheduled", "pendingreds", "pendingmaps",
+                      "scheduledmaps", "scheduling"]
+    scores["scheduling_tracked"] = (
+        1.0 if sum(1 for kw in sched_keywords if kw in content) >= 2 else
+        0.5 if sum(1 for kw in sched_keywords if kw in content) >= 1 else 0.0
+    )
+
+    # Check 3: Reduce slow start discussed
+    slow_start_keywords = ["slow start", "slowstart", "threshold", "reduce.*wait",
+                           "completedmappercent", "not met"]
+    scores["slow_start"] = (
+        1.0 if sum(1 for kw in slow_start_keywords if kw in content) >= 2 else
+        0.5 if sum(1 for kw in slow_start_keywords if kw in content) >= 1 else 0.0
+    )
+
+    # Check 4: Container completion analyzed
+    completion_keywords = ["completed container", "container released", "received completed",
+                           "container finish", "freed"]
+    scores["container_completion"] = (
+        1.0 if sum(1 for kw in completion_keywords if kw in content) >= 1 else
+        0.5 if "complet" in content else 0.0
+    )
+
+    return scores
+```
+
+---
+
+## Additional Notes
+
+**Key facts from the log:**
+
+- YARN-based MapReduce (v2) on cluster with RM
+- 13 containers for 10 map tasks + 1 reduce + 1 AM container + retries
+- The reduce slow start threshold is a standard Hadoop optimization
+- "Before Scheduling" / "After Scheduling" entries provide scheduling state snapshots
+- Final stats: "PendingReds:0 ScheduledMaps:0" — all resources freed
+
+**Grading weights (equal):** Each of the five criteria contributes 0.2 to the final score.
diff --git a/tasks/task_log_mapreduce_slow_tasks.md b/tasks/task_log_mapreduce_slow_tasks.md
new file mode 100644
index 0000000..a16b08d
--- /dev/null
+++ b/tasks/task_log_mapreduce_slow_tasks.md
@@ -0,0 +1,148 @@
+---
+id: task_log_mapreduce_slow_tasks
+name: MapReduce Log - Slow Task Identification
+category: log_analysis
+grading_type: hybrid
+timeout_seconds: 180
+workspace_files:
+  - dest: "mapreduce.log"
+    source: "logs/hadoop_mapreduce.log"
+---
+
+# MapReduce Log - Slow Task Identification
+
+## Prompt
+
+Analyze the Hadoop MapReduce application log at `mapreduce.log` and identify which map and reduce tasks were slowest. Compare task completion times to find stragglers.
+
+Your report should include:
+
+1. **Task Completion Times**: For each completed task, calculate the time from container assignment to task completion
+2. **Fastest vs Slowest**: Identify the fastest and slowest map tasks, and the reduce task timing
+3. **Straggler Analysis**: Are there any tasks that took significantly longer than average? Quantify the deviation
+4. **Retry Impact**: For tasks that were retried (attempt > 0), how did the retry time compare to the original?
+5. **Reduce Phase Timing**: When did the reduce task start relative to map completions? How long did it take?
+6. **Bottleneck Identification**: What was the critical path? Which task(s) determined the overall job duration?
+
+Write the report to `slow_tasks_report.md` as a well-structured markdown document.
+
+---
+
+## Expected Behavior
+
+The agent should extract task completion timestamps and calculate:
+
+**Map Task Completions (in order):**
+1. m_000009: completed 15:39:24 (first)
+2. m_000005: completed 15:40:28
+3. m_000003: completed 15:40:32
+4. m_000000: completed 15:40:34
+5. m_000001: completed 15:40:50
+6. m_000002: completed 15:40:50
+7. m_000004: completed 15:40:50
+8. m_000008: completed 15:40:52
+9. m_000007: completed 15:41:12 (retry — _1 attempt)
+10. m_000006: completed 15:41:25 (retry — _1 attempt, slowest/last)
+
+**Reduce Task:**
+- r_000000: completed 15:42:46
+
+**Key findings:**
+- m_000009 completed first at 15:39:24 — ~1.5 minutes after job start
+- m_000006 completed last at 15:41:25 — ~3.5 minutes after job start (it's a retry)
+- The retried tasks (m_000006, m_000007) were the slowest because they had to restart
+- Spread between first and last map: ~2 minutes
+- Reduce started after enough maps completed and finished about 1.3 minutes later
+
+Acceptable variations:
+- Exact durations depend on which timestamps are used as start reference
+- Different definitions of "task start" are acceptable
+- Straggler threshold may vary
+
+---
+
+## Grading Criteria
+
+- [ ] `slow_tasks_report.md` is created in the workspace
+- [ ] Individual task completion times are listed
+- [ ] Fastest and slowest map tasks are identified
+- [ ] Retried tasks (m_000006, m_000007) are flagged as slower
+- [ ] The reduce task timing is analyzed separately
+
+---
+
+## Automated Checks
+
+```python
+def grade(transcript: list, workspace_path: str) -> dict:
+    """Grade the MapReduce slow task identification task."""
+    from pathlib import Path
+
+    scores = {}
+    workspace = Path(workspace_path)
+    report_file = workspace / "slow_tasks_report.md"
+
+    if not report_file.exists():
+        return {
+            "output_created": 0.0,
+            "completion_times": 0.0,
+            "fastest_slowest": 0.0,
+            "retries_flagged": 0.0,
+            "reduce_timing": 0.0,
+        }
+
+    scores["output_created"] = 1.0
+    content = report_file.read_text(encoding="utf-8").lower()
+
+    # Check 1: Task completion times listed
+    task_ids = ["m_000009", "m_000005", "m_000003", "m_000000", "m_000006"]
+    tasks_found = sum(1 for t in task_ids if t in content)
+    scores["completion_times"] = (
+        1.0 if tasks_found >= 4 else
+        0.5 if tasks_found >= 2 else 0.0
+    )
+
+    # Check 2: Fastest and slowest identified
+    has_fastest = any(kw in content for kw in ["fastest", "first to complete",
+                                                 "earliest", "quickest"])
+    has_slowest = any(kw in content for kw in ["slowest", "last to complete",
+                                                 "longest", "straggler"])
+    scores["fastest_slowest"] = (
+        1.0 if has_fastest and has_slowest else
+        0.5 if has_fastest or has_slowest else 0.0
+    )
+
+    # Check 3: Retried tasks flagged
+    has_retry = any(kw in content for kw in ["retry", "retried", "reattempt",
+                                               "second attempt", "_1"])
+    has_slow_retry = any(kw in content for kw in ["m_000006", "m_000007"])
+    scores["retries_flagged"] = (
+        1.0 if has_retry and has_slow_retry else
+        0.5 if has_retry else 0.0
+    )
+
+    # Check 4: Reduce timing analyzed
+    has_reduce = "r_000000" in content or "reduce" in content
+    has_reduce_time = any(t in content for t in ["15:42", "42:46"])
+    scores["reduce_timing"] = (
+        1.0 if has_reduce and has_reduce_time else
+        0.5 if has_reduce else 0.0
+    )
+
+    return scores
+```
+
+---
+
+## Additional Notes
+
+**Key facts from the log:**
+
+- Job started at 15:37:56, ended at ~15:42:47
+- Map tasks were assigned containers starting around 15:38:00
+- Reduce slow start threshold was not met until enough maps completed
+- Two tasks (m_000006, m_000007) failed on first attempt and succeeded on retry
+- The retries added ~30-55 seconds to total map phase time
+- The critical path runs through the last map completion (m_000006 at 15:41:25) plus the reduce phase
+
+**Grading weights (equal):** Each of the five criteria contributes 0.2 to the final score.
diff --git a/tasks/task_log_mapreduce_timeline.md b/tasks/task_log_mapreduce_timeline.md
new file mode 100644
index 0000000..65aed5e
--- /dev/null
+++ b/tasks/task_log_mapreduce_timeline.md
@@ -0,0 +1,159 @@
+---
+id: task_log_mapreduce_timeline
+name: MapReduce Log - Job Timeline Visualization
+category: log_analysis
+grading_type: hybrid
+timeout_seconds: 180
+workspace_files:
+  - dest: "mapreduce.log"
+    source: "logs/hadoop_mapreduce.log"
+---
+
+# MapReduce Log - Job Timeline Visualization
+
+## Prompt
+
+Analyze the Hadoop MapReduce application log at `mapreduce.log` and create a detailed timeline visualization of the entire job execution. Show all major events in chronological order.
+
+Your output should include:
+
+1. **Event Timeline**: A chronological list of every significant event with timestamp, including:
+   - Job initialization events
+   - Container allocations
+   - Task starts and completions
+   - Errors and warnings
+   - Reduce phase start
+   - Job completion
+2. **Phase Diagram**: Divide the job into phases (initialization, map phase, shuffle, reduce phase, cleanup) with start/end times and durations
+3. **Gantt-Style Task View**: Show each task (m_000000 through m_000009, r_000000) with approximate start and end times in a text-based timeline
+4. **Critical Events**: Highlight the most impactful events (errors, retries, job state transitions)
+5. **Concurrency Analysis**: At each point in time, how many tasks were running in parallel?
+
+Write the report to `mapreduce_timeline.md` as a well-structured markdown document with ASCII/text-based visualizations.
+
+---
+
+## Expected Behavior
+
+The agent should produce a timeline like:
+
+**Phase Breakdown:**
+| Phase | Start | End | Duration |
+|---|---|---|---|
+| Initialization | 15:37:56 | 15:38:00 | ~4s |
+| Map Phase | 15:38:00 | 15:41:25 | ~3m 25s |
+| Reduce Phase | 15:39:24 | 15:42:46 | ~3m 22s |
+| Cleanup | 15:42:46 | 15:42:47 | ~1s |
+| **Total** | **15:37:56** | **15:42:47** | **~4m 51s** |
+
+**Key Events:**
+- 15:37:56 — MRAppMaster created
+- 15:37:57 — OutputCommitter set (FileOutputCommitter)
+- 15:38:00 — Container allocation begins (10 maps pending, 1 reduce pending)
+- 15:39:24 — First map completes (m_000009), Num completed: 1
+- 15:40:28–15:40:52 — Rapid map completions (tasks 2-8)
+- 15:40:45 — WARN: Block I/O error (ResponseProcessor, DataStreamer)
+- 15:41:12 — m_000007 completes (retry attempt)
+- 15:41:25 — m_000006 completes (retry attempt, last map)
+- 15:42:46 — r_000000 completes, Num completed: 11
+- 15:42:46 — Job transitions to SUCCEEDED
+- 15:42:47 — Final stats logged
+
+**Concurrency:**
+- Peak: up to 10 map tasks running simultaneously
+- After 15:39:24, concurrency decreases as maps complete
+
+Acceptable variations:
+- ASCII visualization style will vary
+- Not every log entry needs to be in the timeline — major events are sufficient
+- Phase definitions may differ slightly
+
+---
+
+## Grading Criteria
+
+- [ ] `mapreduce_timeline.md` is created in the workspace
+- [ ] Events are listed chronologically with timestamps
+- [ ] Phases are identified (init, map, reduce, completion)
+- [ ] A visual or structured timeline/gantt is attempted
+- [ ] Key events (first map completion, errors, job success) are highlighted
+
+---
+
+## Automated Checks
+
+```python
+def grade(transcript: list, workspace_path: str) -> dict:
+    """Grade the MapReduce timeline visualization task."""
+    from pathlib import Path
+
+    scores = {}
+    workspace = Path(workspace_path)
+    report_file = workspace / "mapreduce_timeline.md"
+
+    if not report_file.exists():
+        return {
+            "output_created": 0.0,
+            "chronological_events": 0.0,
+            "phases_identified": 0.0,
+            "visual_timeline": 0.0,
+            "key_events_highlighted": 0.0,
+        }
+
+    scores["output_created"] = 1.0
+    content = report_file.read_text(encoding="utf-8").lower()
+
+    # Check 1: Chronological events with timestamps
+    timestamps = ["15:37", "15:38", "15:39", "15:40", "15:41", "15:42"]
+    ts_found = sum(1 for ts in timestamps if ts in content)
+    scores["chronological_events"] = (
+        1.0 if ts_found >= 5 else
+        0.5 if ts_found >= 3 else 0.0
+    )
+
+    # Check 2: Phases identified
+    phase_keywords = ["initialization", "init", "map phase", "reduce phase",
+                      "shuffle", "cleanup", "completion", "startup"]
+    phases_found = sum(1 for kw in phase_keywords if kw in content)
+    scores["phases_identified"] = (
+        1.0 if phases_found >= 3 else
+        0.5 if phases_found >= 2 else 0.0
+    )
+
+    # Check 3: Visual/structured timeline attempted
+    visual_keywords = ["timeline", "gantt", "---", "===", "|||", "phase",
+                       "diagram", "chart", "|", "─", "-"]
+    # Check for table-like structures or ASCII art
+    lines = content.split("\n")
+    table_lines = [l for l in lines if l.count("|") >= 2]
+    ascii_lines = [l for l in lines if any(c in l for c in ["─", "━", "═", "▓", "█", "░"])]
+    scores["visual_timeline"] = (
+        1.0 if len(table_lines) >= 5 or len(ascii_lines) >= 3 else
+        0.5 if len(table_lines) >= 2 else 0.0
+    )
+
+    # Check 4: Key events highlighted
+    key_events = ["mrappmaster", "first map", "succeeded", "warn", "error",
+                  "m_000009", "r_000000", "retry", "completed"]
+    events_found = sum(1 for kw in key_events if kw in content)
+    scores["key_events_highlighted"] = (
+        1.0 if events_found >= 4 else
+        0.5 if events_found >= 2 else 0.0
+    )
+
+    return scores
+```
+
+---
+
+## Additional Notes
+
+**Key facts from the log:**
+
+- Total job duration: ~4 minutes 51 seconds
+- Map phase and reduce phase overlap — reduce starts while maps are still running
+- The reduce "slow start" threshold meant the reduce task didn't get scheduled immediately
+- Two map task retries (m_000006, m_000007) extended the map phase by about 35 seconds
+- 1282 log entries total, but only ~50 represent major state transitions
+
+**Grading weights (equal):** Each of the five criteria contributes 0.2 to the final score.
diff --git a/tasks/task_log_nginx_errors.md b/tasks/task_log_nginx_errors.md
new file mode 100644
index 0000000..6883f30
--- /dev/null
+++ b/tasks/task_log_nginx_errors.md
@@ -0,0 +1,149 @@
+---
+id: task_log_nginx_errors
+name: Nginx Access Log - Error Pattern Analysis
+category: log_analysis
+grading_type: hybrid
+timeout_seconds: 180
+workspace_files:
+  - dest: "nginx_access.log"
+    source: "logs/nginx_access_json.log"
+---
+
+# Nginx Access Log - Error Pattern Analysis
+
+## Prompt
+
+Analyze the Nginx JSON access log at `nginx_access.log` and produce a detailed report on error patterns (4xx and 5xx responses). Each line is a JSON object with fields: `time`, `remote_ip`, `remote_user`, `request`, `response`, `bytes`, `referrer`, `agent`.
+
+Your report should include:
+
+1. **Error Overview**: Total errors, error rate (as percentage of all requests), breakdown by status code
+2. **404 Analysis**: Which paths are returning 404? Are these legitimate missing resources or misconfigured routes?
+3. **403 Analysis**: What's being forbidden and from which IPs?
+4. **Error by Client IP**: Which IPs generate the most errors? Top 10 with counts
+5. **Error by Path**: Which request paths generate the most errors? Top 10 with counts
+6. **Temporal Pattern**: Are errors concentrated at certain times or spread evenly?
+7. **Remediation Recommendations**: Based on the error patterns, suggest 3 specific fixes
+
+Write the report to `error_analysis.md` as a well-structured markdown document.
+
+---
+
+## Expected Behavior
+
+The agent should parse all 1000 JSON log entries and produce:
+
+**Error Overview:**
+- Total errors: 690 (69.0% of all requests)
+- 404: 688 errors
+- 403: 2 errors
+- No 5xx errors observed
+
+**404 Analysis:**
+- All 404s target `/downloads/product_1` and `/downloads/product_2`
+- These same paths also return 200 and 304 at other times
+- This suggests intermittent resource availability, not permanently missing files
+
+**403 Analysis:**
+- 2 forbidden requests — identify the IPs and paths
+
+**Top Error IPs:**
+- 80.91.33.133 is the highest-volume IP overall and likely the top error generator
+- Other high-frequency IPs: 5.83.131.103, 202.143.95.26, 50.57.209.92
+
+**Key insight:**
+- The extremely high 404 rate (68.8%) on a package download server is unusual
+- Package managers retry automatically, which amplifies the error count
+- The root cause is likely transient unavailability of download resources
+
+Acceptable variations:
+- Exact counts are deterministic
+- Remediation suggestions will vary
+- Assessment depth may differ
+
+---
+
+## Grading Criteria
+
+- [ ] `error_analysis.md` is created in the workspace
+- [ ] Error rate and status code breakdown are provided (690 errors, 69%, 404/403 split)
+- [ ] 404 errors are analyzed by path (/downloads/product_1, /downloads/product_2)
+- [ ] Top error-generating IPs are listed
+- [ ] At least 2 remediation recommendations are provided
+
+---
+
+## Automated Checks
+
+```python
+def grade(transcript: list, workspace_path: str) -> dict:
+    """Grade the Nginx error pattern analysis task."""
+    from pathlib import Path
+
+    scores = {}
+    workspace = Path(workspace_path)
+    report_file = workspace / "error_analysis.md"
+
+    if not report_file.exists():
+        return {
+            "output_created": 0.0,
+            "error_rate_breakdown": 0.0,
+            "path_analysis": 0.0,
+            "top_error_ips": 0.0,
+            "recommendations": 0.0,
+        }
+
+    scores["output_created"] = 1.0
+    content = report_file.read_text(encoding="utf-8").lower()
+
+    # Check 1: Error rate and breakdown
+    has_total = any(n in content for n in ["690", "688", "69%", "68.8%", "69.0%"])
+    has_404 = "404" in content
+    has_403 = "403" in content
+    scores["error_rate_breakdown"] = (
+        1.0 if has_total and has_404 and has_403 else
+        0.5 if has_404 and has_total else 0.0
+    )
+
+    # Check 2: Path analysis
+    has_product_1 = "product_1" in content
+    has_product_2 = "product_2" in content
+    scores["path_analysis"] = (
+        1.0 if has_product_1 and has_product_2 else
+        0.5 if has_product_1 or has_product_2 else 0.0
+    )
+
+    # Check 3: Top error IPs
+    top_ips = ["80.91.33.133", "5.83.131.103", "202.143.95.26", "50.57.209.92"]
+    ips_found = sum(1 for ip in top_ips if ip in content)
+    scores["top_error_ips"] = (
+        1.0 if ips_found >= 3 else
+        0.5 if ips_found >= 1 else 0.0
+    )
+
+    # Check 4: Recommendations provided
+    rec_keywords = ["recommend", "suggestion", "fix", "should", "consider",
+                    "implement", "configure", "add", "improve"]
+    lines = content.split("\n")
+    rec_lines = [l for l in lines if any(kw in l for kw in rec_keywords)]
+    scores["recommendations"] = (
+        1.0 if len(rec_lines) >= 2 else
+        0.5 if len(rec_lines) >= 1 else 0.0
+    )
+
+    return scores
+```
+
+---
+
+## Additional Notes
+
+**Key facts from the log:**
+
+- 690 out of 1000 requests are errors — almost entirely 404s
+- Both `/downloads/product_1` and `/downloads/product_2` return a mix of 200, 304, and 404
+- This pattern is consistent with a package repository where files are being updated/rotated
+- Only 2 entries with 403 Forbidden
+- Zero 5xx errors — the server itself is healthy
+
+**Grading weights (equal):** Each of the five criteria contributes 0.2 to the final score.
diff --git a/tasks/task_log_nginx_slow_requests.md b/tasks/task_log_nginx_slow_requests.md
new file mode 100644
index 0000000..2a7e5e3
--- /dev/null
+++ b/tasks/task_log_nginx_slow_requests.md
@@ -0,0 +1,135 @@
+---
+id: task_log_nginx_slow_requests
+name: Nginx Access Log - Find Largest Responses
+category: log_analysis
+grading_type: hybrid
+timeout_seconds: 180
+workspace_files:
+  - dest: "nginx_access.log"
+    source: "logs/nginx_access_json.log"
+---
+
+# Nginx Access Log - Find Largest Responses
+
+## Prompt
+
+Analyze the Nginx JSON access log at `nginx_access.log` and identify the requests that generated the largest responses (by bytes transferred). Each line is a JSON object with fields: `time`, `remote_ip`, `remote_user`, `request`, `response`, `bytes`, `referrer`, `agent`.
+
+Your report should include:
+
+1. **Top 10 Largest Responses**: List the requests with the highest byte counts, including timestamp, client IP, request path, status code, and bytes
+2. **Byte Distribution Summary**: Overall statistics — min, max, mean, median bytes transferred (excluding zero-byte responses)
+3. **Zero-Byte Responses**: Count of zero-byte responses and which status codes produce them
+4. **Large Response Analysis**: What paths and client IPs are associated with the largest transfers?
+5. **Efficiency Assessment**: What percentage of requests result in actual data transfer vs cache hits (304)?
+
+Write the report to `large_responses_report.md` as a well-structured markdown document.
+
+---
+
+## Expected Behavior
+
+The agent should parse all 1000 JSON log entries and produce:
+
+**Top Largest Responses:**
+- Maximum bytes observed: ~3318 bytes
+- Largest responses are 200 OK responses for `/downloads/product_1` and `/downloads/product_2`
+- Top byte values include: 3318, 3316, 3301, 2582, 2578, etc.
+
+**Zero-Byte Analysis:**
+- 304 Not Modified responses all have 0 bytes (274 entries)
+- 404 responses have small byte counts (300-340 range typically)
+
+**Efficiency:**
+- ~274 out of 1000 requests are 304 (cache hits) — 27.4%
+- 200 OK with data: ~35 requests — 3.5%
+- 404 errors: 688 requests — these transfer small error pages
+
+Acceptable variations:
+- Exact byte values are deterministic from the log
+- Assessment wording will vary
+- Top 10 vs top 20 is fine
+
+---
+
+## Grading Criteria
+
+- [ ] `large_responses_report.md` is created in the workspace
+- [ ] Top largest responses are listed with byte counts
+- [ ] Zero-byte / 304 responses are analyzed separately
+- [ ] Distribution statistics (min, max, mean or median) are provided
+- [ ] Paths associated with largest responses are identified
+
+---
+
+## Automated Checks
+
+```python
+def grade(transcript: list, workspace_path: str) -> dict:
+    """Grade the Nginx largest responses task."""
+    from pathlib import Path
+
+    scores = {}
+    workspace = Path(workspace_path)
+    report_file = workspace / "large_responses_report.md"
+
+    if not report_file.exists():
+        return {
+            "output_created": 0.0,
+            "top_responses_listed": 0.0,
+            "zero_byte_analysis": 0.0,
+            "distribution_stats": 0.0,
+            "paths_identified": 0.0,
+        }
+
+    scores["output_created"] = 1.0
+    content = report_file.read_text(encoding="utf-8").lower()
+
+    # Check 1: Top responses listed with byte counts
+    has_max_bytes = any(b in content for b in ["3318", "3316", "3301"])
+    has_ranking = any(kw in content for kw in ["top", "largest", "biggest", "highest"])
+    scores["top_responses_listed"] = (
+        1.0 if has_max_bytes and has_ranking else
+        0.5 if has_max_bytes else 0.0
+    )
+
+    # Check 2: Zero-byte / 304 analysis
+    has_zero = "0 byte" in content or "zero byte" in content or "zero-byte" in content or "no data" in content
+    has_304 = "304" in content
+    scores["zero_byte_analysis"] = (
+        1.0 if has_304 and has_zero else
+        0.5 if has_304 else 0.0
+    )
+
+    # Check 3: Distribution statistics
+    stat_keywords = ["min", "max", "mean", "median", "average", "total bytes",
+                     "distribution", "range"]
+    scores["distribution_stats"] = (
+        1.0 if sum(1 for kw in stat_keywords if kw in content) >= 3 else
+        0.5 if sum(1 for kw in stat_keywords if kw in content) >= 1 else 0.0
+    )
+
+    # Check 4: Paths identified
+    has_product_1 = "product_1" in content
+    has_product_2 = "product_2" in content
+    has_downloads = "downloads" in content or "/download" in content
+    scores["paths_identified"] = (
+        1.0 if has_product_1 and has_product_2 else
+        0.5 if has_downloads else 0.0
+    )
+
+    return scores
+```
+
+---
+
+## Additional Notes
+
+**Key facts from the log:**
+
+- Maximum response size is only ~3318 bytes — this is a lightweight download server
+- The vast majority of responses are either 304 (0 bytes) or 404 (small error page)
+- Only ~35 requests return 200 with actual content
+- All requests target just two paths: `/downloads/product_1` and `/downloads/product_2`
+
+**Grading weights (equal):** Each of the five criteria contributes 0.2 to the final score.
diff --git a/tasks/task_log_nginx_status_codes.md b/tasks/task_log_nginx_status_codes.md
new file mode 100644
index 0000000..595eefd
--- /dev/null
+++ b/tasks/task_log_nginx_status_codes.md
@@ -0,0 +1,142 @@
+---
+id: task_log_nginx_status_codes
+name: Nginx Access Log - HTTP Status Code Distribution
+category: log_analysis
+grading_type: hybrid
+timeout_seconds: 180
+workspace_files:
+  - dest: "nginx_access.log"
+    source: "logs/nginx_access_json.log"
+---
+
+# Nginx Access Log - HTTP Status Code Distribution
+
+## Prompt
+
+Analyze the Nginx JSON access log at `nginx_access.log` and produce a report on HTTP status code distribution. Each line is a JSON object with fields: `time`, `remote_ip`, `remote_user`, `request`, `response`, `bytes`, `referrer`, `agent`.
+
+Your report should include:
+
+1. **Total Requests**: Total number of log entries
+2. **Status Code Breakdown**: Count and percentage for each HTTP status code observed
+3. **Status Code Categories**: Group by category (2xx success, 3xx redirect, 4xx client error, 5xx server error) with totals
+4. **Top Offenders**: For 4xx and 5xx errors, list the top 5 client IPs generating the most errors
+5. **Requested Paths**: For each status code, show the top 3 most-requested paths
+6. **Assessment**: Brief assessment of server health based on the status code distribution
+
+Write the report to `status_code_report.md` as a well-structured markdown document.
+
+---
+
+## Expected Behavior
+
+The agent should parse all 1000 JSON log entries and produce:
+
+**Total Requests:** 1000
+
+**Status Code Breakdown:**
+- 200: 35 (3.5%)
+- 206: 1 (0.1%)
+- 304: 274 (27.4%)
+- 403: 2 (0.2%)
+- 404: 688 (68.8%)
+
+**Status Code Categories:**
+- 2xx: 36 (3.6%)
+- 3xx: 274 (27.4%)
+- 4xx: 690 (69.0%)
+
+**Key observations:**
+- The log covers May 17, 2015, approximately 08:05–16:05 UTC
+- 404 errors dominate — nearly 69% of all requests
+- Paths are primarily `/downloads/product_1` and `/downloads/product_2`
+- Most traffic comes from Debian APT package manager clients
+- 80.91.33.133 is the most active IP with ~210 requests
+
+Acceptable variations:
+- Exact percentages may differ by rounding
+- Assessment wording will vary
+- Additional analysis is welcome
+
+---
+
+## Grading Criteria
+
+- [ ] `status_code_report.md` is created in the workspace
+- [ ] Total request count is reported (1000)
+- [ ] All observed status codes are listed with counts (200, 206, 304, 403, 404)
+- [ ] Status codes are grouped by category (2xx, 3xx, 4xx)
+- [ ] Top error-generating IPs are identified
+
+---
+
+## Automated Checks
+
+```python
+def grade(transcript: list, workspace_path: str) -> dict:
+    """Grade the Nginx status code distribution task."""
+    from pathlib import Path
+
+    scores = {}
+    workspace = Path(workspace_path)
+    report_file = workspace / "status_code_report.md"
+
+    if not report_file.exists():
+        return {
+            "output_created": 0.0,
+            "total_count": 0.0,
+            "status_codes_listed": 0.0,
+            "categories_grouped": 0.0,
+            "top_error_ips": 0.0,
+        }
+
+    scores["output_created"] = 1.0
+    content = report_file.read_text(encoding="utf-8").lower()
+
+    # Check 1: Total request count
+    scores["total_count"] = 1.0 if "1000" in content or "1,000" in content else 0.0
+
+    # Check 2: All status codes listed
+    has_200 = "200" in content
+    has_304 = "304" in content
+    has_404 = "404" in content
+    has_403 = "403" in content
+    codes_found = sum([has_200, has_304, has_404, has_403])
+    scores["status_codes_listed"] = (
+        1.0 if codes_found >= 4 else
+        0.5 if codes_found >= 3 else 0.0
+    )
+
+    # Check 3: Categories grouped
+    has_2xx = "2xx" in content or "success" in content
+    has_3xx = "3xx" in content or "redirect" in content
+    has_4xx = "4xx" in content or "client error" in content
+    scores["categories_grouped"] = (
+        1.0 if sum([has_2xx, has_3xx, has_4xx]) >= 3 else
+        0.5 if sum([has_2xx, has_3xx, has_4xx]) >= 2 else 0.0
+    )
+
+    # Check 4: Top error IPs identified
+    top_ips = ["80.91.33.133", "5.83.131.103", "202.143.95.26", "50.57.209.92"]
+    ips_found = sum(1 for ip in top_ips if ip in content)
+    scores["top_error_ips"] = (
+        1.0 if ips_found >= 2 else
+        0.5 if ips_found >= 1 else 0.0
+    )
+
+    return scores
+```
+
+---
+
+## Additional Notes
+
+**Key facts from the log:**
+
+- 1000 JSON entries, single day: May 17, 2015 (08:05–16:05 UTC)
+- Extremely high 404 rate (68.8%) suggests missing resources or misconfigured download paths
+- Traffic is predominantly automated (Debian APT package managers)
+- Only 2 main paths: `/downloads/product_1` and `/downloads/product_2`
+- No 5xx server errors observed
+
+**Grading weights (equal):** Each of the five criteria contributes 0.2 to the final score.
diff --git a/tasks/task_log_nginx_traffic.md b/tasks/task_log_nginx_traffic.md
new file mode 100644
index 0000000..cc490cf
--- /dev/null
+++ b/tasks/task_log_nginx_traffic.md
@@ -0,0 +1,137 @@
+---
+id: task_log_nginx_traffic
+name: Nginx Access Log - Traffic Patterns by Time
+category: log_analysis
+grading_type: hybrid
+timeout_seconds: 180
+workspace_files:
+  - dest: "nginx_access.log"
+    source: "logs/nginx_access_json.log"
+---
+
+# Nginx Access Log - Traffic Patterns by Time
+
+## Prompt
+
+Analyze the Nginx JSON access log at `nginx_access.log` and produce a report on traffic patterns over time. Each line is a JSON object with fields: `time`, `remote_ip`, `remote_user`, `request`, `response`, `bytes`, `referrer`, `agent`.
+
+Your report should include:
+
+1. **Time Range**: The full date/time range covered by the log
+2. **Hourly Traffic Breakdown**: Number of requests per hour
+3. **Peak and Low Traffic**: Identify the busiest and quietest hours
+4. **Bandwidth Over Time**: Total bytes transferred per hour
+5. **Request Rate Trends**: Are requests steady, bursty, or showing a trend?
+6. **Per-IP Activity Over Time**: Identify IPs that appear across multiple hours vs those that appear in bursts
+
+Write the report to `traffic_report.md` as a well-structured markdown document.
+
+---
+
+## Expected Behavior
+
+The agent should parse all 1000 JSON log entries and produce:
+
+**Time Range:** May 17, 2015, 08:05:01 to 16:05:10 UTC (approximately 8 hours)
+
+**Hourly Breakdown (approximate):**
+- 08:xx — the log starts mid-hour
+- Traffic is distributed across the 8-hour window
+- The log contains entries timestamped between 08:05 and 16:05
+
+**Key observations:**
+- 73 unique client IPs over the period
+- 80.91.33.133 is the most persistent client (~210 requests spread across the time range)
+- Most traffic is Debian APT package manager traffic (automated updates)
+- Bytes transferred vary — 304 (Not Modified) responses have 0 bytes; 200 responses range up to ~3318 bytes
+- The server appears to be a software download/repository mirror
+
+Acceptable variations:
+- Exact hourly counts may vary depending on parsing approach
+- Any reasonable binning approach (hourly, 30-min, etc.) is acceptable
+- Trend analysis wording will vary
+
+---
+
+## Grading Criteria
+
+- [ ] `traffic_report.md` is created in the workspace
+- [ ] Time range is identified (May 17, 2015; approximately 08:05–16:05 UTC)
+- [ ] Traffic is broken down by time period (hourly or similar)
+- [ ] Peak/busiest periods are identified
+- [ ] Bandwidth or bytes transferred is analyzed
+
+---
+
+## Automated Checks
+
+```python
+def grade(transcript: list, workspace_path: str) -> dict:
+    """Grade the Nginx traffic patterns task."""
+    from pathlib import Path
+
+    scores = {}
+    workspace = Path(workspace_path)
+    report_file = workspace / "traffic_report.md"
+
+    if not report_file.exists():
+        return {
+            "output_created": 0.0,
+            "time_range": 0.0,
+            "hourly_breakdown": 0.0,
+            "peak_identified": 0.0,
+            "bandwidth_analysis": 0.0,
+        }
+
+    scores["output_created"] = 1.0
+    content = report_file.read_text(encoding="utf-8").lower()
+
+    # Check 1: Time range identified
+    has_date = any(d in content for d in ["may 17", "2015-05-17", "17/may/2015", "may 2015"])
+    has_range = any(t in content for t in ["08:05", "16:05", "8 hour", "eight hour"])
+    scores["time_range"] = (
+        1.0 if has_date and has_range else
+        0.5 if has_date else 0.0
+    )
+
+    # Check 2: Traffic broken down by time period
+    time_keywords = ["hour", "period", "interval", "08:", "09:", "10:", "11:", "12:",
+                     "13:", "14:", "15:", "16:"]
+    time_sections = sum(1 for kw in time_keywords if kw in content)
+    scores["hourly_breakdown"] = (
+        1.0 if time_sections >= 4 else
+        0.5 if time_sections >= 2 else 0.0
+    )
+
+    # Check 3: Peak/busiest periods identified
+    peak_keywords = ["peak", "busiest", "highest", "most active", "maximum",
+                     "lowest", "quietest", "least"]
+    scores["peak_identified"] = (
+        1.0 if sum(1 for kw in peak_keywords if kw in content) >= 2 else
+        0.5 if sum(1 for kw in peak_keywords if kw in content) >= 1 else 0.0
+    )
+
+    # Check 4: Bandwidth/bytes analysis
+    bandwidth_keywords = ["bytes", "bandwidth", "transfer", "data", "0 bytes",
+                          "304", "not modified"]
+    scores["bandwidth_analysis"] = (
+        1.0 if sum(1 for kw in bandwidth_keywords if kw in content) >= 2 else
+        0.5 if sum(1 for kw in bandwidth_keywords if kw in content) >= 1 else 0.0
+    )
+
+    return scores
+```
+
+---
+
+## Additional Notes
+
+**Key facts from the log:**
+
+- 1000 JSON entries over ~8 hours on May 17, 2015
+- Timestamps are in format `17/May/2015:HH:MM:SS +0000`
+- The log is a download server serving APT package repositories
+- Most bytes transferred are small (max ~3318 bytes for 200 responses)
+- 304 Not Modified responses carry 0 bytes — important for bandwidth analysis
+
+**Grading weights (equal):** Each of the five criteria contributes 0.2 to the final score.
diff --git a/tasks/task_log_nginx_user_agents.md b/tasks/task_log_nginx_user_agents.md
new file mode 100644
index 0000000..e1fa29b
--- /dev/null
+++ b/tasks/task_log_nginx_user_agents.md
@@ -0,0 +1,143 @@
+---
+id: task_log_nginx_user_agents
+name: Nginx Access Log - User Agent Analysis
+category: log_analysis
+grading_type: hybrid
+timeout_seconds: 180
+workspace_files:
+  - dest: "nginx_access.log"
+    source: "logs/nginx_access_json.log"
+---
+
+# Nginx Access Log - User Agent Analysis
+
+## Prompt
+
+Analyze the Nginx JSON access log at `nginx_access.log` and produce a comprehensive user agent analysis. Each line is a JSON object with fields: `time`, `remote_ip`, `remote_user`, `request`, `response`, `bytes`, `referrer`, `agent`.
+
+Your report should include:
+
+1. **Unique User Agents**: Total count of distinct user agent strings
+2. **User Agent Ranking**: List all user agents sorted by request count, with count and percentage
+3. **Client Type Classification**: Categorize agents into types (package managers, web browsers, bots/crawlers, command-line tools, unknown/empty)
+4. **Agent-to-IP Mapping**: For each user agent, how many unique IPs use it?
+5. **Success vs Error Rate by Agent**: For each agent, what percentage of requests result in errors (4xx/5xx)?
+6. **Conclusions**: What type of server is this based on the user agent profile?
+
+Write the report to `user_agent_report.md` as a well-structured markdown document.
+
+---
+
+## Expected Behavior
+
+The agent should parse all 1000 JSON log entries and produce:
+
+**Unique User Agents:** 14 distinct agent strings (including "-" for empty)
+
+**Top User Agents:**
+- `Debian APT-HTTP/1.3 (0.9.7.9)` — 370 requests (37.0%)
+- `Debian APT-HTTP/1.3 (0.8.16~exp12ubuntu10.16)` — 177 (17.7%)
+- `Debian APT-HTTP/1.3 (0.8.16~exp12ubuntu10.22)` — 118 (11.8%)
+- `Debian APT-HTTP/1.3 (1.0.1ubuntu2)` — 116 (11.6%)
+- `Debian APT-HTTP/1.3 (0.8.16~exp12ubuntu10.21)` — 64 (6.4%)
+- Additional APT variants and a few others (Go 1.1 package http, urlgrabber, etc.)
+
+**Classification:**
+- Package managers (Debian APT): vast majority (~95%+)
+- Other automated tools: Go HTTP client, urlgrabber
+- Empty/missing agent ("-"): small number
+
+**Conclusions:**
+- This is clearly a Debian/Ubuntu package repository or software download mirror
+- Multiple APT versions indicate clients running different Ubuntu/Debian releases
+- Very little if any human browser traffic
+
+Acceptable variations:
+- Exact counts are deterministic from the log
+- Classification categories may use different names
+- Assessment language will vary
+
+---
+
+## Grading Criteria
+
+- [ ] `user_agent_report.md` is created in the workspace
+- [ ] All user agents are listed with counts
+- [ ] Agents are classified by type (package manager, bot, etc.)
+- [ ] The dominant agent (Debian APT) is identified as the primary client
+- [ ] Server purpose is correctly inferred (package repository/download mirror)
+
+---
+
+## Automated Checks
+
+```python
+def grade(transcript: list, workspace_path: str) -> dict:
+    """Grade the Nginx user agent analysis task."""
+    from pathlib import Path
+
+    scores = {}
+    workspace = Path(workspace_path)
+    report_file = workspace / "user_agent_report.md"
+
+    if not report_file.exists():
+        return {
+            "output_created": 0.0,
+            "agents_listed": 0.0,
+            "agents_classified": 0.0,
+            "apt_dominant": 0.0,
+            "server_purpose": 0.0,
+        }
+
+    scores["output_created"] = 1.0
+    content = report_file.read_text(encoding="utf-8").lower()
+
+    # Check 1: User agents listed with counts
+    has_apt = "apt-http" in content or "apt http" in content or "debian apt" in content
+    has_counts = any(str(c) in content for c in ["370", "177", "118", "116"])
+    scores["agents_listed"] = (
+        1.0 if has_apt and has_counts else
+        0.5 if has_apt else 0.0
+    )
+
+    # Check 2: Agents classified by type
+    type_keywords = ["package manager", "bot", "crawler", "automated", "tool",
+                     "browser", "command line", "cli", "client type", "categor"]
+    scores["agents_classified"] = (
+        1.0 if sum(1 for kw in type_keywords if kw in content) >= 2 else
+        0.5 if sum(1 for kw in type_keywords if kw in content) >= 1 else 0.0
+    )
+
+    # Check 3: APT identified as dominant
+    dominant_keywords = ["dominant", "majority", "most common", "primary",
+                         "most frequent", "largest", "overwhelming"]
+    has_dominant = any(kw in content for kw in dominant_keywords)
+    scores["apt_dominant"] = (
+        1.0 if has_apt and has_dominant else
+        0.5 if has_apt else 0.0
+    )
+
+    # Check 4: Server purpose inferred
+    purpose_keywords = ["repository", "mirror", "download", "package",
+                        "software", "debian", "ubuntu", "apt"]
+    scores["server_purpose"] = (
+        1.0 if sum(1 for kw in purpose_keywords if kw in content) >= 3 else
+        0.5 if sum(1 for kw in purpose_keywords if kw in content) >= 1 else 0.0
+    )
+
+    return scores
+```
+
+---
+
+## Additional Notes
+
+**Key facts from the log:**
+
+- 14 unique user agent strings
+- Over 95% of traffic is from Debian APT package managers
+- Multiple APT versions correspond to different Ubuntu/Debian releases
+- The presence of `Go 1.1 package http` and `urlgrabber` agents indicates some non-APT automated traffic
+- Some entries have agent "-" (empty/missing user agent)
+
+**Grading weights (equal):** Each of the five criteria contributes 0.2 to the final score.
diff --git a/tasks/task_log_ssh_brute_force.md b/tasks/task_log_ssh_brute_force.md
new file mode 100644
index 0000000..93a330b
--- /dev/null
+++ b/tasks/task_log_ssh_brute_force.md
@@ -0,0 +1,164 @@
+---
+id: task_log_ssh_brute_force
+name: SSH Auth Log - Brute Force Detection
+category: log_analysis
+grading_type: hybrid
+timeout_seconds: 180
+workspace_files:
+  - dest: "auth.log"
+    source: "logs/openssh_auth.log"
+---
+
+# SSH Auth Log - Brute Force Detection
+
+## Prompt
+
+You are a security analyst reviewing the OpenSSH authentication log at `auth.log`. Your job is to detect brute-force attack patterns and produce a threat assessment.
+
+Define a brute-force attack as: **more than 10 failed authentication attempts from a single IP address within the log period**.
+
+Your report should include:
+
+1. **Brute Force Sources**: List all IPs that meet the brute-force threshold, with total failed attempts per IP
+2. **Attack Intensity**: For each brute-force source, calculate the approximate rate of attempts (attempts per minute)
+3. **Username Patterns**: For each attacking IP, what usernames are they trying? Is it a dictionary attack (many usernames) or targeted (few usernames)?
+4. **Attack Timeline**: When did each attack start and stop? Any overlap between attackers?
+5. **Reverse DNS Analysis**: Which attacking IPs triggered "POSSIBLE BREAK-IN ATTEMPT" warnings?
+6. **Risk Assessment**: Rate the overall threat level and recommend specific countermeasures
+
+Write the report to `brute_force_report.json` as a JSON document with the following structure:
+
+```json
+{
+  "summary": "Brief summary",
+  "brute_force_sources": [
+    {
+      "ip": "x.x.x.x",
+      "total_attempts": 100,
+      "first_seen": "Dec 10 HH:MM:SS",
+      "last_seen": "Dec 10 HH:MM:SS",
+      "usernames_tried": ["user1", "user2"],
+      "attack_type": "dictionary|targeted",
+      "reverse_dns_warning": true
+    }
+  ],
+  "risk_level": "critical|high|medium|low",
+  "recommendations": ["rec1", "rec2"]
+}
+```
+
+---
+
+## Expected Behavior
+
+The agent should identify these brute-force sources:
+
+**Primary Attackers:**
+- **183.62.140.253** — ~307 entries, heaviest attacker, likely dictionary attack
+- **187.141.143.180** — ~189 entries, sustained attack
+- **103.99.0.122** — ~83 entries
+- **112.95.230.3** — ~54 entries
+- **5.188.10.180** — ~30 entries
+- **185.190.58.151** — ~26 entries
+
+**Key findings:**
+- Multiple concurrent brute-force attacks from different IPs
+- Attacks span approximately 4 hours (06:55–10:59)
+- Username patterns include common defaults (admin, root, test, oracle, support)
+- 85 "POSSIBLE BREAK-IN ATTEMPT" warnings indicate spoofed/misconfigured reverse DNS
+- Risk level should be assessed as high or critical
+
+Acceptable variations:
+- Threshold for brute-force detection may vary
+- Rate calculations depend on how first/last timestamps are determined
+- Recommendation specifics will vary
+
+---
+
+## Grading Criteria
+
+- [ ] `brute_force_report.json` is created in the workspace
+- [ ] At least 3 brute-force source IPs are identified
+- [ ] 183.62.140.253 is identified as the top attacker
+- [ ] Attack type (dictionary vs targeted) is classified for each source
+- [ ] Recommendations for countermeasures are provided
+
+---
+
+## Automated Checks
+
+```python
+def grade(transcript: list, workspace_path: str) -> dict:
+    """Grade the SSH brute force detection task."""
+    from pathlib import Path
+    import json
+
+    scores = {}
+    workspace = Path(workspace_path)
+    report_file = workspace / "brute_force_report.json"
+
+    if not report_file.exists():
+        return {
+            "output_created": 0.0,
+            "sources_identified": 0.0,
+            "top_attacker": 0.0,
+            "attack_classified": 0.0,
+            "recommendations": 0.0,
+        }
+
+    scores["output_created"] = 1.0
+
+    try:
+        data = json.loads(report_file.read_text(encoding="utf-8"))
+    except (json.JSONDecodeError, Exception):
+        return {
+            "output_created": 1.0,
+            "sources_identified": 0.0,
+            "top_attacker": 0.0,
+            "attack_classified": 0.0,
+            "recommendations": 0.0,
+        }
+
+    full_text = json.dumps(data).lower()
+
+    # Check 1: At least 3 brute-force sources identified
+    sources = data.get("brute_force_sources", [])
+    if not isinstance(sources, list):
+        sources = []
+    scores["sources_identified"] = (
+        1.0 if len(sources) >= 3 else
+        0.5 if len(sources) >= 1 else 0.0
+    )
+
+    # Check 2: Top attacker identified
+    scores["top_attacker"] = 1.0 if "183.62.140.253" in full_text else 0.0
+
+    # Check 3: Attack type classified
+    has_classification = "dictionary" in full_text or "targeted" in full_text or "attack_type" in full_text
+    scores["attack_classified"] = 1.0 if has_classification else 0.0
+
+    # Check 4: Recommendations provided
+    recs = data.get("recommendations", [])
+    if not isinstance(recs, list):
+        recs = []
+    has_recs = len(recs) >= 2 or any(kw in full_text for kw in
+        ["fail2ban", "rate limit", "firewall", "block", "key-based",
+         "disable password", "allowlist", "whitelist", "deny"])
+    scores["recommendations"] = 1.0 if has_recs else 0.5 if len(recs) >= 1 else 0.0
+
+    return scores
+```
+
+---
+
+## Additional Notes
+
+**Key facts from the log:**
+
+- Server: LabSZ, running OpenSSH with PAM
+- Attack window: Dec 10, 06:55 to 10:59 (~4 hours)
+- Multiple simultaneous attackers — suggests the server IP is on a known scan list
+- 183.62.140.253 generates about 75 attempts per hour on average
+- The single successful login (user fztu from 119.137.62.142) is NOT from an attacking IP
+
+**Grading weights (equal):** Each of the five criteria contributes 0.2 to the final score.
diff --git a/tasks/task_log_ssh_failed_logins.md b/tasks/task_log_ssh_failed_logins.md
new file mode 100644
index 0000000..546185c
--- /dev/null
+++ b/tasks/task_log_ssh_failed_logins.md
@@ -0,0 +1,143 @@
+---
+id: task_log_ssh_failed_logins
+name: SSH Auth Log - Failed Login Analysis
+category: log_analysis
+grading_type: hybrid
+timeout_seconds: 180
+workspace_files:
+  - dest: "auth.log"
+    source: "logs/openssh_auth.log"
+---
+
+# SSH Auth Log - Failed Login Analysis
+
+## Prompt
+
+Analyze the OpenSSH authentication log at `auth.log` and produce a detailed report on failed login attempts. The log is from a server named "LabSZ" and covers SSH authentication events.
+
+Your report should include:
+
+1. **Overview**: Total log entries, date range, total failed login attempts
+2. **Failed Password Attempts**: Count of "Failed password" entries, broken down by source IP
+3. **Invalid User Attempts**: Count of attempts using non-existent usernames, with a list of the top 10 most-tried usernames
+4. **Top Attacking IPs**: Top 10 source IPs by number of failed attempts, with counts
+5. **Authentication Methods**: What authentication methods are being attempted (password, publickey, etc.)?
+6. **Reverse DNS Failures**: How many entries show "POSSIBLE BREAK-IN ATTEMPT" warnings?
+7. **Summary Assessment**: Is this server under active attack? What does the pattern suggest?
+
+Write the report to `failed_login_report.md` as a well-structured markdown document.
+
+---
+
+## Expected Behavior
+
+The agent should parse the 1500 log entries and produce:
+
+**Overview:**
+- 1500 log entries
+- Date: December 10 (times range from ~06:55 to ~10:59)
+- ~366 "Failed password" entries
+- ~100 "Invalid user" entries
+
+**Top Attacking IPs:**
+- 183.62.140.253 — ~307 entries (dominant attacker)
+- 187.141.143.180 — ~189 entries
+- 103.99.0.122 — ~83 entries
+- 112.95.230.3 — ~54 entries
+- 5.188.10.180 — ~30 entries
+
+**Top Invalid Usernames:**
+- admin (18), oracle (6), support (5), test (4), inspur (3), 0 (3), matlab (3), webmaster (2), guest (2), 1234 (2)
+
+**Key observations:**
+- Only 1 successful login in the entire log (user "fztu" from 119.137.62.142)
+- 85 "POSSIBLE BREAK-IN ATTEMPT" warnings from reverse DNS failures
+- The server is clearly under active brute-force attack
+- Attack comes from a small number of IPs generating hundreds of attempts each
+
+Acceptable variations:
+- Exact counts may differ by ±5 depending on parsing
+- Assessment language will vary
+
+---
+
+## Grading Criteria
+
+- [ ] `failed_login_report.md` is created in the workspace
+- [ ] Total failed attempts are counted (approximately 366 failed passwords)
+- [ ] Top attacking IPs are identified (183.62.140.253 as the top attacker)
+- [ ] Invalid usernames are listed (admin, oracle, support as top targets)
+- [ ] The server is assessed as being under brute-force attack
+
+---
+
+## Automated Checks
+
+```python
+def grade(transcript: list, workspace_path: str) -> dict:
+    """Grade the SSH failed login analysis task."""
+    from pathlib import Path
+
+    scores = {}
+    workspace = Path(workspace_path)
+    report_file = workspace / "failed_login_report.md"
+
+    if not report_file.exists():
+        return {
+            "output_created": 0.0,
+            "failed_count": 0.0,
+            "top_ips": 0.0,
+            "invalid_usernames": 0.0,
+            "attack_assessment": 0.0,
+        }
+
+    scores["output_created"] = 1.0
+    content = report_file.read_text(encoding="utf-8").lower()
+
+    # Check 1: Failed attempt count
+    has_failed_count = any(n in content for n in ["366", "365", "367", "failed password"])
+    scores["failed_count"] = (
+        1.0 if has_failed_count else
+        0.5 if "failed" in content and any(c.isdigit() for c in content) else 0.0
+    )
+
+    # Check 2: Top attacking IPs identified
+    top_ips = ["183.62.140.253", "187.141.143.180", "103.99.0.122", "112.95.230.3"]
+    ips_found = sum(1 for ip in top_ips if ip in content)
+    scores["top_ips"] = (
+        1.0 if ips_found >= 3 else
+        0.5 if ips_found >= 1 else 0.0
+    )
+
+    # Check 3: Invalid usernames listed
+    usernames = ["admin", "oracle", "support", "test", "webmaster", "guest"]
+    users_found = sum(1 for u in usernames if u in content)
+    scores["invalid_usernames"] = (
+        1.0 if users_found >= 3 else
+        0.5 if users_found >= 1 else 0.0
+    )
+
+    # Check 4: Attack assessment
+    attack_keywords = ["brute force", "brute-force", "attack", "compromise",
+                       "malicious", "automated", "scanning", "dictionary"]
+    scores["attack_assessment"] = (
+        1.0 if sum(1 for kw in attack_keywords if kw in content) >= 2 else
+        0.5 if sum(1 for kw in attack_keywords if kw in content) >= 1 else 0.0
+    )
+
+    return scores
+```
+
+---
+
+## Additional Notes
+
+**Key facts from the log:**
+
+- Server: LabSZ, running OpenSSH with PAM authentication
+- The attack window is approximately 4 hours (06:55–10:59 on December 10)
+- 183.62.140.253 alone accounts for ~307 log entries — a clear brute-force source
+- Reverse DNS failures trigger "POSSIBLE BREAK-IN ATTEMPT" warnings (85 occurrences)
+- Only 1 successful login in the entire log (user fztu)
+
+**Grading weights (equal):** Each of the five criteria contributes 0.2 to the final score.
diff --git a/tasks/task_log_ssh_successful.md b/tasks/task_log_ssh_successful.md
new file mode 100644
index 0000000..ec4dd1f
--- /dev/null
+++ b/tasks/task_log_ssh_successful.md
@@ -0,0 +1,143 @@
+---
+id: task_log_ssh_successful
+name: SSH Auth Log - Successful Authentication Summary
+category: log_analysis
+grading_type: hybrid
+timeout_seconds: 180
+workspace_files:
+  - dest: "auth.log"
+    source: "logs/openssh_auth.log"
+---
+
+# SSH Auth Log - Successful Authentication Summary
+
+## Prompt
+
+Analyze the OpenSSH authentication log at `auth.log` and produce a report focused on successful authentications. Among the noise of failed attempts, identify all legitimate access.
+
+Your report should include:
+
+1. **Successful Logins**: List every successful authentication with timestamp, username, source IP, port, and authentication method
+2. **Success vs Failure Ratio**: What percentage of all authentication attempts succeeded?
+3. **Legitimate User Profile**: For each successfully authenticated user, describe their access pattern
+4. **Session Activity**: Any evidence of what happened after login (session opened/closed events)?
+5. **Source IP Validation**: Is the successful login IP associated with any failed attempts as well?
+6. **Anomaly Check**: Does the successful login appear legitimate, or does it look suspicious (e.g., coming from an IP that was also brute-forcing)?
+
+Write the report to `successful_auth_report.md` as a well-structured markdown document.
+
+---
+
+## Expected Behavior
+
+The agent should identify:
+
+**Successful Logins:**
+- Only 1 successful login in the entire log:
+  - Time: Dec 10 09:32:20
+  - User: fztu
+  - Source: 119.137.62.142
+  - Port: 49116
+  - Method: password (ssh2)
+  - Entry: "Accepted password for fztu from 119.137.62.142 port 49116 ssh2"
+
+**Success vs Failure Ratio:**
+- 1 success out of hundreds of attempts — extremely low success rate
+- This reinforces that the log captures a server under brute-force attack
+
+**Anomaly Check:**
+- 119.137.62.142 (the successful login IP) should be checked against the failed attempt list
+- If it appears only in the successful entry, it's likely a legitimate user
+- If it also has failed attempts, it could be a compromised credential
+
+**Session Activity:**
+- Look for corresponding "session opened" / "session closed" events for user fztu
+
+Acceptable variations:
+- Analysis depth may vary
+- Some agents may find additional session-related entries
+- Anomaly assessment wording will vary
+
+---
+
+## Grading Criteria
+
+- [ ] `successful_auth_report.md` is created in the workspace
+- [ ] The single successful login is identified (user fztu, IP 119.137.62.142)
+- [ ] Success/failure ratio is calculated (1 success vs hundreds of failures)
+- [ ] The successful login IP is checked against failed attempt sources
+- [ ] An assessment of whether the login appears legitimate is provided
+
+---
+
+## Automated Checks
+
+```python
+def grade(transcript: list, workspace_path: str) -> dict:
+    """Grade the SSH successful authentication summary task."""
+    from pathlib import Path
+
+    scores = {}
+    workspace = Path(workspace_path)
+    report_file = workspace / "successful_auth_report.md"
+
+    if not report_file.exists():
+        return {
+            "output_created": 0.0,
+            "login_identified": 0.0,
+            "ratio_calculated": 0.0,
+            "ip_checked": 0.0,
+            "legitimacy_assessed": 0.0,
+        }
+
+    scores["output_created"] = 1.0
+    content = report_file.read_text(encoding="utf-8").lower()
+
+    # Check 1: Successful login identified
+    has_user = "fztu" in content
+    has_ip = "119.137.62.142" in content
+    has_accepted = "accepted" in content or "successful" in content
+    scores["login_identified"] = (
+        1.0 if has_user and has_ip else
+        0.5 if has_user or has_ip else 0.0
+    )
+
+    # Check 2: Ratio calculated
+    ratio_keywords = ["ratio", "percent", "1 success", "1 out of", "only 1",
+                      "single success", "one success", "0."]
+    scores["ratio_calculated"] = (
+        1.0 if sum(1 for kw in ratio_keywords if kw in content) >= 1 else 0.0
+    )
+
+    # Check 3: IP checked against failed attempts
+    check_keywords = ["119.137.62.142", "not associated", "not found",
+                      "does not appear", "no failed", "legitimate",
+                      "only successful", "no other"]
+    scores["ip_checked"] = (
+        1.0 if has_ip and sum(1 for kw in check_keywords if kw in content) >= 2 else
+        0.5 if has_ip else 0.0
+    )
+
+    # Check 4: Legitimacy assessment
+    legit_keywords = ["legitimate", "authorized", "valid", "genuine",
+                      "suspicious", "anomal", "normal", "expected"]
+    scores["legitimacy_assessed"] = (
+        1.0 if sum(1 for kw in legit_keywords if kw in content) >= 1 else 0.0
+    )
+
+    return scores
+```
+
+---
+
+## Additional Notes
+
+**Key facts from the log:**
+
+- Only 1 successful login among 1500 entries
+- User "fztu" authenticated via password from 119.137.62.142:49116 at 09:32:20
+- 119.137.62.142 does NOT appear as a source of any failed attempts — this is a legitimate user
+- The massive imbalance between failed and successful attempts is characteristic of a brute-force target
+- PAM session events (opened/closed) may provide additional context
+
+**Grading weights (equal):** Each of the five criteria contributes 0.2 to the final score.
diff --git a/tasks/task_log_ssh_unusual_times.md b/tasks/task_log_ssh_unusual_times.md
new file mode 100644
index 0000000..c8e2910
--- /dev/null
+++ b/tasks/task_log_ssh_unusual_times.md
@@ -0,0 +1,147 @@
+---
+id: task_log_ssh_unusual_times
+name: SSH Auth Log - Unusual Hour Login Detection
+category: log_analysis
+grading_type: hybrid
+timeout_seconds: 180
+workspace_files:
+  - dest: "auth.log"
+    source: "logs/openssh_auth.log"
+---
+
+# SSH Auth Log - Unusual Hour Login Detection
+
+## Prompt
+
+Analyze the OpenSSH authentication log at `auth.log` and identify login activity occurring at unusual hours. Assume normal business hours are 08:00–18:00 local server time.
+
+Your report should include:
+
+1. **Hourly Distribution**: Count of authentication events per hour
+2. **Off-Hours Activity**: All authentication events occurring before 08:00 or after 18:00
+3. **Early Morning Analysis**: Detailed breakdown of events between 06:00–08:00 (the earliest in the log)
+4. **Successful Logins Timing**: When did the successful login(s) occur? During business hours or not?
+5. **Attack Timing Patterns**: Do attackers prefer certain hours? Is there a pattern?
+6. **Temporal Risk Assessment**: Based on timing patterns, what times should be monitored most closely?
+
+Write the report to `unusual_hours_report.md` as a well-structured markdown document.
+
+---
+
+## Expected Behavior
+
+The agent should parse the log and produce:
+
+**Hourly Distribution:**
+- 06:xx — 7 entries (log starts at 06:55)
+- 07:xx — 169 entries
+- 08:xx — 118 entries
+- 09:xx — 676 entries (peak hour)
+- 10:xx — 530 entries (log ends at 10:59)
+
+**Off-Hours Activity:**
+- 7 entries before 07:00 (at 06:55–06:56)
+- All pre-08:00 entries are attack traffic (failed logins, BREAK-IN warnings)
+
+**Successful Login Timing:**
+- User fztu logged in at 09:32:20 — during business hours
+
+**Attack Timing Patterns:**
+- Attacks escalate sharply from 07:xx to 09:xx
+- Peak attack volume is 09:xx with 676 entries
+- This could indicate attackers in a different timezone where 09:00 LabSZ time is their working hours
+- The early morning entries (06:55) represent the tail end or start of an attack campaign
+
+Acceptable variations:
+- "Unusual hours" definition may vary
+- Timezone assumptions may differ
+- Assessment language will vary
+
+---
+
+## Grading Criteria
+
+- [ ] `unusual_hours_report.md` is created in the workspace
+- [ ] Hourly distribution of events is provided
+- [ ] Off-hours (pre-08:00) events are separately identified
+- [ ] The successful login timing is noted (09:32:20, during business hours)
+- [ ] Attack timing patterns are analyzed
+
+---
+
+## Automated Checks
+
+```python
+def grade(transcript: list, workspace_path: str) -> dict:
+    """Grade the SSH unusual hours detection task."""
+    from pathlib import Path
+
+    scores = {}
+    workspace = Path(workspace_path)
+    report_file = workspace / "unusual_hours_report.md"
+
+    if not report_file.exists():
+        return {
+            "output_created": 0.0,
+            "hourly_distribution": 0.0,
+            "off_hours_identified": 0.0,
+            "successful_timing": 0.0,
+            "timing_patterns": 0.0,
+        }
+
+    scores["output_created"] = 1.0
+    content = report_file.read_text(encoding="utf-8").lower()
+
+    # Check 1: Hourly distribution provided
+    hour_markers = ["06:", "07:", "08:", "09:", "10:"]
+    alt_markers = ["6 am", "7 am", "8 am", "9 am", "10 am", "6:00", "7:00",
+                   "8:00", "9:00", "10:00"]
+    all_markers = hour_markers + alt_markers
+    hours_found = sum(1 for m in all_markers if m in content)
+    scores["hourly_distribution"] = (
+        1.0 if hours_found >= 4 else
+        0.5 if hours_found >= 2 else 0.0
+    )
+
+    # Check 2: Off-hours events identified
+    off_hours_keywords = ["before 08", "before 8:00", "early morning",
+                          "06:55", "pre-business", "off-hour", "off hour",
+                          "unusual hour", "outside business"]
+    scores["off_hours_identified"] = (
+        1.0 if sum(1 for kw in off_hours_keywords if kw in content) >= 1 else 0.0
+    )
+
+    # Check 3: Successful login timing noted
+    has_fztu = "fztu" in content
+    has_time = "09:32" in content or "9:32" in content
+    has_business = any(kw in content for kw in ["business hour", "normal hour",
+                                                  "working hour", "during"])
+    scores["successful_timing"] = (
+        1.0 if has_fztu and (has_time or has_business) else
+        0.5 if has_fztu else 0.0
+    )
+
+    # Check 4: Timing patterns analyzed
+    pattern_keywords = ["peak", "escalat", "increas", "pattern", "trend",
+                        "676", "530", "busiest", "most active", "concentrated"]
+    scores["timing_patterns"] = (
+        1.0 if sum(1 for kw in pattern_keywords if kw in content) >= 2 else
+        0.5 if sum(1 for kw in pattern_keywords if kw in content) >= 1 else 0.0
+    )
+
+    return scores
+```
+
+---
+
+## Additional Notes
+
+**Key facts from the log:**
+
+- Log spans ~4 hours: 06:55 to 10:59 on December 10
+- Attack intensity increases dramatically over time: 7 → 169 → 118 → 676 → 530 per hour
+- The 09:xx hour is the busiest with 676 entries
+- Pre-08:00 entries are entirely attack traffic
+- The single successful login (fztu at 09:32) occurs during peak attack activity
+
+**Grading weights (equal):** Each of the five criteria contributes 0.2 to the final score.
diff --git a/tasks/task_log_ssh_user_activity.md b/tasks/task_log_ssh_user_activity.md
new file mode 100644
index 0000000..4224f5a
--- /dev/null
+++ b/tasks/task_log_ssh_user_activity.md
@@ -0,0 +1,141 @@
+---
+id: task_log_ssh_user_activity
+name: SSH Auth Log - User Login Activity Report
+category: log_analysis
+grading_type: hybrid
+timeout_seconds: 180
+workspace_files:
+  - dest: "auth.log"
+    source: "logs/openssh_auth.log"
+---
+
+# SSH Auth Log - User Login Activity Report
+
+## Prompt
+
+Analyze the OpenSSH authentication log at `auth.log` and produce a user-focused activity report. For every username mentioned in the log (both valid and invalid), summarize their authentication activity.
+
+Your report should include:
+
+1. **All Usernames Attempted**: List every username that appears in the log (both valid system users and invalid/non-existent users)
+2. **Valid vs Invalid Users**: Classify each username as valid (accepted by the system) or invalid (rejected as non-existent)
+3. **Per-User Summary**: For each username, show: number of attempts, source IPs, success/failure, first and last attempt timestamp
+4. **Most Targeted Users**: Rank usernames by number of failed attempts
+5. **Username Patterns**: Are attackers using a dictionary? Common patterns (admin, root, test, service accounts)?
+6. **User Risk Assessment**: Which usernames, if they existed, would pose the greatest security risk?
+
+Write the report to `user_activity_report.md` as a well-structured markdown document.
+
+---
+
+## Expected Behavior
+
+The agent should identify:
+
+**Invalid Users (top by frequency):**
+- admin (18 attempts), oracle (6), support (5), test (4), inspur (3), 0 (3), matlab (3), webmaster (2), guest (2), 1234 (2), and others
+
+**Valid Users:**
+- fztu — the only user with a successful login
+- root — likely a valid user that's targeted (check for "Failed password for root" vs "Failed password for invalid user root")
+
+**Username Patterns:**
+- Common service accounts: admin, oracle, support, webmaster
+- Default credentials: test, guest, 1234, 0
+- Application-specific: matlab, inspur
+- This is clearly a dictionary attack using common username lists
+
+**Risk Assessment:**
+- "admin" and "root" are highest risk — if compromised, full system access
+- "oracle" suggests attackers know this is likely a Linux server running databases
+- Numeric usernames like "0" and "1234" indicate automated/scripted attacks
+
+Acceptable variations:
+- The distinction between valid and invalid users depends on parsing "invalid user" messages
+- Some usernames may be ambiguous
+- Risk assessment language will vary
+
+---
+
+## Grading Criteria
+
+- [ ] `user_activity_report.md` is created in the workspace
+- [ ] Both valid and invalid usernames are listed
+- [ ] The most-targeted username (admin) is identified
+- [ ] Username patterns are analyzed (dictionary attack, common defaults)
+- [ ] A risk assessment is provided for the most dangerous usernames
+
+---
+
+## Automated Checks
+
+```python
+def grade(transcript: list, workspace_path: str) -> dict:
+    """Grade the SSH user activity report task."""
+    from pathlib import Path
+
+    scores = {}
+    workspace = Path(workspace_path)
+    report_file = workspace / "user_activity_report.md"
+
+    if not report_file.exists():
+        return {
+            "output_created": 0.0,
+            "usernames_listed": 0.0,
+            "admin_targeted": 0.0,
+            "patterns_analyzed": 0.0,
+            "risk_assessment": 0.0,
+        }
+
+    scores["output_created"] = 1.0
+    content = report_file.read_text(encoding="utf-8").lower()
+
+    # Check 1: Both valid and invalid usernames listed
+    invalid_users = ["admin", "oracle", "support", "test", "webmaster", "guest"]
+    valid_users = ["fztu"]
+    invalid_found = sum(1 for u in invalid_users if u in content)
+    valid_found = sum(1 for u in valid_users if u in content)
+    scores["usernames_listed"] = (
+        1.0 if invalid_found >= 3 and valid_found >= 1 else
+        0.5 if invalid_found >= 2 else 0.0
+    )
+
+    # Check 2: Admin identified as most targeted
+    scores["admin_targeted"] = (
+        1.0 if "admin" in content and any(kw in content for kw in
+            ["most", "top", "highest", "18", "target"]) else
+        0.5 if "admin" in content else 0.0
+    )
+
+    # Check 3: Username patterns analyzed
+    pattern_keywords = ["dictionary", "common", "default", "service account",
+                        "automated", "wordlist", "brute", "pattern"]
+    scores["patterns_analyzed"] = (
+        1.0 if sum(1 for kw in pattern_keywords if kw in content) >= 2 else
+        0.5 if sum(1 for kw in pattern_keywords if kw in content) >= 1 else 0.0
+    )
+
+    # Check 4: Risk assessment
+    risk_keywords = ["risk", "danger", "critical", "compromise", "privilege",
+                     "escalat", "root access", "full access"]
+    scores["risk_assessment"] = (
+        1.0 if sum(1 for kw in risk_keywords if kw in content) >= 2 else
+        0.5 if sum(1 for kw in risk_keywords if kw in content) >= 1 else 0.0
+    )
+
+    return scores
+```
+
+---
+
+## Additional Notes
+
+**Key facts from the log:**
+
+- ~100 "Invalid user" entries with various usernames
+- "admin" is tried 18 times — the most popular target
+- User "fztu" is the only confirmed valid user (successful login)
+- "root" appears in failed password attempts but NOT as "invalid user" — suggesting root is a real account
+- The username list reads like a standard SSH brute-force dictionary
+
+**Grading weights (equal):** Each of the five criteria contributes 0.2 to the final score.
diff --git a/tasks/task_log_syslog_anomalies.md b/tasks/task_log_syslog_anomalies.md
new file mode 100644
index 0000000..81bbc0f
--- /dev/null
+++ b/tasks/task_log_syslog_anomalies.md
@@ -0,0 +1,140 @@
+---
+id: task_log_syslog_anomalies
+name: Linux Syslog - Anomaly Detection
+category: log_analysis
+grading_type: hybrid
+timeout_seconds: 180
+workspace_files:
+  - dest: "syslog.log"
+    source: "logs/linux_syslog.log"
+---
+
+# Linux Syslog - Anomaly Detection
+
+## Prompt
+
+Analyze the Linux syslog at `syslog.log` and identify anomalous or suspicious entries. The log is from a server named "combo" running a Linux 2.6 kernel, covering several months of activity.
+
+Your report should include:
+
+1. **Log Overview**: Total entries, date range, top services by volume
+2. **Security Anomalies**: Entries that indicate potential attacks, exploits, or unauthorized access attempts
+3. **Format String Attack Detection**: Look for entries with unusual binary content or exploit payloads in service input
+4. **FTP Anomalies**: The log has heavy FTP traffic — identify any suspicious FTP connection patterns (bursts, unusual sources)
+5. **rpc.statd Exploitation**: Check for rpc.statd gethostbyname errors with malformed hostnames (buffer overflow attempts)
+6. **Anomaly Summary**: Rank the top 5 most concerning anomalies with severity and evidence
+
+Write the report to `syslog_anomalies.md` as a well-structured markdown document.
+
+---
+
+## Expected Behavior
+
+The agent should parse 5000 entries and identify:
+
+**Log Overview:**
+- 5000 entries, June 9 to September 14 (2005, based on kernel version)
+- Top services: ftpd (1655), sshd/pam_unix (1610), kernel (545), su/pam_unix (394)
+
+**Security Anomalies:**
+1. **rpc.statd format string attack** (~9 entries on Jun 13):
+   - `gethostbyname error for ^X...%8x%8x...%hn%51859x%hn` — this is a buffer overflow/format string exploitation attempt against rpc.statd
+   - The payload contains format string specifiers (%x, %hn) which are classic exploit patterns
+2. **SSH brute force** — heavy sshd(pam_unix) authentication failure volume
+3. **FTP flood** — 1655 FTP connection entries, with bursts (e.g., 209.184.7.130 with multiple simultaneous connections)
+4. **Authentication failures** — 2000+ pam_unix auth failure entries across SSH and other services
+
+**rpc.statd Exploitation:**
+- 9 entries at Jun 13 11:55:04–11:55:09
+- Malformed hostname contains NOP sled (\220\220\220\220) and format string payload
+- This is an attempted remote code execution exploit
+
+Acceptable variations:
+- Anomaly ranking may differ
+- Additional anomalies beyond the expected ones are welcome
+- Severity assessments will vary
+
+---
+
+## Grading Criteria
+
+- [ ] `syslog_anomalies.md` is created in the workspace
+- [ ] Log overview with date range and service breakdown is provided
+- [ ] rpc.statd format string attack is identified as a security anomaly
+- [ ] FTP connection patterns are analyzed
+- [ ] SSH authentication failures are flagged
+
+---
+
+## Automated Checks
+
+```python
+def grade(transcript: list, workspace_path: str) -> dict:
+    """Grade the Linux syslog anomaly detection task."""
+    from pathlib import Path
+
+    scores = {}
+    workspace = Path(workspace_path)
+    report_file = workspace / "syslog_anomalies.md"
+
+    if not report_file.exists():
+        return {
+            "output_created": 0.0,
+            "log_overview": 0.0,
+            "rpc_statd_attack": 0.0,
+            "ftp_analysis": 0.0,
+            "ssh_failures": 0.0,
+        }
+
+    scores["output_created"] = 1.0
+    content = report_file.read_text(encoding="utf-8").lower()
+
+    # Check 1: Log overview
+    has_count = any(n in content for n in ["5000", "5,000"])
+    has_date = any(d in content for d in ["jun", "june", "sep", "september"])
+    scores["log_overview"] = (
+        1.0 if has_count and has_date else
+        0.5 if has_count or has_date else 0.0
+    )
+
+    # Check 2: rpc.statd attack identified
+    rpc_keywords = ["rpc.statd", "rpc statd", "gethostbyname", "format string",
+                    "buffer overflow", "exploit", "%hn", "nop sled", "\\220"]
+    scores["rpc_statd_attack"] = (
+        1.0 if sum(1 for kw in rpc_keywords if kw in content) >= 2 else
+        0.5 if sum(1 for kw in rpc_keywords if kw in content) >= 1 else 0.0
+    )
+
+    # Check 3: FTP analysis
+    ftp_keywords = ["ftpd", "ftp", "1655", "209.184", "connection flood",
+                    "burst", "ftp connection"]
+    scores["ftp_analysis"] = (
+        1.0 if sum(1 for kw in ftp_keywords if kw in content) >= 2 else
+        0.5 if sum(1 for kw in ftp_keywords if kw in content) >= 1 else 0.0
+    )
+
+    # Check 4: SSH failures flagged
+    ssh_keywords = ["sshd", "ssh", "authentication failure", "brute force",
+                    "failed", "pam_unix"]
+    scores["ssh_failures"] = (
+        1.0 if sum(1 for kw in ssh_keywords if kw in content) >= 2 else
+        0.5 if sum(1 for kw in ssh_keywords if kw in content) >= 1 else 0.0
+    )
+
+    return scores
+```
+
+---
+
+## Additional Notes
+
+**Key facts from the log:**
+
+- Server: "combo", Linux 2.6.5-1.358, Fedora/Red Hat based
+- Date range: Jun 9 to Sep 14 (year 2005, based on kernel build date)
+- The rpc.statd attack on Jun 13 is the most serious anomaly — a real exploit attempt
+- 1655 FTP connections, heavily concentrated from certain IPs (209.184.7.130)
+- The su(pam_unix) entries (394) show regular privilege escalation — likely legitimate cron jobs
+- Some entries contain non-UTF-8 bytes (binary exploit payloads)
+
+**Grading weights (equal):** Each of the five criteria contributes 0.2 to the final score.
diff --git a/tasks/task_log_syslog_auth_failures.md b/tasks/task_log_syslog_auth_failures.md
new file mode 100644
index 0000000..5573f17
--- /dev/null
+++ b/tasks/task_log_syslog_auth_failures.md
@@ -0,0 +1,152 @@
+---
+id: task_log_syslog_auth_failures
+name: Linux Syslog - Authentication Failure Summary
+category: log_analysis
+grading_type: hybrid
+timeout_seconds: 180
+workspace_files:
+  - dest: "syslog.log"
+    source: "logs/linux_syslog.log"
+---
+
+# Linux Syslog - Authentication Failure Summary
+
+## Prompt
+
+Analyze the Linux syslog at `syslog.log` and produce a comprehensive summary of all authentication failures. The log contains PAM authentication events from multiple services.
+
+Your report should include:
+
+1. **Total Auth Failures**: Count all authentication failure entries across all services
+2. **Failures by Service**: Break down failures by service (sshd, ftpd, login, su, etc.)
+3. **Failures by Source**: Top 10 source hosts/IPs generating the most failures
+4. **Targeted Users**: Which user accounts are being targeted in failed auth attempts?
+5. **Temporal Distribution**: When do most authentication failures occur? Any spikes?
+6. **FTP vs SSH Analysis**: Compare the authentication attack patterns across FTP and SSH — are the same sources attacking both?
+7. **Recommendations**: Based on the failure patterns, recommend specific security improvements
+
+Write the report to `auth_failures_report.md` as a well-structured markdown document.
+
+---
+
+## Expected Behavior
+
+The agent should parse the ~2000+ PAM-related entries and produce:
+
+**Total Auth Failures:**
+- Over 2000 authentication-related PAM entries
+- Primary sources: sshd(pam_unix) (~1610 entries), ftpd connections (~1655 entries)
+
+**Failures by Service:**
+- sshd(pam_unix) — the dominant source of auth failure messages
+- ftpd — heavy connection volume (ftpd logs connections, not always explicit failures)
+- su(pam_unix) — 394 entries (mostly legitimate — session opens/closes for cron)
+- login(pam_unix) — 14 entries
+- klogind — 46 entries (Kerberos login daemon)
+
+**Failures by Source:**
+- SSH attacks come from various remote hosts (rhost= in pam entries)
+- FTP connections concentrated from specific IPs (e.g., 209.184.7.130)
+- Some hosts appear in both SSH and FTP failure logs
+
+**Targeted Users:**
+- root — primary target for SSH brute force
+- Various invalid usernames tried via SSH
+
+**Temporal Distribution:**
+- Log spans Jun 9 to Sep 14
+- Attack spikes visible on specific dates
+- SSH brute force tends to cluster in time
+
+Acceptable variations:
+- Exact counts depend on how "authentication failure" is defined
+- FTP entries may or may not be classified as auth failures
+- Temporal analysis granularity may vary
+
+---
+
+## Grading Criteria
+
+- [ ] `auth_failures_report.md` is created in the workspace
+- [ ] Authentication failures are counted (2000+ pam-related entries)
+- [ ] Failures are broken down by service (sshd, ftpd, su, etc.)
+- [ ] Top source hosts are listed
+- [ ] Recommendations for security improvement are provided
+
+---
+
+## Automated Checks
+
+```python
+def grade(transcript: list, workspace_path: str) -> dict:
+    """Grade the Linux syslog authentication failure summary task."""
+    from pathlib import Path
+
+    scores = {}
+    workspace = Path(workspace_path)
+    report_file = workspace / "auth_failures_report.md"
+
+    if not report_file.exists():
+        return {
+            "output_created": 0.0,
+            "failures_counted": 0.0,
+            "by_service": 0.0,
+            "source_hosts": 0.0,
+            "recommendations": 0.0,
+        }
+
+    scores["output_created"] = 1.0
+    content = report_file.read_text(encoding="utf-8").lower()
+
+    # Check 1: Failures counted
+    has_count = any(kw in content for kw in ["2000", "2,000", "1610", "1,610",
+                                               "1655", "1,655", "thousand"])
+    has_failure = "authentication failure" in content or "auth fail" in content or "failed" in content
+    scores["failures_counted"] = (
+        1.0 if has_count and has_failure else
+        0.5 if has_failure else 0.0
+    )
+
+    # Check 2: Broken down by service
+    services = ["sshd", "ftpd", "ftp", "su(pam", "su ", "login", "klogin", "pam_unix"]
+    services_found = sum(1 for s in services if s in content)
+    scores["by_service"] = (
+        1.0 if services_found >= 3 else
+        0.5 if services_found >= 2 else 0.0
+    )
+
+    # Check 3: Source hosts listed
+    host_indicators = ["rhost", "source", "remote", "ip", "host", "209.184",
+                       "sagonet", "iasi", "astral"]
+    scores["source_hosts"] = (
+        1.0 if sum(1 for kw in host_indicators if kw in content) >= 3 else
+        0.5 if sum(1 for kw in host_indicators if kw in content) >= 1 else 0.0
+    )
+
+    # Check 4: Recommendations provided
+    rec_keywords = ["recommend", "should", "implement", "consider", "disable",
+                    "block", "firewall", "fail2ban", "key-based", "rate limit"]
+    rec_lines = [l for l in content.split("\n") if any(kw in l for kw in rec_keywords)]
+    scores["recommendations"] = (
+        1.0 if len(rec_lines) >= 2 else
+        0.5 if len(rec_lines) >= 1 else 0.0
+    )
+
+    return scores
+```
+
+---
+
+## Additional Notes
+
+**Key facts from the log:**
+
+- Server "combo" has multiple authentication surfaces: SSH, FTP, telnet (klogind), local login, su
+- sshd(pam_unix) is the most frequent auth service (1610 entries)
+- ftpd has 1655 connection entries — the heaviest service by volume
+- su(pam_unix) sessions (394) are mostly legitimate (cron jobs, uid=0 running as service users)
+- login(pam_unix) has 14 entries — console/terminal logins
+- klogind has 46 entries — Kerberos remote login
+- The system is a 2005-era Fedora server with many exposed services — a security hardening candidate
+
+**Grading weights (equal):** Each of the five criteria contributes 0.2 to the final score.
diff --git a/tasks/task_log_syslog_cron.md b/tasks/task_log_syslog_cron.md
new file mode 100644
index 0000000..b4e42b6
--- /dev/null
+++ b/tasks/task_log_syslog_cron.md
@@ -0,0 +1,148 @@
+---
+id: task_log_syslog_cron
+name: Linux Syslog - Cron Job Execution Analysis
+category: log_analysis
+grading_type: hybrid
+timeout_seconds: 180
+workspace_files:
+  - dest: "syslog.log"
+    source: "logs/linux_syslog.log"
+---
+
+# Linux Syslog - Cron Job Execution Analysis
+
+## Prompt
+
+Analyze the Linux syslog at `syslog.log` and produce a report on cron job and scheduled task activity. Look for crond, anacron, logrotate, and any other scheduled execution evidence.
+
+Your report should include:
+
+1. **Cron Service Status**: When does crond start? How many startup events are there?
+2. **Anacron Activity**: Document all anacron startup events and their timing
+3. **Logrotate Activity**: Identify logrotate executions and what services they affect
+4. **su Session Patterns**: The `su(pam_unix)` entries often indicate cron executing tasks as different users — analyze these patterns
+5. **Scheduled Task Timeline**: Create a timeline of all scheduled/periodic activity
+6. **Recurring Patterns**: Identify any regular patterns (daily, weekly) in scheduled executions
+
+Write the report to `cron_analysis.md` as a well-structured markdown document.
+
+---
+
+## Expected Behavior
+
+The agent should identify:
+
+**Cron/Anacron Startups:**
+- crond startup: Jun 9 06:06:49, Jun 10 11:32:10, Jul 27 14:42:23 (3 events, aligned with system boots)
+- anacron startup: Jun 9 06:06:51, Jun 10 11:32:12, Jul 27 14:42:25 (follows crond immediately)
+
+**Logrotate Activity:**
+- 97 logrotate entries in the log
+- Logrotate triggers service restarts (especially CUPS)
+- Runs periodically — likely daily via anacron/cron
+
+**su(pam_unix) Patterns:**
+- 394 su session entries
+- Sessions opened for users like: htt, cyrus, news
+- Pattern: "session opened for user X by (uid=0)" → "session closed for user X"
+- These are cron jobs running as specific service users
+
+**Key scheduled users:**
+- htt (web server) — regular su sessions
+- cyrus (mail) — regular su sessions
+- news — periodic sessions
+
+**Recurring Patterns:**
+- Daily: logrotate runs, su sessions for service users
+- At boot: crond → anacron startup sequence
+- Weekly: CUPS restart pattern (shutdown + startup)
+
+Acceptable variations:
+- Pattern detection approaches may differ
+- Timeline granularity may vary
+- Some scheduled patterns require inference from the su session data
+
+---
+
+## Grading Criteria
+
+- [ ] `cron_analysis.md` is created in the workspace
+- [ ] Crond and anacron startup events are documented
+- [ ] Logrotate activity is identified (97 entries)
+- [ ] su(pam_unix) sessions are linked to scheduled tasks
+- [ ] Recurring patterns are identified (daily, at boot, etc.)
+
+---
+
+## Automated Checks
+
+```python
+def grade(transcript: list, workspace_path: str) -> dict:
+    """Grade the Linux syslog cron job analysis task."""
+    from pathlib import Path
+
+    scores = {}
+    workspace = Path(workspace_path)
+    report_file = workspace / "cron_analysis.md"
+
+    if not report_file.exists():
+        return {
+            "output_created": 0.0,
+            "cron_anacron": 0.0,
+            "logrotate": 0.0,
+            "su_sessions": 0.0,
+            "recurring_patterns": 0.0,
+        }
+
+    scores["output_created"] = 1.0
+    content = report_file.read_text(encoding="utf-8").lower()
+
+    # Check 1: Crond and anacron documented
+    has_crond = "crond" in content or "cron daemon" in content
+    has_anacron = "anacron" in content
+    scores["cron_anacron"] = (
+        1.0 if has_crond and has_anacron else
+        0.5 if has_crond else 0.0
+    )
+
+    # Check 2: Logrotate identified
+    has_logrotate = "logrotate" in content
+    has_count = "97" in content or any(kw in content for kw in
+        ["frequent", "numerous", "many logrotate"])
+    scores["logrotate"] = (
+        1.0 if has_logrotate and has_count else
+        0.5 if has_logrotate else 0.0
+    )
+
+    # Check 3: su sessions analyzed
+    has_su = "su(" in content or "su(pam" in content or "su session" in content
+    has_users = sum(1 for u in ["htt", "cyrus", "news"] if u in content)
+    scores["su_sessions"] = (
+        1.0 if has_su and has_users >= 2 else
+        0.5 if has_su else 0.0
+    )
+
+    # Check 4: Recurring patterns identified
+    pattern_keywords = ["daily", "weekly", "periodic", "regular", "recurring",
+                        "schedule", "pattern", "at boot", "every"]
+    scores["recurring_patterns"] = (
+        1.0 if sum(1 for kw in pattern_keywords if kw in content) >= 2 else
+        0.5 if sum(1 for kw in pattern_keywords if kw in content) >= 1 else 0.0
+    )
+
+    return scores
+```
+
+---
+
+## Additional Notes
+
+**Key facts from the log:**
+
+- crond and anacron always start together at boot, 2 seconds apart
+- logrotate is the most active periodic process (97 entries)
+- su sessions (394 entries) are the main indicator of scheduled task execution
+- Common cron user pattern: uid=0 opens session for service user, then closes it
+- The server runs mail (cyrus), web (htt), and news services — all with cron maintenance
+
+**Grading weights (equal):** Each of the five criteria contributes 0.2 to the final score.
diff --git a/tasks/task_log_syslog_services.md b/tasks/task_log_syslog_services.md
new file mode 100644
index 0000000..d74db4c
--- /dev/null
+++ b/tasks/task_log_syslog_services.md
@@ -0,0 +1,152 @@
+---
+id: task_log_syslog_services
+name: Linux Syslog - Service Start/Stop Summary
+category: log_analysis
+grading_type: hybrid
+timeout_seconds: 180
+workspace_files:
+  - dest: "syslog.log"
+    source: "logs/linux_syslog.log"
+---
+
+# Linux Syslog - Service Start/Stop Summary
+
+## Prompt
+
+Analyze the Linux syslog at `syslog.log` and produce a summary of all service start and stop events. The log is from a server named "combo" running Linux 2.6.
+
+Your report should include:
+
+1. **Service Inventory**: List every service mentioned in the log with startup or shutdown events
+2. **System Boot Events**: Identify all system boots/restarts by finding clusters of service startups
+3. **Per-Service Status**: For each service, list all start/stop events with timestamps
+4. **CUPS Special Case**: The CUPS print service has frequent restarts — document its pattern separately
+5. **Service Dependencies**: Based on startup ordering, infer which services start first (core OS) vs later (applications)
+6. **Uptime Estimate**: Based on boot events, estimate the server's uptime between restarts
+
+Write the report to `service_status_report.md` as a well-structured markdown document.
+
+---
+
+## Expected Behavior
+
+The agent should identify:
+
+**System Boots (3 detected):**
+1. Jun 9 ~06:06 — Full boot (syslogd, klogd, kernel, irqbalance, portmap, etc.)
+2. Jun 10 ~11:32 — Full boot (same service sequence)
+3. Jul 27 ~14:42 — Another boot event
+
+**Service Inventory (20+ services with startup events):**
+- Core: syslogd, klogd, irqbalance, portmap
+- Network: rpc.statd, rpc.idmapd, sendmail, sm-client, named
+- Security: spamd, privoxy
+- Hardware: bluetooth (hcid, sdpd), smartd, apmd, gpm
+- Printing: cupsd (17 startups!)
+- Scheduling: crond, anacron, xinetd
+- Web: htt
+
+**CUPS Pattern:**
+- cupsd has 17 startup events and 15 shutdown events
+- Regular weekly pattern: shutdown early morning (04:0x) followed by startup
+- This matches logrotate triggering CUPS restart
+
+**Service Dependencies (boot order):**
+1. syslogd, klogd (logging first)
+2. kernel messages
+3. irqbalance, portmap (system services)
+4. Network services (rpc, named)
+5. Application services (cups, cron, sendmail)
+
+Acceptable variations:
+- Boot detection approaches may differ
+- Service categorization is subjective
+- Uptime calculations are approximate
+
+---
+
+## Grading Criteria
+
+- [ ] `service_status_report.md` is created in the workspace
+- [ ] System boot events are identified (at least 2 boots found)
+- [ ] Services are listed with their start/stop events
+- [ ] CUPS frequent restarts are noted (17 startups)
+- [ ] Service startup ordering is analyzed
+
+---
+
+## Automated Checks
+
+```python
+def grade(transcript: list, workspace_path: str) -> dict:
+    """Grade the Linux syslog service status summary task."""
+    from pathlib import Path
+
+    scores = {}
+    workspace = Path(workspace_path)
+    report_file = workspace / "service_status_report.md"
+
+    if not report_file.exists():
+        return {
+            "output_created": 0.0,
+            "boots_identified": 0.0,
+            "services_listed": 0.0,
+            "cups_pattern": 0.0,
+            "startup_ordering": 0.0,
+        }
+
+    scores["output_created"] = 1.0
+    content = report_file.read_text(encoding="utf-8").lower()
+
+    # Check 1: Boot events identified
+    boot_dates = ["jun 9", "jun 10", "jul 27", "june 9", "june 10", "july 27"]
+    boots_found = sum(1 for d in boot_dates if d in content)
+    has_boot = any(kw in content for kw in ["boot", "restart", "reboot", "startup"])
+    scores["boots_identified"] = (
+        1.0 if boots_found >= 2 and has_boot else
+        0.5 if boots_found >= 1 else 0.0
+    )
+
+    # Check 2: Services listed
+    services = ["syslogd", "klogd", "crond", "cupsd", "cups", "sendmail",
+                "portmap", "sshd", "named", "xinetd", "smartd", "gpm"]
+    services_found = sum(1 for s in services if s in content)
+    scores["services_listed"] = (
+        1.0 if services_found >= 6 else
+        0.5 if services_found >= 3 else 0.0
+    )
+
+    # Check 3: CUPS pattern noted
+    has_cups = "cups" in content
+    has_frequent = any(kw in content for kw in ["17", "frequent", "multiple",
+                                                  "regular", "weekly", "logrotate"])
+    scores["cups_pattern"] = (
+        1.0 if has_cups and has_frequent else
+        0.5 if has_cups else 0.0
+    )
+
+    # Check 4: Startup ordering analyzed
+    order_keywords = ["order", "first", "before", "after", "sequence",
+                      "dependency", "boot order", "startup order"]
+    scores["startup_ordering"] = (
+        1.0 if sum(1 for kw in order_keywords if kw in content) >= 2 else
+        0.5 if sum(1 for kw in order_keywords if kw in content) >= 1 else 0.0
+    )
+
+    return scores
+```
+
+---
+
+## Additional Notes
+
+**Key facts from the log:**
+
+- Server: "combo", Fedora/Red Hat, Linux 2.6.5-1.358
+- 3 full system boots detected in the log
+- CUPS restarts weekly — likely triggered by logrotate
+- 20+ unique services with startup events
+- The boot sequence is consistent across all 3 boots
+- Boot sequence shows clear ordering: logging → system → network → application services
+
+**Grading weights (equal):** Each of the five criteria contributes 0.2 to the final score.