diff --git a/tasks/manifest.yaml b/tasks/manifest.yaml index 1a80e2a..38b22cc 100644 --- a/tasks/manifest.yaml +++ b/tasks/manifest.yaml @@ -110,6 +110,30 @@ categories: - task_log_apache_critical - task_log_apache_timeline - task_log_syslog_boot + - task_log_nginx_status_codes + - task_log_nginx_traffic + - task_log_nginx_slow_requests + - task_log_nginx_user_agents + - task_log_nginx_errors + - task_log_ssh_failed_logins + - task_log_ssh_brute_force + - task_log_ssh_successful + - task_log_ssh_user_activity + - task_log_ssh_unusual_times + - task_log_hdfs_failures + - task_log_hdfs_connections + - task_log_hdfs_slow_ops + - task_log_hdfs_block_ops + - task_log_hdfs_storage + - task_log_mapreduce_jobs + - task_log_mapreduce_failures + - task_log_mapreduce_slow_tasks + - task_log_mapreduce_resources + - task_log_mapreduce_timeline + - task_log_syslog_anomalies + - task_log_syslog_services + - task_log_syslog_cron + - task_log_syslog_auth_failures meeting_analysis: - task_meeting_council_votes diff --git a/tasks/task_log_hdfs_block_ops.md b/tasks/task_log_hdfs_block_ops.md new file mode 100644 index 0000000..e8b7e5b --- /dev/null +++ b/tasks/task_log_hdfs_block_ops.md @@ -0,0 +1,146 @@ +--- +id: task_log_hdfs_block_ops +name: HDFS DataNode Log - Block Operations Summary +category: log_analysis +grading_type: hybrid +timeout_seconds: 180 +workspace_files: + - dest: "hdfs_datanode.log" + source: "logs/hdfs_datanode.log" +--- + +# HDFS DataNode Log - Block Operations Summary + +## Prompt + +Analyze the HDFS DataNode log at `hdfs_datanode.log` and produce a comprehensive summary of all block operations. The log comes from an HDFS cluster and tracks block lifecycle events. + +Your report should include: + +1. **Block Inventory**: Total unique block IDs in the log, with a full list +2. **Operation Types**: For each operation type (allocateBlock, Receiving, Received, addStoredBlock, replicate, PacketResponder), count total occurrences +3. **Block Lifecycle Tracking**: For each block that has a complete lifecycle (allocate → receive → stored), document the full chain +4. **Replication Chain**: For blocks with replication events, trace the replication path across nodes +5. **Associated Jobs**: Identify the MapReduce jobs that triggered these block operations (visible in file paths) +6. **Per-Block Detail Table**: Create a table with columns: Block ID, Size (if known), Allocated Path, Nodes Involved, Replication Count + +Write the report to `hdfs_block_ops_report.md` as a well-structured markdown document. + +--- + +## Expected Behavior + +The agent should parse 2000 log entries and produce: + +**Block Inventory:** +- ~390 unique block IDs + +**Operation Counts:** +- Receiving block: ~1149 +- allocateBlock: ~385 +- Received block: ~19 +- addStoredBlock: ~19 +- PacketResponder: ~12 +- Replicate: 4 + +**Complete Block Lifecycles (blocks with full data):** +- blk_-1608999687919862906: 91178 bytes, allocated for job_200811092030_0001/job.jar +- blk_7503483334202473044: 233217 bytes, allocated for job_200811092030_0001/job.split +- blk_-3544583377289625738: 11971 bytes +- blk_-9073992586687739851: 11977 bytes + +**Replication Chain:** +- blk_-1608999687919862906 was replicated 4 times across the cluster: + 10.250.14.224 → 10.251.215.16 → 10.251.74.79 → 10.251.31.5 → 10.251.90.64 + +**Associated Job:** +- job_200811092030_0001 — MapReduce job, files: job.jar, job.split + +Acceptable variations: +- Block ID lists may be truncated +- Not all 390 blocks need full detail — just those with complete lifecycle data +- Table format may vary + +--- + +## Grading Criteria + +- [ ] `hdfs_block_ops_report.md` is created in the workspace +- [ ] Unique block count is provided (~390) +- [ ] Operation types are counted (receiving, allocate, replicate, etc.) +- [ ] At least one block lifecycle is fully traced (allocate → receive → stored) +- [ ] The associated MapReduce job is identified (job_200811092030_0001) + +--- + +## Automated Checks + +```python +def grade(transcript: list, workspace_path: str) -> dict: + """Grade the HDFS block operations summary task.""" + from pathlib import Path + + scores = {} + workspace = Path(workspace_path) + report_file = workspace / "hdfs_block_ops_report.md" + + if not report_file.exists(): + return { + "output_created": 0.0, + "block_count": 0.0, + "operations_counted": 0.0, + "lifecycle_traced": 0.0, + "job_identified": 0.0, + } + + scores["output_created"] = 1.0 + content = report_file.read_text(encoding="utf-8").lower() + + # Check 1: Block count + has_count = any(n in content for n in ["390", "~390", "385", "~385", "380", "~400"]) + scores["block_count"] = ( + 1.0 if has_count else + 0.5 if any(kw in content for kw in ["hundred", "unique block"]) else 0.0 + ) + + # Check 2: Operations counted + op_keywords = ["receiving", "allocate", "replicate", "addstored", + "packetresponder", "received"] + ops_found = sum(1 for kw in op_keywords if kw in content) + scores["operations_counted"] = ( + 1.0 if ops_found >= 4 else + 0.5 if ops_found >= 2 else 0.0 + ) + + # Check 3: Block lifecycle traced + lifecycle_keywords = ["91178", "233217", "blk_-1608999687919862906", + "blk_7503483334202473044", "lifecycle", "job.jar", "job.split"] + scores["lifecycle_traced"] = ( + 1.0 if sum(1 for kw in lifecycle_keywords if kw in content) >= 2 else + 0.5 if sum(1 for kw in lifecycle_keywords if kw in content) >= 1 else 0.0 + ) + + # Check 4: MapReduce job identified + has_job = "job_200811092030_0001" in content or "200811092030" in content + has_mapreduce = "mapreduce" in content or "mapred" in content or "map reduce" in content + scores["job_identified"] = ( + 1.0 if has_job else + 0.5 if has_mapreduce else 0.0 + ) + + return scores +``` + +--- + +## Additional Notes + +**Key facts from the log:** + +- Most blocks only have "Receiving" and "allocateBlock" entries — the cluster was mid-operation +- Only ~19 blocks have complete lifecycle data with confirmed sizes +- The 390 block IDs represent a MapReduce job's data being distributed across the cluster +- Replication is only logged for blk_-1608999687919862906, which is replicated 4 times +- File paths show this is related to a MapReduce job: `/mnt/hadoop/mapred/system/job_200811092030_0001/` + +**Grading weights (equal):** Each of the five criteria contributes 0.2 to the final score. diff --git a/tasks/task_log_hdfs_connections.md b/tasks/task_log_hdfs_connections.md new file mode 100644 index 0000000..35ec348 --- /dev/null +++ b/tasks/task_log_hdfs_connections.md @@ -0,0 +1,142 @@ +--- +id: task_log_hdfs_connections +name: HDFS DataNode Log - Connection Pattern Analysis +category: log_analysis +grading_type: hybrid +timeout_seconds: 180 +workspace_files: + - dest: "hdfs_datanode.log" + source: "logs/hdfs_datanode.log" +--- + +# HDFS DataNode Log - Connection Pattern Analysis + +## Prompt + +Analyze the HDFS DataNode log at `hdfs_datanode.log` and produce a report on connection and communication patterns between nodes. The log contains entries from DataNode, FSNamesystem, and PacketResponder components. + +Your report should include: + +1. **Network Topology**: List all unique IP addresses that appear in the log, categorized by their role (source, destination, or both) +2. **Subnet Analysis**: Group IPs by subnet (e.g., 10.250.x.x vs 10.251.x.x). How many nodes are in each subnet? +3. **Most Active Nodes**: Top 10 IPs by frequency of appearance (as source or destination) +4. **Communication Patterns**: Which pairs of nodes communicate most frequently? +5. **DataNode vs NameSystem**: Separate the activity — what comes from DataNode operations vs FSNamesystem operations? +6. **Cluster Size Estimate**: Based on the IPs observed, estimate the cluster size + +Write the report to `hdfs_connections_report.md` as a well-structured markdown document. + +--- + +## Expected Behavior + +The agent should parse 2000 log entries and produce: + +**Network Topology:** +- 202 unique IP addresses observed +- IPs fall in the 10.250.x.x and 10.251.x.x ranges (private network) +- All nodes use port 50010 (HDFS DataNode data transfer port) + +**Subnet Analysis:** +- 10.250.x.x subnet — contains some of the most active nodes +- 10.251.x.x subnet — contains additional DataNode cluster members +- The split suggests a multi-rack HDFS deployment + +**Most Active Nodes:** +- 10.250.19.102 — extremely active (appears as source in many block transfers) +- 10.250.10.6, 10.251.215.16, 10.250.14.224 — also very active + +**Component Activity:** +- DataNode$DataXceiver: Block receive operations (~1149 entries) +- FSNamesystem: Block allocation and storage tracking (~400+ entries) +- DataNode$PacketResponder: Block receive confirmations with sizes + +Acceptable variations: +- Exact IP counts and rankings may vary by parsing approach +- Subnet grouping granularity may differ +- Cluster size estimates will be approximate + +--- + +## Grading Criteria + +- [ ] `hdfs_connections_report.md` is created in the workspace +- [ ] Unique IPs are listed or counted (~202) +- [ ] IPs are grouped by subnet (10.250.x.x vs 10.251.x.x) +- [ ] Most active nodes are identified +- [ ] DataNode vs FSNamesystem activity is distinguished + +--- + +## Automated Checks + +```python +def grade(transcript: list, workspace_path: str) -> dict: + """Grade the HDFS connection pattern analysis task.""" + from pathlib import Path + + scores = {} + workspace = Path(workspace_path) + report_file = workspace / "hdfs_connections_report.md" + + if not report_file.exists(): + return { + "output_created": 0.0, + "ips_listed": 0.0, + "subnets_grouped": 0.0, + "active_nodes": 0.0, + "components_separated": 0.0, + } + + scores["output_created"] = 1.0 + content = report_file.read_text(encoding="utf-8").lower() + + # Check 1: IPs listed/counted + has_count = any(n in content for n in ["202", "200", "~200", "over 200"]) + has_ips = "10.250" in content and "10.251" in content + scores["ips_listed"] = ( + 1.0 if has_count and has_ips else + 0.5 if has_ips else 0.0 + ) + + # Check 2: Subnets grouped + subnet_keywords = ["subnet", "10.250", "10.251", "rack", "network segment", + "address range", "ip range"] + scores["subnets_grouped"] = ( + 1.0 if "10.250" in content and "10.251" in content and + sum(1 for kw in subnet_keywords if kw in content) >= 2 else + 0.5 if "10.250" in content and "10.251" in content else 0.0 + ) + + # Check 3: Active nodes identified + active_ips = ["10.250.19.102", "10.251.215.16", "10.250.14.224", "10.250.10.6"] + ips_found = sum(1 for ip in active_ips if ip in content) + scores["active_nodes"] = ( + 1.0 if ips_found >= 2 else + 0.5 if ips_found >= 1 else 0.0 + ) + + # Check 4: Components separated + component_keywords = ["dataxceiver", "dataxeceiver", "fsnamesystem", + "packetresponder", "namenode", "datanode"] + scores["components_separated"] = ( + 1.0 if sum(1 for kw in component_keywords if kw in content) >= 2 else + 0.5 if sum(1 for kw in component_keywords if kw in content) >= 1 else 0.0 + ) + + return scores +``` + +--- + +## Additional Notes + +**Key facts from the log:** + +- 202 unique IPs — this is a large HDFS cluster +- Two main subnets: 10.250.x.x and 10.251.x.x +- Port 50010 is used throughout — standard HDFS DataNode port +- 10.250.19.102 appears as source in a disproportionate number of entries +- The log captures a burst of activity related to job_200811092030_0001 + +**Grading weights (equal):** Each of the five criteria contributes 0.2 to the final score. diff --git a/tasks/task_log_hdfs_failures.md b/tasks/task_log_hdfs_failures.md new file mode 100644 index 0000000..5dc5df0 --- /dev/null +++ b/tasks/task_log_hdfs_failures.md @@ -0,0 +1,147 @@ +--- +id: task_log_hdfs_failures +name: HDFS DataNode Log - Block and Replication Failure Analysis +category: log_analysis +grading_type: hybrid +timeout_seconds: 180 +workspace_files: + - dest: "hdfs_datanode.log" + source: "logs/hdfs_datanode.log" +--- + +# HDFS DataNode Log - Block and Replication Failure Analysis + +## Prompt + +Analyze the HDFS DataNode log at `hdfs_datanode.log` and identify any block operation failures, replication issues, or error conditions. The log is from an HDFS cluster and contains DataNode, FSNamesystem, and PacketResponder entries. + +Your report should include: + +1. **Log Overview**: Total entries, date/time range, log level distribution (INFO, WARN, ERROR) +2. **Block Operation Summary**: Count of block receives, allocations, stored block confirmations, and replications +3. **Error and Warning Analysis**: List any WARN or ERROR level entries with details +4. **Replication Activity**: Detail all replication requests — which blocks are being replicated, from where to where? +5. **Failed or Incomplete Operations**: Are there any blocks where receive started but confirmation was never logged? +6. **Health Assessment**: Based on the log, is the HDFS cluster operating normally? + +Write the report to `hdfs_failure_report.md` as a well-structured markdown document. + +--- + +## Expected Behavior + +The agent should parse 2000 log entries and produce: + +**Log Overview:** +- 2000 entries, all from November 9, 2008 (081109), covering ~28 seconds (203518–203546) +- All entries are INFO level — no WARN or ERROR entries + +**Block Operations:** +- Receiving block: ~1149 entries +- Allocate block: ~385 entries +- Received block (confirmed): ~19 entries +- addStoredBlock: ~19 entries +- PacketResponder: ~12 entries +- Replication requests: 4 + +**Replication Details:** +- Block blk_-1608999687919862906 has 4 replication requests: + - 10.250.14.224 → 10.251.215.16 + - 10.251.215.16 → 10.251.74.79 + - 10.251.107.19 → 10.251.31.5 + - 10.251.31.5 → 10.251.90.64 + +**Health Assessment:** +- No errors or warnings — the cluster appears healthy +- The large number of "Receiving block" entries (1149) with relatively few confirmations (19) suggests high concurrency +- Replication activity for a single block across multiple nodes is normal HDFS behavior + +Acceptable variations: +- Exact counts may differ slightly depending on parsing approach +- Assessment language will vary +- The distinction between "no failures" and "potential incomplete operations" is valid + +--- + +## Grading Criteria + +- [ ] `hdfs_failure_report.md` is created in the workspace +- [ ] Log overview with entry count and time range is provided +- [ ] Block operations are categorized and counted (receive, allocate, replicate) +- [ ] The absence of WARN/ERROR entries is noted (or any found are detailed) +- [ ] A health assessment is provided + +--- + +## Automated Checks + +```python +def grade(transcript: list, workspace_path: str) -> dict: + """Grade the HDFS failure analysis task.""" + from pathlib import Path + + scores = {} + workspace = Path(workspace_path) + report_file = workspace / "hdfs_failure_report.md" + + if not report_file.exists(): + return { + "output_created": 0.0, + "log_overview": 0.0, + "operations_counted": 0.0, + "error_status": 0.0, + "health_assessment": 0.0, + } + + scores["output_created"] = 1.0 + content = report_file.read_text(encoding="utf-8").lower() + + # Check 1: Log overview + has_count = any(n in content for n in ["2000", "2,000"]) + has_date = any(d in content for d in ["081109", "november 9", "nov 9", "2008-11-09", "nov 2008"]) + scores["log_overview"] = ( + 1.0 if has_count and has_date else + 0.5 if has_count or has_date else 0.0 + ) + + # Check 2: Operations categorized + op_keywords = ["receiving", "allocate", "replicate", "addstored", + "packetresponder", "block operation"] + ops_found = sum(1 for kw in op_keywords if kw in content) + scores["operations_counted"] = ( + 1.0 if ops_found >= 3 else + 0.5 if ops_found >= 2 else 0.0 + ) + + # Check 3: Error status noted + error_keywords = ["no error", "no warn", "all info", "no failures", + "0 error", "0 warn", "no warning", "entirely info"] + scores["error_status"] = ( + 1.0 if sum(1 for kw in error_keywords if kw in content) >= 1 else + 0.5 if "info" in content else 0.0 + ) + + # Check 4: Health assessment + health_keywords = ["healthy", "normal", "operating correctly", "no issues", + "good health", "stable", "functioning"] + scores["health_assessment"] = ( + 1.0 if sum(1 for kw in health_keywords if kw in content) >= 1 else 0.0 + ) + + return scores +``` + +--- + +## Additional Notes + +**Key facts from the log:** + +- Format: `YYMMDD HHMMSS threadID LEVEL component: message` +- Date: November 9, 2008 (081109), times 203518–203546 (~28 seconds of activity) +- 202 unique IP addresses in the cluster +- 390 unique block IDs +- Block sizes range from 11,971 to 233,217 bytes +- This is a burst of HDFS activity — likely a MapReduce job starting (job_200811092030_0001 visible in paths) + +**Grading weights (equal):** Each of the five criteria contributes 0.2 to the final score. diff --git a/tasks/task_log_hdfs_slow_ops.md b/tasks/task_log_hdfs_slow_ops.md new file mode 100644 index 0000000..0a1205f --- /dev/null +++ b/tasks/task_log_hdfs_slow_ops.md @@ -0,0 +1,144 @@ +--- +id: task_log_hdfs_slow_ops +name: HDFS DataNode Log - Slow Operation Detection +category: log_analysis +grading_type: hybrid +timeout_seconds: 180 +workspace_files: + - dest: "hdfs_datanode.log" + source: "logs/hdfs_datanode.log" +--- + +# HDFS DataNode Log - Slow Operation Detection + +## Prompt + +Analyze the HDFS DataNode log at `hdfs_datanode.log` and identify operations that took longer than expected. The log records block receives, allocations, and replications with timestamps. + +Your report should include: + +1. **Block Lifecycle Timing**: For blocks where both "Receiving block" and "Received block" entries exist, calculate the elapsed time +2. **Allocation-to-Receive Timing**: For blocks where both "allocateBlock" and first "Receiving block" entries exist, calculate the delay +3. **Replication Timing**: How quickly are replication requests issued after block allocation? +4. **Slowest Operations**: Rank the top 5 slowest block operations by elapsed time +5. **Block Size vs Time Correlation**: Do larger blocks take longer? Correlate block size with transfer time where both are available +6. **Performance Summary**: Overall assessment of cluster performance during this period + +Write the report to `hdfs_slow_ops_report.md` as a well-structured markdown document. + +--- + +## Expected Behavior + +The agent should parse timestamps from the log format `YYMMDD HHMMSS` and calculate: + +**Block Lifecycle:** +- The log covers only ~28 seconds (203518 to 203546) +- Most block operations complete within 1-3 seconds +- Confirmed block receives with sizes: 91178 bytes, 233217 bytes, 11971 bytes, 11977 bytes + +**Key observations:** +- blk_-1608999687919862906 (91178 bytes): Allocated at 203518, first receive at 203518, confirmed at 203519 (~1 second) +- blk_7503483334202473044 (233217 bytes): Allocated at 203520, confirmed at 203521 (~1 second) +- blk_-3544583377289625738 (11971 bytes): Confirmed at 203522-203523 +- Block operations are very fast — consistent with a healthy cluster under normal load + +**Replication:** +- 4 replication requests for blk_-1608999687919862906, issued at 203521, 203524, 203527, 203530 +- ~3 second intervals between replication hops + +**Performance:** +- All operations complete in 1-3 seconds — no slow operations detected +- The cluster is performing well during this snapshot + +Acceptable variations: +- Timestamp resolution is 1 second, so some timing analysis will be approximate +- Different approaches to matching start/end events are valid +- Block size correlation may show insufficient data for meaningful analysis + +--- + +## Grading Criteria + +- [ ] `hdfs_slow_ops_report.md` is created in the workspace +- [ ] Block lifecycle timing is calculated for at least one block +- [ ] Block sizes are correlated with operation times where data is available +- [ ] The time range of the log is correctly identified (~28 seconds) +- [ ] A performance assessment is provided + +--- + +## Automated Checks + +```python +def grade(transcript: list, workspace_path: str) -> dict: + """Grade the HDFS slow operation detection task.""" + from pathlib import Path + + scores = {} + workspace = Path(workspace_path) + report_file = workspace / "hdfs_slow_ops_report.md" + + if not report_file.exists(): + return { + "output_created": 0.0, + "lifecycle_timing": 0.0, + "size_correlation": 0.0, + "time_range": 0.0, + "performance_assessment": 0.0, + } + + scores["output_created"] = 1.0 + content = report_file.read_text(encoding="utf-8").lower() + + # Check 1: Block lifecycle timing calculated + has_timing = any(kw in content for kw in ["1 second", "2 second", "3 second", + "elapsed", "duration", "latency", + "took", "completed in"]) + has_block = "blk_" in content or "blk-" in content or "block" in content + scores["lifecycle_timing"] = ( + 1.0 if has_timing and has_block else + 0.5 if has_timing or has_block else 0.0 + ) + + # Check 2: Block sizes mentioned + sizes = ["91178", "233217", "11971", "11977"] + sizes_found = sum(1 for s in sizes if s in content) + has_correlation = any(kw in content for kw in ["size", "bytes", "larger", "smaller"]) + scores["size_correlation"] = ( + 1.0 if sizes_found >= 2 and has_correlation else + 0.5 if sizes_found >= 1 else 0.0 + ) + + # Check 3: Time range identified + has_28s = any(kw in content for kw in ["28 second", "~28", "30 second", + "half a minute", "less than a minute"]) + has_timestamps = "203518" in content or "20:35:18" in content or "20:35" in content + scores["time_range"] = ( + 1.0 if has_28s or has_timestamps else + 0.5 if any(kw in content for kw in ["short", "brief", "seconds"]) else 0.0 + ) + + # Check 4: Performance assessment + perf_keywords = ["healthy", "normal", "fast", "no slow", "performing well", + "efficient", "optimal", "good performance"] + scores["performance_assessment"] = ( + 1.0 if sum(1 for kw in perf_keywords if kw in content) >= 1 else 0.0 + ) + + return scores +``` + +--- + +## Additional Notes + +**Key facts from the log:** + +- Timestamp format: YYMMDD HHMMSS (e.g., 081109 203518 = 2008-11-09 20:35:18) +- Resolution: 1 second — so sub-second timing is not available +- 390 unique blocks, but only ~19 have confirmed "Received" entries with sizes +- The cluster is in a burst of activity (job startup), so performance is under load +- No errors or warnings suggest all operations completed successfully + +**Grading weights (equal):** Each of the five criteria contributes 0.2 to the final score. diff --git a/tasks/task_log_hdfs_storage.md b/tasks/task_log_hdfs_storage.md new file mode 100644 index 0000000..f7c56e5 --- /dev/null +++ b/tasks/task_log_hdfs_storage.md @@ -0,0 +1,148 @@ +--- +id: task_log_hdfs_storage +name: HDFS DataNode Log - Storage and Capacity Analysis +category: log_analysis +grading_type: hybrid +timeout_seconds: 180 +workspace_files: + - dest: "hdfs_datanode.log" + source: "logs/hdfs_datanode.log" +--- + +# HDFS DataNode Log - Storage and Capacity Analysis + +## Prompt + +Analyze the HDFS DataNode log at `hdfs_datanode.log` and produce a storage-focused analysis. Examine block sizes, data distribution across nodes, and storage patterns. + +Your report should include: + +1. **Data Volume**: Total bytes stored across all confirmed block receives (where size is known) +2. **Block Size Distribution**: List all known block sizes, calculate min/max/mean/median +3. **Data Distribution by Node**: For each node that confirmed receiving blocks (PacketResponder "Received" entries), total the bytes stored +4. **Storage Path Analysis**: What storage paths are being used? (Extract from allocateBlock file paths) +5. **Replication Factor**: Based on how many nodes receive the same block, what is the effective replication factor? +6. **Capacity Planning**: Based on the data ingestion rate observed, estimate the storage needed for 1 hour of similar activity + +Write the report to `hdfs_storage_report.md` as a well-structured markdown document. + +--- + +## Expected Behavior + +The agent should parse the log and calculate: + +**Confirmed Block Sizes:** +- blk_-1608999687919862906: 91,178 bytes (received by 3+ nodes) +- blk_7503483334202473044: 233,217 bytes (received by 3 nodes) +- blk_-3544583377289625738: 11,971 bytes (received by 3 nodes) +- blk_-9073992586687739851: 11,977 bytes (received by 3 nodes) + +**Block Size Statistics:** +- Min: 11,971 bytes (~12 KB) +- Max: 233,217 bytes (~228 KB) +- Mean: ~87,086 bytes (~85 KB) +- Total confirmed data: ~348,343 bytes per replica + +**Replication Factor:** +- Each confirmed block is received by 3 nodes → replication factor of 3 +- This is standard HDFS default replication + +**Storage Path:** +- `/mnt/hadoop/mapred/system/job_200811092030_0001/` — MapReduce job staging directory +- Files: job.jar, job.split + +**Capacity Planning:** +- ~28 seconds of activity produced ~390 block allocations +- If each block averages ~85 KB with replication factor 3, that's ~100 MB/minute raw storage +- 1 hour estimate: ~6 GB (rough) + +Acceptable variations: +- Capacity estimates will be very rough given limited confirmed sizes +- Approaches to extrapolation will differ +- Statistics should be based on confirmed sizes only + +--- + +## Grading Criteria + +- [ ] `hdfs_storage_report.md` is created in the workspace +- [ ] Known block sizes are listed (91178, 233217, 11971, 11977) +- [ ] Block size statistics are calculated (min, max, mean) +- [ ] Replication factor is identified (3) +- [ ] Storage paths are extracted from the log + +--- + +## Automated Checks + +```python +def grade(transcript: list, workspace_path: str) -> dict: + """Grade the HDFS storage and capacity analysis task.""" + from pathlib import Path + + scores = {} + workspace = Path(workspace_path) + report_file = workspace / "hdfs_storage_report.md" + + if not report_file.exists(): + return { + "output_created": 0.0, + "block_sizes_listed": 0.0, + "statistics_calculated": 0.0, + "replication_factor": 0.0, + "storage_paths": 0.0, + } + + scores["output_created"] = 1.0 + content = report_file.read_text(encoding="utf-8").lower() + + # Check 1: Block sizes listed + sizes = ["91178", "233217", "11971", "11977"] + sizes_found = sum(1 for s in sizes if s in content) + scores["block_sizes_listed"] = ( + 1.0 if sizes_found >= 3 else + 0.5 if sizes_found >= 1 else 0.0 + ) + + # Check 2: Statistics calculated + stat_keywords = ["min", "max", "mean", "median", "average", "total", + "distribution", "range"] + scores["statistics_calculated"] = ( + 1.0 if sum(1 for kw in stat_keywords if kw in content) >= 3 else + 0.5 if sum(1 for kw in stat_keywords if kw in content) >= 1 else 0.0 + ) + + # Check 3: Replication factor identified + has_replication = any(kw in content for kw in ["replication factor", + "replication of 3", + "factor of 3", + "3 replicas", + "three replicas", + "3 copies", + "three copies", + "replicated 3", + "3 nodes"]) + scores["replication_factor"] = 1.0 if has_replication else 0.0 + + # Check 4: Storage paths extracted + has_path = any(p in content for p in ["/mnt/hadoop", "job_200811092030", + "mapred/system", "job.jar", "job.split"]) + scores["storage_paths"] = 1.0 if has_path else 0.0 + + return scores +``` + +--- + +## Additional Notes + +**Key facts from the log:** + +- Only 4 unique block sizes are confirmed in the log (from PacketResponder "Received" entries) +- 390 blocks were allocated but only ~19 receive confirmations appear in the log window +- Standard HDFS replication factor is 3, which matches the 3 receive confirmations per block +- The addStoredBlock entries (19) update the NameSystem's block map +- Storage is under `/mnt/hadoop/mapred/system/` — standard MapReduce staging directory + +**Grading weights (equal):** Each of the five criteria contributes 0.2 to the final score. diff --git a/tasks/task_log_mapreduce_failures.md b/tasks/task_log_mapreduce_failures.md new file mode 100644 index 0000000..f7a27a9 --- /dev/null +++ b/tasks/task_log_mapreduce_failures.md @@ -0,0 +1,147 @@ +--- +id: task_log_mapreduce_failures +name: MapReduce Log - Failed Task Analysis +category: log_analysis +grading_type: hybrid +timeout_seconds: 180 +workspace_files: + - dest: "mapreduce.log" + source: "logs/hadoop_mapreduce.log" +--- + +# MapReduce Log - Failed Task Analysis + +## Prompt + +Analyze the Hadoop MapReduce application log at `mapreduce.log` and identify any task failures, errors, or anomalies. Focus on anything that went wrong during execution. + +Your report should include: + +1. **Error and Warning Entries**: List all WARN and ERROR level log entries with full context +2. **Task Retries**: Identify any tasks that required multiple attempts (look for attempt numbers > 0) +3. **Root Cause Analysis**: For each error, explain the likely cause +4. **I/O Errors**: Detail any IOException or network-related failures +5. **Impact Assessment**: Did any failures impact the overall job result? +6. **Failure Prevention**: Recommend changes to prevent these failures in future runs + +Write the report to `mapreduce_failures.md` as a well-structured markdown document. + +--- + +## Expected Behavior + +The agent should identify: + +**WARN Entries (4 total):** +1. ResponseProcessor for block BP-1347369012-10.190.173.170-1444972147527:blk_1073742514_1708 — related to I/O issue +2. DataStreamer for file /tmp/hadoop-yarn/staging/msrabi/.staging/job_1445062781478_0011/job — write pipeline issue +3. CommitterEvent Processor — FileOutputCommitter recovery task count issue + +**ERROR Entries (1 total):** +1. java.io.IOException: Bad response ERROR for block BP-1347369012-10.190.173.170-1444972147527:blk_1073742514_1708 from datanode — an I/O error during block write + +**Task Retries:** +- attempt_1445062781478_0011_m_000006_1 (retry of m_000006_0) +- attempt_1445062781478_0011_m_000007_1 (retry of m_000007_0) +- Two map tasks needed a second attempt, suggesting transient failures + +**Root Cause:** +- The IOException and WARN entries relate to HDFS block write failures +- A DataNode returned a "Bad response ERROR" for block blk_1073742514_1708 +- This is a transient HDFS I/O error, likely caused by a DataNode being unavailable or overloaded + +**Impact:** +- Despite the errors, the overall job SUCCEEDED +- YARN's retry mechanism handled the transient failures transparently +- 2 out of 10 map tasks needed retries — 20% retry rate + +Acceptable variations: +- Root cause analysis depth may vary +- Prevention recommendations will differ +- Some agents may find additional context around the errors + +--- + +## Grading Criteria + +- [ ] `mapreduce_failures.md` is created in the workspace +- [ ] WARN and ERROR entries are listed (4 WARN, 1 ERROR) +- [ ] Task retries are identified (m_000006 and m_000007 retried) +- [ ] The IOException / bad response error is analyzed +- [ ] Impact assessment notes the job still succeeded + +--- + +## Automated Checks + +```python +def grade(transcript: list, workspace_path: str) -> dict: + """Grade the MapReduce failed task analysis.""" + from pathlib import Path + + scores = {} + workspace = Path(workspace_path) + report_file = workspace / "mapreduce_failures.md" + + if not report_file.exists(): + return { + "output_created": 0.0, + "warn_error_listed": 0.0, + "retries_identified": 0.0, + "ioexception_analyzed": 0.0, + "impact_assessed": 0.0, + } + + scores["output_created"] = 1.0 + content = report_file.read_text(encoding="utf-8").lower() + + # Check 1: WARN/ERROR entries listed + has_warn = "warn" in content + has_error = "error" in content + has_counts = any(n in content for n in ["4 warn", "4 warning", "1 error"]) + scores["warn_error_listed"] = ( + 1.0 if has_warn and has_error else + 0.5 if has_warn or has_error else 0.0 + ) + + # Check 2: Task retries identified + has_006 = "m_000006" in content or "000006" in content + has_007 = "m_000007" in content or "000007" in content + has_retry = any(kw in content for kw in ["retry", "reattempt", "second attempt", + "_1", "attempt 1"]) + scores["retries_identified"] = ( + 1.0 if (has_006 or has_007) and has_retry else + 0.5 if has_retry else 0.0 + ) + + # Check 3: IOException analyzed + io_keywords = ["ioexception", "io exception", "bad response", "block write", + "datanode", "datastreamer", "blk_1073742514"] + scores["ioexception_analyzed"] = ( + 1.0 if sum(1 for kw in io_keywords if kw in content) >= 2 else + 0.5 if sum(1 for kw in io_keywords if kw in content) >= 1 else 0.0 + ) + + # Check 4: Impact assessment + impact_keywords = ["succeeded", "success", "still completed", "job completed", + "transparent", "handled", "recovered", "despite"] + scores["impact_assessed"] = ( + 1.0 if sum(1 for kw in impact_keywords if kw in content) >= 1 else 0.0 + ) + + return scores +``` + +--- + +## Additional Notes + +**Key facts from the log:** + +- 1206 INFO, 4 WARN, 1 ERROR entries out of 1282 total +- The ERROR is a Java IOException wrapped in a WARN-level DataStreamer message +- Block BP-1347369012-10.190.173.170-1444972147527:blk_1073742514_1708 had a write failure +- FileOutputCommitter also logged a WARN about recovery task count +- Despite these issues, all 10 map tasks and 1 reduce task eventually completed + +**Grading weights (equal):** Each of the five criteria contributes 0.2 to the final score. diff --git a/tasks/task_log_mapreduce_jobs.md b/tasks/task_log_mapreduce_jobs.md new file mode 100644 index 0000000..82f81e2 --- /dev/null +++ b/tasks/task_log_mapreduce_jobs.md @@ -0,0 +1,147 @@ +--- +id: task_log_mapreduce_jobs +name: MapReduce Log - Job Completion Summary +category: log_analysis +grading_type: hybrid +timeout_seconds: 180 +workspace_files: + - dest: "mapreduce.log" + source: "logs/hadoop_mapreduce.log" +--- + +# MapReduce Log - Job Completion Summary + +## Prompt + +Analyze the Hadoop MapReduce application log at `mapreduce.log` and produce a comprehensive job completion summary. The log is from a MapReduce v2 (YARN) application. + +Your report should include: + +1. **Job Identification**: Job ID, application attempt ID, and job name/type +2. **Job Configuration**: OutputCommitter type, file system, and any other configuration details +3. **Task Summary**: Total map tasks, total reduce tasks, how many of each completed successfully +4. **Task Completion Timeline**: When did each task complete? Create a timeline showing the order of task completions with timestamps +5. **Job Duration**: Total job runtime from start to finish +6. **Final Status**: Did the job succeed or fail? What was the final transition? + +Write the report to `job_completion_report.md` as a well-structured markdown document. + +--- + +## Expected Behavior + +The agent should parse 1282 log entries and produce: + +**Job Identification:** +- Job ID: job_1445062781478_0011 +- Application Attempt: appattempt_1445062781478_0011_000001 +- Job type: pagerank (visible in history file path) +- User: msrabi + +**Configuration:** +- OutputCommitter: FileOutputCommitter +- File system: hdfs://msra-sa-41:9000 +- API: mapred newApiCommitter + +**Task Summary:** +- 10 map tasks (m_000000 through m_000009), plus 2 retries (m_000006_1, m_000007_1) +- 1 reduce task (r_000000) +- All 11 tasks completed successfully (10 map + 1 reduce) +- Total of 12 map task attempts, 1 reduce task attempt + +**Timeline:** +- Job start: 15:37:56 +- First map completion: 15:39:24 (m_000009) +- Last map completion: 15:41:25 (m_000006) +- Reduce completion: 15:42:46 (r_000000) +- Job finish: ~15:42:47 + +**Duration:** ~5 minutes (15:37:56 to 15:42:47) + +**Final Status:** SUCCEEDED + +Acceptable variations: +- Timeline formatting may differ +- Duration calculation approach may vary +- Task numbering notation may differ + +--- + +## Grading Criteria + +- [ ] `job_completion_report.md` is created in the workspace +- [ ] Job ID (job_1445062781478_0011) is identified +- [ ] Map and reduce task counts are correct (10 map tasks, 1 reduce task) +- [ ] Job duration is calculated (~5 minutes) +- [ ] Final status is identified as SUCCEEDED + +--- + +## Automated Checks + +```python +def grade(transcript: list, workspace_path: str) -> dict: + """Grade the MapReduce job completion summary task.""" + from pathlib import Path + + scores = {} + workspace = Path(workspace_path) + report_file = workspace / "job_completion_report.md" + + if not report_file.exists(): + return { + "output_created": 0.0, + "job_id": 0.0, + "task_counts": 0.0, + "duration": 0.0, + "final_status": 0.0, + } + + scores["output_created"] = 1.0 + content = report_file.read_text(encoding="utf-8").lower() + + # Check 1: Job ID identified + scores["job_id"] = ( + 1.0 if "job_1445062781478_0011" in content or "1445062781478_0011" in content else 0.0 + ) + + # Check 2: Task counts correct + has_10_map = any(kw in content for kw in ["10 map", "ten map", "10 mapper"]) + has_1_reduce = any(kw in content for kw in ["1 reduce", "one reduce", "single reduce", + "1 reducer"]) + scores["task_counts"] = ( + 1.0 if has_10_map and has_1_reduce else + 0.5 if has_10_map or has_1_reduce else 0.0 + ) + + # Check 3: Duration calculated + duration_keywords = ["5 minute", "~5 min", "4 minute", "4:51", "4:50", + "approximately 5", "about 5", "15:37", "15:42", + "nearly 5"] + scores["duration"] = ( + 1.0 if sum(1 for kw in duration_keywords if kw in content) >= 1 else 0.0 + ) + + # Check 4: Final status + scores["final_status"] = ( + 1.0 if "succeeded" in content else + 0.5 if "success" in content or "completed" in content else 0.0 + ) + + return scores +``` + +--- + +## Additional Notes + +**Key facts from the log:** + +- Single MapReduce job: a pagerank computation by user "msrabi" +- YARN-based (v2 MapReduce) on a cluster with namenode msra-sa-41 +- 12 map task attempts for 10 map tasks (m_000006 and m_000007 each had retries) +- The job history file confirms SUCCEEDED status with 10 maps and 1 reduce +- October 17, 2015 +- Container allocation visible: 13 unique containers used + +**Grading weights (equal):** Each of the five criteria contributes 0.2 to the final score. diff --git a/tasks/task_log_mapreduce_resources.md b/tasks/task_log_mapreduce_resources.md new file mode 100644 index 0000000..6a59617 --- /dev/null +++ b/tasks/task_log_mapreduce_resources.md @@ -0,0 +1,143 @@ +--- +id: task_log_mapreduce_resources +name: MapReduce Log - Resource Utilization Analysis +category: log_analysis +grading_type: hybrid +timeout_seconds: 180 +workspace_files: + - dest: "mapreduce.log" + source: "logs/hadoop_mapreduce.log" +--- + +# MapReduce Log - Resource Utilization Analysis + +## Prompt + +Analyze the Hadoop MapReduce application log at `mapreduce.log` and produce a resource utilization report. Focus on container allocation, scheduling, and resource usage patterns. + +Your report should include: + +1. **Container Inventory**: List all containers allocated for this job, with their IDs +2. **Container Allocation Timeline**: When was each container requested and assigned? +3. **Scheduling Analysis**: Track pending maps and reduces over time from the RMContainerAllocator entries +4. **Reduce Scheduling**: When did the reduce slow start threshold get met? What was the completion percentage? +5. **Container Reuse**: Were any containers completed and then reused? +6. **Resource Efficiency**: Based on container allocation vs task completion patterns, assess resource efficiency + +Write the report to `mapreduce_resources.md` as a well-structured markdown document. + +--- + +## Expected Behavior + +The agent should parse RMContainerAllocator entries and produce: + +**Container Inventory:** +- 13 unique containers used (container_1445062781478_0011_01_000001 through 000013) +- Application attempt: 01 + +**Scheduling Progression:** +- Initial state: 10 pending maps, 1 pending reduce +- Reduce slow start threshold repeatedly "not met" from 15:38:00 to 15:39:24 +- First map completes at 15:39:24 (10% complete) — still not enough for reduce +- Reduce scheduling begins when completedMapPercent reaches sufficient threshold +- "completedMapPercent 0.1 totalResources 2" logged at 15:39:24 + +**Container Lifecycle:** +- Containers allocated around 15:38:00–15:38:15 +- Containers released as tasks complete +- "Received completed container" entries track when containers finish + +**Key observations:** +- The reduce task had to wait for enough maps to complete (slow start) +- Map tasks had varying completion times (1.5 to 3.5 minutes) +- Container turnover: some containers were released and their resources freed quickly + +Acceptable variations: +- Container ID enumeration approach may vary +- Timeline granularity may differ +- Resource efficiency assessment is subjective + +--- + +## Grading Criteria + +- [ ] `mapreduce_resources.md` is created in the workspace +- [ ] Containers are listed (13 containers identified) +- [ ] Scheduling progression is tracked (pending maps/reduces over time) +- [ ] Reduce slow start threshold discussion is included +- [ ] Container completion events are analyzed + +--- + +## Automated Checks + +```python +def grade(transcript: list, workspace_path: str) -> dict: + """Grade the MapReduce resource utilization analysis task.""" + from pathlib import Path + + scores = {} + workspace = Path(workspace_path) + report_file = workspace / "mapreduce_resources.md" + + if not report_file.exists(): + return { + "output_created": 0.0, + "containers_listed": 0.0, + "scheduling_tracked": 0.0, + "slow_start": 0.0, + "container_completion": 0.0, + } + + scores["output_created"] = 1.0 + content = report_file.read_text(encoding="utf-8").lower() + + # Check 1: Containers listed + has_container = "container_" in content or "container" in content + has_count = any(n in content for n in ["13 container", "13 unique", "thirteen"]) + scores["containers_listed"] = ( + 1.0 if has_container and has_count else + 0.5 if has_container else 0.0 + ) + + # Check 2: Scheduling tracked + sched_keywords = ["pending", "scheduled", "pendingreds", "pendingmaps", + "scheduledmaps", "scheduling"] + scores["scheduling_tracked"] = ( + 1.0 if sum(1 for kw in sched_keywords if kw in content) >= 2 else + 0.5 if sum(1 for kw in sched_keywords if kw in content) >= 1 else 0.0 + ) + + # Check 3: Reduce slow start discussed + slow_start_keywords = ["slow start", "slowstart", "threshold", "reduce.*wait", + "completedmappercent", "not met"] + scores["slow_start"] = ( + 1.0 if sum(1 for kw in slow_start_keywords if kw in content) >= 2 else + 0.5 if sum(1 for kw in slow_start_keywords if kw in content) >= 1 else 0.0 + ) + + # Check 4: Container completion analyzed + completion_keywords = ["completed container", "container released", "received completed", + "container finish", "freed"] + scores["container_completion"] = ( + 1.0 if sum(1 for kw in completion_keywords if kw in content) >= 1 else + 0.5 if "complet" in content else 0.0 + ) + + return scores +``` + +--- + +## Additional Notes + +**Key facts from the log:** + +- YARN-based MapReduce (v2) on cluster with RM +- 13 containers for 10 map tasks + 1 reduce + 1 AM container + retries +- The reduce slow start threshold is a standard Hadoop optimization +- "Before Scheduling" / "After Scheduling" entries provide scheduling state snapshots +- Final stats: "PendingReds:0 ScheduledMaps:0" — all resources freed + +**Grading weights (equal):** Each of the five criteria contributes 0.2 to the final score. diff --git a/tasks/task_log_mapreduce_slow_tasks.md b/tasks/task_log_mapreduce_slow_tasks.md new file mode 100644 index 0000000..a16b08d --- /dev/null +++ b/tasks/task_log_mapreduce_slow_tasks.md @@ -0,0 +1,148 @@ +--- +id: task_log_mapreduce_slow_tasks +name: MapReduce Log - Slow Task Identification +category: log_analysis +grading_type: hybrid +timeout_seconds: 180 +workspace_files: + - dest: "mapreduce.log" + source: "logs/hadoop_mapreduce.log" +--- + +# MapReduce Log - Slow Task Identification + +## Prompt + +Analyze the Hadoop MapReduce application log at `mapreduce.log` and identify which map and reduce tasks were slowest. Compare task completion times to find stragglers. + +Your report should include: + +1. **Task Completion Times**: For each completed task, calculate the time from container assignment to task completion +2. **Fastest vs Slowest**: Identify the fastest and slowest map tasks, and the reduce task timing +3. **Straggler Analysis**: Are there any tasks that took significantly longer than average? Quantify the deviation +4. **Retry Impact**: For tasks that were retried (attempt > 0), how did the retry time compare to the original? +5. **Reduce Phase Timing**: When did the reduce task start relative to map completions? How long did it take? +6. **Bottleneck Identification**: What was the critical path? Which task(s) determined the overall job duration? + +Write the report to `slow_tasks_report.md` as a well-structured markdown document. + +--- + +## Expected Behavior + +The agent should extract task completion timestamps and calculate: + +**Map Task Completions (in order):** +1. m_000009: completed 15:39:24 (first) +2. m_000005: completed 15:40:28 +3. m_000003: completed 15:40:32 +4. m_000000: completed 15:40:34 +5. m_000001: completed 15:40:50 +6. m_000002: completed 15:40:50 +7. m_000004: completed 15:40:50 +8. m_000008: completed 15:40:52 +9. m_000007: completed 15:41:12 (retry — _1 attempt) +10. m_000006: completed 15:41:25 (retry — _1 attempt, slowest/last) + +**Reduce Task:** +- r_000000: completed 15:42:46 + +**Key findings:** +- m_000009 completed first at 15:39:24 — ~1.5 minutes after job start +- m_000006 completed last at 15:41:25 — ~3.5 minutes after job start (it's a retry) +- The retried tasks (m_000006, m_000007) were the slowest because they had to restart +- Spread between first and last map: ~2 minutes +- Reduce started after enough maps completed and finished about 1.3 minutes later + +Acceptable variations: +- Exact durations depend on which timestamps are used as start reference +- Different definitions of "task start" are acceptable +- Straggler threshold may vary + +--- + +## Grading Criteria + +- [ ] `slow_tasks_report.md` is created in the workspace +- [ ] Individual task completion times are listed +- [ ] Fastest and slowest map tasks are identified +- [ ] Retried tasks (m_000006, m_000007) are flagged as slower +- [ ] The reduce task timing is analyzed separately + +--- + +## Automated Checks + +```python +def grade(transcript: list, workspace_path: str) -> dict: + """Grade the MapReduce slow task identification task.""" + from pathlib import Path + + scores = {} + workspace = Path(workspace_path) + report_file = workspace / "slow_tasks_report.md" + + if not report_file.exists(): + return { + "output_created": 0.0, + "completion_times": 0.0, + "fastest_slowest": 0.0, + "retries_flagged": 0.0, + "reduce_timing": 0.0, + } + + scores["output_created"] = 1.0 + content = report_file.read_text(encoding="utf-8").lower() + + # Check 1: Task completion times listed + task_ids = ["m_000009", "m_000005", "m_000003", "m_000000", "m_000006"] + tasks_found = sum(1 for t in task_ids if t in content) + scores["completion_times"] = ( + 1.0 if tasks_found >= 4 else + 0.5 if tasks_found >= 2 else 0.0 + ) + + # Check 2: Fastest and slowest identified + has_fastest = any(kw in content for kw in ["fastest", "first to complete", + "earliest", "quickest"]) + has_slowest = any(kw in content for kw in ["slowest", "last to complete", + "longest", "straggler"]) + scores["fastest_slowest"] = ( + 1.0 if has_fastest and has_slowest else + 0.5 if has_fastest or has_slowest else 0.0 + ) + + # Check 3: Retried tasks flagged + has_retry = any(kw in content for kw in ["retry", "retried", "reattempt", + "second attempt", "_1"]) + has_slow_retry = any(kw in content for kw in ["m_000006", "m_000007"]) + scores["retries_flagged"] = ( + 1.0 if has_retry and has_slow_retry else + 0.5 if has_retry else 0.0 + ) + + # Check 4: Reduce timing analyzed + has_reduce = "r_000000" in content or "reduce" in content + has_reduce_time = any(t in content for t in ["15:42", "42:46"]) + scores["reduce_timing"] = ( + 1.0 if has_reduce and has_reduce_time else + 0.5 if has_reduce else 0.0 + ) + + return scores +``` + +--- + +## Additional Notes + +**Key facts from the log:** + +- Job started at 15:37:56, ended at ~15:42:47 +- Map tasks were assigned containers starting around 15:38:00 +- Reduce slow start threshold was not met until enough maps completed +- Two tasks (m_000006, m_000007) failed on first attempt and succeeded on retry +- The retries added ~30-55 seconds to total map phase time +- The critical path runs through the last map completion (m_000006 at 15:41:25) plus the reduce phase + +**Grading weights (equal):** Each of the five criteria contributes 0.2 to the final score. diff --git a/tasks/task_log_mapreduce_timeline.md b/tasks/task_log_mapreduce_timeline.md new file mode 100644 index 0000000..65aed5e --- /dev/null +++ b/tasks/task_log_mapreduce_timeline.md @@ -0,0 +1,159 @@ +--- +id: task_log_mapreduce_timeline +name: MapReduce Log - Job Timeline Visualization +category: log_analysis +grading_type: hybrid +timeout_seconds: 180 +workspace_files: + - dest: "mapreduce.log" + source: "logs/hadoop_mapreduce.log" +--- + +# MapReduce Log - Job Timeline Visualization + +## Prompt + +Analyze the Hadoop MapReduce application log at `mapreduce.log` and create a detailed timeline visualization of the entire job execution. Show all major events in chronological order. + +Your output should include: + +1. **Event Timeline**: A chronological list of every significant event with timestamp, including: + - Job initialization events + - Container allocations + - Task starts and completions + - Errors and warnings + - Reduce phase start + - Job completion +2. **Phase Diagram**: Divide the job into phases (initialization, map phase, shuffle, reduce phase, cleanup) with start/end times and durations +3. **Gantt-Style Task View**: Show each task (m_000000 through m_000009, r_000000) with approximate start and end times in a text-based timeline +4. **Critical Events**: Highlight the most impactful events (errors, retries, job state transitions) +5. **Concurrency Analysis**: At each point in time, how many tasks were running in parallel? + +Write the report to `mapreduce_timeline.md` as a well-structured markdown document with ASCII/text-based visualizations. + +--- + +## Expected Behavior + +The agent should produce a timeline like: + +**Phase Breakdown:** +| Phase | Start | End | Duration | +|---|---|---|---| +| Initialization | 15:37:56 | 15:38:00 | ~4s | +| Map Phase | 15:38:00 | 15:41:25 | ~3m 25s | +| Reduce Phase | 15:39:24 | 15:42:46 | ~3m 22s | +| Cleanup | 15:42:46 | 15:42:47 | ~1s | +| **Total** | **15:37:56** | **15:42:47** | **~4m 51s** | + +**Key Events:** +- 15:37:56 — MRAppMaster created +- 15:37:57 — OutputCommitter set (FileOutputCommitter) +- 15:38:00 — Container allocation begins (10 maps pending, 1 reduce pending) +- 15:39:24 — First map completes (m_000009), Num completed: 1 +- 15:40:28–15:40:52 — Rapid map completions (tasks 2-8) +- 15:40:45 — WARN: Block I/O error (ResponseProcessor, DataStreamer) +- 15:41:12 — m_000007 completes (retry attempt) +- 15:41:25 — m_000006 completes (retry attempt, last map) +- 15:42:46 — r_000000 completes, Num completed: 11 +- 15:42:46 — Job transitions to SUCCEEDED +- 15:42:47 — Final stats logged + +**Concurrency:** +- Peak: up to 10 map tasks running simultaneously +- After 15:39:24, concurrency decreases as maps complete + +Acceptable variations: +- ASCII visualization style will vary +- Not every log entry needs to be in the timeline — major events are sufficient +- Phase definitions may differ slightly + +--- + +## Grading Criteria + +- [ ] `mapreduce_timeline.md` is created in the workspace +- [ ] Events are listed chronologically with timestamps +- [ ] Phases are identified (init, map, reduce, completion) +- [ ] A visual or structured timeline/gantt is attempted +- [ ] Key events (first map completion, errors, job success) are highlighted + +--- + +## Automated Checks + +```python +def grade(transcript: list, workspace_path: str) -> dict: + """Grade the MapReduce timeline visualization task.""" + from pathlib import Path + + scores = {} + workspace = Path(workspace_path) + report_file = workspace / "mapreduce_timeline.md" + + if not report_file.exists(): + return { + "output_created": 0.0, + "chronological_events": 0.0, + "phases_identified": 0.0, + "visual_timeline": 0.0, + "key_events_highlighted": 0.0, + } + + scores["output_created"] = 1.0 + content = report_file.read_text(encoding="utf-8").lower() + + # Check 1: Chronological events with timestamps + timestamps = ["15:37", "15:38", "15:39", "15:40", "15:41", "15:42"] + ts_found = sum(1 for ts in timestamps if ts in content) + scores["chronological_events"] = ( + 1.0 if ts_found >= 5 else + 0.5 if ts_found >= 3 else 0.0 + ) + + # Check 2: Phases identified + phase_keywords = ["initialization", "init", "map phase", "reduce phase", + "shuffle", "cleanup", "completion", "startup"] + phases_found = sum(1 for kw in phase_keywords if kw in content) + scores["phases_identified"] = ( + 1.0 if phases_found >= 3 else + 0.5 if phases_found >= 2 else 0.0 + ) + + # Check 3: Visual/structured timeline attempted + visual_keywords = ["timeline", "gantt", "---", "===", "|||", "phase", + "diagram", "chart", "|", "─", "-"] + # Check for table-like structures or ASCII art + lines = content.split("\n") + table_lines = [l for l in lines if l.count("|") >= 2] + ascii_lines = [l for l in lines if any(c in l for c in ["─", "━", "═", "▓", "█", "░"])] + scores["visual_timeline"] = ( + 1.0 if len(table_lines) >= 5 or len(ascii_lines) >= 3 else + 0.5 if len(table_lines) >= 2 else 0.0 + ) + + # Check 4: Key events highlighted + key_events = ["mrappmaster", "first map", "succeeded", "warn", "error", + "m_000009", "r_000000", "retry", "completed"] + events_found = sum(1 for kw in key_events if kw in content) + scores["key_events_highlighted"] = ( + 1.0 if events_found >= 4 else + 0.5 if events_found >= 2 else 0.0 + ) + + return scores +``` + +--- + +## Additional Notes + +**Key facts from the log:** + +- Total job duration: ~4 minutes 51 seconds +- Map phase and reduce phase overlap — reduce starts while maps are still running +- The reduce "slow start" threshold meant the reduce task didn't get scheduled immediately +- Two map task retries (m_000006, m_000007) extended the map phase by about 35 seconds +- 1282 log entries total, but only ~50 represent major state transitions + +**Grading weights (equal):** Each of the five criteria contributes 0.2 to the final score. diff --git a/tasks/task_log_nginx_errors.md b/tasks/task_log_nginx_errors.md new file mode 100644 index 0000000..6883f30 --- /dev/null +++ b/tasks/task_log_nginx_errors.md @@ -0,0 +1,149 @@ +--- +id: task_log_nginx_errors +name: Nginx Access Log - Error Pattern Analysis +category: log_analysis +grading_type: hybrid +timeout_seconds: 180 +workspace_files: + - dest: "nginx_access.log" + source: "logs/nginx_access_json.log" +--- + +# Nginx Access Log - Error Pattern Analysis + +## Prompt + +Analyze the Nginx JSON access log at `nginx_access.log` and produce a detailed report on error patterns (4xx and 5xx responses). Each line is a JSON object with fields: `time`, `remote_ip`, `remote_user`, `request`, `response`, `bytes`, `referrer`, `agent`. + +Your report should include: + +1. **Error Overview**: Total errors, error rate (as percentage of all requests), breakdown by status code +2. **404 Analysis**: Which paths are returning 404? Are these legitimate missing resources or misconfigured routes? +3. **403 Analysis**: What's being forbidden and from which IPs? +4. **Error by Client IP**: Which IPs generate the most errors? Top 10 with counts +5. **Error by Path**: Which request paths generate the most errors? Top 10 with counts +6. **Temporal Pattern**: Are errors concentrated at certain times or spread evenly? +7. **Remediation Recommendations**: Based on the error patterns, suggest 3 specific fixes + +Write the report to `error_analysis.md` as a well-structured markdown document. + +--- + +## Expected Behavior + +The agent should parse all 1000 JSON log entries and produce: + +**Error Overview:** +- Total errors: 690 (69.0% of all requests) +- 404: 688 errors +- 403: 2 errors +- No 5xx errors observed + +**404 Analysis:** +- All 404s target `/downloads/product_1` and `/downloads/product_2` +- These same paths also return 200 and 304 at other times +- This suggests intermittent resource availability, not permanently missing files + +**403 Analysis:** +- 2 forbidden requests — identify the IPs and paths + +**Top Error IPs:** +- 80.91.33.133 is the highest-volume IP overall and likely the top error generator +- Other high-frequency IPs: 5.83.131.103, 202.143.95.26, 50.57.209.92 + +**Key insight:** +- The extremely high 404 rate (68.8%) on a package download server is unusual +- Package managers retry automatically, which amplifies the error count +- The root cause is likely transient unavailability of download resources + +Acceptable variations: +- Exact counts are deterministic +- Remediation suggestions will vary +- Assessment depth may differ + +--- + +## Grading Criteria + +- [ ] `error_analysis.md` is created in the workspace +- [ ] Error rate and status code breakdown are provided (690 errors, 69%, 404/403 split) +- [ ] 404 errors are analyzed by path (/downloads/product_1, /downloads/product_2) +- [ ] Top error-generating IPs are listed +- [ ] At least 2 remediation recommendations are provided + +--- + +## Automated Checks + +```python +def grade(transcript: list, workspace_path: str) -> dict: + """Grade the Nginx error pattern analysis task.""" + from pathlib import Path + + scores = {} + workspace = Path(workspace_path) + report_file = workspace / "error_analysis.md" + + if not report_file.exists(): + return { + "output_created": 0.0, + "error_rate_breakdown": 0.0, + "path_analysis": 0.0, + "top_error_ips": 0.0, + "recommendations": 0.0, + } + + scores["output_created"] = 1.0 + content = report_file.read_text(encoding="utf-8").lower() + + # Check 1: Error rate and breakdown + has_total = any(n in content for n in ["690", "688", "69%", "68.8%", "69.0%"]) + has_404 = "404" in content + has_403 = "403" in content + scores["error_rate_breakdown"] = ( + 1.0 if has_total and has_404 and has_403 else + 0.5 if has_404 and has_total else 0.0 + ) + + # Check 2: Path analysis + has_product_1 = "product_1" in content + has_product_2 = "product_2" in content + scores["path_analysis"] = ( + 1.0 if has_product_1 and has_product_2 else + 0.5 if has_product_1 or has_product_2 else 0.0 + ) + + # Check 3: Top error IPs + top_ips = ["80.91.33.133", "5.83.131.103", "202.143.95.26", "50.57.209.92"] + ips_found = sum(1 for ip in top_ips if ip in content) + scores["top_error_ips"] = ( + 1.0 if ips_found >= 3 else + 0.5 if ips_found >= 1 else 0.0 + ) + + # Check 4: Recommendations provided + rec_keywords = ["recommend", "suggestion", "fix", "should", "consider", + "implement", "configure", "add", "improve"] + lines = content.split("\n") + rec_lines = [l for l in lines if any(kw in l for kw in rec_keywords)] + scores["recommendations"] = ( + 1.0 if len(rec_lines) >= 2 else + 0.5 if len(rec_lines) >= 1 else 0.0 + ) + + return scores +``` + +--- + +## Additional Notes + +**Key facts from the log:** + +- 690 out of 1000 requests are errors — almost entirely 404s +- Both `/downloads/product_1` and `/downloads/product_2` return a mix of 200, 304, and 404 +- This pattern is consistent with a package repository where files are being updated/rotated +- Only 2 entries with 403 Forbidden +- Zero 5xx errors — the server itself is healthy + +**Grading weights (equal):** Each of the five criteria contributes 0.2 to the final score. diff --git a/tasks/task_log_nginx_slow_requests.md b/tasks/task_log_nginx_slow_requests.md new file mode 100644 index 0000000..2a7e5e3 --- /dev/null +++ b/tasks/task_log_nginx_slow_requests.md @@ -0,0 +1,135 @@ +--- +id: task_log_nginx_slow_requests +name: Nginx Access Log - Find Largest Responses +category: log_analysis +grading_type: hybrid +timeout_seconds: 180 +workspace_files: + - dest: "nginx_access.log" + source: "logs/nginx_access_json.log" +--- + +# Nginx Access Log - Find Largest Responses + +## Prompt + +Analyze the Nginx JSON access log at `nginx_access.log` and identify the requests that generated the largest responses (by bytes transferred). Each line is a JSON object with fields: `time`, `remote_ip`, `remote_user`, `request`, `response`, `bytes`, `referrer`, `agent`. + +Your report should include: + +1. **Top 10 Largest Responses**: List the requests with the highest byte counts, including timestamp, client IP, request path, status code, and bytes +2. **Byte Distribution Summary**: Overall statistics — min, max, mean, median bytes transferred (excluding zero-byte responses) +3. **Zero-Byte Responses**: Count of zero-byte responses and which status codes produce them +4. **Large Response Analysis**: What paths and client IPs are associated with the largest transfers? +5. **Efficiency Assessment**: What percentage of requests result in actual data transfer vs cache hits (304)? + +Write the report to `large_responses_report.md` as a well-structured markdown document. + +--- + +## Expected Behavior + +The agent should parse all 1000 JSON log entries and produce: + +**Top Largest Responses:** +- Maximum bytes observed: ~3318 bytes +- Largest responses are 200 OK responses for `/downloads/product_1` and `/downloads/product_2` +- Top byte values include: 3318, 3316, 3301, 2582, 2578, etc. + +**Zero-Byte Analysis:** +- 304 Not Modified responses all have 0 bytes (274 entries) +- 404 responses have small byte counts (300-340 range typically) + +**Efficiency:** +- ~274 out of 1000 requests are 304 (cache hits) — 27.4% +- 200 OK with data: ~35 requests — 3.5% +- 404 errors: 688 requests — these transfer small error pages + +Acceptable variations: +- Exact byte values are deterministic from the log +- Assessment wording will vary +- Top 10 vs top 20 is fine + +--- + +## Grading Criteria + +- [ ] `large_responses_report.md` is created in the workspace +- [ ] Top largest responses are listed with byte counts +- [ ] Zero-byte / 304 responses are analyzed separately +- [ ] Distribution statistics (min, max, mean or median) are provided +- [ ] Paths associated with largest responses are identified + +--- + +## Automated Checks + +```python +def grade(transcript: list, workspace_path: str) -> dict: + """Grade the Nginx largest responses task.""" + from pathlib import Path + + scores = {} + workspace = Path(workspace_path) + report_file = workspace / "large_responses_report.md" + + if not report_file.exists(): + return { + "output_created": 0.0, + "top_responses_listed": 0.0, + "zero_byte_analysis": 0.0, + "distribution_stats": 0.0, + "paths_identified": 0.0, + } + + scores["output_created"] = 1.0 + content = report_file.read_text(encoding="utf-8").lower() + + # Check 1: Top responses listed with byte counts + has_max_bytes = any(b in content for b in ["3318", "3316", "3301"]) + has_ranking = any(kw in content for kw in ["top", "largest", "biggest", "highest"]) + scores["top_responses_listed"] = ( + 1.0 if has_max_bytes and has_ranking else + 0.5 if has_max_bytes else 0.0 + ) + + # Check 2: Zero-byte / 304 analysis + has_zero = "0 byte" in content or "zero byte" in content or "zero-byte" in content or "no data" in content + has_304 = "304" in content + scores["zero_byte_analysis"] = ( + 1.0 if has_304 and has_zero else + 0.5 if has_304 else 0.0 + ) + + # Check 3: Distribution statistics + stat_keywords = ["min", "max", "mean", "median", "average", "total bytes", + "distribution", "range"] + scores["distribution_stats"] = ( + 1.0 if sum(1 for kw in stat_keywords if kw in content) >= 3 else + 0.5 if sum(1 for kw in stat_keywords if kw in content) >= 1 else 0.0 + ) + + # Check 4: Paths identified + has_product_1 = "product_1" in content + has_product_2 = "product_2" in content + has_downloads = "downloads" in content or "/download" in content + scores["paths_identified"] = ( + 1.0 if has_product_1 and has_product_2 else + 0.5 if has_downloads else 0.0 + ) + + return scores +``` + +--- + +## Additional Notes + +**Key facts from the log:** + +- Maximum response size is only ~3318 bytes — this is a lightweight download server +- The vast majority of responses are either 304 (0 bytes) or 404 (small error page) +- Only ~35 requests return 200 with actual content +- All requests target just two paths: `/downloads/product_1` and `/downloads/product_2` + +**Grading weights (equal):** Each of the five criteria contributes 0.2 to the final score. diff --git a/tasks/task_log_nginx_status_codes.md b/tasks/task_log_nginx_status_codes.md new file mode 100644 index 0000000..595eefd --- /dev/null +++ b/tasks/task_log_nginx_status_codes.md @@ -0,0 +1,142 @@ +--- +id: task_log_nginx_status_codes +name: Nginx Access Log - HTTP Status Code Distribution +category: log_analysis +grading_type: hybrid +timeout_seconds: 180 +workspace_files: + - dest: "nginx_access.log" + source: "logs/nginx_access_json.log" +--- + +# Nginx Access Log - HTTP Status Code Distribution + +## Prompt + +Analyze the Nginx JSON access log at `nginx_access.log` and produce a report on HTTP status code distribution. Each line is a JSON object with fields: `time`, `remote_ip`, `remote_user`, `request`, `response`, `bytes`, `referrer`, `agent`. + +Your report should include: + +1. **Total Requests**: Total number of log entries +2. **Status Code Breakdown**: Count and percentage for each HTTP status code observed +3. **Status Code Categories**: Group by category (2xx success, 3xx redirect, 4xx client error, 5xx server error) with totals +4. **Top Offenders**: For 4xx and 5xx errors, list the top 5 client IPs generating the most errors +5. **Requested Paths**: For each status code, show the top 3 most-requested paths +6. **Assessment**: Brief assessment of server health based on the status code distribution + +Write the report to `status_code_report.md` as a well-structured markdown document. + +--- + +## Expected Behavior + +The agent should parse all 1000 JSON log entries and produce: + +**Total Requests:** 1000 + +**Status Code Breakdown:** +- 200: 35 (3.5%) +- 206: 1 (0.1%) +- 304: 274 (27.4%) +- 403: 2 (0.2%) +- 404: 688 (68.8%) + +**Status Code Categories:** +- 2xx: 36 (3.6%) +- 3xx: 274 (27.4%) +- 4xx: 690 (69.0%) + +**Key observations:** +- The log covers May 17, 2015, approximately 08:05–16:05 UTC +- 404 errors dominate — nearly 69% of all requests +- Paths are primarily `/downloads/product_1` and `/downloads/product_2` +- Most traffic comes from Debian APT package manager clients +- 80.91.33.133 is the most active IP with ~210 requests + +Acceptable variations: +- Exact percentages may differ by rounding +- Assessment wording will vary +- Additional analysis is welcome + +--- + +## Grading Criteria + +- [ ] `status_code_report.md` is created in the workspace +- [ ] Total request count is reported (1000) +- [ ] All observed status codes are listed with counts (200, 206, 304, 403, 404) +- [ ] Status codes are grouped by category (2xx, 3xx, 4xx) +- [ ] Top error-generating IPs are identified + +--- + +## Automated Checks + +```python +def grade(transcript: list, workspace_path: str) -> dict: + """Grade the Nginx status code distribution task.""" + from pathlib import Path + + scores = {} + workspace = Path(workspace_path) + report_file = workspace / "status_code_report.md" + + if not report_file.exists(): + return { + "output_created": 0.0, + "total_count": 0.0, + "status_codes_listed": 0.0, + "categories_grouped": 0.0, + "top_error_ips": 0.0, + } + + scores["output_created"] = 1.0 + content = report_file.read_text(encoding="utf-8").lower() + + # Check 1: Total request count + scores["total_count"] = 1.0 if "1000" in content or "1,000" in content else 0.0 + + # Check 2: All status codes listed + has_200 = "200" in content + has_304 = "304" in content + has_404 = "404" in content + has_403 = "403" in content + codes_found = sum([has_200, has_304, has_404, has_403]) + scores["status_codes_listed"] = ( + 1.0 if codes_found >= 4 else + 0.5 if codes_found >= 3 else 0.0 + ) + + # Check 3: Categories grouped + has_2xx = "2xx" in content or "success" in content + has_3xx = "3xx" in content or "redirect" in content + has_4xx = "4xx" in content or "client error" in content + scores["categories_grouped"] = ( + 1.0 if sum([has_2xx, has_3xx, has_4xx]) >= 3 else + 0.5 if sum([has_2xx, has_3xx, has_4xx]) >= 2 else 0.0 + ) + + # Check 4: Top error IPs identified + top_ips = ["80.91.33.133", "5.83.131.103", "202.143.95.26", "50.57.209.92"] + ips_found = sum(1 for ip in top_ips if ip in content) + scores["top_error_ips"] = ( + 1.0 if ips_found >= 2 else + 0.5 if ips_found >= 1 else 0.0 + ) + + return scores +``` + +--- + +## Additional Notes + +**Key facts from the log:** + +- 1000 JSON entries, single day: May 17, 2015 (08:05–16:05 UTC) +- Extremely high 404 rate (68.8%) suggests missing resources or misconfigured download paths +- Traffic is predominantly automated (Debian APT package managers) +- Only 2 main paths: `/downloads/product_1` and `/downloads/product_2` +- No 5xx server errors observed + +**Grading weights (equal):** Each of the five criteria contributes 0.2 to the final score. diff --git a/tasks/task_log_nginx_traffic.md b/tasks/task_log_nginx_traffic.md new file mode 100644 index 0000000..cc490cf --- /dev/null +++ b/tasks/task_log_nginx_traffic.md @@ -0,0 +1,137 @@ +--- +id: task_log_nginx_traffic +name: Nginx Access Log - Traffic Patterns by Time +category: log_analysis +grading_type: hybrid +timeout_seconds: 180 +workspace_files: + - dest: "nginx_access.log" + source: "logs/nginx_access_json.log" +--- + +# Nginx Access Log - Traffic Patterns by Time + +## Prompt + +Analyze the Nginx JSON access log at `nginx_access.log` and produce a report on traffic patterns over time. Each line is a JSON object with fields: `time`, `remote_ip`, `remote_user`, `request`, `response`, `bytes`, `referrer`, `agent`. + +Your report should include: + +1. **Time Range**: The full date/time range covered by the log +2. **Hourly Traffic Breakdown**: Number of requests per hour +3. **Peak and Low Traffic**: Identify the busiest and quietest hours +4. **Bandwidth Over Time**: Total bytes transferred per hour +5. **Request Rate Trends**: Are requests steady, bursty, or showing a trend? +6. **Per-IP Activity Over Time**: Identify IPs that appear across multiple hours vs those that appear in bursts + +Write the report to `traffic_report.md` as a well-structured markdown document. + +--- + +## Expected Behavior + +The agent should parse all 1000 JSON log entries and produce: + +**Time Range:** May 17, 2015, 08:05:01 to 16:05:10 UTC (approximately 8 hours) + +**Hourly Breakdown (approximate):** +- 08:xx — the log starts mid-hour +- Traffic is distributed across the 8-hour window +- The log contains entries timestamped between 08:05 and 16:05 + +**Key observations:** +- 73 unique client IPs over the period +- 80.91.33.133 is the most persistent client (~210 requests spread across the time range) +- Most traffic is Debian APT package manager traffic (automated updates) +- Bytes transferred vary — 304 (Not Modified) responses have 0 bytes; 200 responses range up to ~3318 bytes +- The server appears to be a software download/repository mirror + +Acceptable variations: +- Exact hourly counts may vary depending on parsing approach +- Any reasonable binning approach (hourly, 30-min, etc.) is acceptable +- Trend analysis wording will vary + +--- + +## Grading Criteria + +- [ ] `traffic_report.md` is created in the workspace +- [ ] Time range is identified (May 17, 2015; approximately 08:05–16:05 UTC) +- [ ] Traffic is broken down by time period (hourly or similar) +- [ ] Peak/busiest periods are identified +- [ ] Bandwidth or bytes transferred is analyzed + +--- + +## Automated Checks + +```python +def grade(transcript: list, workspace_path: str) -> dict: + """Grade the Nginx traffic patterns task.""" + from pathlib import Path + + scores = {} + workspace = Path(workspace_path) + report_file = workspace / "traffic_report.md" + + if not report_file.exists(): + return { + "output_created": 0.0, + "time_range": 0.0, + "hourly_breakdown": 0.0, + "peak_identified": 0.0, + "bandwidth_analysis": 0.0, + } + + scores["output_created"] = 1.0 + content = report_file.read_text(encoding="utf-8").lower() + + # Check 1: Time range identified + has_date = any(d in content for d in ["may 17", "2015-05-17", "17/may/2015", "may 2015"]) + has_range = any(t in content for t in ["08:05", "16:05", "8 hour", "eight hour"]) + scores["time_range"] = ( + 1.0 if has_date and has_range else + 0.5 if has_date else 0.0 + ) + + # Check 2: Traffic broken down by time period + time_keywords = ["hour", "period", "interval", "08:", "09:", "10:", "11:", "12:", + "13:", "14:", "15:", "16:"] + time_sections = sum(1 for kw in time_keywords if kw in content) + scores["hourly_breakdown"] = ( + 1.0 if time_sections >= 4 else + 0.5 if time_sections >= 2 else 0.0 + ) + + # Check 3: Peak/busiest periods identified + peak_keywords = ["peak", "busiest", "highest", "most active", "maximum", + "lowest", "quietest", "least"] + scores["peak_identified"] = ( + 1.0 if sum(1 for kw in peak_keywords if kw in content) >= 2 else + 0.5 if sum(1 for kw in peak_keywords if kw in content) >= 1 else 0.0 + ) + + # Check 4: Bandwidth/bytes analysis + bandwidth_keywords = ["bytes", "bandwidth", "transfer", "data", "0 bytes", + "304", "not modified"] + scores["bandwidth_analysis"] = ( + 1.0 if sum(1 for kw in bandwidth_keywords if kw in content) >= 2 else + 0.5 if sum(1 for kw in bandwidth_keywords if kw in content) >= 1 else 0.0 + ) + + return scores +``` + +--- + +## Additional Notes + +**Key facts from the log:** + +- 1000 JSON entries over ~8 hours on May 17, 2015 +- Timestamps are in format `17/May/2015:HH:MM:SS +0000` +- The log is a download server serving APT package repositories +- Most bytes transferred are small (max ~3318 bytes for 200 responses) +- 304 Not Modified responses carry 0 bytes — important for bandwidth analysis + +**Grading weights (equal):** Each of the five criteria contributes 0.2 to the final score. diff --git a/tasks/task_log_nginx_user_agents.md b/tasks/task_log_nginx_user_agents.md new file mode 100644 index 0000000..e1fa29b --- /dev/null +++ b/tasks/task_log_nginx_user_agents.md @@ -0,0 +1,143 @@ +--- +id: task_log_nginx_user_agents +name: Nginx Access Log - User Agent Analysis +category: log_analysis +grading_type: hybrid +timeout_seconds: 180 +workspace_files: + - dest: "nginx_access.log" + source: "logs/nginx_access_json.log" +--- + +# Nginx Access Log - User Agent Analysis + +## Prompt + +Analyze the Nginx JSON access log at `nginx_access.log` and produce a comprehensive user agent analysis. Each line is a JSON object with fields: `time`, `remote_ip`, `remote_user`, `request`, `response`, `bytes`, `referrer`, `agent`. + +Your report should include: + +1. **Unique User Agents**: Total count of distinct user agent strings +2. **User Agent Ranking**: List all user agents sorted by request count, with count and percentage +3. **Client Type Classification**: Categorize agents into types (package managers, web browsers, bots/crawlers, command-line tools, unknown/empty) +4. **Agent-to-IP Mapping**: For each user agent, how many unique IPs use it? +5. **Success vs Error Rate by Agent**: For each agent, what percentage of requests result in errors (4xx/5xx)? +6. **Conclusions**: What type of server is this based on the user agent profile? + +Write the report to `user_agent_report.md` as a well-structured markdown document. + +--- + +## Expected Behavior + +The agent should parse all 1000 JSON log entries and produce: + +**Unique User Agents:** 14 distinct agent strings (including "-" for empty) + +**Top User Agents:** +- `Debian APT-HTTP/1.3 (0.9.7.9)` — 370 requests (37.0%) +- `Debian APT-HTTP/1.3 (0.8.16~exp12ubuntu10.16)` — 177 (17.7%) +- `Debian APT-HTTP/1.3 (0.8.16~exp12ubuntu10.22)` — 118 (11.8%) +- `Debian APT-HTTP/1.3 (1.0.1ubuntu2)` — 116 (11.6%) +- `Debian APT-HTTP/1.3 (0.8.16~exp12ubuntu10.21)` — 64 (6.4%) +- Additional APT variants and a few others (Go 1.1 package http, urlgrabber, etc.) + +**Classification:** +- Package managers (Debian APT): vast majority (~95%+) +- Other automated tools: Go HTTP client, urlgrabber +- Empty/missing agent ("-"): small number + +**Conclusions:** +- This is clearly a Debian/Ubuntu package repository or software download mirror +- Multiple APT versions indicate clients running different Ubuntu/Debian releases +- Very little if any human browser traffic + +Acceptable variations: +- Exact counts are deterministic from the log +- Classification categories may use different names +- Assessment language will vary + +--- + +## Grading Criteria + +- [ ] `user_agent_report.md` is created in the workspace +- [ ] All user agents are listed with counts +- [ ] Agents are classified by type (package manager, bot, etc.) +- [ ] The dominant agent (Debian APT) is identified as the primary client +- [ ] Server purpose is correctly inferred (package repository/download mirror) + +--- + +## Automated Checks + +```python +def grade(transcript: list, workspace_path: str) -> dict: + """Grade the Nginx user agent analysis task.""" + from pathlib import Path + + scores = {} + workspace = Path(workspace_path) + report_file = workspace / "user_agent_report.md" + + if not report_file.exists(): + return { + "output_created": 0.0, + "agents_listed": 0.0, + "agents_classified": 0.0, + "apt_dominant": 0.0, + "server_purpose": 0.0, + } + + scores["output_created"] = 1.0 + content = report_file.read_text(encoding="utf-8").lower() + + # Check 1: User agents listed with counts + has_apt = "apt-http" in content or "apt http" in content or "debian apt" in content + has_counts = any(str(c) in content for c in ["370", "177", "118", "116"]) + scores["agents_listed"] = ( + 1.0 if has_apt and has_counts else + 0.5 if has_apt else 0.0 + ) + + # Check 2: Agents classified by type + type_keywords = ["package manager", "bot", "crawler", "automated", "tool", + "browser", "command line", "cli", "client type", "categor"] + scores["agents_classified"] = ( + 1.0 if sum(1 for kw in type_keywords if kw in content) >= 2 else + 0.5 if sum(1 for kw in type_keywords if kw in content) >= 1 else 0.0 + ) + + # Check 3: APT identified as dominant + dominant_keywords = ["dominant", "majority", "most common", "primary", + "most frequent", "largest", "overwhelming"] + has_dominant = any(kw in content for kw in dominant_keywords) + scores["apt_dominant"] = ( + 1.0 if has_apt and has_dominant else + 0.5 if has_apt else 0.0 + ) + + # Check 4: Server purpose inferred + purpose_keywords = ["repository", "mirror", "download", "package", + "software", "debian", "ubuntu", "apt"] + scores["server_purpose"] = ( + 1.0 if sum(1 for kw in purpose_keywords if kw in content) >= 3 else + 0.5 if sum(1 for kw in purpose_keywords if kw in content) >= 1 else 0.0 + ) + + return scores +``` + +--- + +## Additional Notes + +**Key facts from the log:** + +- 14 unique user agent strings +- Over 95% of traffic is from Debian APT package managers +- Multiple APT versions correspond to different Ubuntu/Debian releases +- The presence of `Go 1.1 package http` and `urlgrabber` agents indicates some non-APT automated traffic +- Some entries have agent "-" (empty/missing user agent) + +**Grading weights (equal):** Each of the five criteria contributes 0.2 to the final score. diff --git a/tasks/task_log_ssh_brute_force.md b/tasks/task_log_ssh_brute_force.md new file mode 100644 index 0000000..93a330b --- /dev/null +++ b/tasks/task_log_ssh_brute_force.md @@ -0,0 +1,164 @@ +--- +id: task_log_ssh_brute_force +name: SSH Auth Log - Brute Force Detection +category: log_analysis +grading_type: hybrid +timeout_seconds: 180 +workspace_files: + - dest: "auth.log" + source: "logs/openssh_auth.log" +--- + +# SSH Auth Log - Brute Force Detection + +## Prompt + +You are a security analyst reviewing the OpenSSH authentication log at `auth.log`. Your job is to detect brute-force attack patterns and produce a threat assessment. + +Define a brute-force attack as: **more than 10 failed authentication attempts from a single IP address within the log period**. + +Your report should include: + +1. **Brute Force Sources**: List all IPs that meet the brute-force threshold, with total failed attempts per IP +2. **Attack Intensity**: For each brute-force source, calculate the approximate rate of attempts (attempts per minute) +3. **Username Patterns**: For each attacking IP, what usernames are they trying? Is it a dictionary attack (many usernames) or targeted (few usernames)? +4. **Attack Timeline**: When did each attack start and stop? Any overlap between attackers? +5. **Reverse DNS Analysis**: Which attacking IPs triggered "POSSIBLE BREAK-IN ATTEMPT" warnings? +6. **Risk Assessment**: Rate the overall threat level and recommend specific countermeasures + +Write the report to `brute_force_report.json` as a JSON document with the following structure: + +```json +{ + "summary": "Brief summary", + "brute_force_sources": [ + { + "ip": "x.x.x.x", + "total_attempts": 100, + "first_seen": "Dec 10 HH:MM:SS", + "last_seen": "Dec 10 HH:MM:SS", + "usernames_tried": ["user1", "user2"], + "attack_type": "dictionary|targeted", + "reverse_dns_warning": true + } + ], + "risk_level": "critical|high|medium|low", + "recommendations": ["rec1", "rec2"] +} +``` + +--- + +## Expected Behavior + +The agent should identify these brute-force sources: + +**Primary Attackers:** +- **183.62.140.253** — ~307 entries, heaviest attacker, likely dictionary attack +- **187.141.143.180** — ~189 entries, sustained attack +- **103.99.0.122** — ~83 entries +- **112.95.230.3** — ~54 entries +- **5.188.10.180** — ~30 entries +- **185.190.58.151** — ~26 entries + +**Key findings:** +- Multiple concurrent brute-force attacks from different IPs +- Attacks span approximately 4 hours (06:55–10:59) +- Username patterns include common defaults (admin, root, test, oracle, support) +- 85 "POSSIBLE BREAK-IN ATTEMPT" warnings indicate spoofed/misconfigured reverse DNS +- Risk level should be assessed as high or critical + +Acceptable variations: +- Threshold for brute-force detection may vary +- Rate calculations depend on how first/last timestamps are determined +- Recommendation specifics will vary + +--- + +## Grading Criteria + +- [ ] `brute_force_report.json` is created in the workspace +- [ ] At least 3 brute-force source IPs are identified +- [ ] 183.62.140.253 is identified as the top attacker +- [ ] Attack type (dictionary vs targeted) is classified for each source +- [ ] Recommendations for countermeasures are provided + +--- + +## Automated Checks + +```python +def grade(transcript: list, workspace_path: str) -> dict: + """Grade the SSH brute force detection task.""" + from pathlib import Path + import json + + scores = {} + workspace = Path(workspace_path) + report_file = workspace / "brute_force_report.json" + + if not report_file.exists(): + return { + "output_created": 0.0, + "sources_identified": 0.0, + "top_attacker": 0.0, + "attack_classified": 0.0, + "recommendations": 0.0, + } + + scores["output_created"] = 1.0 + + try: + data = json.loads(report_file.read_text(encoding="utf-8")) + except (json.JSONDecodeError, Exception): + return { + "output_created": 1.0, + "sources_identified": 0.0, + "top_attacker": 0.0, + "attack_classified": 0.0, + "recommendations": 0.0, + } + + full_text = json.dumps(data).lower() + + # Check 1: At least 3 brute-force sources identified + sources = data.get("brute_force_sources", []) + if not isinstance(sources, list): + sources = [] + scores["sources_identified"] = ( + 1.0 if len(sources) >= 3 else + 0.5 if len(sources) >= 1 else 0.0 + ) + + # Check 2: Top attacker identified + scores["top_attacker"] = 1.0 if "183.62.140.253" in full_text else 0.0 + + # Check 3: Attack type classified + has_classification = "dictionary" in full_text or "targeted" in full_text or "attack_type" in full_text + scores["attack_classified"] = 1.0 if has_classification else 0.0 + + # Check 4: Recommendations provided + recs = data.get("recommendations", []) + if not isinstance(recs, list): + recs = [] + has_recs = len(recs) >= 2 or any(kw in full_text for kw in + ["fail2ban", "rate limit", "firewall", "block", "key-based", + "disable password", "allowlist", "whitelist", "deny"]) + scores["recommendations"] = 1.0 if has_recs else 0.5 if len(recs) >= 1 else 0.0 + + return scores +``` + +--- + +## Additional Notes + +**Key facts from the log:** + +- Server: LabSZ, running OpenSSH with PAM +- Attack window: Dec 10, 06:55 to 10:59 (~4 hours) +- Multiple simultaneous attackers — suggests the server IP is on a known scan list +- 183.62.140.253 generates about 75 attempts per hour on average +- The single successful login (user fztu from 119.137.62.142) is NOT from an attacking IP + +**Grading weights (equal):** Each of the five criteria contributes 0.2 to the final score. diff --git a/tasks/task_log_ssh_failed_logins.md b/tasks/task_log_ssh_failed_logins.md new file mode 100644 index 0000000..546185c --- /dev/null +++ b/tasks/task_log_ssh_failed_logins.md @@ -0,0 +1,143 @@ +--- +id: task_log_ssh_failed_logins +name: SSH Auth Log - Failed Login Analysis +category: log_analysis +grading_type: hybrid +timeout_seconds: 180 +workspace_files: + - dest: "auth.log" + source: "logs/openssh_auth.log" +--- + +# SSH Auth Log - Failed Login Analysis + +## Prompt + +Analyze the OpenSSH authentication log at `auth.log` and produce a detailed report on failed login attempts. The log is from a server named "LabSZ" and covers SSH authentication events. + +Your report should include: + +1. **Overview**: Total log entries, date range, total failed login attempts +2. **Failed Password Attempts**: Count of "Failed password" entries, broken down by source IP +3. **Invalid User Attempts**: Count of attempts using non-existent usernames, with a list of the top 10 most-tried usernames +4. **Top Attacking IPs**: Top 10 source IPs by number of failed attempts, with counts +5. **Authentication Methods**: What authentication methods are being attempted (password, publickey, etc.)? +6. **Reverse DNS Failures**: How many entries show "POSSIBLE BREAK-IN ATTEMPT" warnings? +7. **Summary Assessment**: Is this server under active attack? What does the pattern suggest? + +Write the report to `failed_login_report.md` as a well-structured markdown document. + +--- + +## Expected Behavior + +The agent should parse the 1500 log entries and produce: + +**Overview:** +- 1500 log entries +- Date: December 10 (times range from ~06:55 to ~10:59) +- ~366 "Failed password" entries +- ~100 "Invalid user" entries + +**Top Attacking IPs:** +- 183.62.140.253 — ~307 entries (dominant attacker) +- 187.141.143.180 — ~189 entries +- 103.99.0.122 — ~83 entries +- 112.95.230.3 — ~54 entries +- 5.188.10.180 — ~30 entries + +**Top Invalid Usernames:** +- admin (18), oracle (6), support (5), test (4), inspur (3), 0 (3), matlab (3), webmaster (2), guest (2), 1234 (2) + +**Key observations:** +- Only 1 successful login in the entire log (user "fztu" from 119.137.62.142) +- 85 "POSSIBLE BREAK-IN ATTEMPT" warnings from reverse DNS failures +- The server is clearly under active brute-force attack +- Attack comes from a small number of IPs generating hundreds of attempts each + +Acceptable variations: +- Exact counts may differ by ±5 depending on parsing +- Assessment language will vary + +--- + +## Grading Criteria + +- [ ] `failed_login_report.md` is created in the workspace +- [ ] Total failed attempts are counted (approximately 366 failed passwords) +- [ ] Top attacking IPs are identified (183.62.140.253 as the top attacker) +- [ ] Invalid usernames are listed (admin, oracle, support as top targets) +- [ ] The server is assessed as being under brute-force attack + +--- + +## Automated Checks + +```python +def grade(transcript: list, workspace_path: str) -> dict: + """Grade the SSH failed login analysis task.""" + from pathlib import Path + + scores = {} + workspace = Path(workspace_path) + report_file = workspace / "failed_login_report.md" + + if not report_file.exists(): + return { + "output_created": 0.0, + "failed_count": 0.0, + "top_ips": 0.0, + "invalid_usernames": 0.0, + "attack_assessment": 0.0, + } + + scores["output_created"] = 1.0 + content = report_file.read_text(encoding="utf-8").lower() + + # Check 1: Failed attempt count + has_failed_count = any(n in content for n in ["366", "365", "367", "failed password"]) + scores["failed_count"] = ( + 1.0 if has_failed_count else + 0.5 if "failed" in content and any(c.isdigit() for c in content) else 0.0 + ) + + # Check 2: Top attacking IPs identified + top_ips = ["183.62.140.253", "187.141.143.180", "103.99.0.122", "112.95.230.3"] + ips_found = sum(1 for ip in top_ips if ip in content) + scores["top_ips"] = ( + 1.0 if ips_found >= 3 else + 0.5 if ips_found >= 1 else 0.0 + ) + + # Check 3: Invalid usernames listed + usernames = ["admin", "oracle", "support", "test", "webmaster", "guest"] + users_found = sum(1 for u in usernames if u in content) + scores["invalid_usernames"] = ( + 1.0 if users_found >= 3 else + 0.5 if users_found >= 1 else 0.0 + ) + + # Check 4: Attack assessment + attack_keywords = ["brute force", "brute-force", "attack", "compromise", + "malicious", "automated", "scanning", "dictionary"] + scores["attack_assessment"] = ( + 1.0 if sum(1 for kw in attack_keywords if kw in content) >= 2 else + 0.5 if sum(1 for kw in attack_keywords if kw in content) >= 1 else 0.0 + ) + + return scores +``` + +--- + +## Additional Notes + +**Key facts from the log:** + +- Server: LabSZ, running OpenSSH with PAM authentication +- The attack window is approximately 4 hours (06:55–10:59 on December 10) +- 183.62.140.253 alone accounts for ~307 log entries — a clear brute-force source +- Reverse DNS failures trigger "POSSIBLE BREAK-IN ATTEMPT" warnings (85 occurrences) +- Only 1 successful login in the entire log (user fztu) + +**Grading weights (equal):** Each of the five criteria contributes 0.2 to the final score. diff --git a/tasks/task_log_ssh_successful.md b/tasks/task_log_ssh_successful.md new file mode 100644 index 0000000..ec4dd1f --- /dev/null +++ b/tasks/task_log_ssh_successful.md @@ -0,0 +1,143 @@ +--- +id: task_log_ssh_successful +name: SSH Auth Log - Successful Authentication Summary +category: log_analysis +grading_type: hybrid +timeout_seconds: 180 +workspace_files: + - dest: "auth.log" + source: "logs/openssh_auth.log" +--- + +# SSH Auth Log - Successful Authentication Summary + +## Prompt + +Analyze the OpenSSH authentication log at `auth.log` and produce a report focused on successful authentications. Among the noise of failed attempts, identify all legitimate access. + +Your report should include: + +1. **Successful Logins**: List every successful authentication with timestamp, username, source IP, port, and authentication method +2. **Success vs Failure Ratio**: What percentage of all authentication attempts succeeded? +3. **Legitimate User Profile**: For each successfully authenticated user, describe their access pattern +4. **Session Activity**: Any evidence of what happened after login (session opened/closed events)? +5. **Source IP Validation**: Is the successful login IP associated with any failed attempts as well? +6. **Anomaly Check**: Does the successful login appear legitimate, or does it look suspicious (e.g., coming from an IP that was also brute-forcing)? + +Write the report to `successful_auth_report.md` as a well-structured markdown document. + +--- + +## Expected Behavior + +The agent should identify: + +**Successful Logins:** +- Only 1 successful login in the entire log: + - Time: Dec 10 09:32:20 + - User: fztu + - Source: 119.137.62.142 + - Port: 49116 + - Method: password (ssh2) + - Entry: "Accepted password for fztu from 119.137.62.142 port 49116 ssh2" + +**Success vs Failure Ratio:** +- 1 success out of hundreds of attempts — extremely low success rate +- This reinforces that the log captures a server under brute-force attack + +**Anomaly Check:** +- 119.137.62.142 (the successful login IP) should be checked against the failed attempt list +- If it appears only in the successful entry, it's likely a legitimate user +- If it also has failed attempts, it could be a compromised credential + +**Session Activity:** +- Look for corresponding "session opened" / "session closed" events for user fztu + +Acceptable variations: +- Analysis depth may vary +- Some agents may find additional session-related entries +- Anomaly assessment wording will vary + +--- + +## Grading Criteria + +- [ ] `successful_auth_report.md` is created in the workspace +- [ ] The single successful login is identified (user fztu, IP 119.137.62.142) +- [ ] Success/failure ratio is calculated (1 success vs hundreds of failures) +- [ ] The successful login IP is checked against failed attempt sources +- [ ] An assessment of whether the login appears legitimate is provided + +--- + +## Automated Checks + +```python +def grade(transcript: list, workspace_path: str) -> dict: + """Grade the SSH successful authentication summary task.""" + from pathlib import Path + + scores = {} + workspace = Path(workspace_path) + report_file = workspace / "successful_auth_report.md" + + if not report_file.exists(): + return { + "output_created": 0.0, + "login_identified": 0.0, + "ratio_calculated": 0.0, + "ip_checked": 0.0, + "legitimacy_assessed": 0.0, + } + + scores["output_created"] = 1.0 + content = report_file.read_text(encoding="utf-8").lower() + + # Check 1: Successful login identified + has_user = "fztu" in content + has_ip = "119.137.62.142" in content + has_accepted = "accepted" in content or "successful" in content + scores["login_identified"] = ( + 1.0 if has_user and has_ip else + 0.5 if has_user or has_ip else 0.0 + ) + + # Check 2: Ratio calculated + ratio_keywords = ["ratio", "percent", "1 success", "1 out of", "only 1", + "single success", "one success", "0."] + scores["ratio_calculated"] = ( + 1.0 if sum(1 for kw in ratio_keywords if kw in content) >= 1 else 0.0 + ) + + # Check 3: IP checked against failed attempts + check_keywords = ["119.137.62.142", "not associated", "not found", + "does not appear", "no failed", "legitimate", + "only successful", "no other"] + scores["ip_checked"] = ( + 1.0 if has_ip and sum(1 for kw in check_keywords if kw in content) >= 2 else + 0.5 if has_ip else 0.0 + ) + + # Check 4: Legitimacy assessment + legit_keywords = ["legitimate", "authorized", "valid", "genuine", + "suspicious", "anomal", "normal", "expected"] + scores["legitimacy_assessed"] = ( + 1.0 if sum(1 for kw in legit_keywords if kw in content) >= 1 else 0.0 + ) + + return scores +``` + +--- + +## Additional Notes + +**Key facts from the log:** + +- Only 1 successful login among 1500 entries +- User "fztu" authenticated via password from 119.137.62.142:49116 at 09:32:20 +- 119.137.62.142 does NOT appear as a source of any failed attempts — this is a legitimate user +- The massive imbalance between failed and successful attempts is characteristic of a brute-force target +- PAM session events (opened/closed) may provide additional context + +**Grading weights (equal):** Each of the five criteria contributes 0.2 to the final score. diff --git a/tasks/task_log_ssh_unusual_times.md b/tasks/task_log_ssh_unusual_times.md new file mode 100644 index 0000000..c8e2910 --- /dev/null +++ b/tasks/task_log_ssh_unusual_times.md @@ -0,0 +1,147 @@ +--- +id: task_log_ssh_unusual_times +name: SSH Auth Log - Unusual Hour Login Detection +category: log_analysis +grading_type: hybrid +timeout_seconds: 180 +workspace_files: + - dest: "auth.log" + source: "logs/openssh_auth.log" +--- + +# SSH Auth Log - Unusual Hour Login Detection + +## Prompt + +Analyze the OpenSSH authentication log at `auth.log` and identify login activity occurring at unusual hours. Assume normal business hours are 08:00–18:00 local server time. + +Your report should include: + +1. **Hourly Distribution**: Count of authentication events per hour +2. **Off-Hours Activity**: All authentication events occurring before 08:00 or after 18:00 +3. **Early Morning Analysis**: Detailed breakdown of events between 06:00–08:00 (the earliest in the log) +4. **Successful Logins Timing**: When did the successful login(s) occur? During business hours or not? +5. **Attack Timing Patterns**: Do attackers prefer certain hours? Is there a pattern? +6. **Temporal Risk Assessment**: Based on timing patterns, what times should be monitored most closely? + +Write the report to `unusual_hours_report.md` as a well-structured markdown document. + +--- + +## Expected Behavior + +The agent should parse the log and produce: + +**Hourly Distribution:** +- 06:xx — 7 entries (log starts at 06:55) +- 07:xx — 169 entries +- 08:xx — 118 entries +- 09:xx — 676 entries (peak hour) +- 10:xx — 530 entries (log ends at 10:59) + +**Off-Hours Activity:** +- 7 entries before 07:00 (at 06:55–06:56) +- All pre-08:00 entries are attack traffic (failed logins, BREAK-IN warnings) + +**Successful Login Timing:** +- User fztu logged in at 09:32:20 — during business hours + +**Attack Timing Patterns:** +- Attacks escalate sharply from 07:xx to 09:xx +- Peak attack volume is 09:xx with 676 entries +- This could indicate attackers in a different timezone where 09:00 LabSZ time is their working hours +- The early morning entries (06:55) represent the tail end or start of an attack campaign + +Acceptable variations: +- "Unusual hours" definition may vary +- Timezone assumptions may differ +- Assessment language will vary + +--- + +## Grading Criteria + +- [ ] `unusual_hours_report.md` is created in the workspace +- [ ] Hourly distribution of events is provided +- [ ] Off-hours (pre-08:00) events are separately identified +- [ ] The successful login timing is noted (09:32:20, during business hours) +- [ ] Attack timing patterns are analyzed + +--- + +## Automated Checks + +```python +def grade(transcript: list, workspace_path: str) -> dict: + """Grade the SSH unusual hours detection task.""" + from pathlib import Path + + scores = {} + workspace = Path(workspace_path) + report_file = workspace / "unusual_hours_report.md" + + if not report_file.exists(): + return { + "output_created": 0.0, + "hourly_distribution": 0.0, + "off_hours_identified": 0.0, + "successful_timing": 0.0, + "timing_patterns": 0.0, + } + + scores["output_created"] = 1.0 + content = report_file.read_text(encoding="utf-8").lower() + + # Check 1: Hourly distribution provided + hour_markers = ["06:", "07:", "08:", "09:", "10:"] + alt_markers = ["6 am", "7 am", "8 am", "9 am", "10 am", "6:00", "7:00", + "8:00", "9:00", "10:00"] + all_markers = hour_markers + alt_markers + hours_found = sum(1 for m in all_markers if m in content) + scores["hourly_distribution"] = ( + 1.0 if hours_found >= 4 else + 0.5 if hours_found >= 2 else 0.0 + ) + + # Check 2: Off-hours events identified + off_hours_keywords = ["before 08", "before 8:00", "early morning", + "06:55", "pre-business", "off-hour", "off hour", + "unusual hour", "outside business"] + scores["off_hours_identified"] = ( + 1.0 if sum(1 for kw in off_hours_keywords if kw in content) >= 1 else 0.0 + ) + + # Check 3: Successful login timing noted + has_fztu = "fztu" in content + has_time = "09:32" in content or "9:32" in content + has_business = any(kw in content for kw in ["business hour", "normal hour", + "working hour", "during"]) + scores["successful_timing"] = ( + 1.0 if has_fztu and (has_time or has_business) else + 0.5 if has_fztu else 0.0 + ) + + # Check 4: Timing patterns analyzed + pattern_keywords = ["peak", "escalat", "increas", "pattern", "trend", + "676", "530", "busiest", "most active", "concentrated"] + scores["timing_patterns"] = ( + 1.0 if sum(1 for kw in pattern_keywords if kw in content) >= 2 else + 0.5 if sum(1 for kw in pattern_keywords if kw in content) >= 1 else 0.0 + ) + + return scores +``` + +--- + +## Additional Notes + +**Key facts from the log:** + +- Log spans ~4 hours: 06:55 to 10:59 on December 10 +- Attack intensity increases dramatically over time: 7 → 169 → 118 → 676 → 530 per hour +- The 09:xx hour is the busiest with 676 entries +- Pre-08:00 entries are entirely attack traffic +- The single successful login (fztu at 09:32) occurs during peak attack activity + +**Grading weights (equal):** Each of the five criteria contributes 0.2 to the final score. diff --git a/tasks/task_log_ssh_user_activity.md b/tasks/task_log_ssh_user_activity.md new file mode 100644 index 0000000..4224f5a --- /dev/null +++ b/tasks/task_log_ssh_user_activity.md @@ -0,0 +1,141 @@ +--- +id: task_log_ssh_user_activity +name: SSH Auth Log - User Login Activity Report +category: log_analysis +grading_type: hybrid +timeout_seconds: 180 +workspace_files: + - dest: "auth.log" + source: "logs/openssh_auth.log" +--- + +# SSH Auth Log - User Login Activity Report + +## Prompt + +Analyze the OpenSSH authentication log at `auth.log` and produce a user-focused activity report. For every username mentioned in the log (both valid and invalid), summarize their authentication activity. + +Your report should include: + +1. **All Usernames Attempted**: List every username that appears in the log (both valid system users and invalid/non-existent users) +2. **Valid vs Invalid Users**: Classify each username as valid (accepted by the system) or invalid (rejected as non-existent) +3. **Per-User Summary**: For each username, show: number of attempts, source IPs, success/failure, first and last attempt timestamp +4. **Most Targeted Users**: Rank usernames by number of failed attempts +5. **Username Patterns**: Are attackers using a dictionary? Common patterns (admin, root, test, service accounts)? +6. **User Risk Assessment**: Which usernames, if they existed, would pose the greatest security risk? + +Write the report to `user_activity_report.md` as a well-structured markdown document. + +--- + +## Expected Behavior + +The agent should identify: + +**Invalid Users (top by frequency):** +- admin (18 attempts), oracle (6), support (5), test (4), inspur (3), 0 (3), matlab (3), webmaster (2), guest (2), 1234 (2), and others + +**Valid Users:** +- fztu — the only user with a successful login +- root — likely a valid user that's targeted (check for "Failed password for root" vs "Failed password for invalid user root") + +**Username Patterns:** +- Common service accounts: admin, oracle, support, webmaster +- Default credentials: test, guest, 1234, 0 +- Application-specific: matlab, inspur +- This is clearly a dictionary attack using common username lists + +**Risk Assessment:** +- "admin" and "root" are highest risk — if compromised, full system access +- "oracle" suggests attackers know this is likely a Linux server running databases +- Numeric usernames like "0" and "1234" indicate automated/scripted attacks + +Acceptable variations: +- The distinction between valid and invalid users depends on parsing "invalid user" messages +- Some usernames may be ambiguous +- Risk assessment language will vary + +--- + +## Grading Criteria + +- [ ] `user_activity_report.md` is created in the workspace +- [ ] Both valid and invalid usernames are listed +- [ ] The most-targeted username (admin) is identified +- [ ] Username patterns are analyzed (dictionary attack, common defaults) +- [ ] A risk assessment is provided for the most dangerous usernames + +--- + +## Automated Checks + +```python +def grade(transcript: list, workspace_path: str) -> dict: + """Grade the SSH user activity report task.""" + from pathlib import Path + + scores = {} + workspace = Path(workspace_path) + report_file = workspace / "user_activity_report.md" + + if not report_file.exists(): + return { + "output_created": 0.0, + "usernames_listed": 0.0, + "admin_targeted": 0.0, + "patterns_analyzed": 0.0, + "risk_assessment": 0.0, + } + + scores["output_created"] = 1.0 + content = report_file.read_text(encoding="utf-8").lower() + + # Check 1: Both valid and invalid usernames listed + invalid_users = ["admin", "oracle", "support", "test", "webmaster", "guest"] + valid_users = ["fztu"] + invalid_found = sum(1 for u in invalid_users if u in content) + valid_found = sum(1 for u in valid_users if u in content) + scores["usernames_listed"] = ( + 1.0 if invalid_found >= 3 and valid_found >= 1 else + 0.5 if invalid_found >= 2 else 0.0 + ) + + # Check 2: Admin identified as most targeted + scores["admin_targeted"] = ( + 1.0 if "admin" in content and any(kw in content for kw in + ["most", "top", "highest", "18", "target"]) else + 0.5 if "admin" in content else 0.0 + ) + + # Check 3: Username patterns analyzed + pattern_keywords = ["dictionary", "common", "default", "service account", + "automated", "wordlist", "brute", "pattern"] + scores["patterns_analyzed"] = ( + 1.0 if sum(1 for kw in pattern_keywords if kw in content) >= 2 else + 0.5 if sum(1 for kw in pattern_keywords if kw in content) >= 1 else 0.0 + ) + + # Check 4: Risk assessment + risk_keywords = ["risk", "danger", "critical", "compromise", "privilege", + "escalat", "root access", "full access"] + scores["risk_assessment"] = ( + 1.0 if sum(1 for kw in risk_keywords if kw in content) >= 2 else + 0.5 if sum(1 for kw in risk_keywords if kw in content) >= 1 else 0.0 + ) + + return scores +``` + +--- + +## Additional Notes + +**Key facts from the log:** + +- ~100 "Invalid user" entries with various usernames +- "admin" is tried 18 times — the most popular target +- User "fztu" is the only confirmed valid user (successful login) +- "root" appears in failed password attempts but NOT as "invalid user" — suggesting root is a real account +- The username list reads like a standard SSH brute-force dictionary + +**Grading weights (equal):** Each of the five criteria contributes 0.2 to the final score. diff --git a/tasks/task_log_syslog_anomalies.md b/tasks/task_log_syslog_anomalies.md new file mode 100644 index 0000000..81bbc0f --- /dev/null +++ b/tasks/task_log_syslog_anomalies.md @@ -0,0 +1,140 @@ +--- +id: task_log_syslog_anomalies +name: Linux Syslog - Anomaly Detection +category: log_analysis +grading_type: hybrid +timeout_seconds: 180 +workspace_files: + - dest: "syslog.log" + source: "logs/linux_syslog.log" +--- + +# Linux Syslog - Anomaly Detection + +## Prompt + +Analyze the Linux syslog at `syslog.log` and identify anomalous or suspicious entries. The log is from a server named "combo" running a Linux 2.6 kernel, covering several months of activity. + +Your report should include: + +1. **Log Overview**: Total entries, date range, top services by volume +2. **Security Anomalies**: Entries that indicate potential attacks, exploits, or unauthorized access attempts +3. **Format String Attack Detection**: Look for entries with unusual binary content or exploit payloads in service input +4. **FTP Anomalies**: The log has heavy FTP traffic — identify any suspicious FTP connection patterns (bursts, unusual sources) +5. **rpc.statd Exploitation**: Check for rpc.statd gethostbyname errors with malformed hostnames (buffer overflow attempts) +6. **Anomaly Summary**: Rank the top 5 most concerning anomalies with severity and evidence + +Write the report to `syslog_anomalies.md` as a well-structured markdown document. + +--- + +## Expected Behavior + +The agent should parse 5000 entries and identify: + +**Log Overview:** +- 5000 entries, June 9 to September 14 (2005, based on kernel version) +- Top services: ftpd (1655), sshd/pam_unix (1610), kernel (545), su/pam_unix (394) + +**Security Anomalies:** +1. **rpc.statd format string attack** (~9 entries on Jun 13): + - `gethostbyname error for ^X...%8x%8x...%hn%51859x%hn` — this is a buffer overflow/format string exploitation attempt against rpc.statd + - The payload contains format string specifiers (%x, %hn) which are classic exploit patterns +2. **SSH brute force** — heavy sshd(pam_unix) authentication failure volume +3. **FTP flood** — 1655 FTP connection entries, with bursts (e.g., 209.184.7.130 with multiple simultaneous connections) +4. **Authentication failures** — 2000+ pam_unix auth failure entries across SSH and other services + +**rpc.statd Exploitation:** +- 9 entries at Jun 13 11:55:04–11:55:09 +- Malformed hostname contains NOP sled (\220\220\220\220) and format string payload +- This is an attempted remote code execution exploit + +Acceptable variations: +- Anomaly ranking may differ +- Additional anomalies beyond the expected ones are welcome +- Severity assessments will vary + +--- + +## Grading Criteria + +- [ ] `syslog_anomalies.md` is created in the workspace +- [ ] Log overview with date range and service breakdown is provided +- [ ] rpc.statd format string attack is identified as a security anomaly +- [ ] FTP connection patterns are analyzed +- [ ] SSH authentication failures are flagged + +--- + +## Automated Checks + +```python +def grade(transcript: list, workspace_path: str) -> dict: + """Grade the Linux syslog anomaly detection task.""" + from pathlib import Path + + scores = {} + workspace = Path(workspace_path) + report_file = workspace / "syslog_anomalies.md" + + if not report_file.exists(): + return { + "output_created": 0.0, + "log_overview": 0.0, + "rpc_statd_attack": 0.0, + "ftp_analysis": 0.0, + "ssh_failures": 0.0, + } + + scores["output_created"] = 1.0 + content = report_file.read_text(encoding="utf-8").lower() + + # Check 1: Log overview + has_count = any(n in content for n in ["5000", "5,000"]) + has_date = any(d in content for d in ["jun", "june", "sep", "september"]) + scores["log_overview"] = ( + 1.0 if has_count and has_date else + 0.5 if has_count or has_date else 0.0 + ) + + # Check 2: rpc.statd attack identified + rpc_keywords = ["rpc.statd", "rpc statd", "gethostbyname", "format string", + "buffer overflow", "exploit", "%hn", "nop sled", "\\220"] + scores["rpc_statd_attack"] = ( + 1.0 if sum(1 for kw in rpc_keywords if kw in content) >= 2 else + 0.5 if sum(1 for kw in rpc_keywords if kw in content) >= 1 else 0.0 + ) + + # Check 3: FTP analysis + ftp_keywords = ["ftpd", "ftp", "1655", "209.184", "connection flood", + "burst", "ftp connection"] + scores["ftp_analysis"] = ( + 1.0 if sum(1 for kw in ftp_keywords if kw in content) >= 2 else + 0.5 if sum(1 for kw in ftp_keywords if kw in content) >= 1 else 0.0 + ) + + # Check 4: SSH failures flagged + ssh_keywords = ["sshd", "ssh", "authentication failure", "brute force", + "failed", "pam_unix"] + scores["ssh_failures"] = ( + 1.0 if sum(1 for kw in ssh_keywords if kw in content) >= 2 else + 0.5 if sum(1 for kw in ssh_keywords if kw in content) >= 1 else 0.0 + ) + + return scores +``` + +--- + +## Additional Notes + +**Key facts from the log:** + +- Server: "combo", Linux 2.6.5-1.358, Fedora/Red Hat based +- Date range: Jun 9 to Sep 14 (year 2005, based on kernel build date) +- The rpc.statd attack on Jun 13 is the most serious anomaly — a real exploit attempt +- 1655 FTP connections, heavily concentrated from certain IPs (209.184.7.130) +- The su(pam_unix) entries (394) show regular privilege escalation — likely legitimate cron jobs +- Some entries contain non-UTF-8 bytes (binary exploit payloads) + +**Grading weights (equal):** Each of the five criteria contributes 0.2 to the final score. diff --git a/tasks/task_log_syslog_auth_failures.md b/tasks/task_log_syslog_auth_failures.md new file mode 100644 index 0000000..5573f17 --- /dev/null +++ b/tasks/task_log_syslog_auth_failures.md @@ -0,0 +1,152 @@ +--- +id: task_log_syslog_auth_failures +name: Linux Syslog - Authentication Failure Summary +category: log_analysis +grading_type: hybrid +timeout_seconds: 180 +workspace_files: + - dest: "syslog.log" + source: "logs/linux_syslog.log" +--- + +# Linux Syslog - Authentication Failure Summary + +## Prompt + +Analyze the Linux syslog at `syslog.log` and produce a comprehensive summary of all authentication failures. The log contains PAM authentication events from multiple services. + +Your report should include: + +1. **Total Auth Failures**: Count all authentication failure entries across all services +2. **Failures by Service**: Break down failures by service (sshd, ftpd, login, su, etc.) +3. **Failures by Source**: Top 10 source hosts/IPs generating the most failures +4. **Targeted Users**: Which user accounts are being targeted in failed auth attempts? +5. **Temporal Distribution**: When do most authentication failures occur? Any spikes? +6. **FTP vs SSH Analysis**: Compare the authentication attack patterns across FTP and SSH — are the same sources attacking both? +7. **Recommendations**: Based on the failure patterns, recommend specific security improvements + +Write the report to `auth_failures_report.md` as a well-structured markdown document. + +--- + +## Expected Behavior + +The agent should parse the ~2000+ PAM-related entries and produce: + +**Total Auth Failures:** +- Over 2000 authentication-related PAM entries +- Primary sources: sshd(pam_unix) (~1610 entries), ftpd connections (~1655 entries) + +**Failures by Service:** +- sshd(pam_unix) — the dominant source of auth failure messages +- ftpd — heavy connection volume (ftpd logs connections, not always explicit failures) +- su(pam_unix) — 394 entries (mostly legitimate — session opens/closes for cron) +- login(pam_unix) — 14 entries +- klogind — 46 entries (Kerberos login daemon) + +**Failures by Source:** +- SSH attacks come from various remote hosts (rhost= in pam entries) +- FTP connections concentrated from specific IPs (e.g., 209.184.7.130) +- Some hosts appear in both SSH and FTP failure logs + +**Targeted Users:** +- root — primary target for SSH brute force +- Various invalid usernames tried via SSH + +**Temporal Distribution:** +- Log spans Jun 9 to Sep 14 +- Attack spikes visible on specific dates +- SSH brute force tends to cluster in time + +Acceptable variations: +- Exact counts depend on how "authentication failure" is defined +- FTP entries may or may not be classified as auth failures +- Temporal analysis granularity may vary + +--- + +## Grading Criteria + +- [ ] `auth_failures_report.md` is created in the workspace +- [ ] Authentication failures are counted (2000+ pam-related entries) +- [ ] Failures are broken down by service (sshd, ftpd, su, etc.) +- [ ] Top source hosts are listed +- [ ] Recommendations for security improvement are provided + +--- + +## Automated Checks + +```python +def grade(transcript: list, workspace_path: str) -> dict: + """Grade the Linux syslog authentication failure summary task.""" + from pathlib import Path + + scores = {} + workspace = Path(workspace_path) + report_file = workspace / "auth_failures_report.md" + + if not report_file.exists(): + return { + "output_created": 0.0, + "failures_counted": 0.0, + "by_service": 0.0, + "source_hosts": 0.0, + "recommendations": 0.0, + } + + scores["output_created"] = 1.0 + content = report_file.read_text(encoding="utf-8").lower() + + # Check 1: Failures counted + has_count = any(kw in content for kw in ["2000", "2,000", "1610", "1,610", + "1655", "1,655", "thousand"]) + has_failure = "authentication failure" in content or "auth fail" in content or "failed" in content + scores["failures_counted"] = ( + 1.0 if has_count and has_failure else + 0.5 if has_failure else 0.0 + ) + + # Check 2: Broken down by service + services = ["sshd", "ftpd", "ftp", "su(pam", "su ", "login", "klogin", "pam_unix"] + services_found = sum(1 for s in services if s in content) + scores["by_service"] = ( + 1.0 if services_found >= 3 else + 0.5 if services_found >= 2 else 0.0 + ) + + # Check 3: Source hosts listed + host_indicators = ["rhost", "source", "remote", "ip", "host", "209.184", + "sagonet", "iasi", "astral"] + scores["source_hosts"] = ( + 1.0 if sum(1 for kw in host_indicators if kw in content) >= 3 else + 0.5 if sum(1 for kw in host_indicators if kw in content) >= 1 else 0.0 + ) + + # Check 4: Recommendations provided + rec_keywords = ["recommend", "should", "implement", "consider", "disable", + "block", "firewall", "fail2ban", "key-based", "rate limit"] + rec_lines = [l for l in content.split("\n") if any(kw in l for kw in rec_keywords)] + scores["recommendations"] = ( + 1.0 if len(rec_lines) >= 2 else + 0.5 if len(rec_lines) >= 1 else 0.0 + ) + + return scores +``` + +--- + +## Additional Notes + +**Key facts from the log:** + +- Server "combo" has multiple authentication surfaces: SSH, FTP, telnet (klogind), local login, su +- sshd(pam_unix) is the most frequent auth service (1610 entries) +- ftpd has 1655 connection entries — the heaviest service by volume +- su(pam_unix) sessions (394) are mostly legitimate (cron jobs, uid=0 running as service users) +- login(pam_unix) has 14 entries — console/terminal logins +- klogind has 46 entries — Kerberos remote login +- The system is a 2005-era Fedora server with many exposed services — a security hardening candidate + +**Grading weights (equal):** Each of the five criteria contributes 0.2 to the final score. diff --git a/tasks/task_log_syslog_cron.md b/tasks/task_log_syslog_cron.md new file mode 100644 index 0000000..b4e42b6 --- /dev/null +++ b/tasks/task_log_syslog_cron.md @@ -0,0 +1,148 @@ +--- +id: task_log_syslog_cron +name: Linux Syslog - Cron Job Execution Analysis +category: log_analysis +grading_type: hybrid +timeout_seconds: 180 +workspace_files: + - dest: "syslog.log" + source: "logs/linux_syslog.log" +--- + +# Linux Syslog - Cron Job Execution Analysis + +## Prompt + +Analyze the Linux syslog at `syslog.log` and produce a report on cron job and scheduled task activity. Look for crond, anacron, logrotate, and any other scheduled execution evidence. + +Your report should include: + +1. **Cron Service Status**: When does crond start? How many startup events are there? +2. **Anacron Activity**: Document all anacron startup events and their timing +3. **Logrotate Activity**: Identify logrotate executions and what services they affect +4. **su Session Patterns**: The `su(pam_unix)` entries often indicate cron executing tasks as different users — analyze these patterns +5. **Scheduled Task Timeline**: Create a timeline of all scheduled/periodic activity +6. **Recurring Patterns**: Identify any regular patterns (daily, weekly) in scheduled executions + +Write the report to `cron_analysis.md` as a well-structured markdown document. + +--- + +## Expected Behavior + +The agent should identify: + +**Cron/Anacron Startups:** +- crond startup: Jun 9 06:06:49, Jun 10 11:32:10, Jul 27 14:42:23 (3 events, aligned with system boots) +- anacron startup: Jun 9 06:06:51, Jun 10 11:32:12, Jul 27 14:42:25 (follows crond immediately) + +**Logrotate Activity:** +- 97 logrotate entries in the log +- Logrotate triggers service restarts (especially CUPS) +- Runs periodically — likely daily via anacron/cron + +**su(pam_unix) Patterns:** +- 394 su session entries +- Sessions opened for users like: htt, cyrus, news +- Pattern: "session opened for user X by (uid=0)" → "session closed for user X" +- These are cron jobs running as specific service users + +**Key scheduled users:** +- htt (web server) — regular su sessions +- cyrus (mail) — regular su sessions +- news — periodic sessions + +**Recurring Patterns:** +- Daily: logrotate runs, su sessions for service users +- At boot: crond → anacron startup sequence +- Weekly: CUPS restart pattern (shutdown + startup) + +Acceptable variations: +- Pattern detection approaches may differ +- Timeline granularity may vary +- Some scheduled patterns require inference from the su session data + +--- + +## Grading Criteria + +- [ ] `cron_analysis.md` is created in the workspace +- [ ] Crond and anacron startup events are documented +- [ ] Logrotate activity is identified (97 entries) +- [ ] su(pam_unix) sessions are linked to scheduled tasks +- [ ] Recurring patterns are identified (daily, at boot, etc.) + +--- + +## Automated Checks + +```python +def grade(transcript: list, workspace_path: str) -> dict: + """Grade the Linux syslog cron job analysis task.""" + from pathlib import Path + + scores = {} + workspace = Path(workspace_path) + report_file = workspace / "cron_analysis.md" + + if not report_file.exists(): + return { + "output_created": 0.0, + "cron_anacron": 0.0, + "logrotate": 0.0, + "su_sessions": 0.0, + "recurring_patterns": 0.0, + } + + scores["output_created"] = 1.0 + content = report_file.read_text(encoding="utf-8").lower() + + # Check 1: Crond and anacron documented + has_crond = "crond" in content or "cron daemon" in content + has_anacron = "anacron" in content + scores["cron_anacron"] = ( + 1.0 if has_crond and has_anacron else + 0.5 if has_crond else 0.0 + ) + + # Check 2: Logrotate identified + has_logrotate = "logrotate" in content + has_count = "97" in content or any(kw in content for kw in + ["frequent", "numerous", "many logrotate"]) + scores["logrotate"] = ( + 1.0 if has_logrotate and has_count else + 0.5 if has_logrotate else 0.0 + ) + + # Check 3: su sessions analyzed + has_su = "su(" in content or "su(pam" in content or "su session" in content + has_users = sum(1 for u in ["htt", "cyrus", "news"] if u in content) + scores["su_sessions"] = ( + 1.0 if has_su and has_users >= 2 else + 0.5 if has_su else 0.0 + ) + + # Check 4: Recurring patterns identified + pattern_keywords = ["daily", "weekly", "periodic", "regular", "recurring", + "schedule", "pattern", "at boot", "every"] + scores["recurring_patterns"] = ( + 1.0 if sum(1 for kw in pattern_keywords if kw in content) >= 2 else + 0.5 if sum(1 for kw in pattern_keywords if kw in content) >= 1 else 0.0 + ) + + return scores +``` + +--- + +## Additional Notes + +**Key facts from the log:** + +- crond and anacron always start together at boot, 2 seconds apart +- logrotate is the most active periodic process (97 entries) +- su sessions (394 entries) are the main indicator of scheduled task execution +- Common cron user pattern: uid=0 opens session for service user, then closes it +- The server runs mail (cyrus), web (htt), and news services — all with cron maintenance + +**Grading weights (equal):** Each of the five criteria contributes 0.2 to the final score. diff --git a/tasks/task_log_syslog_services.md b/tasks/task_log_syslog_services.md new file mode 100644 index 0000000..d74db4c --- /dev/null +++ b/tasks/task_log_syslog_services.md @@ -0,0 +1,152 @@ +--- +id: task_log_syslog_services +name: Linux Syslog - Service Start/Stop Summary +category: log_analysis +grading_type: hybrid +timeout_seconds: 180 +workspace_files: + - dest: "syslog.log" + source: "logs/linux_syslog.log" +--- + +# Linux Syslog - Service Start/Stop Summary + +## Prompt + +Analyze the Linux syslog at `syslog.log` and produce a summary of all service start and stop events. The log is from a server named "combo" running Linux 2.6. + +Your report should include: + +1. **Service Inventory**: List every service mentioned in the log with startup or shutdown events +2. **System Boot Events**: Identify all system boots/restarts by finding clusters of service startups +3. **Per-Service Status**: For each service, list all start/stop events with timestamps +4. **CUPS Special Case**: The CUPS print service has frequent restarts — document its pattern separately +5. **Service Dependencies**: Based on startup ordering, infer which services start first (core OS) vs later (applications) +6. **Uptime Estimate**: Based on boot events, estimate the server's uptime between restarts + +Write the report to `service_status_report.md` as a well-structured markdown document. + +--- + +## Expected Behavior + +The agent should identify: + +**System Boots (3 detected):** +1. Jun 9 ~06:06 — Full boot (syslogd, klogd, kernel, irqbalance, portmap, etc.) +2. Jun 10 ~11:32 — Full boot (same service sequence) +3. Jul 27 ~14:42 — Another boot event + +**Service Inventory (20+ services with startup events):** +- Core: syslogd, klogd, irqbalance, portmap +- Network: rpc.statd, rpc.idmapd, sendmail, sm-client, named +- Security: spamd, privoxy +- Hardware: bluetooth (hcid, sdpd), smartd, apmd, gpm +- Printing: cupsd (17 startups!) +- Scheduling: crond, anacron, xinetd +- Web: htt + +**CUPS Pattern:** +- cupsd has 17 startup events and 15 shutdown events +- Regular weekly pattern: shutdown early morning (04:0x) followed by startup +- This matches logrotate triggering CUPS restart + +**Service Dependencies (boot order):** +1. syslogd, klogd (logging first) +2. kernel messages +3. irqbalance, portmap (system services) +4. Network services (rpc, named) +5. Application services (cups, cron, sendmail) + +Acceptable variations: +- Boot detection approaches may differ +- Service categorization is subjective +- Uptime calculations are approximate + +--- + +## Grading Criteria + +- [ ] `service_status_report.md` is created in the workspace +- [ ] System boot events are identified (at least 2 boots found) +- [ ] Services are listed with their start/stop events +- [ ] CUPS frequent restarts are noted (17 startups) +- [ ] Service startup ordering is analyzed + +--- + +## Automated Checks + +```python +def grade(transcript: list, workspace_path: str) -> dict: + """Grade the Linux syslog service status summary task.""" + from pathlib import Path + + scores = {} + workspace = Path(workspace_path) + report_file = workspace / "service_status_report.md" + + if not report_file.exists(): + return { + "output_created": 0.0, + "boots_identified": 0.0, + "services_listed": 0.0, + "cups_pattern": 0.0, + "startup_ordering": 0.0, + } + + scores["output_created"] = 1.0 + content = report_file.read_text(encoding="utf-8").lower() + + # Check 1: Boot events identified + boot_dates = ["jun 9", "jun 10", "jul 27", "june 9", "june 10", "july 27"] + boots_found = sum(1 for d in boot_dates if d in content) + has_boot = any(kw in content for kw in ["boot", "restart", "reboot", "startup"]) + scores["boots_identified"] = ( + 1.0 if boots_found >= 2 and has_boot else + 0.5 if boots_found >= 1 else 0.0 + ) + + # Check 2: Services listed + services = ["syslogd", "klogd", "crond", "cupsd", "cups", "sendmail", + "portmap", "sshd", "named", "xinetd", "smartd", "gpm"] + services_found = sum(1 for s in services if s in content) + scores["services_listed"] = ( + 1.0 if services_found >= 6 else + 0.5 if services_found >= 3 else 0.0 + ) + + # Check 3: CUPS pattern noted + has_cups = "cups" in content + has_frequent = any(kw in content for kw in ["17", "frequent", "multiple", + "regular", "weekly", "logrotate"]) + scores["cups_pattern"] = ( + 1.0 if has_cups and has_frequent else + 0.5 if has_cups else 0.0 + ) + + # Check 4: Startup ordering analyzed + order_keywords = ["order", "first", "before", "after", "sequence", + "dependency", "boot order", "startup order"] + scores["startup_ordering"] = ( + 1.0 if sum(1 for kw in order_keywords if kw in content) >= 2 else + 0.5 if sum(1 for kw in order_keywords if kw in content) >= 1 else 0.0 + ) + + return scores +``` + +--- + +## Additional Notes + +**Key facts from the log:** + +- Server: "combo", Fedora/Red Hat, Linux 2.6.5-1.358 +- 3 full system boots detected in the log +- CUPS restarts weekly — likely triggered by logrotate +- 20+ unique services with startup events +- The boot sequence is consistent across all 3 boots +- Boot sequence shows clear ordering: logging → system → network → application services + +**Grading weights (equal):** Each of the five criteria contributes 0.2 to the final score.