From cfa7c4ae83ad40072031fec38cec8d18b40d9d8f Mon Sep 17 00:00:00 2001 From: Julz19 Date: Sat, 16 May 2026 03:57:31 -0400 Subject: [PATCH] Add DNS history and runtime regex checks --- README.md | 63 +++++++ dooked/include/cli_preprocessor.hpp | 17 ++ dooked/include/utils/containers.hpp | 12 +- dooked/include/utils/exceptions.hpp | 1 + dooked/include/utils/io_utils.hpp | 77 +++++++- dooked/source/cli_preprocessor.cpp | 227 ++++++++++++++++++++++++ dooked/source/http/requests_handler.cpp | 6 + dooked/source/http/resolver.cpp | 23 ++- dooked/source/main.cpp | 8 + 9 files changed, 424 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index f1a761c..3ca879c 100644 --- a/README.md +++ b/README.md @@ -39,3 +39,66 @@ make ## Usage For comprehensive help, use `dooked --help` + +### History tracking + +When a previous JSON output file is passed back as input, dooked now carries +record history forward into the next JSON output: + +- `first-seen`: when this DNS record first appeared in dooked output +- `last-seen`: when this DNS record was last observed +- `seen`: how many runs have observed the same domain/type/value tuple + +The history metadata makes load-balanced records less noisy because an IP can +rotate out of a single run without losing when it was first and last observed. +Records from previous JSON input are preserved in the next output when they are +not observed in the current run, with their previous `last_seen` value intact. + +Additional reporting flags: + +```sh +dooked -i previous.json --fs +dooked -i previous.json --ls 2 +dooked -i previous.json --lsd 05/16/2026 +``` + +- `--fs` reports DNS records that are first seen in the current run. +- `--ls ` reports records from the previous JSON that are missing from + the current run and were last seen at least `` days ago. +- `--lsd ` reports missing records last seen on or before the + supplied US-formatted date. + +### Runtime regex checks + +Use `--checks ` to load custom regex alerts at runtime. The config +can be either an array or an object with a `checks` array: + +```json +{ + "checks": [ + { + "field": "domain", + "regex": "(dev|test)", + "alert": "development-looking domain", + "ignore_case": true + }, + { + "field": "page_content", + "regex": "Copyright 2024", + "alert": "outdated copyright banner" + }, + { + "field": "content_length", + "regex": "^[1-9][0-9]{5,}$", + "alert": "large response body" + } + ] +} +``` + +Supported fields are `domain`, `domain_name`, `type`, `rdata`, `info`, `ttl`, +`http_code`, `code_string`, `content_length`, `body`, `page_content`, +`response_body`, `first-seen`, `first_seen`, `last-seen`, `last_seen`, and +`seen`. Use `"field": "*"` to run a regex against every supported field. +Response bodies are capped in memory for runtime checks and are not written +into JSON output. diff --git a/dooked/include/cli_preprocessor.hpp b/dooked/include/cli_preprocessor.hpp index 43fa1ba..364d3bc 100644 --- a/dooked/include/cli_preprocessor.hpp +++ b/dooked/include/cli_preprocessor.hpp @@ -2,6 +2,7 @@ #include "dns/dns_resolver.hpp" #include "utils/io_utils.hpp" +#include #include // maximum sockets to open regardless of the number of threads @@ -19,12 +20,23 @@ struct cli_args_t { std::string resolver_filename{}; std::string output_filename{}; std::string input_filename{}; + std::string last_seen_date{}; + std::string check_config_filename{}; int file_type{}; int post_http_request{}; int thread_count{}; int content_length{-1}; + int last_seen_days{-1}; bool include_date{false}; + bool show_first_seen{false}; +}; + +struct regex_check_t { + std::string field{}; + std::regex pattern{}; + std::string pattern_text{}; + std::string alert{}; }; struct runtime_args_t { @@ -33,9 +45,14 @@ struct runtime_args_t { std::optional> previous_data{}; std::unique_ptr output_file{}; std::string output_filename{}; + std::string run_timestamp{}; + std::string last_seen_date{}; + std::vector regex_checks{}; http_process_e http_request_time_{}; int thread_count{}; int content_length{-1}; + int last_seen_days{-1}; + bool show_first_seen{false}; }; void run_program(cli_args_t const &cli_args); diff --git a/dooked/include/utils/containers.hpp b/dooked/include/utils/containers.hpp index 7c955f6..473d72b 100644 --- a/dooked/include/utils/containers.hpp +++ b/dooked/include/utils/containers.hpp @@ -7,6 +7,7 @@ #include #include #include +#include #include namespace dooked { @@ -31,6 +32,7 @@ template class circular_queue_t { struct http_response_t { int content_length_{}; int http_status_{}; + std::string response_body_{}; }; template struct http_dns_response_t { @@ -52,9 +54,10 @@ template class map_container_t { } void insert_impl(std::string const &name, int const len, - int const http_status) { + int const http_status, std::string const &body) { map_[name].http_result_.content_length_ = len; map_[name].http_result_.http_status_ = http_status; + map_[name].http_result_.response_body_ = body; } public: @@ -74,12 +77,13 @@ template class map_container_t { append_impl(key, value); } - void insert(std::string const &name, int const len, int const http_status) { + void insert(std::string const &name, int const len, int const http_status, + std::string const &body = {}) { if (!opt_mutex_) { - return insert_impl(name, len, http_status); + return insert_impl(name, len, http_status, body); } std::lock_guard lock_g{*opt_mutex_}; - insert_impl(name, len, http_status); + insert_impl(name, len, http_status, body); } // only used by main thread, after all "computations" has been // done. There's no need for locks here. diff --git a/dooked/include/utils/exceptions.hpp b/dooked/include/utils/exceptions.hpp index a749a1b..846d544 100644 --- a/dooked/include/utils/exceptions.hpp +++ b/dooked/include/utils/exceptions.hpp @@ -1,6 +1,7 @@ #pragma once #include +#include namespace dooked { diff --git a/dooked/include/utils/io_utils.hpp b/dooked/include/utils/io_utils.hpp index 829b09e..e14e52c 100644 --- a/dooked/include/utils/io_utils.hpp +++ b/dooked/include/utils/io_utils.hpp @@ -5,8 +5,10 @@ #include #include #include +#include #include #include +#include #include namespace dooked { @@ -26,14 +28,26 @@ void trim(std::string &); struct json_data_t { std::string domain_name{}; std::string rdata{}; + std::string first_seen{}; + std::string last_seen{}; int ttl{}; int http_code{}; int content_length{}; + int seen{1}; dns_record_type_e type{}; static json_data_t serialize(std::string const &d, int const len, int const http_code, json::object_t &json_object) { + auto const get_string = [&](char const *primary, char const *fallback) { + if (auto iter = json_object.find(primary); iter != json_object.end()) { + return iter->second.get(); + } + if (auto iter = json_object.find(fallback); iter != json_object.end()) { + return iter->second.get(); + } + return std::string{}; + }; json_data_t data{}; data.domain_name = d; data.type = @@ -42,6 +56,11 @@ struct json_data_t { data.ttl = json_object["ttl"].get(); data.content_length = len; data.http_code = http_code; + data.first_seen = get_string("first-seen", "first_seen"); + data.last_seen = get_string("last-seen", "last_seen"); + if (auto iter = json_object.find("seen"); iter != json_object.end()) { + data.seen = iter->second.get(); + } return data; } }; @@ -54,6 +73,25 @@ struct jd_domain_comparator_t { namespace detail { +inline std::string record_key(std::string const &domain, + dns_record_type_e const type, + std::string const &rdata) { + return domain + "\n" + dns_record_type_to_str(type) + "\n" + rdata; +} + +inline std::map +previous_record_map(std::optional> const &previous) { + std::map records{}; + if (!previous) { + return records; + } + for (auto const &record : *previous) { + records[record_key(record.domain_name, record.type, record.rdata)] = + record; + } + return records; +} + template void write_json_result_impl(map_container_t const &result_map, RtType const &rt_args) { @@ -67,10 +105,47 @@ void write_json_result_impl(map_container_t const &result_map, } json::array_t list; + auto const previous_records = previous_record_map(rt_args.previous_data); + auto const now = rt_args.run_timestamp; for (auto const &result_pair : result_map.cresult()) { json::object_t internal_object; auto &http_result = result_pair.second.http_result_; - internal_object["dns_probe"] = result_pair.second.dns_result_list_; + json::array_t dns_probe{}; + std::set written_record_keys{}; + for (auto const &record : result_pair.second.dns_result_list_) { + auto const key = record_key(result_pair.first, record.type, record.rdata); + written_record_keys.insert(key); + json::object_t record_json{}; + record_json["ttl"] = record.ttl; + record_json["type"] = dns_record_type_to_str(record.type); + record_json["info"] = record.rdata; + + auto const previous_iter = previous_records.find(key); + if (previous_iter == previous_records.end()) { + record_json["first-seen"] = now; + record_json["seen"] = 1; + } else { + auto const &previous = previous_iter->second; + record_json["first-seen"] = + previous.first_seen.empty() ? now : previous.first_seen; + record_json["seen"] = previous.seen + 1; + } + record_json["last-seen"] = now; + dns_probe.push_back(std::move(record_json)); + } + for (auto const &[key, previous] : previous_records) { + if (previous.domain_name != result_pair.first || + written_record_keys.count(key) != 0) { + continue; + } + dns_probe.push_back({{"ttl", previous.ttl}, + {"type", dns_record_type_to_str(previous.type)}, + {"info", previous.rdata}, + {"first-seen", previous.first_seen}, + {"last-seen", previous.last_seen}, + {"seen", previous.seen}}); + } + internal_object["dns_probe"] = std::move(dns_probe); internal_object["content_length"] = http_result.content_length_; internal_object["http_code"] = http_result.http_status_; internal_object["code_string"] = code_string(http_result.http_status_); diff --git a/dooked/source/cli_preprocessor.cpp b/dooked/source/cli_preprocessor.cpp index c08d7fb..7d89bbe 100644 --- a/dooked/source/cli_preprocessor.cpp +++ b/dooked/source/cli_preprocessor.cpp @@ -6,7 +6,12 @@ #include "utils/string_utils.hpp" #include #include +#include +#include +#include +#include #include +#include #include // defined (and assigned to) in main.cpp @@ -18,6 +23,221 @@ namespace dooked { namespace net = boost::asio; using namespace fmt::v7::literals; +std::string current_timestamp() { + std::string timestamp{}; + if (!timet_to_string(timestamp, std::time(nullptr), "%Y-%m-%d %H:%M:%S")) { + return {}; + } + return timestamp; +} + +std::optional parse_timestamp(std::string const &value) { + if (value.empty()) { + return std::nullopt; + } + + for (auto const *format : {"%Y-%m-%d %H:%M:%S", "%m/%d/%Y %H:%M:%S", + "%m/%d/%Y"}) { + std::tm parsed{}; + std::istringstream stream{value}; + stream >> std::get_time(&parsed, format); + if (!stream.fail()) { + parsed.tm_isdst = -1; + return std::mktime(&parsed); + } + } + return std::nullopt; +} + +std::optional last_seen_cutoff(runtime_args_t const &rt_args) { + if (!rt_args.last_seen_date.empty()) { + return parse_timestamp(rt_args.last_seen_date); + } + if (rt_args.last_seen_days >= 0) { + auto const now = std::chrono::system_clock::now(); + auto const cutoff = + now - std::chrono::hours(24 * rt_args.last_seen_days); + return std::chrono::system_clock::to_time_t(cutoff); + } + return std::nullopt; +} + +bool is_not_seen_since(json_data_t const &record, std::time_t const cutoff) { + auto const parsed = parse_timestamp(record.last_seen); + return !parsed || *parsed <= cutoff; +} + +std::map +record_fields(std::string const &domain, probe_result_t const &record, + http_response_t const &http_result, + json_data_t const *previous_record, + std::string const &run_timestamp) { + auto const first_seen = + previous_record && !previous_record->first_seen.empty() + ? previous_record->first_seen + : run_timestamp; + auto const seen = previous_record ? previous_record->seen + 1 : 1; + + return {{"domain", domain}, + {"domain_name", domain}, + {"type", dns_record_type_to_str(record.type)}, + {"rdata", record.rdata}, + {"info", record.rdata}, + {"ttl", std::to_string(record.ttl)}, + {"http_code", std::to_string(http_result.http_status_)}, + {"code_string", code_string(http_result.http_status_)}, + {"content_length", std::to_string(http_result.content_length_)}, + {"body", http_result.response_body_}, + {"page_content", http_result.response_body_}, + {"response_body", http_result.response_body_}, + {"first-seen", first_seen}, + {"first_seen", first_seen}, + {"last-seen", run_timestamp}, + {"last_seen", run_timestamp}, + {"seen", std::to_string(seen)}}; +} + +void report_first_last_seen(runtime_args_t const &rt_args, + map_container_t const &result_map) { + if (!rt_args.previous_data || + (!rt_args.show_first_seen && rt_args.last_seen_days < 0 && + rt_args.last_seen_date.empty())) { + return; + } + + auto const previous_records = + detail::previous_record_map(rt_args.previous_data); + std::set current_records{}; + for (auto const &result_pair : result_map.cresult()) { + for (auto const &record : result_pair.second.dns_result_list_) { + auto const key = + detail::record_key(result_pair.first, record.type, record.rdata); + current_records.insert(key); + if (rt_args.show_first_seen && previous_records.count(key) == 0) { + spdlog::info("[FIRST-SEEN][{}][{}] `{}`", result_pair.first, + dns_record_type_to_str(record.type), record.rdata); + } + } + } + + auto const cutoff = last_seen_cutoff(rt_args); + if (!cutoff) { + return; + } + for (auto const &[key, previous] : previous_records) { + if (current_records.count(key) == 0 && + is_not_seen_since(previous, *cutoff)) { + spdlog::warn("[LAST-SEEN][{}][{}] `{}` last seen `{}`", + previous.domain_name, dns_record_type_to_str(previous.type), + previous.rdata, + previous.last_seen.empty() ? "unknown" : previous.last_seen); + } + } +} + +std::vector +load_regex_checks(std::string const &check_config_filename) { + if (check_config_filename.empty()) { + return {}; + } + + std::ifstream input_file(check_config_filename); + if (!input_file) { + spdlog::error("unable to open regex check config `{}`", + check_config_filename); + return {}; + } + + json content{}; + try { + input_file >> content; + } catch (std::exception const &e) { + spdlog::error("unable to parse regex check config `{}`: {}", + check_config_filename, e.what()); + return {}; + } + + json checks{}; + if (content.is_array()) { + checks = content; + } else if (content.contains("checks") && content["checks"].is_array()) { + checks = content["checks"]; + } else { + spdlog::error( + "regex check config must be an array or contain a `checks` array"); + return {}; + } + std::vector result{}; + for (auto const &check : checks) { + auto const field = check.value("field", ""); + auto const pattern_text = check.value("regex", ""); + auto const alert = check.value("alert", "regex check matched"); + auto const ignore_case = check.value("ignore_case", false); + if (field.empty() || pattern_text.empty()) { + spdlog::warn("skipping regex check with missing field or regex"); + continue; + } + + try { + auto flags = std::regex::ECMAScript; + if (ignore_case) { + flags |= std::regex::icase; + } + result.push_back( + {field, std::regex(pattern_text, flags), pattern_text, alert}); + } catch (std::regex_error const &e) { + spdlog::warn("skipping invalid regex `{}`: {}", pattern_text, e.what()); + } + } + return result; +} + +void report_regex_checks(runtime_args_t const &rt_args, + map_container_t const &result_map) { + if (rt_args.regex_checks.empty()) { + return; + } + + auto const previous_records = + detail::previous_record_map(rt_args.previous_data); + for (auto const &result_pair : result_map.cresult()) { + auto const &http_result = result_pair.second.http_result_; + for (auto const &record : result_pair.second.dns_result_list_) { + auto const previous_iter = previous_records.find( + detail::record_key(result_pair.first, record.type, record.rdata)); + auto const previous_record = previous_iter == previous_records.end() + ? nullptr + : &previous_iter->second; + auto const fields = + record_fields(result_pair.first, record, http_result, previous_record, + rt_args.run_timestamp); + for (auto const &check : rt_args.regex_checks) { + if (check.field == "*") { + for (auto const &[field, value] : fields) { + if (std::regex_search(value, check.pattern)) { + spdlog::warn("[ALERT][{}][{}][{}] {}: `{}`", field, + result_pair.first, + dns_record_type_to_str(record.type), check.alert, + value); + } + } + continue; + } + + auto const field_iter = fields.find(check.field); + if (field_iter == fields.end()) { + continue; + } + if (std::regex_search(field_iter->second, check.pattern)) { + spdlog::warn("[ALERT][{}][{}][{}] {}: `{}`", check.field, + result_pair.first, dns_record_type_to_str(record.type), + check.alert, field_iter->second); + } + } + } + } +} + void compare_http_result(int const base_cl, json_data_t const &prev_http_result, http_response_t const ¤t_result) { auto const current_req_cl = current_result.content_length_; @@ -354,6 +574,8 @@ void start_name_checking(runtime_args_t &&rt_args) { spdlog::info("Writing JSON output"); } write_json_result(result_map, rt_args); + report_regex_checks(rt_args, result_map); + report_first_last_seen(rt_args, result_map); // compare old with new result -- only if we had previous record if (rt_args.previous_data) { @@ -477,6 +699,11 @@ void run_program(cli_args_t const &cli_args) { static_cast(cli_args.post_http_request); rt_args.thread_count = cli_args.thread_count; rt_args.content_length = cli_args.content_length; + rt_args.run_timestamp = current_timestamp(); + rt_args.last_seen_days = cli_args.last_seen_days; + rt_args.last_seen_date = cli_args.last_seen_date; + rt_args.show_first_seen = cli_args.show_first_seen; + rt_args.regex_checks = load_regex_checks(cli_args.check_config_filename); return start_name_checking(std::move(rt_args)); } diff --git a/dooked/source/http/requests_handler.cpp b/dooked/source/http/requests_handler.cpp index d21a592..2f291e4 100644 --- a/dooked/source/http/requests_handler.cpp +++ b/dooked/source/http/requests_handler.cpp @@ -138,6 +138,7 @@ void http_request_handler_t::on_data_received( if (status_code_simple == 2) { response_int = response_type_e::ok; + response_string = response_->body(); } else if (status_code_simple == 3) { // redirected response_string = (*response_)[http::field::location].to_string(); if (response_string.empty()) { @@ -150,6 +151,7 @@ void http_request_handler_t::on_data_received( } } } else if (status_code_simple == 4) { + response_string = response_->body(); if (http_status_code == 404) { response_int = response_type_e::not_found; } else if (http_status_code == 400) { @@ -158,6 +160,7 @@ void http_request_handler_t::on_data_received( response_int = response_type_e::forbidden; } } else if (status_code_simple == 5) { + response_string = response_->body(); response_int = response_type_e::server_error; } else { #ifdef _DEBUG @@ -364,6 +367,7 @@ void https_request_handler_t::on_data_received( if (status_code_simple == 2) { response_int = response_type_e::ok; + response_string = response_->body(); } else if (status_code_simple == 3) { // redirected response_string = (*response_)[http::field::location].to_string(); if (response_string.empty()) { @@ -376,6 +380,7 @@ void https_request_handler_t::on_data_received( } } } else if (status_code_simple == 4) { + response_string = response_->body(); if (status_code == 404) { response_int = response_type_e::not_found; } else if (status_code == 400) { @@ -384,6 +389,7 @@ void https_request_handler_t::on_data_received( response_int = response_type_e::forbidden; } } else if (status_code_simple == 5) { + response_string = response_->body(); response_int = response_type_e::server_error; } else { response_int = response_type_e::unknown_response; diff --git a/dooked/source/http/resolver.cpp b/dooked/source/http/resolver.cpp index 95332a4..8f54bd2 100644 --- a/dooked/source/http/resolver.cpp +++ b/dooked/source/http/resolver.cpp @@ -4,6 +4,14 @@ namespace dooked { +std::string bounded_body(std::string const &body) { + constexpr std::size_t max_body_bytes = 1024 * 1024; + if (body.size() <= max_body_bytes) { + return body; + } + return body.substr(0, max_body_bytes); +} + http_resolver_t::http_resolver_t(net::io_context &ioc, ssl::context *sslc, domain_list_t &names, map_container_t &result_map) @@ -65,11 +73,13 @@ void http_resolver_t::tcp_request_result(response_type_e const rt, std::string const &response_string) { switch (rt) { case response_type_e::bad_request: { - result_map_.insert(name_, content_length, 400); + result_map_.insert(name_, content_length, 400, + bounded_body(response_string)); return send_next_request(); } case response_type_e::forbidden: { - result_map_.insert(name_, content_length, 403); + result_map_.insert(name_, content_length, 403, + bounded_body(response_string)); return send_next_request(); } case response_type_e::cannot_resolve_name: { @@ -97,11 +107,13 @@ void http_resolver_t::tcp_request_result(response_type_e const rt, return send_https_request(response_string); } case response_type_e::not_found: { // HTTP(S) 404 - result_map_.insert(name_, content_length, 404); + result_map_.insert(name_, content_length, 404, + bounded_body(response_string)); return send_next_request(); } case response_type_e::ok: { - result_map_.insert(name_, content_length, 200); + result_map_.insert(name_, content_length, 200, + bounded_body(response_string)); return send_next_request(); } case response_type_e::recv_timed_out: { // retry, wait timeout @@ -122,7 +134,8 @@ void http_resolver_t::tcp_request_result(response_type_e const rt, return switch_ssl_method(response_string); } case response_type_e::server_error: { - result_map_.insert(name_, content_length, 503); + result_map_.insert(name_, content_length, 503, + bounded_body(response_string)); return send_next_request(); } default: { diff --git a/dooked/source/main.cpp b/dooked/source/main.cpp index cf29460..90d307a 100644 --- a/dooked/source/main.cpp +++ b/dooked/source/main.cpp @@ -41,6 +41,14 @@ int main(int argc, char **argv) { "defers http request until after all DNS requests have been completed"); app.add_flag("--compare-cl", compare_cl, "compare content-length of HTTP requests"); + app.add_flag("--fs", cli_args.show_first_seen, + "show records that are first seen in the current run"); + app.add_option("--ls", cli_args.last_seen_days, + "show records not seen since at least this many days ago"); + app.add_option("--lsd", cli_args.last_seen_date, + "show records not seen since the given US date (MM/DD/YYYY)"); + app.add_option("--checks", cli_args.check_config_filename, + "JSON file containing runtime regex alert checks"); app.add_flag("--nbc", no_bytes_count, "in case `content-length` is missing in an HTTP header field,"