From 7cc8f374916987340b950a1676a5c886fd5930c4 Mon Sep 17 00:00:00 2001 From: Lukas Date: Mon, 11 May 2026 13:47:38 -0400 Subject: [PATCH 1/2] Add runtime regex checks --- README.md | 31 ++++ dooked/include/cli_preprocessor.hpp | 11 ++ dooked/include/utils/containers.hpp | 13 +- dooked/include/utils/exceptions.hpp | 1 + dooked/source/cli_preprocessor.cpp | 201 ++++++++++++++++++++++++ dooked/source/dns/dns_resolver.cpp | 10 +- dooked/source/http/requests_handler.cpp | 24 ++- dooked/source/http/resolver.cpp | 10 +- dooked/source/main.cpp | 2 + 9 files changed, 285 insertions(+), 18 deletions(-) diff --git a/README.md b/README.md index f1a761c..c6cd9f1 100644 --- a/README.md +++ b/README.md @@ -39,3 +39,34 @@ make ## Usage For comprehensive help, use `dooked --help` + +### Runtime regex checks + +Pass `--checks ` to run custom regex checks against collected fields and +print alerts when they match. The checks file can be a JSON object with a +`checks` array: + +```json +{ + "checks": [ + { + "field": "domain", + "regex": "dev|test", + "alert": "domain name contains an environment marker", + "ignore_case": true + }, + { + "field": "response_body", + "regex": "copyright 2025", + "alert": "page may contain an outdated copyright banner", + "ignore_case": true + } + ] +} +``` + +Supported fields are `domain`, `domain_name`, `type`, `record_type`, `info`, +`rdata`, `ttl`, `content_length`, `http_code`, `code_string`, and +`http_status`. Page content can be checked with `response_body`, `body`, +`page_content`, or `content`; it is only used for matching and is not written to +the JSON output. diff --git a/dooked/include/cli_preprocessor.hpp b/dooked/include/cli_preprocessor.hpp index 43fa1ba..65b1a08 100644 --- a/dooked/include/cli_preprocessor.hpp +++ b/dooked/include/cli_preprocessor.hpp @@ -2,6 +2,7 @@ #include "dns/dns_resolver.hpp" #include "utils/io_utils.hpp" +#include #include // maximum sockets to open regardless of the number of threads @@ -19,6 +20,7 @@ struct cli_args_t { std::string resolver_filename{}; std::string output_filename{}; std::string input_filename{}; + std::string regex_checks_filename{}; int file_type{}; int post_http_request{}; @@ -27,6 +29,14 @@ struct cli_args_t { bool include_date{false}; }; +struct regex_check_t { + std::string field{}; + std::string pattern{}; + std::string alert{}; + bool ignore_case{}; + std::regex expression{}; +}; + struct runtime_args_t { std::optional resolvers{}; opt_domain_list_t names{}; @@ -36,6 +46,7 @@ struct runtime_args_t { http_process_e http_request_time_{}; int thread_count{}; int content_length{-1}; + std::vector regex_checks{}; }; void run_program(cli_args_t const &cli_args); diff --git a/dooked/include/utils/containers.hpp b/dooked/include/utils/containers.hpp index 7c955f6..b59d01c 100644 --- a/dooked/include/utils/containers.hpp +++ b/dooked/include/utils/containers.hpp @@ -7,6 +7,7 @@ #include #include #include +#include #include namespace dooked { @@ -31,6 +32,7 @@ template class circular_queue_t { struct http_response_t { int content_length_{}; int http_status_{}; + std::string response_body_{}; }; template struct http_dns_response_t { @@ -52,9 +54,11 @@ template class map_container_t { } void insert_impl(std::string const &name, int const len, - int const http_status) { + int const http_status, + std::string const &response_body = {}) { map_[name].http_result_.content_length_ = len; map_[name].http_result_.http_status_ = http_status; + map_[name].http_result_.response_body_ = response_body; } public: @@ -74,12 +78,13 @@ template class map_container_t { append_impl(key, value); } - void insert(std::string const &name, int const len, int const http_status) { + void insert(std::string const &name, int const len, int const http_status, + std::string const &response_body = {}) { if (!opt_mutex_) { - return insert_impl(name, len, http_status); + return insert_impl(name, len, http_status, response_body); } std::lock_guard lock_g{*opt_mutex_}; - insert_impl(name, len, http_status); + insert_impl(name, len, http_status, response_body); } // only used by main thread, after all "computations" has been // done. There's no need for locks here. diff --git a/dooked/include/utils/exceptions.hpp b/dooked/include/utils/exceptions.hpp index a749a1b..846d544 100644 --- a/dooked/include/utils/exceptions.hpp +++ b/dooked/include/utils/exceptions.hpp @@ -1,6 +1,7 @@ #pragma once #include +#include namespace dooked { diff --git a/dooked/source/cli_preprocessor.cpp b/dooked/source/cli_preprocessor.cpp index c08d7fb..8680a8a 100644 --- a/dooked/source/cli_preprocessor.cpp +++ b/dooked/source/cli_preprocessor.cpp @@ -4,8 +4,11 @@ #include "utils/exceptions.hpp" #include "utils/random_utils.hpp" #include "utils/string_utils.hpp" +#include #include #include +#include +#include #include #include @@ -18,6 +21,196 @@ namespace dooked { namespace net = boost::asio; using namespace fmt::v7::literals; +std::string normalize_check_field(std::string field) { + std::transform(field.begin(), field.end(), field.begin(), [](char ch) { + if (ch == '-' || ch == ' ') { + return '_'; + } + return (char)std::tolower((unsigned char)ch); + }); + return field; +} + +bool is_domain_check_field(std::string const &field) { + return field == "domain" || field == "domain_name" || + field == "content_length" || field == "http_code" || + field == "code_string" || field == "http_status" || + field == "response_body" || field == "body" || + field == "page_content" || field == "content"; +} + +bool is_record_check_field(std::string const &field) { + return field == "type" || field == "record_type" || field == "info" || + field == "rdata" || field == "ttl"; +} + +bool is_supported_check_field(std::string const &field) { + return is_domain_check_field(field) || is_record_check_field(field); +} + +std::optional +domain_check_value(std::string const &field, std::string const &domain_name, + http_response_t const &http_result) { + if (field == "domain" || field == "domain_name") { + return domain_name; + } + if (field == "content_length") { + return std::to_string(http_result.content_length_); + } + if (field == "http_code") { + return std::to_string(http_result.http_status_); + } + if (field == "code_string" || field == "http_status") { + return code_string(http_result.http_status_); + } + if (field == "response_body" || field == "body" || + field == "page_content" || field == "content") { + return http_result.response_body_; + } + return std::nullopt; +} + +std::optional +record_check_value(std::string const &field, probe_result_t const &record) { + if (field == "type" || field == "record_type") { + return dns_record_type_to_str(record.type); + } + if (field == "info" || field == "rdata") { + return record.rdata; + } + if (field == "ttl") { + return std::to_string(record.ttl); + } + return std::nullopt; +} + +std::string alert_value(std::string value) { + for (auto &ch : value) { + if (ch == '\n' || ch == '\r' || ch == '\t') { + ch = ' '; + } + } + constexpr std::size_t max_alert_value_length = 240; + if (value.size() > max_alert_value_length) { + value = value.substr(0, max_alert_value_length) + "..."; + } + return value; +} + +void report_regex_match(regex_check_t const &check, + std::string const &domain_name, + std::string const &value) { + spdlog::warn("[REGEX][{}][{}] {} (value: `{}`)", check.field, domain_name, + check.alert, alert_value(value)); +} + +void report_regex_match(regex_check_t const &check, + std::string const &domain_name, + probe_result_t const &record, + std::string const &value) { + spdlog::warn("[REGEX][{}][{}][{}] {} (value: `{}`)", check.field, + domain_name, dns_record_type_to_str(record.type), check.alert, + alert_value(value)); +} + +void run_regex_checks(map_container_t const &result_map, + std::vector const &checks) { + if (checks.empty()) { + return; + } + + for (auto const &result_pair : result_map.cresult()) { + auto const &domain_name = result_pair.first; + auto const &domain_result = result_pair.second; + for (auto const &check : checks) { + if (is_domain_check_field(check.field)) { + auto const value = + domain_check_value(check.field, domain_name, + domain_result.http_result_); + if (value && std::regex_search(*value, check.expression)) { + report_regex_match(check, domain_name, *value); + } + continue; + } + + for (auto const &record : domain_result.dns_result_list_) { + auto const value = record_check_value(check.field, record); + if (value && std::regex_search(*value, check.expression)) { + report_regex_match(check, domain_name, record, *value); + } + } + } + } +} + +std::optional> +load_regex_checks(std::string const &filename) { + if (filename.empty()) { + return std::vector{}; + } + + std::ifstream input_file(filename); + if (!input_file) { + spdlog::error("Unable to open regex checks file `{}`", filename); + return std::nullopt; + } + + try { + json root{}; + input_file >> root; + json const *checks_json = nullptr; + if (root.is_array()) { + checks_json = &root; + } else if (root.is_object() && root.contains("checks") && + root["checks"].is_array()) { + checks_json = &root["checks"]; + } + + if (!checks_json) { + spdlog::error("Regex checks file must be an array or contain a `checks` " + "array"); + return std::nullopt; + } + + std::vector checks{}; + for (auto const &item : *checks_json) { + if (!item.is_object()) { + spdlog::error("Each regex check must be a JSON object"); + return std::nullopt; + } + + auto field = normalize_check_field(item.value("field", "")); + auto pattern = item.value("regex", item.value("pattern", "")); + auto alert = item.value("alert", ""); + auto const ignore_case = item.value("ignore_case", false); + + if (field.empty() || pattern.empty() || alert.empty()) { + spdlog::error("Each regex check requires `field`, `regex`, and " + "`alert`"); + return std::nullopt; + } + if (!is_supported_check_field(field)) { + spdlog::error("Unsupported regex check field `{}`", field); + return std::nullopt; + } + + auto options = std::regex_constants::ECMAScript; + if (ignore_case) { + options |= std::regex_constants::icase; + } + checks.push_back({field, pattern, alert, ignore_case, + std::regex(pattern, options)}); + } + return checks; + } catch (std::regex_error const &e) { + spdlog::error("Invalid regex in checks file `{}`: {}", filename, e.what()); + } catch (std::exception const &e) { + spdlog::error("Unable to parse regex checks file `{}`: {}", filename, + e.what()); + } + return std::nullopt; +} + void compare_http_result(int const base_cl, json_data_t const &prev_http_result, http_response_t const ¤t_result) { auto const current_req_cl = current_result.content_length_; @@ -350,6 +543,8 @@ void start_name_checking(runtime_args_t &&rt_args) { } thread_pool->join(); } + run_regex_checks(result_map, rt_args.regex_checks); + if (!silent) { spdlog::info("Writing JSON output"); } @@ -380,6 +575,12 @@ void start_name_checking(runtime_args_t &&rt_args) { void run_program(cli_args_t const &cli_args) { runtime_args_t rt_args{}; + auto regex_checks = load_regex_checks(cli_args.regex_checks_filename); + if (!regex_checks) { + return; + } + rt_args.regex_checks = std::move(*regex_checks); + // settle resolvers. std::vector resolver_strings{}; if (cli_args.resolver_filename.empty()) { diff --git a/dooked/source/dns/dns_resolver.cpp b/dooked/source/dns/dns_resolver.cpp index 851745f..5f5ec73 100644 --- a/dooked/source/dns/dns_resolver.cpp +++ b/dooked/source/dns/dns_resolver.cpp @@ -417,11 +417,11 @@ void custom_resolver_socket_t::http_result_obtained( switch (rt) { case response_type_e::bad_request: { - result_map_.insert(name_, content_length, 400); + result_map_.insert(name_, content_length, 400, response_string); return dns_continue_probe(); } case response_type_e::forbidden: { - result_map_.insert(name_, content_length, 403); + result_map_.insert(name_, content_length, 403, response_string); return dns_continue_probe(); } case response_type_e::cannot_resolve_name: { @@ -447,11 +447,11 @@ void custom_resolver_socket_t::http_result_obtained( return send_https_request(response_string); } case response_type_e::not_found: { // HTTP(S) 404 - result_map_.insert(name_, content_length, 404); + result_map_.insert(name_, content_length, 404, response_string); return dns_continue_probe(); } case response_type_e::ok: { - result_map_.insert(name_, content_length, 200); + result_map_.insert(name_, content_length, 200, response_string); return dns_continue_probe(); } case response_type_e::recv_timed_out: { // retry, wait timeout @@ -477,7 +477,7 @@ void custom_resolver_socket_t::http_result_obtained( return send_https_request(response_string); } case response_type_e::server_error: { - result_map_.insert(name_, content_length, 503); + result_map_.insert(name_, content_length, 503, response_string); return dns_continue_probe(); } default: { diff --git a/dooked/source/http/requests_handler.cpp b/dooked/source/http/requests_handler.cpp index d21a592..94454ec 100644 --- a/dooked/source/http/requests_handler.cpp +++ b/dooked/source/http/requests_handler.cpp @@ -1,5 +1,6 @@ #include "http/requests_handler.hpp" #include "utils/random_utils.hpp" +#include #include #include #include @@ -10,6 +11,11 @@ extern bool silent; namespace dooked { +template +std::string field_value_to_string(FieldValue const &value) { + return std::string(value.data(), value.size()); +} + http_request_handler_t::http_request_handler_t(net::io_context &io_context, std::string domain_name) : io_{io_context}, domain_{std::move(domain_name)} {} @@ -138,8 +144,10 @@ void http_request_handler_t::on_data_received( if (status_code_simple == 2) { response_int = response_type_e::ok; + response_string = response_->body(); } else if (status_code_simple == 3) { // redirected - response_string = (*response_)[http::field::location].to_string(); + response_string = + field_value_to_string((*response_)[http::field::location]); if (response_string.empty()) { response_int = response_type_e::unknown_response; } else { @@ -150,6 +158,7 @@ void http_request_handler_t::on_data_received( } } } else if (status_code_simple == 4) { + response_string = response_->body(); if (http_status_code == 404) { response_int = response_type_e::not_found; } else if (http_status_code == 400) { @@ -158,6 +167,7 @@ void http_request_handler_t::on_data_received( response_int = response_type_e::forbidden; } } else if (status_code_simple == 5) { + response_string = response_->body(); response_int = response_type_e::server_error; } else { #ifdef _DEBUG @@ -171,7 +181,8 @@ void http_request_handler_t::on_data_received( int content_length{}; if (response_->has_content_length()) { try { - auto const cl_str = (*response_)[http::field::content_length].to_string(); + auto const cl_str = + field_value_to_string((*response_)[http::field::content_length]); content_length = std::stoi(cl_str); } catch (std::exception const &) { } @@ -364,8 +375,10 @@ void https_request_handler_t::on_data_received( if (status_code_simple == 2) { response_int = response_type_e::ok; + response_string = response_->body(); } else if (status_code_simple == 3) { // redirected - response_string = (*response_)[http::field::location].to_string(); + response_string = + field_value_to_string((*response_)[http::field::location]); if (response_string.empty()) { response_int = response_type_e::unknown_response; } else { @@ -376,6 +389,7 @@ void https_request_handler_t::on_data_received( } } } else if (status_code_simple == 4) { + response_string = response_->body(); if (status_code == 404) { response_int = response_type_e::not_found; } else if (status_code == 400) { @@ -384,6 +398,7 @@ void https_request_handler_t::on_data_received( response_int = response_type_e::forbidden; } } else if (status_code_simple == 5) { + response_string = response_->body(); response_int = response_type_e::server_error; } else { response_int = response_type_e::unknown_response; @@ -392,7 +407,8 @@ void https_request_handler_t::on_data_received( int content_length = 0; if (response_->has_content_length()) { try { - auto const cl_str = (*response_)[http::field::content_length].to_string(); + auto const cl_str = + field_value_to_string((*response_)[http::field::content_length]); content_length = std::stoi(cl_str); } catch (std::exception const &) { } diff --git a/dooked/source/http/resolver.cpp b/dooked/source/http/resolver.cpp index 95332a4..35ae835 100644 --- a/dooked/source/http/resolver.cpp +++ b/dooked/source/http/resolver.cpp @@ -65,11 +65,11 @@ void http_resolver_t::tcp_request_result(response_type_e const rt, std::string const &response_string) { switch (rt) { case response_type_e::bad_request: { - result_map_.insert(name_, content_length, 400); + result_map_.insert(name_, content_length, 400, response_string); return send_next_request(); } case response_type_e::forbidden: { - result_map_.insert(name_, content_length, 403); + result_map_.insert(name_, content_length, 403, response_string); return send_next_request(); } case response_type_e::cannot_resolve_name: { @@ -97,11 +97,11 @@ void http_resolver_t::tcp_request_result(response_type_e const rt, return send_https_request(response_string); } case response_type_e::not_found: { // HTTP(S) 404 - result_map_.insert(name_, content_length, 404); + result_map_.insert(name_, content_length, 404, response_string); return send_next_request(); } case response_type_e::ok: { - result_map_.insert(name_, content_length, 200); + result_map_.insert(name_, content_length, 200, response_string); return send_next_request(); } case response_type_e::recv_timed_out: { // retry, wait timeout @@ -122,7 +122,7 @@ void http_resolver_t::tcp_request_result(response_type_e const rt, return switch_ssl_method(response_string); } case response_type_e::server_error: { - result_map_.insert(name_, content_length, 503); + result_map_.insert(name_, content_length, 503, response_string); return send_next_request(); } default: { diff --git a/dooked/source/main.cpp b/dooked/source/main.cpp index cf29460..e118f9d 100644 --- a/dooked/source/main.cpp +++ b/dooked/source/main.cpp @@ -34,6 +34,8 @@ int main(int argc, char **argv) { app.add_option( "-c,--content-length", cli_args.content_length, "show content lengths that changed more than --content-length"); + app.add_option("--checks,--check-config", cli_args.regex_checks_filename, + "JSON file with runtime regex checks"); app.add_flag("-d,--include-date", cli_args.include_date, "append present datetime(-ddMMyyyy_hhmmss) in output name"); app.add_flag( From 36e62457e5aee53cb4beb80c4b3707dc71d07e9a Mon Sep 17 00:00:00 2001 From: Lukas Date: Tue, 12 May 2026 08:10:21 -0400 Subject: [PATCH 2/2] Cap regex response body matching --- README.md | 13 ++++++++----- dooked/source/cli_preprocessor.cpp | 2 +- dooked/source/http/requests_handler.cpp | 20 ++++++++++++++------ 3 files changed, 23 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index c6cd9f1..b6c8f6a 100644 --- a/README.md +++ b/README.md @@ -42,9 +42,9 @@ For comprehensive help, use `dooked --help` ### Runtime regex checks -Pass `--checks ` to run custom regex checks against collected fields and -print alerts when they match. The checks file can be a JSON object with a -`checks` array: +Pass `--checks ` or `--check-config ` to run custom regex checks +against collected fields and print alerts when they match. The checks file can +be a JSON object with a `checks` array or the array itself: ```json { @@ -65,8 +65,11 @@ print alerts when they match. The checks file can be a JSON object with a } ``` +Each check requires `field`, `regex`, and `alert`; `pattern` is accepted as an +alias for `regex`, and `ignore_case` is optional. + Supported fields are `domain`, `domain_name`, `type`, `record_type`, `info`, `rdata`, `ttl`, `content_length`, `http_code`, `code_string`, and `http_status`. Page content can be checked with `response_body`, `body`, -`page_content`, or `content`; it is only used for matching and is not written to -the JSON output. +`page_content`, or `content`; dooked keeps at most the first 64 KiB in memory +for matching and does not write page content to the JSON output. diff --git a/dooked/source/cli_preprocessor.cpp b/dooked/source/cli_preprocessor.cpp index 8680a8a..d1c694c 100644 --- a/dooked/source/cli_preprocessor.cpp +++ b/dooked/source/cli_preprocessor.cpp @@ -532,7 +532,7 @@ void start_name_checking(runtime_args_t &&rt_args) { // if we deferred HTTP/S "probe", now is the time to get to it if (deferring) { - io_context.reset(); + io_context.restart(); thread_pool.emplace(thread_count); rt_args.names.emplace(std::move(*deferred_names_)); for (std::size_t index = 0; index < thread_count; ++index) { diff --git a/dooked/source/http/requests_handler.cpp b/dooked/source/http/requests_handler.cpp index 94454ec..c7fba66 100644 --- a/dooked/source/http/requests_handler.cpp +++ b/dooked/source/http/requests_handler.cpp @@ -16,6 +16,14 @@ std::string field_value_to_string(FieldValue const &value) { return std::string(value.data(), value.size()); } +std::string response_body_for_checks(std::string const &body) { + constexpr std::size_t max_body_size = 64 * 1024; + if (body.size() > max_body_size) { + return body.substr(0, max_body_size); + } + return body; +} + http_request_handler_t::http_request_handler_t(net::io_context &io_context, std::string domain_name) : io_{io_context}, domain_{std::move(domain_name)} {} @@ -144,7 +152,7 @@ void http_request_handler_t::on_data_received( if (status_code_simple == 2) { response_int = response_type_e::ok; - response_string = response_->body(); + response_string = response_body_for_checks(response_->body()); } else if (status_code_simple == 3) { // redirected response_string = field_value_to_string((*response_)[http::field::location]); @@ -158,7 +166,7 @@ void http_request_handler_t::on_data_received( } } } else if (status_code_simple == 4) { - response_string = response_->body(); + response_string = response_body_for_checks(response_->body()); if (http_status_code == 404) { response_int = response_type_e::not_found; } else if (http_status_code == 400) { @@ -167,7 +175,7 @@ void http_request_handler_t::on_data_received( response_int = response_type_e::forbidden; } } else if (status_code_simple == 5) { - response_string = response_->body(); + response_string = response_body_for_checks(response_->body()); response_int = response_type_e::server_error; } else { #ifdef _DEBUG @@ -375,7 +383,7 @@ void https_request_handler_t::on_data_received( if (status_code_simple == 2) { response_int = response_type_e::ok; - response_string = response_->body(); + response_string = response_body_for_checks(response_->body()); } else if (status_code_simple == 3) { // redirected response_string = field_value_to_string((*response_)[http::field::location]); @@ -389,7 +397,7 @@ void https_request_handler_t::on_data_received( } } } else if (status_code_simple == 4) { - response_string = response_->body(); + response_string = response_body_for_checks(response_->body()); if (status_code == 404) { response_int = response_type_e::not_found; } else if (status_code == 400) { @@ -398,7 +406,7 @@ void https_request_handler_t::on_data_received( response_int = response_type_e::forbidden; } } else if (status_code_simple == 5) { - response_string = response_->body(); + response_string = response_body_for_checks(response_->body()); response_int = response_type_e::server_error; } else { response_int = response_type_e::unknown_response;