From 1454bf9fe330bf67c3da81b9a7d5df9285b47233 Mon Sep 17 00:00:00 2001 From: Alexei Bykov Date: Fri, 20 Sep 2024 19:20:29 +0300 Subject: [PATCH 01/13] Add an option to skip dependency installation for CMake --- CMakeLists.txt | 61 ++++++++++++++++++++++++++++++++------------------ 1 file changed, 39 insertions(+), 22 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 491ad67..591190c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -15,8 +15,8 @@ LIST(GET VERSION_DIGITS 2 CPACK_PACKAGE_VERSION_PATCH) SET(CPACK_PACKAGE_NAME "robots") SET(CPACK_PACKAGE_DESCRIPTION_SUMMARY "Google's robots.txt parser and matcher C++ library") SET(CPACK_PACKAGE_VENDOR "Google Inc.") -SET(CPACK_PACAKGE_DESCRIPTION_FILE "${CMAKE_SOURCE_DIR}/README.md") -SET(CPACK_RESOURCE_FILE_LICENSE "${CMAKE_SOURCE_DIR}/LICENSE") +SET(CPACK_PACKAGE_DESCRIPTION_FILE "${CMAKE_CURRENT_SOURCE_DIR}/README.md") +SET(CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE") SET(CPACK_PACKAGE_INSTALL_DIRECTORY "${CPACK_PACKAGE_DESCRIPTION_SUMMARY} ${CPACK_PACKAGE_VERSION_MAJOR}.${CPACK_PACKAGE_VERSION_MINOR}.${CPACK_PACKAGE_VERSION_PATCH}") SET(CPACK_SOURCE_PACKAGE_FILE_NAME "${CPACK_PACKAGE_NAME}-${CPACK_PACKAGE_VERSION_MAJOR}.${CPACK_PACKAGE_VERSION_MINOR}.${CPACK_PACKAGE_VERSION_PATCH}") @@ -51,6 +51,7 @@ SET(CPACK_SOURCE_IGNORE_FILES OPTION(ROBOTS_BUILD_STATIC "If ON, robots will build also the static library" ON) OPTION(ROBOTS_BUILD_TESTS "If ON, robots will build test targets" OFF) OPTION(ROBOTS_INSTALL "If ON, enable the installation of the targets" ON) +OPTION(ROBOTS_SKIP_DEPS "If ON, skip build dependency installation" OFF) ############ helper libs ############ @@ -60,17 +61,19 @@ INCLUDE(ExternalProject) ############ dependencies ############## -CONFIGURE_FILE(CMakeLists.txt.in libs/CMakeLists.txt) -EXECUTE_PROCESS(COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}" . RESULT_VARIABLE result WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/libs) +IF(NOT ROBOTS_SKIP_DEPS) + CONFIGURE_FILE(CMakeLists.txt.in libs/CMakeLists.txt) + EXECUTE_PROCESS(COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}" . RESULT_VARIABLE result WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/libs) -IF(result) - MESSAGE(FATAL_ERROR "Failed to download dependencies: ${result}") -ENDIF() + IF(result) + MESSAGE(FATAL_ERROR "Failed to download dependencies: ${result}") + ENDIF() -EXECUTE_PROCESS(COMMAND ${CMAKE_COMMAND} --build . RESULT_VARIABLE result WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/libs) + EXECUTE_PROCESS(COMMAND ${CMAKE_COMMAND} --build . RESULT_VARIABLE result WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/libs) -IF(result) - MESSAGE(FATAL_ERROR "Failed to download dependencies: ${result}") + IF(result) + MESSAGE(FATAL_ERROR "Failed to download dependencies: ${result}") + ENDIF() ENDIF() # abseil-cpp @@ -84,23 +87,27 @@ IF(MSVC) ADD_DEFINITIONS(/DNOMINMAX /DWIN32_LEAN_AND_MEAN=1 /D_CRT_SECURE_NO_WARNINGS) ENDIF(MSVC) -ADD_SUBDIRECTORY(${CMAKE_CURRENT_BINARY_DIR}/libs/abseil-cpp-src - ${CMAKE_CURRENT_BINARY_DIR}/libs/abseil-cpp-build - EXCLUDE_FROM_ALL) +IF(NOT ROBOTS_SKIP_DEPS) + ADD_SUBDIRECTORY(${CMAKE_CURRENT_BINARY_DIR}/libs/abseil-cpp-src + ${CMAKE_CURRENT_BINARY_DIR}/libs/abseil-cpp-build + EXCLUDE_FROM_ALL) +ENDIF() IF(ROBOTS_BUILD_TESTS) INCLUDE(CTest) - # googletest - ADD_SUBDIRECTORY(${CMAKE_CURRENT_BINARY_DIR}/libs/gtest-src - ${CMAKE_CURRENT_BINARY_DIR}/libs/gtest-build - EXCLUDE_FROM_ALL) + IF(NOT ROBOTS_SKIP_DEPS) + # googletest + ADD_SUBDIRECTORY(${CMAKE_CURRENT_BINARY_DIR}/libs/gtest-src + ${CMAKE_CURRENT_BINARY_DIR}/libs/gtest-build + EXCLUDE_FROM_ALL) - SET(INSTALL_GTEST 0) - SET(gtest_force_shared_crt ON CACHE BOOL "" FORCE) + SET(INSTALL_GTEST 0) + SET(gtest_force_shared_crt ON CACHE BOOL "" FORCE) - IF(CMAKE_VERSION VERSION_LESS 2.8.11) - INCLUDE_DIRECTORIES(${gtest_SOURCE_DIR}/include) + IF(CMAKE_VERSION VERSION_LESS 2.8.11) + INCLUDE_DIRECTORIES(${gtest_SOURCE_DIR}/include) + ENDIF() ENDIF() ENDIF(ROBOTS_BUILD_TESTS) @@ -121,6 +128,10 @@ INCLUDE_DIRECTORIES(.) ######### targets ########### +IF(ROBOTS_SKIP_DEPS) + find_package(absl) +ENDIF() + SET(LIBROBOTS_LIBS) SET(robots_SRCS ./robots.cc) @@ -168,7 +179,13 @@ IF(ROBOTS_BUILD_TESTS) ENABLE_TESTING() ADD_EXECUTABLE(robots-test ./robots_test.cc) - TARGET_LINK_LIBRARIES(robots-test ${LIBROBOTS_LIBS} gtest_main) + IF(ROBOTS_SKIP_DEPS) + find_package(GTest REQUIRED) + TARGET_LINK_LIBRARIES(robots-test ${LIBROBOTS_LIBS} ${robots_LIBS} GTest::gtest GTest::gtest_main) + ELSE() + TARGET_LINK_LIBRARIES(robots-test ${LIBROBOTS_LIBS} gtest_main) + ENDIF() + ADD_TEST(NAME robots-test COMMAND robots-test) ENDIF(ROBOTS_BUILD_TESTS) From 33a9944008732598f64e32b25a62bc9ad88c12c6 Mon Sep 17 00:00:00 2001 From: Leonardo Poletto Date: Thu, 28 Aug 2025 19:04:42 -0300 Subject: [PATCH 02/13] docs: fix README examples to match actual CLI output MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Corrected example output (`allowed: YES` → `: ALLOWED`) - Documented behavior when robots.txt is empty - Added note on exit codes (`0` = ALLOWED, `1` = DISALLOWED`) These changes bring the README in line with the current `robots_main` output, making usage clearer for scripting and automation. --- README.md | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 2d5a2fc..ea679b9 100644 --- a/README.md +++ b/README.md @@ -71,7 +71,7 @@ Target //:robots_main up-to-date: bazel-bin/robots_main ... bazel-robots$ bazel run robots_main -- ~/local/path/to/robots.txt YourBot https://example.com/url - user-agent 'YourBot' with url 'https://example.com/url' allowed: YES + user-agent 'YourBot' with url 'https://example.com/url': ALLOWED ``` #### Building with CMake @@ -104,8 +104,12 @@ Test project robotstxt/c-build Total Test time (real) = 0.02 sec ... $ robots ~/local/path/to/robots.txt YourBot https://example.com/url - user-agent 'YourBot' with url 'https://example.com/url' allowed: YES + user-agent 'YourBot' with url 'https://example.com/url': ALLOWED ``` +> **Note**: If the robots file is empty, the parser also prints: +`notice: robots file is empty so all user-agents are allowed` + +> **Exit codes:** `0` = `ALLOWED`, `1` = `DISALLOWED`. ## Notes From da6c46afe46577f4a4f32bd5974ad818d2a47a32 Mon Sep 17 00:00:00 2001 From: Leonardo Poletto Date: Thu, 28 Aug 2025 19:25:52 -0300 Subject: [PATCH 03/13] =?UTF-8?q?Change=20url=20=E2=86=92=20URI=20in=20the?= =?UTF-8?q?=20README?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index ea679b9..721a5e0 100644 --- a/README.md +++ b/README.md @@ -71,7 +71,7 @@ Target //:robots_main up-to-date: bazel-bin/robots_main ... bazel-robots$ bazel run robots_main -- ~/local/path/to/robots.txt YourBot https://example.com/url - user-agent 'YourBot' with url 'https://example.com/url': ALLOWED + user-agent 'YourBot' with URI 'https://example.com/url': ALLOWED ``` #### Building with CMake @@ -104,7 +104,7 @@ Test project robotstxt/c-build Total Test time (real) = 0.02 sec ... $ robots ~/local/path/to/robots.txt YourBot https://example.com/url - user-agent 'YourBot' with url 'https://example.com/url': ALLOWED + user-agent 'YourBot' with URI 'https://example.com/url': ALLOWED ``` > **Note**: If the robots file is empty, the parser also prints: `notice: robots file is empty so all user-agents are allowed` From 7066f3151e73b60a08576522482bb61fee1c3bad Mon Sep 17 00:00:00 2001 From: Leonardo Poletto Date: Thu, 28 Aug 2025 19:37:18 -0300 Subject: [PATCH 04/13] docs(README): document empty robots.txt behavior and exit codes - Clarify parser output when robots.txt is empty - Add note on exit codes (`0` = ALLOWED, `1` = DISALLOWED) - Makes CLI behavior clearer for scripting use --- README.md | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 721a5e0..764efd7 100644 --- a/README.md +++ b/README.md @@ -107,9 +107,12 @@ $ robots ~/local/path/to/robots.txt YourBot https://example.com/url user-agent 'YourBot' with URI 'https://example.com/url': ALLOWED ``` > **Note**: If the robots file is empty, the parser also prints: -`notice: robots file is empty so all user-agents are allowed` +> +> ``` +> notice: robots file is empty so all user-agents are allowed +> ``` -> **Exit codes:** `0` = `ALLOWED`, `1` = `DISALLOWED`. +> **Exit codes:** `0` = ALLOWED, `1` = DISALLOWED ## Notes From e1069a6dc3181e20043a071adae27f49b89b3970 Mon Sep 17 00:00:00 2001 From: elisa-luo Date: Wed, 3 Dec 2025 15:10:32 -0800 Subject: [PATCH 05/13] fix CMAKE_CXX_STANDARD --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 491ad67..e35b1fe 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,7 +1,7 @@ PROJECT(robots) CMAKE_MINIMUM_REQUIRED(VERSION 3.0) -SET(CMAKE_CXX_STANDARD 14) +SET(CMAKE_CXX_STANDARD 17) SET(CMAKE_POSITION_INDEPENDENT_CODE ON) SET(VERSION "0.0.0") From 55815b79fa73bdec29d4595d6bb9a9388b8c4275 Mon Sep 17 00:00:00 2001 From: Abhishek Singh <87878794+3x10RaiseTo8@users.noreply.github.com> Date: Sun, 11 Jan 2026 12:59:18 +0530 Subject: [PATCH 06/13] Add 'content-signal' to unsupported tags list --- reporting_robots.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/reporting_robots.cc b/reporting_robots.cc index 7c39518..32fd12e 100644 --- a/reporting_robots.cc +++ b/reporting_robots.cc @@ -16,7 +16,7 @@ namespace googlebot { // have some use cases; to the best of our knowledge other tags we find, don't. // (for example, "unicorn" from "unicorn: /value") static const std::vector kUnsupportedTags = { - "clean-param", "crawl-delay", "host", "noarchive", "noindex", "nofollow"}; + "clean-param", "crawl-delay", "host", "noarchive", "noindex", "nofollow", "content-signal"}; void RobotsParsingReporter::Digest(int line_num, RobotsParsedLine::RobotsTagName parsed_tag) { From fef529ace1d2421945cd6e7e90fc9057678cd210 Mon Sep 17 00:00:00 2001 From: Nazavtra Date: Thu, 15 Jan 2026 22:42:25 +0200 Subject: [PATCH 07/13] Add CI workflow --- .github/workflows/ci.yml | 52 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) create mode 100644 .github/workflows/ci.yml diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..a6ab78a --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,52 @@ +name: CI + +on: + push: + branches: [master] + pull_request: + branches: [master] + +jobs: + bazel: + name: Bazel (${{ matrix.os }}) + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [ubuntu-latest, macos-latest] + + steps: + - uses: actions/checkout@v4 + + - name: Setup Bazel + uses: bazel-contrib/setup-bazel@0.14.0 + with: + bazelisk-cache: true + disk-cache: ${{ github.workflow }} + repository-cache: true + + - name: Build + run: bazel build //... + + - name: Test + run: bazel test //... --test_output=errors + + cmake: + name: CMake (${{ matrix.os }}) + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [ubuntu-latest, macos-latest] + + steps: + - uses: actions/checkout@v4 + + - name: Configure + run: cmake -B build -DROBOTS_BUILD_TESTS=ON + + - name: Build + run: cmake --build build -j$(nproc 2>/dev/null || sysctl -n hw.ncpu) + + - name: Test + run: ctest --test-dir build --output-on-failure From 632678defe444b399a05bb1bd281afe7c1c14896 Mon Sep 17 00:00:00 2001 From: Nazavtra Date: Thu, 15 Jan 2026 22:42:25 +0200 Subject: [PATCH 08/13] Fix CI workflow for macOS --- .github/workflows/ci.yml | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index a6ab78a..d092bee 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -43,10 +43,14 @@ jobs: - uses: actions/checkout@v4 - name: Configure - run: cmake -B build -DROBOTS_BUILD_TESTS=ON + run: > + cmake -B cmake-build + -DROBOTS_BUILD_TESTS=ON + -DCMAKE_C_COMPILER=clang + -DCMAKE_CXX_COMPILER=clang++ - name: Build - run: cmake --build build -j$(nproc 2>/dev/null || sysctl -n hw.ncpu) + run: cmake --build cmake-build -j$(nproc 2>/dev/null || sysctl -n hw.ncpu) - name: Test - run: ctest --test-dir build --output-on-failure + run: ctest --test-dir cmake-build --output-on-failure \ No newline at end of file From 89f3049d65758de31e1668bcd5e7665c75e027d1 Mon Sep 17 00:00:00 2001 From: Nazavtra Date: Thu, 15 Jan 2026 23:09:40 +0200 Subject: [PATCH 09/13] Use cmake v3.10 --- CMakeLists.txt | 2 +- CMakeLists.txt.in | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index e35b1fe..29e6f04 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,5 +1,5 @@ +CMAKE_MINIMUM_REQUIRED(VERSION 3.10) PROJECT(robots) -CMAKE_MINIMUM_REQUIRED(VERSION 3.0) SET(CMAKE_CXX_STANDARD 17) SET(CMAKE_POSITION_INDEPENDENT_CODE ON) diff --git a/CMakeLists.txt.in b/CMakeLists.txt.in index 9bc5dc3..35bb945 100644 --- a/CMakeLists.txt.in +++ b/CMakeLists.txt.in @@ -1,6 +1,5 @@ - +CMAKE_MINIMUM_REQUIRED(VERSION 3.10) PROJECT(dependency-downloader NONE) -CMAKE_MINIMUM_REQUIRED(VERSION 3.0) INCLUDE(ExternalProject) From e020acd242096b369c5be11360ae9571d2c906dd Mon Sep 17 00:00:00 2001 From: Nazavtra Date: Thu, 15 Jan 2026 23:14:27 +0200 Subject: [PATCH 10/13] Add CodeQL --- .github/workflows/codeql.yml | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) create mode 100644 .github/workflows/codeql.yml diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml new file mode 100644 index 0000000..8166fd6 --- /dev/null +++ b/.github/workflows/codeql.yml @@ -0,0 +1,30 @@ +name: CodeQL + +on: + push: + branches: [master] + pull_request: + branches: [master] + +jobs: + analyze: + name: Analyze + runs-on: ubuntu-latest + permissions: + security-events: write + + steps: + - uses: actions/checkout@v4 + + - name: Initialize CodeQL + uses: github/codeql-action/init@v3 + with: + languages: cpp + + - name: Build + run: | + cmake -B build -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ + cmake --build build -j$(nproc) + + - name: Perform CodeQL Analysis + uses: github/codeql-action/analyze@v3 From d1d51a9eb2feec63555daea4cdfe0c83c9cfbefe Mon Sep 17 00:00:00 2001 From: Nazavtra Date: Thu, 15 Jan 2026 23:23:40 +0200 Subject: [PATCH 11/13] Use C++23 --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 29e6f04..0f9e28b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,7 +1,7 @@ CMAKE_MINIMUM_REQUIRED(VERSION 3.10) PROJECT(robots) -SET(CMAKE_CXX_STANDARD 17) +SET(CMAKE_CXX_STANDARD 23) SET(CMAKE_POSITION_INDEPENDENT_CODE ON) SET(VERSION "0.0.0") From 71008124f796e9c30a98539694be2aa8b22112f2 Mon Sep 17 00:00:00 2001 From: Nazavtra Date: Thu, 15 Jan 2026 23:34:38 +0200 Subject: [PATCH 12/13] Fix clang-tidy warnings --- robots.cc | 7 +++---- robots.h | 4 ++-- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/robots.cc b/robots.cc index 93b1fd3..a2b862a 100644 --- a/robots.cc +++ b/robots.cc @@ -58,7 +58,7 @@ namespace googlebot { // Match. class RobotsMatchStrategy { public: - virtual ~RobotsMatchStrategy() {} + virtual ~RobotsMatchStrategy() = default; virtual int MatchAllow(absl::string_view path, absl::string_view pattern) = 0; @@ -124,7 +124,6 @@ static const char* kHexDigits = "0123456789ABCDEF"; // authority, and fragment. Result always starts with "/". // Returns "/" if the url doesn't have a path or is not valid. std::string GetPathParamsQuery(const std::string& url) { - std::string path; // Initial two slashes are ignored. size_t search_start = 0; @@ -304,7 +303,7 @@ class RobotsTxtParser { void ParseAndEmitLine(int current_line, char* line, bool* line_too_long_strict); - bool NeedEscapeValueForKey(const Key& key); + static bool NeedEscapeValueForKey(const Key& key); absl::string_view robots_body_; RobotsParseHandler* const handler_; @@ -476,7 +475,7 @@ void RobotsTxtParser::Parse() { // characters matched by a pattern is returned as its match priority. class LongestMatchRobotsMatchStrategy : public RobotsMatchStrategy { public: - LongestMatchRobotsMatchStrategy() { } + LongestMatchRobotsMatchStrategy() = default; // Disallow copying and assignment. LongestMatchRobotsMatchStrategy(const LongestMatchRobotsMatchStrategy&) = diff --git a/robots.h b/robots.h index 335d3d5..4eadcc7 100644 --- a/robots.h +++ b/robots.h @@ -44,8 +44,8 @@ namespace googlebot { // ParseRobotsTxt() in the sequence they have been found in the file. class RobotsParseHandler { public: - RobotsParseHandler() {} - virtual ~RobotsParseHandler() {} + RobotsParseHandler() = default; + virtual ~RobotsParseHandler() = default; // Disallow copying and assignment. RobotsParseHandler(const RobotsParseHandler&) = delete; From 68cd9aaa78d63c88f43cf07c69a3c6f1b98cd09e Mon Sep 17 00:00:00 2001 From: Nazavtra Date: Thu, 15 Jan 2026 23:51:03 +0200 Subject: [PATCH 13/13] Change CMAKE_SOURCE_DIR to CMAKE_CURRENT_SOURCE_DIR --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 9cca413..3189c2c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -168,7 +168,7 @@ IF(ROBOTS_INSTALL) ARCHIVE DESTINATION lib ) - INSTALL(FILES ${CMAKE_SOURCE_DIR}/robots.h DESTINATION include) + INSTALL(FILES ${CMAKE_CURRENT_SOURCE_DIR}/robots.h DESTINATION include) INSTALL(TARGETS robots-main DESTINATION bin) ENDIF(ROBOTS_INSTALL)