diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..d092bee --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,56 @@ +name: CI + +on: + push: + branches: [master] + pull_request: + branches: [master] + +jobs: + bazel: + name: Bazel (${{ matrix.os }}) + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [ubuntu-latest, macos-latest] + + steps: + - uses: actions/checkout@v4 + + - name: Setup Bazel + uses: bazel-contrib/setup-bazel@0.14.0 + with: + bazelisk-cache: true + disk-cache: ${{ github.workflow }} + repository-cache: true + + - name: Build + run: bazel build //... + + - name: Test + run: bazel test //... --test_output=errors + + cmake: + name: CMake (${{ matrix.os }}) + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [ubuntu-latest, macos-latest] + + steps: + - uses: actions/checkout@v4 + + - name: Configure + run: > + cmake -B cmake-build + -DROBOTS_BUILD_TESTS=ON + -DCMAKE_C_COMPILER=clang + -DCMAKE_CXX_COMPILER=clang++ + + - name: Build + run: cmake --build cmake-build -j$(nproc 2>/dev/null || sysctl -n hw.ncpu) + + - name: Test + run: ctest --test-dir cmake-build --output-on-failure \ No newline at end of file diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml new file mode 100644 index 0000000..8166fd6 --- /dev/null +++ b/.github/workflows/codeql.yml @@ -0,0 +1,30 @@ +name: CodeQL + +on: + push: + branches: [master] + pull_request: + branches: [master] + +jobs: + analyze: + name: Analyze + runs-on: ubuntu-latest + permissions: + security-events: write + + steps: + - uses: actions/checkout@v4 + + - name: Initialize CodeQL + uses: github/codeql-action/init@v3 + with: + languages: cpp + + - name: Build + run: | + cmake -B build -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ + cmake --build build -j$(nproc) + + - name: Perform CodeQL Analysis + uses: github/codeql-action/analyze@v3 diff --git a/CMakeLists.txt b/CMakeLists.txt index 491ad67..3189c2c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,7 +1,7 @@ +CMAKE_MINIMUM_REQUIRED(VERSION 3.10) PROJECT(robots) -CMAKE_MINIMUM_REQUIRED(VERSION 3.0) -SET(CMAKE_CXX_STANDARD 14) +SET(CMAKE_CXX_STANDARD 23) SET(CMAKE_POSITION_INDEPENDENT_CODE ON) SET(VERSION "0.0.0") @@ -15,8 +15,8 @@ LIST(GET VERSION_DIGITS 2 CPACK_PACKAGE_VERSION_PATCH) SET(CPACK_PACKAGE_NAME "robots") SET(CPACK_PACKAGE_DESCRIPTION_SUMMARY "Google's robots.txt parser and matcher C++ library") SET(CPACK_PACKAGE_VENDOR "Google Inc.") -SET(CPACK_PACAKGE_DESCRIPTION_FILE "${CMAKE_SOURCE_DIR}/README.md") -SET(CPACK_RESOURCE_FILE_LICENSE "${CMAKE_SOURCE_DIR}/LICENSE") +SET(CPACK_PACKAGE_DESCRIPTION_FILE "${CMAKE_CURRENT_SOURCE_DIR}/README.md") +SET(CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE") SET(CPACK_PACKAGE_INSTALL_DIRECTORY "${CPACK_PACKAGE_DESCRIPTION_SUMMARY} ${CPACK_PACKAGE_VERSION_MAJOR}.${CPACK_PACKAGE_VERSION_MINOR}.${CPACK_PACKAGE_VERSION_PATCH}") SET(CPACK_SOURCE_PACKAGE_FILE_NAME "${CPACK_PACKAGE_NAME}-${CPACK_PACKAGE_VERSION_MAJOR}.${CPACK_PACKAGE_VERSION_MINOR}.${CPACK_PACKAGE_VERSION_PATCH}") @@ -51,6 +51,7 @@ SET(CPACK_SOURCE_IGNORE_FILES OPTION(ROBOTS_BUILD_STATIC "If ON, robots will build also the static library" ON) OPTION(ROBOTS_BUILD_TESTS "If ON, robots will build test targets" OFF) OPTION(ROBOTS_INSTALL "If ON, enable the installation of the targets" ON) +OPTION(ROBOTS_SKIP_DEPS "If ON, skip build dependency installation" OFF) ############ helper libs ############ @@ -60,17 +61,19 @@ INCLUDE(ExternalProject) ############ dependencies ############## -CONFIGURE_FILE(CMakeLists.txt.in libs/CMakeLists.txt) -EXECUTE_PROCESS(COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}" . RESULT_VARIABLE result WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/libs) +IF(NOT ROBOTS_SKIP_DEPS) + CONFIGURE_FILE(CMakeLists.txt.in libs/CMakeLists.txt) + EXECUTE_PROCESS(COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}" . RESULT_VARIABLE result WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/libs) -IF(result) - MESSAGE(FATAL_ERROR "Failed to download dependencies: ${result}") -ENDIF() + IF(result) + MESSAGE(FATAL_ERROR "Failed to download dependencies: ${result}") + ENDIF() -EXECUTE_PROCESS(COMMAND ${CMAKE_COMMAND} --build . RESULT_VARIABLE result WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/libs) + EXECUTE_PROCESS(COMMAND ${CMAKE_COMMAND} --build . RESULT_VARIABLE result WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/libs) -IF(result) - MESSAGE(FATAL_ERROR "Failed to download dependencies: ${result}") + IF(result) + MESSAGE(FATAL_ERROR "Failed to download dependencies: ${result}") + ENDIF() ENDIF() # abseil-cpp @@ -84,23 +87,27 @@ IF(MSVC) ADD_DEFINITIONS(/DNOMINMAX /DWIN32_LEAN_AND_MEAN=1 /D_CRT_SECURE_NO_WARNINGS) ENDIF(MSVC) -ADD_SUBDIRECTORY(${CMAKE_CURRENT_BINARY_DIR}/libs/abseil-cpp-src - ${CMAKE_CURRENT_BINARY_DIR}/libs/abseil-cpp-build - EXCLUDE_FROM_ALL) +IF(NOT ROBOTS_SKIP_DEPS) + ADD_SUBDIRECTORY(${CMAKE_CURRENT_BINARY_DIR}/libs/abseil-cpp-src + ${CMAKE_CURRENT_BINARY_DIR}/libs/abseil-cpp-build + EXCLUDE_FROM_ALL) +ENDIF() IF(ROBOTS_BUILD_TESTS) INCLUDE(CTest) - # googletest - ADD_SUBDIRECTORY(${CMAKE_CURRENT_BINARY_DIR}/libs/gtest-src - ${CMAKE_CURRENT_BINARY_DIR}/libs/gtest-build - EXCLUDE_FROM_ALL) + IF(NOT ROBOTS_SKIP_DEPS) + # googletest + ADD_SUBDIRECTORY(${CMAKE_CURRENT_BINARY_DIR}/libs/gtest-src + ${CMAKE_CURRENT_BINARY_DIR}/libs/gtest-build + EXCLUDE_FROM_ALL) - SET(INSTALL_GTEST 0) - SET(gtest_force_shared_crt ON CACHE BOOL "" FORCE) + SET(INSTALL_GTEST 0) + SET(gtest_force_shared_crt ON CACHE BOOL "" FORCE) - IF(CMAKE_VERSION VERSION_LESS 2.8.11) - INCLUDE_DIRECTORIES(${gtest_SOURCE_DIR}/include) + IF(CMAKE_VERSION VERSION_LESS 2.8.11) + INCLUDE_DIRECTORIES(${gtest_SOURCE_DIR}/include) + ENDIF() ENDIF() ENDIF(ROBOTS_BUILD_TESTS) @@ -121,6 +128,10 @@ INCLUDE_DIRECTORIES(.) ######### targets ########### +IF(ROBOTS_SKIP_DEPS) + find_package(absl) +ENDIF() + SET(LIBROBOTS_LIBS) SET(robots_SRCS ./robots.cc) @@ -157,7 +168,7 @@ IF(ROBOTS_INSTALL) ARCHIVE DESTINATION lib ) - INSTALL(FILES ${CMAKE_SOURCE_DIR}/robots.h DESTINATION include) + INSTALL(FILES ${CMAKE_CURRENT_SOURCE_DIR}/robots.h DESTINATION include) INSTALL(TARGETS robots-main DESTINATION bin) ENDIF(ROBOTS_INSTALL) @@ -168,7 +179,13 @@ IF(ROBOTS_BUILD_TESTS) ENABLE_TESTING() ADD_EXECUTABLE(robots-test ./robots_test.cc) - TARGET_LINK_LIBRARIES(robots-test ${LIBROBOTS_LIBS} gtest_main) + IF(ROBOTS_SKIP_DEPS) + find_package(GTest REQUIRED) + TARGET_LINK_LIBRARIES(robots-test ${LIBROBOTS_LIBS} ${robots_LIBS} GTest::gtest GTest::gtest_main) + ELSE() + TARGET_LINK_LIBRARIES(robots-test ${LIBROBOTS_LIBS} gtest_main) + ENDIF() + ADD_TEST(NAME robots-test COMMAND robots-test) ENDIF(ROBOTS_BUILD_TESTS) diff --git a/CMakeLists.txt.in b/CMakeLists.txt.in index 9bc5dc3..35bb945 100644 --- a/CMakeLists.txt.in +++ b/CMakeLists.txt.in @@ -1,6 +1,5 @@ - +CMAKE_MINIMUM_REQUIRED(VERSION 3.10) PROJECT(dependency-downloader NONE) -CMAKE_MINIMUM_REQUIRED(VERSION 3.0) INCLUDE(ExternalProject) diff --git a/README.md b/README.md index 2d5a2fc..764efd7 100644 --- a/README.md +++ b/README.md @@ -71,7 +71,7 @@ Target //:robots_main up-to-date: bazel-bin/robots_main ... bazel-robots$ bazel run robots_main -- ~/local/path/to/robots.txt YourBot https://example.com/url - user-agent 'YourBot' with url 'https://example.com/url' allowed: YES + user-agent 'YourBot' with URI 'https://example.com/url': ALLOWED ``` #### Building with CMake @@ -104,8 +104,15 @@ Test project robotstxt/c-build Total Test time (real) = 0.02 sec ... $ robots ~/local/path/to/robots.txt YourBot https://example.com/url - user-agent 'YourBot' with url 'https://example.com/url' allowed: YES + user-agent 'YourBot' with URI 'https://example.com/url': ALLOWED ``` +> **Note**: If the robots file is empty, the parser also prints: +> +> ``` +> notice: robots file is empty so all user-agents are allowed +> ``` + +> **Exit codes:** `0` = ALLOWED, `1` = DISALLOWED ## Notes diff --git a/reporting_robots.cc b/reporting_robots.cc index 7c39518..32fd12e 100644 --- a/reporting_robots.cc +++ b/reporting_robots.cc @@ -16,7 +16,7 @@ namespace googlebot { // have some use cases; to the best of our knowledge other tags we find, don't. // (for example, "unicorn" from "unicorn: /value") static const std::vector kUnsupportedTags = { - "clean-param", "crawl-delay", "host", "noarchive", "noindex", "nofollow"}; + "clean-param", "crawl-delay", "host", "noarchive", "noindex", "nofollow", "content-signal"}; void RobotsParsingReporter::Digest(int line_num, RobotsParsedLine::RobotsTagName parsed_tag) { diff --git a/robots.cc b/robots.cc index 93b1fd3..a2b862a 100644 --- a/robots.cc +++ b/robots.cc @@ -58,7 +58,7 @@ namespace googlebot { // Match. class RobotsMatchStrategy { public: - virtual ~RobotsMatchStrategy() {} + virtual ~RobotsMatchStrategy() = default; virtual int MatchAllow(absl::string_view path, absl::string_view pattern) = 0; @@ -124,7 +124,6 @@ static const char* kHexDigits = "0123456789ABCDEF"; // authority, and fragment. Result always starts with "/". // Returns "/" if the url doesn't have a path or is not valid. std::string GetPathParamsQuery(const std::string& url) { - std::string path; // Initial two slashes are ignored. size_t search_start = 0; @@ -304,7 +303,7 @@ class RobotsTxtParser { void ParseAndEmitLine(int current_line, char* line, bool* line_too_long_strict); - bool NeedEscapeValueForKey(const Key& key); + static bool NeedEscapeValueForKey(const Key& key); absl::string_view robots_body_; RobotsParseHandler* const handler_; @@ -476,7 +475,7 @@ void RobotsTxtParser::Parse() { // characters matched by a pattern is returned as its match priority. class LongestMatchRobotsMatchStrategy : public RobotsMatchStrategy { public: - LongestMatchRobotsMatchStrategy() { } + LongestMatchRobotsMatchStrategy() = default; // Disallow copying and assignment. LongestMatchRobotsMatchStrategy(const LongestMatchRobotsMatchStrategy&) = diff --git a/robots.h b/robots.h index 335d3d5..4eadcc7 100644 --- a/robots.h +++ b/robots.h @@ -44,8 +44,8 @@ namespace googlebot { // ParseRobotsTxt() in the sequence they have been found in the file. class RobotsParseHandler { public: - RobotsParseHandler() {} - virtual ~RobotsParseHandler() {} + RobotsParseHandler() = default; + virtual ~RobotsParseHandler() = default; // Disallow copying and assignment. RobotsParseHandler(const RobotsParseHandler&) = delete;