Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 56 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
name: CI

on:
push:
branches: [master]
pull_request:
branches: [master]

jobs:
bazel:
name: Bazel (${{ matrix.os }})
runs-on: ${{ matrix.os }}
strategy:
fail-fast: false
matrix:
os: [ubuntu-latest, macos-latest]

steps:
- uses: actions/checkout@v4

- name: Setup Bazel
uses: bazel-contrib/setup-bazel@0.14.0
with:
bazelisk-cache: true
disk-cache: ${{ github.workflow }}
repository-cache: true

- name: Build
run: bazel build //...

- name: Test
run: bazel test //... --test_output=errors

cmake:
name: CMake (${{ matrix.os }})
runs-on: ${{ matrix.os }}
strategy:
fail-fast: false
matrix:
os: [ubuntu-latest, macos-latest]

steps:
- uses: actions/checkout@v4

- name: Configure
run: >
cmake -B cmake-build
-DROBOTS_BUILD_TESTS=ON
-DCMAKE_C_COMPILER=clang
-DCMAKE_CXX_COMPILER=clang++

- name: Build
run: cmake --build cmake-build -j$(nproc 2>/dev/null || sysctl -n hw.ncpu)

- name: Test
run: ctest --test-dir cmake-build --output-on-failure
30 changes: 30 additions & 0 deletions .github/workflows/codeql.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
name: CodeQL

on:
push:
branches: [master]
pull_request:
branches: [master]

jobs:
analyze:
name: Analyze
runs-on: ubuntu-latest
permissions:
security-events: write

steps:
- uses: actions/checkout@v4

- name: Initialize CodeQL
uses: github/codeql-action/init@v3
with:
languages: cpp

- name: Build
run: |
cmake -B build -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++
cmake --build build -j$(nproc)

- name: Perform CodeQL Analysis
uses: github/codeql-action/analyze@v3
67 changes: 42 additions & 25 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
CMAKE_MINIMUM_REQUIRED(VERSION 3.10)
PROJECT(robots)
CMAKE_MINIMUM_REQUIRED(VERSION 3.0)

SET(CMAKE_CXX_STANDARD 14)
SET(CMAKE_CXX_STANDARD 23)
SET(CMAKE_POSITION_INDEPENDENT_CODE ON)

SET(VERSION "0.0.0")
Expand All @@ -15,8 +15,8 @@ LIST(GET VERSION_DIGITS 2 CPACK_PACKAGE_VERSION_PATCH)
SET(CPACK_PACKAGE_NAME "robots")
SET(CPACK_PACKAGE_DESCRIPTION_SUMMARY "Google's robots.txt parser and matcher C++ library")
SET(CPACK_PACKAGE_VENDOR "Google Inc.")
SET(CPACK_PACAKGE_DESCRIPTION_FILE "${CMAKE_SOURCE_DIR}/README.md")
SET(CPACK_RESOURCE_FILE_LICENSE "${CMAKE_SOURCE_DIR}/LICENSE")
SET(CPACK_PACKAGE_DESCRIPTION_FILE "${CMAKE_CURRENT_SOURCE_DIR}/README.md")
SET(CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE")

SET(CPACK_PACKAGE_INSTALL_DIRECTORY "${CPACK_PACKAGE_DESCRIPTION_SUMMARY} ${CPACK_PACKAGE_VERSION_MAJOR}.${CPACK_PACKAGE_VERSION_MINOR}.${CPACK_PACKAGE_VERSION_PATCH}")
SET(CPACK_SOURCE_PACKAGE_FILE_NAME "${CPACK_PACKAGE_NAME}-${CPACK_PACKAGE_VERSION_MAJOR}.${CPACK_PACKAGE_VERSION_MINOR}.${CPACK_PACKAGE_VERSION_PATCH}")
Expand Down Expand Up @@ -51,6 +51,7 @@ SET(CPACK_SOURCE_IGNORE_FILES
OPTION(ROBOTS_BUILD_STATIC "If ON, robots will build also the static library" ON)
OPTION(ROBOTS_BUILD_TESTS "If ON, robots will build test targets" OFF)
OPTION(ROBOTS_INSTALL "If ON, enable the installation of the targets" ON)
OPTION(ROBOTS_SKIP_DEPS "If ON, skip build dependency installation" OFF)

############ helper libs ############

Expand All @@ -60,17 +61,19 @@ INCLUDE(ExternalProject)

############ dependencies ##############

CONFIGURE_FILE(CMakeLists.txt.in libs/CMakeLists.txt)
EXECUTE_PROCESS(COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}" . RESULT_VARIABLE result WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/libs)
IF(NOT ROBOTS_SKIP_DEPS)
CONFIGURE_FILE(CMakeLists.txt.in libs/CMakeLists.txt)
EXECUTE_PROCESS(COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}" . RESULT_VARIABLE result WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/libs)

IF(result)
MESSAGE(FATAL_ERROR "Failed to download dependencies: ${result}")
ENDIF()
IF(result)
MESSAGE(FATAL_ERROR "Failed to download dependencies: ${result}")
ENDIF()

EXECUTE_PROCESS(COMMAND ${CMAKE_COMMAND} --build . RESULT_VARIABLE result WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/libs)
EXECUTE_PROCESS(COMMAND ${CMAKE_COMMAND} --build . RESULT_VARIABLE result WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/libs)

IF(result)
MESSAGE(FATAL_ERROR "Failed to download dependencies: ${result}")
IF(result)
MESSAGE(FATAL_ERROR "Failed to download dependencies: ${result}")
ENDIF()
ENDIF()

# abseil-cpp
Expand All @@ -84,23 +87,27 @@ IF(MSVC)
ADD_DEFINITIONS(/DNOMINMAX /DWIN32_LEAN_AND_MEAN=1 /D_CRT_SECURE_NO_WARNINGS)
ENDIF(MSVC)

ADD_SUBDIRECTORY(${CMAKE_CURRENT_BINARY_DIR}/libs/abseil-cpp-src
${CMAKE_CURRENT_BINARY_DIR}/libs/abseil-cpp-build
EXCLUDE_FROM_ALL)
IF(NOT ROBOTS_SKIP_DEPS)
ADD_SUBDIRECTORY(${CMAKE_CURRENT_BINARY_DIR}/libs/abseil-cpp-src
${CMAKE_CURRENT_BINARY_DIR}/libs/abseil-cpp-build
EXCLUDE_FROM_ALL)
ENDIF()

IF(ROBOTS_BUILD_TESTS)
INCLUDE(CTest)

# googletest
ADD_SUBDIRECTORY(${CMAKE_CURRENT_BINARY_DIR}/libs/gtest-src
${CMAKE_CURRENT_BINARY_DIR}/libs/gtest-build
EXCLUDE_FROM_ALL)
IF(NOT ROBOTS_SKIP_DEPS)
# googletest
ADD_SUBDIRECTORY(${CMAKE_CURRENT_BINARY_DIR}/libs/gtest-src
${CMAKE_CURRENT_BINARY_DIR}/libs/gtest-build
EXCLUDE_FROM_ALL)

SET(INSTALL_GTEST 0)
SET(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
SET(INSTALL_GTEST 0)
SET(gtest_force_shared_crt ON CACHE BOOL "" FORCE)

IF(CMAKE_VERSION VERSION_LESS 2.8.11)
INCLUDE_DIRECTORIES(${gtest_SOURCE_DIR}/include)
IF(CMAKE_VERSION VERSION_LESS 2.8.11)
INCLUDE_DIRECTORIES(${gtest_SOURCE_DIR}/include)
ENDIF()
ENDIF()
ENDIF(ROBOTS_BUILD_TESTS)

Expand All @@ -121,6 +128,10 @@ INCLUDE_DIRECTORIES(.)

######### targets ###########

IF(ROBOTS_SKIP_DEPS)
find_package(absl)
ENDIF()

SET(LIBROBOTS_LIBS)

SET(robots_SRCS ./robots.cc)
Expand Down Expand Up @@ -157,7 +168,7 @@ IF(ROBOTS_INSTALL)
ARCHIVE DESTINATION lib
)

INSTALL(FILES ${CMAKE_SOURCE_DIR}/robots.h DESTINATION include)
INSTALL(FILES ${CMAKE_CURRENT_SOURCE_DIR}/robots.h DESTINATION include)

INSTALL(TARGETS robots-main DESTINATION bin)
ENDIF(ROBOTS_INSTALL)
Expand All @@ -168,7 +179,13 @@ IF(ROBOTS_BUILD_TESTS)
ENABLE_TESTING()

ADD_EXECUTABLE(robots-test ./robots_test.cc)
TARGET_LINK_LIBRARIES(robots-test ${LIBROBOTS_LIBS} gtest_main)
IF(ROBOTS_SKIP_DEPS)
find_package(GTest REQUIRED)
TARGET_LINK_LIBRARIES(robots-test ${LIBROBOTS_LIBS} ${robots_LIBS} GTest::gtest GTest::gtest_main)
ELSE()
TARGET_LINK_LIBRARIES(robots-test ${LIBROBOTS_LIBS} gtest_main)
ENDIF()

ADD_TEST(NAME robots-test COMMAND robots-test)
ENDIF(ROBOTS_BUILD_TESTS)

3 changes: 1 addition & 2 deletions CMakeLists.txt.in
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@

CMAKE_MINIMUM_REQUIRED(VERSION 3.10)
PROJECT(dependency-downloader NONE)
CMAKE_MINIMUM_REQUIRED(VERSION 3.0)

INCLUDE(ExternalProject)

Expand Down
11 changes: 9 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ Target //:robots_main up-to-date:
bazel-bin/robots_main
...
bazel-robots$ bazel run robots_main -- ~/local/path/to/robots.txt YourBot https://example.com/url
user-agent 'YourBot' with url 'https://example.com/url' allowed: YES
user-agent 'YourBot' with URI 'https://example.com/url': ALLOWED
```

#### Building with CMake
Expand Down Expand Up @@ -104,8 +104,15 @@ Test project robotstxt/c-build
Total Test time (real) = 0.02 sec
...
$ robots ~/local/path/to/robots.txt YourBot https://example.com/url
user-agent 'YourBot' with url 'https://example.com/url' allowed: YES
user-agent 'YourBot' with URI 'https://example.com/url': ALLOWED
```
> **Note**: If the robots file is empty, the parser also prints:
>
> ```
> notice: robots file is empty so all user-agents are allowed
> ```

> **Exit codes:** `0` = ALLOWED, `1` = DISALLOWED

## Notes

Expand Down
2 changes: 1 addition & 1 deletion reporting_robots.cc
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ namespace googlebot {
// have some use cases; to the best of our knowledge other tags we find, don't.
// (for example, "unicorn" from "unicorn: /value")
static const std::vector<std::string> kUnsupportedTags = {
"clean-param", "crawl-delay", "host", "noarchive", "noindex", "nofollow"};
"clean-param", "crawl-delay", "host", "noarchive", "noindex", "nofollow", "content-signal"};

void RobotsParsingReporter::Digest(int line_num,
RobotsParsedLine::RobotsTagName parsed_tag) {
Expand Down
7 changes: 3 additions & 4 deletions robots.cc
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ namespace googlebot {
// Match.
class RobotsMatchStrategy {
public:
virtual ~RobotsMatchStrategy() {}
virtual ~RobotsMatchStrategy() = default;

virtual int MatchAllow(absl::string_view path,
absl::string_view pattern) = 0;
Expand Down Expand Up @@ -124,7 +124,6 @@ static const char* kHexDigits = "0123456789ABCDEF";
// authority, and fragment. Result always starts with "/".
// Returns "/" if the url doesn't have a path or is not valid.
std::string GetPathParamsQuery(const std::string& url) {
std::string path;

// Initial two slashes are ignored.
size_t search_start = 0;
Expand Down Expand Up @@ -304,7 +303,7 @@ class RobotsTxtParser {

void ParseAndEmitLine(int current_line, char* line,
bool* line_too_long_strict);
bool NeedEscapeValueForKey(const Key& key);
static bool NeedEscapeValueForKey(const Key& key);

absl::string_view robots_body_;
RobotsParseHandler* const handler_;
Expand Down Expand Up @@ -476,7 +475,7 @@ void RobotsTxtParser::Parse() {
// characters matched by a pattern is returned as its match priority.
class LongestMatchRobotsMatchStrategy : public RobotsMatchStrategy {
public:
LongestMatchRobotsMatchStrategy() { }
LongestMatchRobotsMatchStrategy() = default;

// Disallow copying and assignment.
LongestMatchRobotsMatchStrategy(const LongestMatchRobotsMatchStrategy&) =
Expand Down
4 changes: 2 additions & 2 deletions robots.h
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,8 @@ namespace googlebot {
// ParseRobotsTxt() in the sequence they have been found in the file.
class RobotsParseHandler {
public:
RobotsParseHandler() {}
virtual ~RobotsParseHandler() {}
RobotsParseHandler() = default;
virtual ~RobotsParseHandler() = default;

// Disallow copying and assignment.
RobotsParseHandler(const RobotsParseHandler&) = delete;
Expand Down