From 84495943b3127ef51045c5dffc9b7463584ed17e Mon Sep 17 00:00:00 2001 From: Joshua <138818689+francojoshua@users.noreply.github.com> Date: Wed, 19 Jul 2023 10:45:01 -0400 Subject: [PATCH 1/2] Add __get_bot_names function --- read_log.py | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/read_log.py b/read_log.py index 4ea23e7..38353a8 100644 --- a/read_log.py +++ b/read_log.py @@ -43,3 +43,33 @@ print("No match found.") df = pd.DataFrame(data) + + +def __get_bot_names(df, normalize=False): + """ + Get a frequency dictionary of bot requests + """ + pattern = re.compile(r"([A-Za-z]+bot)\b", re.IGNORECASE) + bot_names = {} + + # Find all user agents that contain the word "bot" case-insensitive + mask = df["user_agent"].str.lower().str.contains("bot") + + for agent in df[mask]["user_agent"]: + match = pattern.search(agent) + if match: + bot_name = match.group(1) + if bot_name not in bot_names: + bot_names[bot_name] = 0 + else: + bot_names[bot_name] += 1 + + # Convert the numbers into percentages that add up to 1 if noted. + if normalize: + total = sum(bot_names.values()) + + for name in bot_names: + bot_names[name] = round(bot_names[name] / total, 5) + + # Sort by the frequency in descending order + return dict(sorted(bot_names.items(), key=lambda item: item[1], reverse=True)) From 81e75b25655473b4893aa73bc3ad7b553fd917ff Mon Sep 17 00:00:00 2001 From: Joshua <138818689+francojoshua@users.noreply.github.com> Date: Wed, 19 Jul 2023 15:55:47 -0400 Subject: [PATCH 2/2] Move function up --- read_log.py | 61 +++++++++++++++++++++++++++-------------------------- 1 file changed, 31 insertions(+), 30 deletions(-) diff --git a/read_log.py b/read_log.py index 38353a8..61576e2 100644 --- a/read_log.py +++ b/read_log.py @@ -3,6 +3,37 @@ import pandas as pd + +def __get_bot_names(df, normalize=False): + """ + Get a frequency dictionary of bot requests + """ + pattern = re.compile(r"([A-Za-z]+bot)\b", re.IGNORECASE) + bot_names = {} + + # Find all user agents that contain the word "bot" case-insensitive + mask = df["user_agent"].str.lower().str.contains("bot") + + for agent in df[mask]["user_agent"]: + match = pattern.search(agent) + if match: + bot_name = match.group(1) + if bot_name not in bot_names: + bot_names[bot_name] = 0 + else: + bot_names[bot_name] += 1 + + # Convert the numbers into percentages that add up to 1 if noted. + if normalize: + total = sum(bot_names.values()) + + for name in bot_names: + bot_names[name] = round(bot_names[name] / total, 5) + + # Sort by the frequency in descending order + return dict(sorted(bot_names.items(), key=lambda item: item[1], reverse=True)) + + methods = ["GET", "HEAD", "POST", "PUT", "DELETE", "OPTIONS", "TRACE", "PATCH"] pattern = r'^([\d.]+) - - \[([^]]+)\] "([^"]*)" (\d+) (\d+) "([^"]*)" "([^"]*)" "-"$' @@ -43,33 +74,3 @@ print("No match found.") df = pd.DataFrame(data) - - -def __get_bot_names(df, normalize=False): - """ - Get a frequency dictionary of bot requests - """ - pattern = re.compile(r"([A-Za-z]+bot)\b", re.IGNORECASE) - bot_names = {} - - # Find all user agents that contain the word "bot" case-insensitive - mask = df["user_agent"].str.lower().str.contains("bot") - - for agent in df[mask]["user_agent"]: - match = pattern.search(agent) - if match: - bot_name = match.group(1) - if bot_name not in bot_names: - bot_names[bot_name] = 0 - else: - bot_names[bot_name] += 1 - - # Convert the numbers into percentages that add up to 1 if noted. - if normalize: - total = sum(bot_names.values()) - - for name in bot_names: - bot_names[name] = round(bot_names[name] / total, 5) - - # Sort by the frequency in descending order - return dict(sorted(bot_names.items(), key=lambda item: item[1], reverse=True))