diff --git a/ghunt/cli.py b/ghunt/cli.py index c445b2c3..a05587e9 100644 --- a/ghunt/cli.py +++ b/ghunt/cli.py @@ -25,6 +25,11 @@ def parse_and_run(): parser_drive = subparsers.add_parser('drive', help="Get information on a Drive file or folder.") parser_drive.add_argument("file_id", help="Example: 1N__vVu4c9fCt4EHxfthUNzVOs_tp8l6tHcMBnpOZv_M") parser_drive.add_argument('--json', type=str, help="File to write the JSON output to.") + + ### YouTube module + parser_youtube = subparsers.add_parser('youtube', help="Get information on a YouTube channel (doesn't work with channels created after Google removed IDs from the page source, and relies on the page having been archived by Wayback Machine.") + parser_youtube.add_argument("channel_url", help="Example: https://www.youtube.com/@YouTube") + parser_youtube.add_argument('--json', type=str, help="File to write the JSON output to.") ### Parsing args = parser.parse_args(args=None if sys.argv[1:] else ['--help']) @@ -44,4 +49,7 @@ def process_args(args: argparse.Namespace): trio.run(gaia.hunt, None, args.gaia_id, args.json) case "drive": from ghunt.modules import drive - trio.run(drive.hunt, None, args.file_id, args.json) \ No newline at end of file + trio.run(drive.hunt, None, args.file_id, args.json) + case "youtube": + from ghunt.modules import youtube + trio.run(youtube.hunt, None, args.channel_url, args.json) diff --git a/ghunt/modules/youtube.py b/ghunt/modules/youtube.py new file mode 100644 index 00000000..93fb3ed1 --- /dev/null +++ b/ghunt/modules/youtube.py @@ -0,0 +1,24 @@ +from ghunt.helpers.utils import get_httpx_client +from ghunt import globals as gb + +import requests, re, waybackpy, argparse, trio, httpx + +async def hunt(as_client: httpx.AsyncClient, channel_url: str, json_file: bool=None): + # later: add a way to change this later + User_Agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:103.0) Gecko/20100101 Firefox/103.0" + r = requests.get(channel_url) + matchChannelID = re.search("(https?:\/\/)(www\.)?youtube\.com\/(channel)\/[\w-]+", r.text) + # later: maybe add a prompt here to ask the user if the channel ID looks valid? and if it doesn't, it can iterate through all the different channel IDs and ask for each one? Also, add error handling if no match found + channelIDURL = matchChannelID.group(0) + # later: add a way to let the user + # later: switch to memento API for access to more archives? + waybackpy_url_object = waybackpy.Url(channelIDURL, User_Agent) + nearest_archive_url = waybackpy_url_object.near(year=2019) + # later: add a way to see if there are any archives at all, and if there are any before Plus IDs were removed? + rArchived = requests.get(nearest_archive_url) + # later: add error handling if request fails + matchGAIAID = re.search("(?:https?:\/\/plus.google.com\/)([0-9]+)", rArchived.text) + # later: add error handling if no match found + gaia_id = matchGAIAID.group(1) + from ghunt.modules import gaia + await gaia.hunt(None, gaia_id, json_file)