Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
104 changes: 80 additions & 24 deletions gitfive/lib/commits.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import json

import trio
from bs4 import BeautifulSoup
from alive_progress import alive_bar
Expand All @@ -8,11 +10,54 @@
from gitfive.lib.instruments import TrioAliveProgress


def _extract_payload(raw_body: str):
"""
The new GitHub commits page renders an embedded JSON payload inside a
`<script type="application/json">` tag. The old `<li class="js-commits-list-item">`
DOM has been removed. The payload contains a `commitGroups` list whose entries
each hold a `commits` list with `oid` / `authors` / `bodyMessageHtml` fields —
everything Metamon needs to map fake commit hashes back to the recognised
GitHub user (the `authors[1]` entry, if any).
"""
body = BeautifulSoup(raw_body, 'html.parser')
for s in body.find_all('script', {'type': 'application/json'}):
text = s.string or ''
if 'commitGroups' in text:
try:
data = json.loads(text)
except json.JSONDecodeError:
continue
payload = data.get('payload')
if isinstance(payload, dict) and 'commitGroups' in payload:
return payload
return None


def _iter_commits(payload):
for group in payload.get('commitGroups', []) or []:
for commit in group.get('commits', []) or []:
yield commit


def _target_author(commit):
"""
Each Metamon commit has two `authors` entries: the local committer
(`gitfive_hunter`, login=None) and the impersonated co-author. We want the
one that GitHub successfully linked to a real account — i.e. has a non-null
`login`. Returns None when nothing was matched (the email is unknown to GH).
"""
for author in commit.get('authors', []) or []:
login = author.get('login')
if login and login != 'gitfive_hunter':
return author
return None


async def fetch_avatar(runner: GitfiveRunner, email: str, avatar_link: str, username: str,
out: Dict[str, str|bool], check_only: bool):
async with runner.limiters["commits_fetch_avatar"]:
is_target = (username.lower() == runner.target.username.lower())
if check_only:
if check_only:
if is_target:
runner.rc.print(f"[+] [Target's email] 🐱 {email} -> @{username}", style="cyan")

Expand Down Expand Up @@ -49,42 +94,53 @@ async def fetch_commits(runner: GitfiveRunner, repo_name: str, emails_index: Dic

if req.status_code == 429:
exit(f'Rate-limit detected, please adjust the CapacityLimiter.\nCurrent CapacityLimiter : {runner.limiters["commits_scrape"]}')
body = BeautifulSoup(req.text, 'html.parser')

commits = body.find_all("li", {"class": "js-commits-list-item"})

payload = _extract_payload(req.text)
if payload is None:
return

async with trio.open_nursery() as nursery:
for commit in commits:
hexsha = commit.find("a", {"class": "js-navigation-open"}).attrs["href"].split("/")[-1]
avatar = commit.find("img", {"class": "avatar-user"})
if not avatar:
for commit in _iter_commits(payload):
hexsha = commit.get('oid')
if not hexsha or hexsha not in emails_index:
continue
target = _target_author(commit)
if target is None:
continue

email = emails_index[hexsha]
avatar_link = avatar.get("src")
username = avatar.get("alt")[1:] # We remove the "@" at the beginning
avatar_link = target.get('avatarUrl')
username = target.get('login')

nursery.start_soon(fetch_avatar, runner, email, avatar_link, username, out, check_only)

async def scrape(runner: GitfiveRunner, repo_name: str, emails_index: Dict[str, str], check_only=False):
out = {}
total = 0
last_hash_trigger = f"/{runner.creds.username}/{repo_name}/tree/"
last_hash = ""

req = await runner.as_client.get(f"https://github.com/{runner.creds.username}/{repo_name}")
body = BeautifulSoup(req.text, 'html.parser')
req = await runner.as_client.get(f"https://github.com/{runner.creds.username}/{repo_name}/commits/mirage")
if req.status_code != 200:
exit(f"Couldn't fetch the commits page (HTTP {req.status_code}).")

if is_repo_empty(body):
exit("Empty repository.")
payload = _extract_payload(req.text)
if payload is None:
body = BeautifulSoup(req.text, 'html.parser')
if is_repo_empty(body):
exit("Empty repository.")
exit("Couldn't parse the commits page payload.")

if last_hash_trigger in req.text:
_, total = await get_commits_count(runner, raw_body=req.text)
last_hash = [x for x in body.select('a') if x.text.lower() == "permalink"][0].attrs['href'].split('/')[-1]
else:
last_hash = (payload.get('currentCommit') or {}).get('oid') \
or (payload.get('refInfo') or {}).get('currentOid')
if not last_hash:
exit("Couldn't fetch the last hash.")

to_request = [0]+list(range(-1, total-1, 35))[1:]
_, total = await get_commits_count(runner, raw_body=req.text)
if not total:
# Fall back to counting whatever the payload already gave us.
total = sum(len(g.get('commits', []) or []) for g in payload.get('commitGroups', []) or [])
if not total:
return out

to_request = [0] + list(range(-1, total-1, 35))[1:]

with alive_bar(total, receipt=False, enrich_print=False, title="Fetching committers...") as bar:
instrument = TrioAliveProgress(fetch_commits, 35, bar)
Expand All @@ -97,4 +153,4 @@ async def scrape(runner: GitfiveRunner, repo_name: str, emails_index: Dict[str,

trio.lowlevel.remove_instrument(instrument)

return out
return out
19 changes: 14 additions & 5 deletions gitfive/lib/domain_finder.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,20 @@ def guess_custom_domain(runner: GitfiveRunner):
except Exception: # https://github.com/mxrch/GitFive/issues/15
runner.rc.print("[!] Google Search failed, are you using a VPN/Proxy ?", style="italic")

# Hunter.io
req = httpx.get(f"https://hunter.io/v2/domains-suggestion?query={company}")
data = json.loads(req.text)
if results := data.get("data", [{}]):
hunter = results[0].get("domain")
# Hunter.io — the public hunter.io/v2 endpoint now 303-redirects to
# api.hunter.io and requires an API key (returns 401). Treat any non-200
# or non-JSON response as "no result" instead of crashing the whole run.
try:
req = httpx.get(
f"https://hunter.io/v2/domains-suggestion?query={company}",
follow_redirects=True,
)
if req.status_code == 200:
data = req.json()
if results := data.get("data", [{}]):
hunter = results[0].get("domain")
except (httpx.HTTPError, json.JSONDecodeError, ValueError):
runner.rc.print("[!] Hunter.io lookup failed.", style="italic")

if hunter and (not google or hunter in google):
runner.rc.print(f'🔍 [Hunter.io] Found possible domain "{hunter}" for company "{company}"', style="light_green")
Expand Down
32 changes: 31 additions & 1 deletion gitfive/lib/repos.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,11 +48,41 @@ async def fetch_repos_page(runner: GitfiveRunner, page: int, repos: List[Dict[st
repos.append(details)


def _extract_repo_count(body: BeautifulSoup) -> int:
"""
Extract the repository count from the navigation tab on a user's profile.
GitHub serves two different shells: the legacy anonymous one uses
`<span class="Counter" title="N">`, the newer Primer-React one (shown to
authenticated viewers) uses `<span data-component="counter">` with the
number in nested text.
"""
tab = body.find("a", {"data-tab-item": "repositories"})
if tab is None:
return 0

legacy = tab.find("span", {"class": "Counter"})
if legacy is not None:
raw = legacy.attrs.get("title") or legacy.get_text()
else:
modern = tab.find("span", {"data-component": "counter"})
if modern is None:
return 0
raw = modern.get_text()

digits = ''.join(ch for ch in raw if ch.isdigit())
return int(digits) if digits else 0


async def get_list(runner: GitfiveRunner):
req = await runner.as_client.get(f"https://github.com/{runner.target.username}?tab=repositories")

body = BeautifulSoup(req.text, 'html.parser')
total_repos = int(body.find("a", {"data-tab-item": "repositories"}).find("span", {"class": "Counter"}).attrs["title"])
total_repos = _extract_repo_count(body)

if not total_repos:
runner.target.repos = []
runner.target.languages_stats = {}
return

to_request = range(1, ceil(total_repos/30)+1)

Expand Down
8 changes: 6 additions & 2 deletions gitfive/lib/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,13 +110,17 @@ async def get_commits_count(runner: GitfiveRunner, repo_url: str="", raw_body: s
raw_body = req.text
body = BeautifulSoup(raw_body, 'html.parser')
# Slightly modified this line to find the correct <span> containing the commit count
commits_icon_el = body.find("a", {"href": re.compile(r'.*/commits/mirage$')})
commits_icon_el = body.find("a", {"href": re.compile(r'.*/commits/mirage/?$')})
if not commits_icon_el:
return False, 0
nb_commits_el = commits_icon_el.findNext("span")
if not nb_commits_el:
return False, 0
nb_commits_str = nb_commits_el.text.split()[0].replace(",", "")
nb_commits_text = nb_commits_el.text.strip() or commits_icon_el.get_text(strip=True)
parts = nb_commits_text.split()
if not parts:
return False, 0
nb_commits_str = parts[0].replace(",", "")
if nb_commits_str == "∞":
return True, 50000 # Temporary limit, because GitHub hasn't liked my 70k commits
nb_commits = int(nb_commits_str)
Expand Down