Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/pytest.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: [3.10.x,3.11.x]
python-version: [3.10.x, 3.11.x, 3.12.x]

steps:
- uses: actions/checkout@v3
Expand Down
115 changes: 109 additions & 6 deletions fedivertex/main.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
import json
from types import NoneType
from typing import List, Optional
from typing import List, Optional, Tuple

import mlcroissant as mlc
import networkx as nx
import networkx_temporal as tx
from tqdm import tqdm


Expand All @@ -23,10 +25,12 @@ class GraphLoader:
}
UNDIRECTED_GRAPHS = ["federation"]

def __init__(
self,
url="https://www.kaggle.com/datasets/marcdamie/fediverse-graph-dataset/croissant/download",
):
def __init__(self, light_version=True):
self.light_version = light_version
if self.light_version:
url = "https://www.kaggle.com/datasets/marcdamie/fediverse-graph-dataset-reduced/croissant/download"
else:
url = "https://www.kaggle.com/datasets/marcdamie/fediverse-graph-dataset/croissant/download"
try:
self.dataset = mlc.Dataset(jsonld=url)
except json.JSONDecodeError as err:
Expand Down Expand Up @@ -56,6 +60,12 @@ def _check_input(self, software: str, graph_type: str) -> NoneType:
f"{graph_type} is not a valid graph type for {software}. Valid types: {self.VALID_GRAPH_TYPES[software]}"
)

if self.light_version and software == "mastodon" and graph_type == "federation":
raise ValueError(
f"The graph {software} {graph_type} is not included in the light version of Fedivertex\n"
"To download the full version, generate the dataset loader as follows: `GraphLoader(light_version=False)`"
)

def _fetch_date_index(self, software: str, graph_type: str, index: int) -> str:
"""Returns the i-th date available for a given graph type.
The dates are sorted increasingly.
Expand Down Expand Up @@ -198,7 +208,7 @@ def get_graph(
graph.nodes[host]["domain"] = host.split("[DOT]")[-1]
for col, val in record.items():
col_name = col.split("/")[-1]
if type(val) == bytes:
if type(val) is bytes:
val = val.decode()
if col_name not in ["host", "Id", "Label"]:
graph.nodes[host][col_name] = val
Expand All @@ -222,3 +232,96 @@ def get_graph(
graph = graph.subgraph(largest_cc).copy()

return graph

def get_temporal_graph(
self,
software: str,
graph_type: str,
index: Optional[Tuple[int, int]] = None,
date: Optional[Tuple[str, str]] = None,
disable_tqdm: bool = False,
) -> tx.TemporalGraph:
"""Provide a graph for a given software and graph type.
By default, we provide the latest graph but it can also be selected using the date or index.

:param software:
:type software: str
:param graph_type:
:type graph_type: str
:param index: index range for the graphs (bounds are included), defaults to None
:type index: Optional[Tuple[int, int]], optional
:param date: date range for the graphs (bounds are included), defaults to None
:type date: Optional[Tuple[str, str]], optional
:param disable_tqdm: disables the TQDM progress bars, defaults to False
:type disable_tqdm: bool, optional
:raises ValueError: if both a date and an index are provided.
:return: a graph in the NetworkX format
:rtype: tx.TemporalGraph
"""
self._check_input(software, graph_type)

if software == "mastodon" and graph_type == "federation":
resp = input(
"""Each Mastodon Federation graph is 1GB large.\n
Storing the temporal graph might take a lot of space in memory,
are you sure you want to load it? [yes or no]"""
)
if resp.lower() not in ["yes", "y", "yeah"]:
raise KeyboardInterrupt

availables_dates = self.list_available_dates(software, graph_type)
selected_dates = []
if index is None and date is None:
# Fetch all graphs
selected_dates = availables_dates
elif index is not None and date is not None:
raise ValueError(
"You must provide either the date or the index range of the graph, not both."
)
elif index is not None:
if len(index) > 2:
raise ValueError("Incorrect format for the index range")
if index[0] > index[1]:
raise ValueError("Incorrect index range")
if index[0] < 0 or index[1] > len(availables_dates) - 1:
raise ValueError(
f"Indices are out of the acceptable range (0,{len(availables_dates) - 1})"
)

selected_dates = availables_dates[index[0] : index[1] + 1]
else: # date is not None:
assert date is not None
if len(date) > 2:
raise ValueError("Incorrect format for the date range")

min_date, max_date = date
try:
min_date = int(min_date)
max_date = int(max_date)
except ValueError as err:
raise ValueError("Invalid date format") from err

if (
min_date > int(availables_dates[-1])
or int(availables_dates[0]) > max_date
):
raise ValueError(
f"Indices not covering the available dates: ({availables_dates[0]},{availables_dates[-1]})"
)

for selected_date in availables_dates:
int_date = int(selected_date)
if min_date <= int_date and int_date <= max_date:
selected_dates.append(selected_date)

selected_graphs = []
for selected_date in selected_dates:
graph = self.get_graph(
software=software,
graph_type=graph_type,
date=selected_date,
disable_tqdm=disable_tqdm,
)
selected_graphs.append(graph)

return tx.from_snapshots(selected_graphs)
6 changes: 4 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
from setuptools import setup, find_packages
from pathlib import Path

from setuptools import find_packages, setup

this_directory = Path(__file__).parent
long_description = (this_directory / "README.md").read_text()

setup(
name="fedivertex",
version="0.9.9",
version="1.0.0",
author="Marc DAMIE",
author_email="marc.damie@inria.fr",
description="Interface to download and interact with Fedivertex, the Fediverse Graph Dataset",
Expand All @@ -19,6 +20,7 @@
"numpy<2.0", # To be compatible with mlcroissant
"mlcroissant",
"networkx",
"networkx-temporal",
"tqdm",
],
extras_require={"test": ["pytest", "pytest-coverage"]},
Expand Down
36 changes: 36 additions & 0 deletions tests/test_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,3 +137,39 @@ def test_get_graph():
)
assert bookwyrm_graph.number_of_nodes() == 70
assert bookwyrm_graph.number_of_edges() == 1827


def test_get_temporal_graph():
loader = GraphLoader()

with pytest.raises(ValueError):
loader.get_temporal_graph("NON-EXISTING", "federation")

with pytest.raises(ValueError):
loader.get_temporal_graph("peertube", "NON-EXISTING")

with pytest.raises(ValueError):
loader.get_temporal_graph(
"peertube", "follow", date=("20250203", "20250217"), index=(3, 7)
)

with pytest.raises(ValueError):
loader.get_temporal_graph("peertube", "follow", index=(-1, 7))

with pytest.raises(ValueError):
loader.get_temporal_graph("peertube", "follow", index=(3, 70000000000))

with pytest.raises(ValueError):
loader.get_temporal_graph("peertube", "follow", date=("20210203", "20210217"))

temporal_graph = loader.get_temporal_graph(
"peertube", "follow", date=("20250203", "20250617")
)
assert len(temporal_graph.temporal_nodes()) == 1157
assert len(temporal_graph.temporal_edges()) == 310695
assert temporal_graph.number_of_snapshots() == 20

temporal_graph = loader.get_temporal_graph("peertube", "follow", index=(0, 7))
assert len(temporal_graph.temporal_nodes()) == 991
assert len(temporal_graph.temporal_edges()) == 133852
assert temporal_graph.number_of_snapshots() == 8