From dc31383d27c246135bd1ba863ccffc66a852e7ab Mon Sep 17 00:00:00 2001 From: Marc Damie Date: Thu, 12 Feb 2026 13:57:55 +0100 Subject: [PATCH 1/4] Implement an API for temporal graphs --- fedivertex/main.py | 99 +++++++++++++++++++++++++++++++++++++++++++- setup.py | 5 ++- tests/test_loader.py | 36 ++++++++++++++++ 3 files changed, 136 insertions(+), 4 deletions(-) diff --git a/fedivertex/main.py b/fedivertex/main.py index 1eb755b..0764257 100644 --- a/fedivertex/main.py +++ b/fedivertex/main.py @@ -1,8 +1,10 @@ import json from types import NoneType -from typing import List, Optional +from typing import List, Optional, Tuple + import mlcroissant as mlc import networkx as nx +import networkx_temporal as tx from tqdm import tqdm @@ -198,7 +200,7 @@ def get_graph( graph.nodes[host]["domain"] = host.split("[DOT]")[-1] for col, val in record.items(): col_name = col.split("/")[-1] - if type(val) == bytes: + if type(val) is bytes: val = val.decode() if col_name not in ["host", "Id", "Label"]: graph.nodes[host][col_name] = val @@ -222,3 +224,96 @@ def get_graph( graph = graph.subgraph(largest_cc).copy() return graph + + def get_temporal_graph( + self, + software: str, + graph_type: str, + index: Optional[Tuple[int, int]] = None, + date: Optional[Tuple[str, str]] = None, + disable_tqdm: bool = False, + ) -> tx.TemporalGraph: + """Provide a graph for a given software and graph type. + By default, we provide the latest graph but it can also be selected using the date or index. + + :param software: + :type software: str + :param graph_type: + :type graph_type: str + :param index: index range for the graphs (bounds are included), defaults to None + :type index: Optional[Tuple[int, int]], optional + :param date: date range for the graphs (bounds are included), defaults to None + :type date: Optional[Tuple[str, str]], optional + :param disable_tqdm: disables the TQDM progress bars, defaults to False + :type disable_tqdm: bool, optional + :raises ValueError: if both a date and an index are provided. + :return: a graph in the NetworkX format + :rtype: tx.TemporalGraph + """ + self._check_input(software, graph_type) + + if software == "mastodon" and graph_type == "federation": + resp = input( + """Each Mastodon Federation graph is 1GB large.\n + Storing the temporal graph might take a lot of space in memory, + are you sure you want to load it? [yes or no]""" + ) + if resp.lower() not in ["yes", "y", "yeah"]: + raise KeyboardInterrupt + + availables_dates = self.list_available_dates(software, graph_type) + selected_dates = [] + if index is None and date is None: + # Fetch all graphs + selected_dates = availables_dates + elif index is not None and date is not None: + raise ValueError( + "You must provide either the date or the index range of the graph, not both." + ) + elif index is not None: + if len(index) > 2: + raise ValueError("Incorrect format for the index range") + if index[0] > index[1]: + raise ValueError("Incorrect index range") + if index[0] < 0 or index[1] > len(availables_dates) - 1: + raise ValueError( + f"Indices are out of the acceptable range (0,{len(availables_dates) - 1})" + ) + + selected_dates = availables_dates[index[0] : index[1] + 1] + else: # date is not None: + assert date is not None + if len(date) > 2: + raise ValueError("Incorrect format for the date range") + + min_date, max_date = date + try: + min_date = int(min_date) + max_date = int(max_date) + except ValueError as err: + raise ValueError("Invalid date format") from err + + if ( + min_date > int(availables_dates[-1]) + or int(availables_dates[0]) > max_date + ): + raise ValueError( + f"Indices not covering the available dates: ({availables_dates[0]},{availables_dates[-1]})" + ) + + for selected_date in availables_dates: + int_date = int(selected_date) + if min_date <= int_date and int_date <= max_date: + selected_dates.append(selected_date) + + selected_graphs = [] + for selected_date in selected_dates: + graph = self.get_graph( + software=software, + graph_type=graph_type, + date=selected_date, + disable_tqdm=disable_tqdm, + ) + selected_graphs.append(graph) + + return tx.from_snapshots(selected_graphs) diff --git a/setup.py b/setup.py index 3a54212..83ca05d 100644 --- a/setup.py +++ b/setup.py @@ -1,12 +1,13 @@ -from setuptools import setup, find_packages from pathlib import Path +from setuptools import find_packages, setup + this_directory = Path(__file__).parent long_description = (this_directory / "README.md").read_text() setup( name="fedivertex", - version="0.9.9", + version="1.0.0", author="Marc DAMIE", author_email="marc.damie@inria.fr", description="Interface to download and interact with Fedivertex, the Fediverse Graph Dataset", diff --git a/tests/test_loader.py b/tests/test_loader.py index 4fc4b0a..b4b81a5 100644 --- a/tests/test_loader.py +++ b/tests/test_loader.py @@ -137,3 +137,39 @@ def test_get_graph(): ) assert bookwyrm_graph.number_of_nodes() == 70 assert bookwyrm_graph.number_of_edges() == 1827 + + +def test_get_temporal_graph(): + loader = GraphLoader() + + with pytest.raises(ValueError): + loader.get_temporal_graph("NON-EXISTING", "federation") + + with pytest.raises(ValueError): + loader.get_temporal_graph("peertube", "NON-EXISTING") + + with pytest.raises(ValueError): + loader.get_temporal_graph( + "peertube", "follow", date=("20250203", "20250217"), index=(3, 7) + ) + + with pytest.raises(ValueError): + loader.get_temporal_graph("peertube", "follow", index=(-1, 7)) + + with pytest.raises(ValueError): + loader.get_temporal_graph("peertube", "follow", index=(3, 70000000000)) + + with pytest.raises(ValueError): + loader.get_temporal_graph("peertube", "follow", date=("20210203", "20210217")) + + temporal_graph = loader.get_temporal_graph( + "peertube", "follow", date=("20250203", "20250617") + ) + assert len(temporal_graph.temporal_nodes()) == 1157 + assert len(temporal_graph.temporal_edges()) == 310695 + assert temporal_graph.number_of_snapshots() == 20 + + temporal_graph = loader.get_temporal_graph("peertube", "follow", index=(0, 7)) + assert len(temporal_graph.temporal_nodes()) == 991 + assert len(temporal_graph.temporal_edges()) == 133852 + assert temporal_graph.number_of_snapshots() == 8 From f85d22b3bc5f1bc9af585669bbf09ae083762a83 Mon Sep 17 00:00:00 2001 From: Marc Damie Date: Thu, 12 Feb 2026 14:13:02 +0100 Subject: [PATCH 2/4] Add a missing dependency --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index 83ca05d..c26e97d 100644 --- a/setup.py +++ b/setup.py @@ -20,6 +20,7 @@ "numpy<2.0", # To be compatible with mlcroissant "mlcroissant", "networkx", + "networkx-temporal", "tqdm", ], extras_require={"test": ["pytest", "pytest-coverage"]}, From dc5ae896df40d653f69e8a8848a958b33110357a Mon Sep 17 00:00:00 2001 From: Marc Damie Date: Thu, 12 Feb 2026 16:27:52 +0100 Subject: [PATCH 3/4] Add a light dataset option to reduce disk space usage --- fedivertex/main.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/fedivertex/main.py b/fedivertex/main.py index 0764257..a57f7e5 100644 --- a/fedivertex/main.py +++ b/fedivertex/main.py @@ -25,10 +25,12 @@ class GraphLoader: } UNDIRECTED_GRAPHS = ["federation"] - def __init__( - self, - url="https://www.kaggle.com/datasets/marcdamie/fediverse-graph-dataset/croissant/download", - ): + def __init__(self, light_version=True): + self.light_version = light_version + if self.light_version: + url = "https://www.kaggle.com/datasets/marcdamie/fediverse-graph-dataset-reduced/croissant/download" + else: + url = "https://www.kaggle.com/datasets/marcdamie/fediverse-graph-dataset/croissant/download" try: self.dataset = mlc.Dataset(jsonld=url) except json.JSONDecodeError as err: @@ -58,6 +60,12 @@ def _check_input(self, software: str, graph_type: str) -> NoneType: f"{graph_type} is not a valid graph type for {software}. Valid types: {self.VALID_GRAPH_TYPES[software]}" ) + if self.light_version and software == "mastodon" and graph_type == "federation": + raise ValueError( + f"The graph {software} {graph_type} is not included in the light version of Fedivertex\n" + "To download the full version, generate the dataset loader as follows: `GraphLoader(light_version=False)`" + ) + def _fetch_date_index(self, software: str, graph_type: str, index: int) -> str: """Returns the i-th date available for a given graph type. The dates are sorted increasingly. From eb0cdc3fa47ab6ca1cf32be67570288c273219bb Mon Sep 17 00:00:00 2001 From: Marc Damie Date: Thu, 12 Feb 2026 16:33:23 +0100 Subject: [PATCH 4/4] Extend the pytest execution to Python 3.12 --- .github/workflows/pytest.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml index f8bfd56..662165d 100644 --- a/.github/workflows/pytest.yml +++ b/.github/workflows/pytest.yml @@ -10,7 +10,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: [3.10.x,3.11.x] + python-version: [3.10.x, 3.11.x, 3.12.x] steps: - uses: actions/checkout@v3