simple-reference-checker/main.py at master · klb2/simple-reference-checker · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
# /// script
# requires-python = ">=3.13"
# dependencies = [
#     "requests>=2.33.1",
# ]
# ///

import json
import logging

import requests

from constants import (
    API_DOI,
    FIELD_MATCH_SCORE,
    FIELD_PUBTYPE,
    FIELD_VENUE,
    FIELD_YEAR,
    FIELD_AUTHORS,
    FIELD_TITLE,
    FIELD_JOURNAL,
    FIELD_DOI,
    BASE_URL,
    API_POINT,
    SCORE_CRITICAL,
    SCORE_LIMIT_BAD,
    SCORE_LIMIT_GOOD,
    SCORE_UNKNOWN,
    bcolors,
)
from comparison import compare_references
from scoring import compute_total_score

LOGGER = logging.getLogger(__name__)

RETURN_FIELDS = ",".join(
    (FIELD_TITLE, FIELD_AUTHORS, FIELD_YEAR, FIELD_VENUE, FIELD_JOURNAL, FIELD_PUBTYPE)
)


def output_reference_results(
    reference: dict,
    scores: dict,
) -> None:
    doi = reference.get(FIELD_DOI)
    title = reference.get(FIELD_TITLE)
    LOGGER.info(f"{bcolors.BOLD}Title:{bcolors.ENDC} {title}")
    LOGGER.info(f"{bcolors.BOLD}DOI:{bcolors.ENDC} {doi}")
    output_score_results(scores)
    return


def output_score_results(scores: dict) -> None:
    for _field, _score in scores.items():
        LOGGER.debug(f"Sub-score {_field}: {_score:.2f}")
    total_score = compute_total_score(scores)

    if total_score == SCORE_UNKNOWN:
        LOGGER.warning(
            f"{bcolors.WARNING}Could not find a title in the reference. Manually check it.{bcolors.ENDC}"
        )
    elif total_score == SCORE_CRITICAL:
        LOGGER.critical(
            f"{bcolors.FAIL}No match found on SemanticScholar! The reference probably does not exist or is not a published paper.{bcolors.ENDC}"
        )
    elif total_score < SCORE_LIMIT_BAD:
        LOGGER.error(f"{bcolors.FAIL}Very low score: {total_score:.2f}{bcolors.ENDC}")
    elif total_score < SCORE_LIMIT_GOOD:
        LOGGER.warning(f"{bcolors.WARNING}Low score: {total_score:.2f}{bcolors.ENDC}")
    else:
        LOGGER.info(f"Score: {total_score:.2f}")
    return


def rate_single_reference(reference: dict) -> tuple[dict, dict]:
    request_params = {"fields": RETURN_FIELDS}
    match = {}

    doi = reference.get(FIELD_DOI)
    title = reference.get(FIELD_TITLE)

    if doi is not None:
        request_url = f"{BASE_URL}/{API_DOI}{doi}"
    elif title is None:
        LOGGER.error("Could not get a title")
        scores = {FIELD_TITLE: -1}
        return scores, match
    else:
        request_url = f"{BASE_URL}/{API_POINT}"
        request_params["query"] = f"{title}"

    r = requests.get(url=request_url, params=request_params)

    if r.status_code == 404:
        LOGGER.critical(f"Could not find a match for '{title}'")
        scores = {FIELD_TITLE: SCORE_CRITICAL}
    elif r.status_code == 200:
        match = r.json()
        # print(match)
        if doi is None:
            match = match["data"][0]
            if match[FIELD_MATCH_SCORE] < 100:
                LOGGER.error("Ultra low quality match.")
                scores = compare_references(reference, match)
            elif match[FIELD_MATCH_SCORE] < 175:
                LOGGER.warning("Low quality match")
                scores = compare_references(reference, match)
        scores = compare_references(reference, match)
    else:
        LOGGER.error(
            f"Error getting response from SemanticScholar. Response code {r.status_code:d}."
        )
        scores = {FIELD_TITLE: SCORE_UNKNOWN}
    r.close()
    return scores, match


def main(references_json):
    with open(references_json, "r") as json_file:
        references = json.load(json_file)

    for idx, reference in enumerate(references):
        LOGGER.info(f"Reviewing reference [{idx + 1:d}]")
        _scores, _match = rate_single_reference(reference)
        output_reference_results(reference, _scores)


if __name__ == "__main__":
    import argparse

    parser = argparse.ArgumentParser()
    parser.add_argument("references_json")
    parser.add_argument(
        "-v", "--verbosity", action="count", default=0, help="Increase output verbosity"
    )
    args = parser.parse_args()
    args = vars(args)
    verb = args.pop("verbosity")
    logging.basicConfig(
        format=f"%(asctime)s - [{bcolors.OKBLUE}%(levelname)8s{bcolors.ENDC}]: %(message)s"
    )
    loglevel = logging.INFO - verb * 10
    LOGGER.setLevel(loglevel)
    main(**args)