-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcomparison.py
More file actions
78 lines (67 loc) · 2.65 KB
/
comparison.py
File metadata and controls
78 lines (67 loc) · 2.65 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import difflib
import itertools
from constants import (
ATTR_PUBTYPE_CONFERENCE,
ATTR_PUBTYPE_JOURNAL,
FIELD_AUTHOR_NAME,
FIELD_AUTHORS,
FIELD_JOURNAL,
FIELD_JOURNAL_NAME,
FIELD_PUBTYPE,
FIELD_TITLE,
FIELD_VENUE,
FIELD_YEAR,
SCORE_CRITICAL,
)
def _get_string_comparison_ratio(orig: str, other: str) -> float:
ratio = difflib.SequenceMatcher(None, orig.lower(), other.lower()).ratio()
return ratio
def _get_author_comparison_ratio(orig: list, other: list) -> list[float]:
scores_authors = []
if len(other) == 0:
return [0.0]
elif other[-1].get(FIELD_AUTHOR_NAME, "") == "et al.":
orig = orig[: len(other)]
# for _auth_orig, _auth_match in zip(authors_orig, authors_match):
for _auth_orig, _auth_match in itertools.zip_longest(
orig, other, fillvalue={"name": ""}
):
_score_author = _get_string_comparison_ratio(
_auth_orig.get(FIELD_AUTHOR_NAME, ""),
_auth_match.get(FIELD_AUTHOR_NAME, ""),
)
scores_authors.append(_score_author)
return scores_authors
def compare_references(original: dict, match: dict) -> dict:
scores = {}
for field in (FIELD_TITLE, FIELD_VENUE):
_field_orig = original.get(field, "")
_field_match = match.get(field, "")
_ratio = _get_string_comparison_ratio(_field_orig, _field_match)
scores[field] = _ratio
scores[FIELD_YEAR] = 1 / (1 + abs(original[FIELD_YEAR] - match[FIELD_YEAR]))
authors_orig = original.get(FIELD_AUTHORS, [])
authors_match = match.get(FIELD_AUTHORS, [])
scores_authors = _get_author_comparison_ratio(authors_orig, authors_match)
scores[FIELD_AUTHORS] = sum(scores_authors) / len(scores_authors)
_match_pub_types = match.get(FIELD_PUBTYPE, [])
if not _match_pub_types:
_match_pub_types = []
if ATTR_PUBTYPE_CONFERENCE in _match_pub_types:
_conf_orig = original.get(FIELD_VENUE, "")
_conf_match = match.get(FIELD_JOURNAL, {}).get(FIELD_JOURNAL_NAME, "")
_ratio = _get_string_comparison_ratio(_conf_orig, _conf_match)
scores[FIELD_VENUE] = _ratio
elif ATTR_PUBTYPE_JOURNAL in _match_pub_types:
journal_orig = original.get(FIELD_JOURNAL, {})
journal_match = match.get(FIELD_JOURNAL, {})
_scores_journal = [
_get_string_comparison_ratio(str(_o), str(journal_match.get(_k, "")))
for _k, _o in journal_orig.items()
if _k in journal_match
]
if not _scores_journal:
scores[FIELD_VENUE] = SCORE_CRITICAL
else:
scores[FIELD_VENUE] = sum(_scores_journal) / len(_scores_journal)
return scores