DBProcessingScripts/cleaningdb.py at master · COVID-19-Text-Mining/DBProcessingScripts · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
# -*- coding: utf-8 -*-
"""
Created on Fri Apr  3 13:36:44 2020

@author: elise
"""

import re
import unicodedata


def remove_pre_proof(title):
    clean_title = title.replace('Journal Pre-proofs', ' ')
    clean_title = clean_title.replace('Journal Pre-proof', ' ')
    clean_title = clean_title.strip()
    if len(clean_title) == 0:
        clean_title = None
    return clean_title


def add_pre_proof_and_clean(entry):
    if entry['title'] != None and 'Journal Pre-proof' in entry['title']:
        entry['is_pre_proof'] = True
        entry['title'] = remove_pre_proof(entry['title'])

    else:
        entry['is_pre_proof'] = False


def remove_html(abstract):
    #necessary to check this to avoid removing text between less than and greater than signs
    if abstract is not None and bool(re.search('<.*?>.*?</.*?>', abstract)):
        clean_abstract = re.sub('<.*?>', '', abstract)
        return clean_abstract
    else:
        return abstract


def clean_data(data):
    cleaned_data = data
    for i in cleaned_data:
        add_pre_proof_and_clean(i)
        i['abstract'] = remove_html('abstract')
        if i['journal'] == 'PLoS ONE':
            i['journal'] = 'PLOS ONE'
    return cleaned_data


def clean_title(title):
    """
    @ Yuxing Fei
    This function will:
        1. Replace all the characters to its canonical form
        2. Remove urls
        3. Remove abnormal tokens,
           for example, too many continuous digits or punctuations

    If the input title is None or empty string,
        the function will return the raw title
    """
    if not title:
        return title

    title = unicodedata.normalize('NFKC', title)
    title = re.sub(r"[—–]", "-", title)
    title = re.sub(r"(?<=\s)[^A-z0-9&!@#$%^*,./;=|'+:\"()\-\s](?=\s)|(https?://\s?|www\.)[^\s)]+", " ", title)
    title = re.sub(r"-NC-ND license", "", title)
    title = re.sub(r"\(\s+\)", "", title)
    tokens = title.split()
    new_tokens = []
    noise = []
    for token in tokens:
        if re.fullmatch(r"[\u0370-\u03ffA-z(]?[\w\-,.;:()?/]{2,}", token) is not None:
            if len(noise) <= 5:
                new_tokens.extend(noise)  # actually not noise
                noise = []
            else:
                noise = []  # throw away noise
            if token:
                new_tokens.append(token)

        else:
            noise.append(token)
    if len(noise) <= 2:
        new_tokens.extend(noise)

    return " ".join(new_tokens)