diff --git a/countries/management/commands/get_events_measures_data.py b/countries/management/commands/get_events_measures_data.py new file mode 100644 index 0000000..1055991 --- /dev/null +++ b/countries/management/commands/get_events_measures_data.py @@ -0,0 +1,116 @@ +import requests, itertools, re +from bs4 import BeautifulSoup +from _collections import defaultdict +from datetime import datetime +import pandas as pd +import spacy +import os +from django.core.management.base import BaseCommand, CommandError +from countries.forms import MeasureForm +from coronavirus import settings + +COUNTRIES_FIXTURES = os.path.join(settings.BASE_DIR, 'countries', 'fixtures') + + +class Command(BaseCommand): + help = 'Gets WiKipedia data about Event and Measures by country for a specific month, ' \ + """February and March are available at present.""" + + def add_arguments(self, parser): + parser.add_argument('month', type=str) + + def handle(self, *args, **options): + month = options['month'] + + def get_february(): + res = requests.get( + 'https://en.wikipedia.org/wiki/Timeline_of_the_2019%E2%80%9320_coronavirus_pandemic_in_February_2020') + wiki = BeautifulSoup(res.content, "lxml") + first_h3 = wiki.find( + attrs={'class': "mw-headline", 'id': "Reactions_and_measures_outside_mainland_China"}) + uls = defaultdict(list) + name = first_h3.text.strip() + for sib in first_h3.find_all_next(): + if sib.name == 'p': + uls[name].append(sib.text.strip()) + if sib.name == 'h3': + name = sib.text.strip() + elif sib.name == 'h2': + break + df1 = pd.DataFrame(list(uls.items()), columns=['date', 'measures']) + return df1 + + def get_march(): + res1 = requests.get( + 'https://en.wikipedia.org/wiki/Responses_to_the_2019%E2%80%9320_coronavirus_pandemic_in_March_2020') + wiki1 = BeautifulSoup(res1.content, "lxml") + second_h3 = wiki1.find(attrs={'class': "mw-headline", 'id': "1_March"}) + uls1 = defaultdict(list) + name1 = second_h3.text.strip() + for sib in second_h3.find_all_next(): + if sib.name == 'p': + uls1[name1].append(sib.text.strip()) + if sib.name == 'h3': + name1 = sib.text.strip() + elif sib.name == 'h2': + break + df2 = pd.DataFrame(list(uls1.items()), columns=['date', 'measures']) + return df2 + + def parse_concat(list_dfs): + dfs = [] + for df in list_dfs: + new = pd.DataFrame([(tup.date, measure) for tup in df.itertuples() for measure in tup.measures]) + new.columns = ['date', 'measures'] + dfs.append(new) + df_result = pd.concat(dfs) + return df_result + + def get_wiki_events(month): + if month == 'February': + return get_february() + elif month == 'March': + return get_march() + elif month == 'all': + frames = [] + feb = get_february() + march = get_march() + frames.append(feb) + frames.append(march) + #Get final dataframe and clean up dates and measures columns + all_data = parse_concat(frames) + all_data['date'] = all_data['date'].str.replace('[[edit][edit][edit][edit]]', '2020', regex=True) + all_data['date'] = all_data['date'].str.replace('[', ',', regex=True) + all_data['date'] = pd.to_datetime(all_data['date'], errors='coerce') + all_data['measures'] = all_data['measures'].str.replace('[[0-9][0-9]]', ' ', regex=True) + all_data = all_data.dropna() + + # extract entities for country field + sp = spacy.load("en_core_web_sm") + countries = [] + countries_population = os.path.join(COUNTRIES_FIXTURES, 'countries_population.csv') + csv = pd.read_csv(countries_population, delimiter=';') + options = set(csv['name'].values) + for text in all_data['measures'].tolist(): + doc = sp(text) + for ent in doc.ents: + if ent.label_ == 'GPE': + if ent.text in options: + countries.append(ent.text) + all_data['countries'] = pd.Series(countries[:444]) + + return all_data + + final = get_wiki_events(month) + errors = 0 + rows = 0 + + for row in final.itertuples(index=True): + rows += 1 + form = MeasureForm({'name': getattr(row, "measures"), 'start_date': getattr(row, "date"), + 'end_date': getattr(row, "date"), 'country': getattr(row, 'countries')}) + if form.is_valid(): + form.save() + else: + errors += 1 + print("Result: {}/{}".format(rows - errors, rows)) diff --git a/countries/migrations/0003_auto_20200410_1409.py b/countries/migrations/0003_auto_20200410_1409.py new file mode 100644 index 0000000..23bdb92 --- /dev/null +++ b/countries/migrations/0003_auto_20200410_1409.py @@ -0,0 +1,18 @@ +# Generated by Django 3.0.4 on 2020-04-10 14:09 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('countries', '0002_auto_20200406_0804'), + ] + + operations = [ + migrations.AlterField( + model_name='measure', + name='name', + field=models.CharField(max_length=1000), + ), + ] diff --git a/countries/models.py b/countries/models.py index 605f6b0..97c7575 100644 --- a/countries/models.py +++ b/countries/models.py @@ -32,7 +32,7 @@ class Meta: class Measure(models.Model): - name = models.CharField(max_length=200) + name = models.CharField(max_length=1000) start_date = models.DateField() end_date = models.DateField() country = models.ForeignKey('Country', on_delete=models.CASCADE) diff --git a/requirements.txt b/requirements.txt index 4a4fb8d..132b596 100644 --- a/requirements.txt +++ b/requirements.txt @@ -18,3 +18,4 @@ python-dotenv==0.12.0 tweepy==3.7.0 django-postgres-extra==1.22 spacy==2.2.4 +beautifulsoup4==4.8.2 diff --git a/stats/management/commands/get_twitter_data.py b/stats/management/commands/get_twitter_data.py index d4dc779..c810f23 100644 --- a/stats/management/commands/get_twitter_data.py +++ b/stats/management/commands/get_twitter_data.py @@ -1,13 +1,11 @@ import random import pandas as pd import tweepy -import requests -import json from psqlextra.query import ConflictAction from django.core.management.base import BaseCommand, CommandError -from coronavirus.settings import TWEEPY_TOKENS, USERNAMES +from coronavirus.local_settings import TWEEPY_TOKENS from stats.models import Tweet @@ -40,10 +38,10 @@ def stitch(screen_name, tweets_id): api = make_tweepy_api() new_tweets = tweepy.Cursor(api.user_timeline, user_id=user_id, count=200, tweet_mode="extended", since_id=tweet_id, exclude_replies=True).items() - print(new_tweets) + results = [[tweet.user.id, tweet.id_str, tweet.created_at, tweet.full_text, stitch(tweet.user.screen_name, tweet.id_str)] for tweet in new_tweets] - print(results) + if not results: return new_result = [{'user_id': el[0], 'tweet_id': el[1], @@ -51,9 +49,9 @@ def stitch(screen_name, tweets_id): final = pd.DataFrame(new_result) def dump(x): - (Tweets.objects.on_conflict(['tweet_id', 'datetime'], ConflictAction.UPDATE) - .insert_and_get(user_id=x['user_id'], tweet_id=x['tweet_id'], - datetime=x['datetime'], - text=x['text'], url=x['url'])) + (Tweet.objects.on_conflict(['tweet_id', 'datetime'], ConflictAction.UPDATE) + .insert_and_get(user_id=x['user_id'], tweet_id=x['tweet_id'], + datetime=x['datetime'], + text=x['text'], url=x['url'])) final.apply(lambda x: dump(x), axis=1)