SystemAgent · SystemAgent · Apr 3, 2020 · Apr 3, 2020 · Apr 7, 2020 · Apr 8, 2020
diff --git a/countries/management/commands/get_events_measures_data.py b/countries/management/commands/get_events_measures_data.py
@@ -0,0 +1,116 @@
+import requests, itertools, re
+from bs4 import BeautifulSoup
+from _collections import defaultdict
+from datetime import datetime
+import pandas as pd
+import spacy
+import os
+from django.core.management.base import BaseCommand, CommandError
+from countries.forms import MeasureForm
+from coronavirus import settings
+
+COUNTRIES_FIXTURES = os.path.join(settings.BASE_DIR, 'countries', 'fixtures')
+
+
+class Command(BaseCommand):
+    help = 'Gets WiKipedia data about Event and Measures by country for a specific month, ' \
+           """February and March are available at present."""
+
+    def add_arguments(self, parser):
+        parser.add_argument('month', type=str)
+
+    def handle(self, *args, **options):
+        month = options['month']
+
+        def get_february():
+            res = requests.get(
+                'https://en.wikipedia.org/wiki/Timeline_of_the_2019%E2%80%9320_coronavirus_pandemic_in_February_2020')
+            wiki = BeautifulSoup(res.content, "lxml")
+            first_h3 = wiki.find(
+                attrs={'class': "mw-headline", 'id': "Reactions_and_measures_outside_mainland_China"})
+            uls = defaultdict(list)
+            name = first_h3.text.strip()
+            for sib in first_h3.find_all_next():
+                if sib.name == 'p':
+                    uls[name].append(sib.text.strip())
+                if sib.name == 'h3':
+                    name = sib.text.strip()
+                elif sib.name == 'h2':
+                    break
+            df1 = pd.DataFrame(list(uls.items()), columns=['date', 'measures'])
+            return df1
+
+        def get_march():
+            res1 = requests.get(
+                'https://en.wikipedia.org/wiki/Responses_to_the_2019%E2%80%9320_coronavirus_pandemic_in_March_2020')
+            wiki1 = BeautifulSoup(res1.content, "lxml")
+            second_h3 = wiki1.find(attrs={'class': "mw-headline", 'id': "1_March"})
+            uls1 = defaultdict(list)
+            name1 = second_h3.text.strip()
+            for sib in second_h3.find_all_next():
+                if sib.name == 'p':
+                    uls1[name1].append(sib.text.strip())
+                if sib.name == 'h3':
+                    name1 = sib.text.strip()
+                elif sib.name == 'h2':
+                    break
+            df2 = pd.DataFrame(list(uls1.items()), columns=['date', 'measures'])
+            return df2
+
+        def parse_concat(list_dfs):
+            dfs = []
+            for df in list_dfs:
+                new = pd.DataFrame([(tup.date, measure) for tup in df.itertuples() for measure in tup.measures])
+                new.columns = ['date', 'measures']
+                dfs.append(new)
+            df_result = pd.concat(dfs)
+            return df_result
+
+        def get_wiki_events(month):
+            if month == 'February':
+                return get_february()
+            elif month == 'March':
+                return get_march()
+            elif month == 'all':
+                frames = []
+                feb = get_february()
+                march = get_march()
+                frames.append(feb)
+                frames.append(march)
+                #Get final dataframe and clean up dates and measures columns
+                all_data = parse_concat(frames)
+                all_data['date'] = all_data['date'].str.replace('[[edit][edit][edit][edit]]', '2020', regex=True)
+                all_data['date'] = all_data['date'].str.replace('[', ',', regex=True)
+                all_data['date'] = pd.to_datetime(all_data['date'], errors='coerce')
+                all_data['measures'] = all_data['measures'].str.replace('[[0-9][0-9]]', ' ', regex=True)
+                all_data = all_data.dropna()
+
+                # extract entities for country field
+                sp = spacy.load("en_core_web_sm")
+                countries = []
+                countries_population = os.path.join(COUNTRIES_FIXTURES, 'countries_population.csv')
+                csv = pd.read_csv(countries_population, delimiter=';')
+                options = set(csv['name'].values)
+                for text in all_data['measures'].tolist():
+                    doc = sp(text)
+                    for ent in doc.ents:
+                        if ent.label_ == 'GPE':
+                            if ent.text in options:
+                                countries.append(ent.text)
+                            all_data['countries'] = pd.Series(countries[:444])
+
+                return all_data
+
+        final = get_wiki_events(month)
+        errors = 0
+        rows = 0
+
+        for row in final.itertuples(index=True):
+            rows += 1
+            form = MeasureForm({'name': getattr(row, "measures"), 'start_date': getattr(row, "date"),
+                                'end_date': getattr(row, "date"), 'country': getattr(row, 'countries')})
+            if form.is_valid():
+                form.save()
+            else:
+                errors += 1
+        print("Result: {}/{}".format(rows - errors, rows))
diff --git a/countries/migrations/0003_auto_20200410_1409.py b/countries/migrations/0003_auto_20200410_1409.py
@@ -0,0 +1,18 @@
+# Generated by Django 3.0.4 on 2020-04-10 14:09
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('countries', '0002_auto_20200406_0804'),
+    ]
+
+    operations = [
+        migrations.AlterField(
+            model_name='measure',
+            name='name',
+            field=models.CharField(max_length=1000),
+        ),
+    ]
diff --git a/countries/models.py b/countries/models.py
@@ -32,7 +32,7 @@ class Meta:
 
 
 class Measure(models.Model):
-    name = models.CharField(max_length=200)
+    name = models.CharField(max_length=1000)
     start_date = models.DateField()
     end_date = models.DateField()
     country = models.ForeignKey('Country', on_delete=models.CASCADE)

diff --git a/requirements.txt b/requirements.txt
@@ -18,3 +18,4 @@ python-dotenv==0.12.0
 tweepy==3.7.0
 django-postgres-extra==1.22
 spacy==2.2.4
+beautifulsoup4==4.8.2
diff --git a/stats/management/commands/get_twitter_data.py b/stats/management/commands/get_twitter_data.py
@@ -1,13 +1,11 @@
 import random
 import pandas as pd
 import tweepy
-import requests
-import json
 
 from psqlextra.query import ConflictAction
 from django.core.management.base import BaseCommand, CommandError
 
-from coronavirus.settings import TWEEPY_TOKENS, USERNAMES
+from coronavirus.local_settings import TWEEPY_TOKENS
 from stats.models import Tweet
 
 
@@ -40,20 +38,20 @@ def stitch(screen_name, tweets_id):
         api = make_tweepy_api()
         new_tweets = tweepy.Cursor(api.user_timeline, user_id=user_id, count=200, tweet_mode="extended",
                                    since_id=tweet_id, exclude_replies=True).items()
-        print(new_tweets)
+
         results = [[tweet.user.id, tweet.id_str, tweet.created_at, tweet.full_text,
                     stitch(tweet.user.screen_name, tweet.id_str)] for tweet in new_tweets]
-        print(results)
+
         if not results:
             return
         new_result = [{'user_id': el[0], 'tweet_id': el[1],
                       'datetime': el[2], 'text': el[3], 'url': el[4]} for el in results]
         final = pd.DataFrame(new_result)
 
         def dump(x):
-            (Tweets.objects.on_conflict(['tweet_id', 'datetime'], ConflictAction.UPDATE)
-                            .insert_and_get(user_id=x['user_id'], tweet_id=x['tweet_id'],
-                                            datetime=x['datetime'],
-                                            text=x['text'], url=x['url']))
+            (Tweet.objects.on_conflict(['tweet_id', 'datetime'], ConflictAction.UPDATE)
+                          .insert_and_get(user_id=x['user_id'], tweet_id=x['tweet_id'],
+                                          datetime=x['datetime'],
+                                          text=x['text'], url=x['url']))
 
         final.apply(lambda x: dump(x), axis=1)