From c80cc44db6ef8075ef02a8838a65a74f289a5b82 Mon Sep 17 00:00:00 2001
From: "@SystemAgent"
Date: Fri, 3 Apr 2020 17:06:36 +0300
Subject: [PATCH 1/5] initial_command_for_measures
---
.../commands/get_events_measures_data.py | 64 +++++++++++++++++++
stats/management/commands/get_twitter_data.py | 16 ++---
2 files changed, 71 insertions(+), 9 deletions(-)
create mode 100644 stats/management/commands/get_events_measures_data.py
diff --git a/stats/management/commands/get_events_measures_data.py b/stats/management/commands/get_events_measures_data.py
new file mode 100644
index 0000000..d0bb101
--- /dev/null
+++ b/stats/management/commands/get_events_measures_data.py
@@ -0,0 +1,64 @@
+from collections import defaultdict
+import requests, itertools, re
+from bs4 import BeautifulSoup
+import numpy as np
+import pandas as pd
+from _collections import defaultdict
+import pandas as pd
+from django.core.management.base import BaseCommand, CommandError
+from psqlextra.query import ConflictAction
+
+
+class Command(BaseCommand):
+ help = 'Gets wikipedia data about Event and Measures by country for a specific month, ' \
+ """February and March are available at present."""
+
+ def add_arguments(self, parser):
+ parser.add_argument('month', type=str)
+
+ def handle(self, *args, **options):
+ month = options['month']
+
+ def get_wiki_events(month):
+ res = requests.get(
+ 'https://en.wikipedia.org/wiki/Timeline_of_the_2019%E2%80%9320_coronavirus_pandemic_in_February_2020')
+ res1 = requests.get(
+ 'https://en.wikipedia.org/wiki/Responses_to_the_2019%E2%80%9320_coronavirus_pandemic_in_March_2020')
+ if month == 'February':
+ wiki = BeautifulSoup(res.content, "lxml")
+ first_h3 = wiki.find('h3') # Start here
+ uls = defaultdict(list)
+ name = first_h3.text.strip()
+ for sib in first_h3.find_next_siblings():
+ if sib.name == 'ul':
+ for event in sib.findAll("li"):
+ uls[name].append(event.text.strip())
+ elif sib.name == 'h3':
+ name = sib.text.strip()
+ elif sib.name == 'h2':
+ break
+ print(uls)
+ elif month == 'March':
+ wiki1 = BeautifulSoup(res1.content, "lxml")
+ second_h3 = wiki1.find(attrs={'class': "mw-headline", 'id': "1_March"})
+ uls1 = defaultdict(list)
+ name1 = second_h3.text.strip()
+ for sib in second_h3.find_all_next():
+ if sib.name == 'p':
+ uls1[name1].append(sib.text.strip())
+ if sib.name == 'h3':
+ name1 = sib.text.strip()
+ elif sib.name == 'h2':
+ break
+ print(uls1)
+
+ get_wiki_events(month)
+
+
+#TODO Save scraped data to db
+# for date, events in url.items():
+# for event in events:
+# (Total.objects
+# .on_conflict(['date', 'event'], ConflictAction.UPDATE)
+# .insert_and_get(date=date, event=event)
+# )
diff --git a/stats/management/commands/get_twitter_data.py b/stats/management/commands/get_twitter_data.py
index d4dc779..c810f23 100644
--- a/stats/management/commands/get_twitter_data.py
+++ b/stats/management/commands/get_twitter_data.py
@@ -1,13 +1,11 @@
import random
import pandas as pd
import tweepy
-import requests
-import json
from psqlextra.query import ConflictAction
from django.core.management.base import BaseCommand, CommandError
-from coronavirus.settings import TWEEPY_TOKENS, USERNAMES
+from coronavirus.local_settings import TWEEPY_TOKENS
from stats.models import Tweet
@@ -40,10 +38,10 @@ def stitch(screen_name, tweets_id):
api = make_tweepy_api()
new_tweets = tweepy.Cursor(api.user_timeline, user_id=user_id, count=200, tweet_mode="extended",
since_id=tweet_id, exclude_replies=True).items()
- print(new_tweets)
+
results = [[tweet.user.id, tweet.id_str, tweet.created_at, tweet.full_text,
stitch(tweet.user.screen_name, tweet.id_str)] for tweet in new_tweets]
- print(results)
+
if not results:
return
new_result = [{'user_id': el[0], 'tweet_id': el[1],
@@ -51,9 +49,9 @@ def stitch(screen_name, tweets_id):
final = pd.DataFrame(new_result)
def dump(x):
- (Tweets.objects.on_conflict(['tweet_id', 'datetime'], ConflictAction.UPDATE)
- .insert_and_get(user_id=x['user_id'], tweet_id=x['tweet_id'],
- datetime=x['datetime'],
- text=x['text'], url=x['url']))
+ (Tweet.objects.on_conflict(['tweet_id', 'datetime'], ConflictAction.UPDATE)
+ .insert_and_get(user_id=x['user_id'], tweet_id=x['tweet_id'],
+ datetime=x['datetime'],
+ text=x['text'], url=x['url']))
final.apply(lambda x: dump(x), axis=1)
From 326c5bdd0f8939767c9c94858b030971f571f2d2 Mon Sep 17 00:00:00 2001
From: "@SystemAgent"
Date: Fri, 3 Apr 2020 17:11:02 +0300
Subject: [PATCH 2/5] add bs4
---
requirements.txt | 1 +
1 file changed, 1 insertion(+)
diff --git a/requirements.txt b/requirements.txt
index 4a4fb8d..132b596 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -18,3 +18,4 @@ python-dotenv==0.12.0
tweepy==3.7.0
django-postgres-extra==1.22
spacy==2.2.4
+beautifulsoup4==4.8.2
From 9d6b71d8bd618a58d70718f4c05856d822bcbcc1 Mon Sep 17 00:00:00 2001
From: "@SystemAgent"
Date: Tue, 7 Apr 2020 12:10:15 +0300
Subject: [PATCH 3/5] changes
---
stats/management/commands/get_events_measures_data.py | 1 -
1 file changed, 1 deletion(-)
diff --git a/stats/management/commands/get_events_measures_data.py b/stats/management/commands/get_events_measures_data.py
index d0bb101..bd59975 100644
--- a/stats/management/commands/get_events_measures_data.py
+++ b/stats/management/commands/get_events_measures_data.py
@@ -51,7 +51,6 @@ def get_wiki_events(month):
elif sib.name == 'h2':
break
print(uls1)
-
get_wiki_events(month)
From ee89dcbfd1419a6dfaa2fafe7fd1610298fe2dde Mon Sep 17 00:00:00 2001
From: "@SystemAgent"
Date: Wed, 8 Apr 2020 18:21:11 +0300
Subject: [PATCH 4/5] changes on command
---
.../commands/get_events_measures_data.py | 85 ++++++++++++-------
1 file changed, 55 insertions(+), 30 deletions(-)
diff --git a/stats/management/commands/get_events_measures_data.py b/stats/management/commands/get_events_measures_data.py
index bd59975..03bd1a8 100644
--- a/stats/management/commands/get_events_measures_data.py
+++ b/stats/management/commands/get_events_measures_data.py
@@ -1,16 +1,13 @@
-from collections import defaultdict
import requests, itertools, re
from bs4 import BeautifulSoup
import numpy as np
-import pandas as pd
from _collections import defaultdict
import pandas as pd
from django.core.management.base import BaseCommand, CommandError
-from psqlextra.query import ConflictAction
class Command(BaseCommand):
- help = 'Gets wikipedia data about Event and Measures by country for a specific month, ' \
+ help = 'Gets WiKipedia data about Event and Measures by country for a specific month, ' \
"""February and March are available at present."""
def add_arguments(self, parser):
@@ -19,38 +16,66 @@ def add_arguments(self, parser):
def handle(self, *args, **options):
month = options['month']
- def get_wiki_events(month):
+ def get_february():
res = requests.get(
'https://en.wikipedia.org/wiki/Timeline_of_the_2019%E2%80%9320_coronavirus_pandemic_in_February_2020')
+ wiki = BeautifulSoup(res.content, "lxml")
+ first_h3 = wiki.find(
+ attrs={'class': "mw-headline", 'id': "Reactions_and_measures_outside_mainland_China"})
+ uls = defaultdict(list)
+ name = first_h3.text.strip()
+ for sib in first_h3.find_all_next():
+ if sib.name == 'p':
+ uls[name].append(sib.text.strip())
+ if sib.name == 'h3':
+ name = sib.text.strip()
+ elif sib.name == 'h2':
+ break
+ df1 = pd.DataFrame(list(uls.items()), columns=['date', 'measures'])
+ return df1
+
+ def get_march():
res1 = requests.get(
'https://en.wikipedia.org/wiki/Responses_to_the_2019%E2%80%9320_coronavirus_pandemic_in_March_2020')
+ wiki1 = BeautifulSoup(res1.content, "lxml")
+ second_h3 = wiki1.find(attrs={'class': "mw-headline", 'id': "1_March"})
+ uls1 = defaultdict(list)
+ name1 = second_h3.text.strip()
+ for sib in second_h3.find_all_next():
+ if sib.name == 'p':
+ uls1[name1].append(sib.text.strip())
+ if sib.name == 'h3':
+ name1 = sib.text.strip()
+ elif sib.name == 'h2':
+ break
+ df2 = pd.DataFrame(list(uls1.items()), columns=['date', 'measures'])
+ return df2
+
+ def parse_concat(list_dfs):
+ dfs = []
+ for df in list_dfs:
+ new = pd.DataFrame(df['measures'].str.split("]', ").tolist(), index=df.date).stack()
+ new = new.reset_index([0, 'date'])
+ new.columns = ['date', 'measures']
+ dfs.append(new)
+ print(dfs)
+ df_result = pd.concat(dfs)
+ return df_result
+
+ def get_wiki_events(month):
if month == 'February':
- wiki = BeautifulSoup(res.content, "lxml")
- first_h3 = wiki.find('h3') # Start here
- uls = defaultdict(list)
- name = first_h3.text.strip()
- for sib in first_h3.find_next_siblings():
- if sib.name == 'ul':
- for event in sib.findAll("li"):
- uls[name].append(event.text.strip())
- elif sib.name == 'h3':
- name = sib.text.strip()
- elif sib.name == 'h2':
- break
- print(uls)
+ get_february()
elif month == 'March':
- wiki1 = BeautifulSoup(res1.content, "lxml")
- second_h3 = wiki1.find(attrs={'class': "mw-headline", 'id': "1_March"})
- uls1 = defaultdict(list)
- name1 = second_h3.text.strip()
- for sib in second_h3.find_all_next():
- if sib.name == 'p':
- uls1[name1].append(sib.text.strip())
- if sib.name == 'h3':
- name1 = sib.text.strip()
- elif sib.name == 'h2':
- break
- print(uls1)
+ get_march()
+ elif month == 'all':
+ frames = []
+ febs = get_february()
+ march = get_march()
+ frames.append(febs)
+ frames.append(march)
+ # print(frames[1])
+ print(parse_concat(frames))
+
get_wiki_events(month)
From b0799b999ba105da63f798361732e7106388f717 Mon Sep 17 00:00:00 2001
From: "@SystemAgent"
Date: Fri, 10 Apr 2020 17:38:41 +0300
Subject: [PATCH 5/5] changes on command and save to db
---
.../commands/get_events_measures_data.py | 64 +++++++++++++------
.../migrations/0003_auto_20200410_1409.py | 18 ++++++
countries/models.py | 2 +-
3 files changed, 65 insertions(+), 19 deletions(-)
rename {stats => countries}/management/commands/get_events_measures_data.py (53%)
create mode 100644 countries/migrations/0003_auto_20200410_1409.py
diff --git a/stats/management/commands/get_events_measures_data.py b/countries/management/commands/get_events_measures_data.py
similarity index 53%
rename from stats/management/commands/get_events_measures_data.py
rename to countries/management/commands/get_events_measures_data.py
index 03bd1a8..1055991 100644
--- a/stats/management/commands/get_events_measures_data.py
+++ b/countries/management/commands/get_events_measures_data.py
@@ -1,9 +1,15 @@
import requests, itertools, re
from bs4 import BeautifulSoup
-import numpy as np
from _collections import defaultdict
+from datetime import datetime
import pandas as pd
+import spacy
+import os
from django.core.management.base import BaseCommand, CommandError
+from countries.forms import MeasureForm
+from coronavirus import settings
+
+COUNTRIES_FIXTURES = os.path.join(settings.BASE_DIR, 'countries', 'fixtures')
class Command(BaseCommand):
@@ -54,35 +60,57 @@ def get_march():
def parse_concat(list_dfs):
dfs = []
for df in list_dfs:
- new = pd.DataFrame(df['measures'].str.split("]', ").tolist(), index=df.date).stack()
- new = new.reset_index([0, 'date'])
+ new = pd.DataFrame([(tup.date, measure) for tup in df.itertuples() for measure in tup.measures])
new.columns = ['date', 'measures']
dfs.append(new)
- print(dfs)
df_result = pd.concat(dfs)
return df_result
def get_wiki_events(month):
if month == 'February':
- get_february()
+ return get_february()
elif month == 'March':
- get_march()
+ return get_march()
elif month == 'all':
frames = []
- febs = get_february()
+ feb = get_february()
march = get_march()
- frames.append(febs)
+ frames.append(feb)
frames.append(march)
- # print(frames[1])
- print(parse_concat(frames))
+ #Get final dataframe and clean up dates and measures columns
+ all_data = parse_concat(frames)
+ all_data['date'] = all_data['date'].str.replace('[[edit][edit][edit][edit]]', '2020', regex=True)
+ all_data['date'] = all_data['date'].str.replace('[', ',', regex=True)
+ all_data['date'] = pd.to_datetime(all_data['date'], errors='coerce')
+ all_data['measures'] = all_data['measures'].str.replace('[[0-9][0-9]]', ' ', regex=True)
+ all_data = all_data.dropna()
+
+ # extract entities for country field
+ sp = spacy.load("en_core_web_sm")
+ countries = []
+ countries_population = os.path.join(COUNTRIES_FIXTURES, 'countries_population.csv')
+ csv = pd.read_csv(countries_population, delimiter=';')
+ options = set(csv['name'].values)
+ for text in all_data['measures'].tolist():
+ doc = sp(text)
+ for ent in doc.ents:
+ if ent.label_ == 'GPE':
+ if ent.text in options:
+ countries.append(ent.text)
+ all_data['countries'] = pd.Series(countries[:444])
- get_wiki_events(month)
+ return all_data
+ final = get_wiki_events(month)
+ errors = 0
+ rows = 0
-#TODO Save scraped data to db
-# for date, events in url.items():
-# for event in events:
-# (Total.objects
-# .on_conflict(['date', 'event'], ConflictAction.UPDATE)
-# .insert_and_get(date=date, event=event)
-# )
+ for row in final.itertuples(index=True):
+ rows += 1
+ form = MeasureForm({'name': getattr(row, "measures"), 'start_date': getattr(row, "date"),
+ 'end_date': getattr(row, "date"), 'country': getattr(row, 'countries')})
+ if form.is_valid():
+ form.save()
+ else:
+ errors += 1
+ print("Result: {}/{}".format(rows - errors, rows))
diff --git a/countries/migrations/0003_auto_20200410_1409.py b/countries/migrations/0003_auto_20200410_1409.py
new file mode 100644
index 0000000..23bdb92
--- /dev/null
+++ b/countries/migrations/0003_auto_20200410_1409.py
@@ -0,0 +1,18 @@
+# Generated by Django 3.0.4 on 2020-04-10 14:09
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+ dependencies = [
+ ('countries', '0002_auto_20200406_0804'),
+ ]
+
+ operations = [
+ migrations.AlterField(
+ model_name='measure',
+ name='name',
+ field=models.CharField(max_length=1000),
+ ),
+ ]
diff --git a/countries/models.py b/countries/models.py
index 605f6b0..97c7575 100644
--- a/countries/models.py
+++ b/countries/models.py
@@ -32,7 +32,7 @@ class Meta:
class Measure(models.Model):
- name = models.CharField(max_length=200)
+ name = models.CharField(max_length=1000)
start_date = models.DateField()
end_date = models.DateField()
country = models.ForeignKey('Country', on_delete=models.CASCADE)