From c80cc44db6ef8075ef02a8838a65a74f289a5b82 Mon Sep 17 00:00:00 2001 From: "@SystemAgent" Date: Fri, 3 Apr 2020 17:06:36 +0300 Subject: [PATCH 1/5] initial_command_for_measures --- .../commands/get_events_measures_data.py | 64 +++++++++++++++++++ stats/management/commands/get_twitter_data.py | 16 ++--- 2 files changed, 71 insertions(+), 9 deletions(-) create mode 100644 stats/management/commands/get_events_measures_data.py diff --git a/stats/management/commands/get_events_measures_data.py b/stats/management/commands/get_events_measures_data.py new file mode 100644 index 0000000..d0bb101 --- /dev/null +++ b/stats/management/commands/get_events_measures_data.py @@ -0,0 +1,64 @@ +from collections import defaultdict +import requests, itertools, re +from bs4 import BeautifulSoup +import numpy as np +import pandas as pd +from _collections import defaultdict +import pandas as pd +from django.core.management.base import BaseCommand, CommandError +from psqlextra.query import ConflictAction + + +class Command(BaseCommand): + help = 'Gets wikipedia data about Event and Measures by country for a specific month, ' \ + """February and March are available at present.""" + + def add_arguments(self, parser): + parser.add_argument('month', type=str) + + def handle(self, *args, **options): + month = options['month'] + + def get_wiki_events(month): + res = requests.get( + 'https://en.wikipedia.org/wiki/Timeline_of_the_2019%E2%80%9320_coronavirus_pandemic_in_February_2020') + res1 = requests.get( + 'https://en.wikipedia.org/wiki/Responses_to_the_2019%E2%80%9320_coronavirus_pandemic_in_March_2020') + if month == 'February': + wiki = BeautifulSoup(res.content, "lxml") + first_h3 = wiki.find('h3') # Start here + uls = defaultdict(list) + name = first_h3.text.strip() + for sib in first_h3.find_next_siblings(): + if sib.name == 'ul': + for event in sib.findAll("li"): + uls[name].append(event.text.strip()) + elif sib.name == 'h3': + name = sib.text.strip() + elif sib.name == 'h2': + break + print(uls) + elif month == 'March': + wiki1 = BeautifulSoup(res1.content, "lxml") + second_h3 = wiki1.find(attrs={'class': "mw-headline", 'id': "1_March"}) + uls1 = defaultdict(list) + name1 = second_h3.text.strip() + for sib in second_h3.find_all_next(): + if sib.name == 'p': + uls1[name1].append(sib.text.strip()) + if sib.name == 'h3': + name1 = sib.text.strip() + elif sib.name == 'h2': + break + print(uls1) + + get_wiki_events(month) + + +#TODO Save scraped data to db +# for date, events in url.items(): +# for event in events: +# (Total.objects +# .on_conflict(['date', 'event'], ConflictAction.UPDATE) +# .insert_and_get(date=date, event=event) +# ) diff --git a/stats/management/commands/get_twitter_data.py b/stats/management/commands/get_twitter_data.py index d4dc779..c810f23 100644 --- a/stats/management/commands/get_twitter_data.py +++ b/stats/management/commands/get_twitter_data.py @@ -1,13 +1,11 @@ import random import pandas as pd import tweepy -import requests -import json from psqlextra.query import ConflictAction from django.core.management.base import BaseCommand, CommandError -from coronavirus.settings import TWEEPY_TOKENS, USERNAMES +from coronavirus.local_settings import TWEEPY_TOKENS from stats.models import Tweet @@ -40,10 +38,10 @@ def stitch(screen_name, tweets_id): api = make_tweepy_api() new_tweets = tweepy.Cursor(api.user_timeline, user_id=user_id, count=200, tweet_mode="extended", since_id=tweet_id, exclude_replies=True).items() - print(new_tweets) + results = [[tweet.user.id, tweet.id_str, tweet.created_at, tweet.full_text, stitch(tweet.user.screen_name, tweet.id_str)] for tweet in new_tweets] - print(results) + if not results: return new_result = [{'user_id': el[0], 'tweet_id': el[1], @@ -51,9 +49,9 @@ def stitch(screen_name, tweets_id): final = pd.DataFrame(new_result) def dump(x): - (Tweets.objects.on_conflict(['tweet_id', 'datetime'], ConflictAction.UPDATE) - .insert_and_get(user_id=x['user_id'], tweet_id=x['tweet_id'], - datetime=x['datetime'], - text=x['text'], url=x['url'])) + (Tweet.objects.on_conflict(['tweet_id', 'datetime'], ConflictAction.UPDATE) + .insert_and_get(user_id=x['user_id'], tweet_id=x['tweet_id'], + datetime=x['datetime'], + text=x['text'], url=x['url'])) final.apply(lambda x: dump(x), axis=1) From 326c5bdd0f8939767c9c94858b030971f571f2d2 Mon Sep 17 00:00:00 2001 From: "@SystemAgent" Date: Fri, 3 Apr 2020 17:11:02 +0300 Subject: [PATCH 2/5] add bs4 --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index 4a4fb8d..132b596 100644 --- a/requirements.txt +++ b/requirements.txt @@ -18,3 +18,4 @@ python-dotenv==0.12.0 tweepy==3.7.0 django-postgres-extra==1.22 spacy==2.2.4 +beautifulsoup4==4.8.2 From 9d6b71d8bd618a58d70718f4c05856d822bcbcc1 Mon Sep 17 00:00:00 2001 From: "@SystemAgent" Date: Tue, 7 Apr 2020 12:10:15 +0300 Subject: [PATCH 3/5] changes --- stats/management/commands/get_events_measures_data.py | 1 - 1 file changed, 1 deletion(-) diff --git a/stats/management/commands/get_events_measures_data.py b/stats/management/commands/get_events_measures_data.py index d0bb101..bd59975 100644 --- a/stats/management/commands/get_events_measures_data.py +++ b/stats/management/commands/get_events_measures_data.py @@ -51,7 +51,6 @@ def get_wiki_events(month): elif sib.name == 'h2': break print(uls1) - get_wiki_events(month) From ee89dcbfd1419a6dfaa2fafe7fd1610298fe2dde Mon Sep 17 00:00:00 2001 From: "@SystemAgent" Date: Wed, 8 Apr 2020 18:21:11 +0300 Subject: [PATCH 4/5] changes on command --- .../commands/get_events_measures_data.py | 85 ++++++++++++------- 1 file changed, 55 insertions(+), 30 deletions(-) diff --git a/stats/management/commands/get_events_measures_data.py b/stats/management/commands/get_events_measures_data.py index bd59975..03bd1a8 100644 --- a/stats/management/commands/get_events_measures_data.py +++ b/stats/management/commands/get_events_measures_data.py @@ -1,16 +1,13 @@ -from collections import defaultdict import requests, itertools, re from bs4 import BeautifulSoup import numpy as np -import pandas as pd from _collections import defaultdict import pandas as pd from django.core.management.base import BaseCommand, CommandError -from psqlextra.query import ConflictAction class Command(BaseCommand): - help = 'Gets wikipedia data about Event and Measures by country for a specific month, ' \ + help = 'Gets WiKipedia data about Event and Measures by country for a specific month, ' \ """February and March are available at present.""" def add_arguments(self, parser): @@ -19,38 +16,66 @@ def add_arguments(self, parser): def handle(self, *args, **options): month = options['month'] - def get_wiki_events(month): + def get_february(): res = requests.get( 'https://en.wikipedia.org/wiki/Timeline_of_the_2019%E2%80%9320_coronavirus_pandemic_in_February_2020') + wiki = BeautifulSoup(res.content, "lxml") + first_h3 = wiki.find( + attrs={'class': "mw-headline", 'id': "Reactions_and_measures_outside_mainland_China"}) + uls = defaultdict(list) + name = first_h3.text.strip() + for sib in first_h3.find_all_next(): + if sib.name == 'p': + uls[name].append(sib.text.strip()) + if sib.name == 'h3': + name = sib.text.strip() + elif sib.name == 'h2': + break + df1 = pd.DataFrame(list(uls.items()), columns=['date', 'measures']) + return df1 + + def get_march(): res1 = requests.get( 'https://en.wikipedia.org/wiki/Responses_to_the_2019%E2%80%9320_coronavirus_pandemic_in_March_2020') + wiki1 = BeautifulSoup(res1.content, "lxml") + second_h3 = wiki1.find(attrs={'class': "mw-headline", 'id': "1_March"}) + uls1 = defaultdict(list) + name1 = second_h3.text.strip() + for sib in second_h3.find_all_next(): + if sib.name == 'p': + uls1[name1].append(sib.text.strip()) + if sib.name == 'h3': + name1 = sib.text.strip() + elif sib.name == 'h2': + break + df2 = pd.DataFrame(list(uls1.items()), columns=['date', 'measures']) + return df2 + + def parse_concat(list_dfs): + dfs = [] + for df in list_dfs: + new = pd.DataFrame(df['measures'].str.split("]', ").tolist(), index=df.date).stack() + new = new.reset_index([0, 'date']) + new.columns = ['date', 'measures'] + dfs.append(new) + print(dfs) + df_result = pd.concat(dfs) + return df_result + + def get_wiki_events(month): if month == 'February': - wiki = BeautifulSoup(res.content, "lxml") - first_h3 = wiki.find('h3') # Start here - uls = defaultdict(list) - name = first_h3.text.strip() - for sib in first_h3.find_next_siblings(): - if sib.name == 'ul': - for event in sib.findAll("li"): - uls[name].append(event.text.strip()) - elif sib.name == 'h3': - name = sib.text.strip() - elif sib.name == 'h2': - break - print(uls) + get_february() elif month == 'March': - wiki1 = BeautifulSoup(res1.content, "lxml") - second_h3 = wiki1.find(attrs={'class': "mw-headline", 'id': "1_March"}) - uls1 = defaultdict(list) - name1 = second_h3.text.strip() - for sib in second_h3.find_all_next(): - if sib.name == 'p': - uls1[name1].append(sib.text.strip()) - if sib.name == 'h3': - name1 = sib.text.strip() - elif sib.name == 'h2': - break - print(uls1) + get_march() + elif month == 'all': + frames = [] + febs = get_february() + march = get_march() + frames.append(febs) + frames.append(march) + # print(frames[1]) + print(parse_concat(frames)) + get_wiki_events(month) From b0799b999ba105da63f798361732e7106388f717 Mon Sep 17 00:00:00 2001 From: "@SystemAgent" Date: Fri, 10 Apr 2020 17:38:41 +0300 Subject: [PATCH 5/5] changes on command and save to db --- .../commands/get_events_measures_data.py | 64 +++++++++++++------ .../migrations/0003_auto_20200410_1409.py | 18 ++++++ countries/models.py | 2 +- 3 files changed, 65 insertions(+), 19 deletions(-) rename {stats => countries}/management/commands/get_events_measures_data.py (53%) create mode 100644 countries/migrations/0003_auto_20200410_1409.py diff --git a/stats/management/commands/get_events_measures_data.py b/countries/management/commands/get_events_measures_data.py similarity index 53% rename from stats/management/commands/get_events_measures_data.py rename to countries/management/commands/get_events_measures_data.py index 03bd1a8..1055991 100644 --- a/stats/management/commands/get_events_measures_data.py +++ b/countries/management/commands/get_events_measures_data.py @@ -1,9 +1,15 @@ import requests, itertools, re from bs4 import BeautifulSoup -import numpy as np from _collections import defaultdict +from datetime import datetime import pandas as pd +import spacy +import os from django.core.management.base import BaseCommand, CommandError +from countries.forms import MeasureForm +from coronavirus import settings + +COUNTRIES_FIXTURES = os.path.join(settings.BASE_DIR, 'countries', 'fixtures') class Command(BaseCommand): @@ -54,35 +60,57 @@ def get_march(): def parse_concat(list_dfs): dfs = [] for df in list_dfs: - new = pd.DataFrame(df['measures'].str.split("]', ").tolist(), index=df.date).stack() - new = new.reset_index([0, 'date']) + new = pd.DataFrame([(tup.date, measure) for tup in df.itertuples() for measure in tup.measures]) new.columns = ['date', 'measures'] dfs.append(new) - print(dfs) df_result = pd.concat(dfs) return df_result def get_wiki_events(month): if month == 'February': - get_february() + return get_february() elif month == 'March': - get_march() + return get_march() elif month == 'all': frames = [] - febs = get_february() + feb = get_february() march = get_march() - frames.append(febs) + frames.append(feb) frames.append(march) - # print(frames[1]) - print(parse_concat(frames)) + #Get final dataframe and clean up dates and measures columns + all_data = parse_concat(frames) + all_data['date'] = all_data['date'].str.replace('[[edit][edit][edit][edit]]', '2020', regex=True) + all_data['date'] = all_data['date'].str.replace('[', ',', regex=True) + all_data['date'] = pd.to_datetime(all_data['date'], errors='coerce') + all_data['measures'] = all_data['measures'].str.replace('[[0-9][0-9]]', ' ', regex=True) + all_data = all_data.dropna() + + # extract entities for country field + sp = spacy.load("en_core_web_sm") + countries = [] + countries_population = os.path.join(COUNTRIES_FIXTURES, 'countries_population.csv') + csv = pd.read_csv(countries_population, delimiter=';') + options = set(csv['name'].values) + for text in all_data['measures'].tolist(): + doc = sp(text) + for ent in doc.ents: + if ent.label_ == 'GPE': + if ent.text in options: + countries.append(ent.text) + all_data['countries'] = pd.Series(countries[:444]) - get_wiki_events(month) + return all_data + final = get_wiki_events(month) + errors = 0 + rows = 0 -#TODO Save scraped data to db -# for date, events in url.items(): -# for event in events: -# (Total.objects -# .on_conflict(['date', 'event'], ConflictAction.UPDATE) -# .insert_and_get(date=date, event=event) -# ) + for row in final.itertuples(index=True): + rows += 1 + form = MeasureForm({'name': getattr(row, "measures"), 'start_date': getattr(row, "date"), + 'end_date': getattr(row, "date"), 'country': getattr(row, 'countries')}) + if form.is_valid(): + form.save() + else: + errors += 1 + print("Result: {}/{}".format(rows - errors, rows)) diff --git a/countries/migrations/0003_auto_20200410_1409.py b/countries/migrations/0003_auto_20200410_1409.py new file mode 100644 index 0000000..23bdb92 --- /dev/null +++ b/countries/migrations/0003_auto_20200410_1409.py @@ -0,0 +1,18 @@ +# Generated by Django 3.0.4 on 2020-04-10 14:09 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('countries', '0002_auto_20200406_0804'), + ] + + operations = [ + migrations.AlterField( + model_name='measure', + name='name', + field=models.CharField(max_length=1000), + ), + ] diff --git a/countries/models.py b/countries/models.py index 605f6b0..97c7575 100644 --- a/countries/models.py +++ b/countries/models.py @@ -32,7 +32,7 @@ class Meta: class Measure(models.Model): - name = models.CharField(max_length=200) + name = models.CharField(max_length=1000) start_date = models.DateField() end_date = models.DateField() country = models.ForeignKey('Country', on_delete=models.CASCADE)