Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
116 changes: 116 additions & 0 deletions countries/management/commands/get_events_measures_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
import requests, itertools, re
from bs4 import BeautifulSoup
from _collections import defaultdict
from datetime import datetime
import pandas as pd
import spacy
import os
from django.core.management.base import BaseCommand, CommandError
from countries.forms import MeasureForm
from coronavirus import settings

COUNTRIES_FIXTURES = os.path.join(settings.BASE_DIR, 'countries', 'fixtures')


class Command(BaseCommand):
help = 'Gets WiKipedia data about Event and Measures by country for a specific month, ' \
"""February and March are available at present."""

def add_arguments(self, parser):
parser.add_argument('month', type=str)

def handle(self, *args, **options):
month = options['month']

def get_february():
res = requests.get(
'https://en.wikipedia.org/wiki/Timeline_of_the_2019%E2%80%9320_coronavirus_pandemic_in_February_2020')
wiki = BeautifulSoup(res.content, "lxml")
first_h3 = wiki.find(
attrs={'class': "mw-headline", 'id': "Reactions_and_measures_outside_mainland_China"})
uls = defaultdict(list)
name = first_h3.text.strip()
for sib in first_h3.find_all_next():
if sib.name == 'p':
uls[name].append(sib.text.strip())
if sib.name == 'h3':
name = sib.text.strip()
elif sib.name == 'h2':
break
df1 = pd.DataFrame(list(uls.items()), columns=['date', 'measures'])
return df1

def get_march():
res1 = requests.get(
'https://en.wikipedia.org/wiki/Responses_to_the_2019%E2%80%9320_coronavirus_pandemic_in_March_2020')
wiki1 = BeautifulSoup(res1.content, "lxml")
second_h3 = wiki1.find(attrs={'class': "mw-headline", 'id': "1_March"})
uls1 = defaultdict(list)
name1 = second_h3.text.strip()
for sib in second_h3.find_all_next():
if sib.name == 'p':
uls1[name1].append(sib.text.strip())
if sib.name == 'h3':
name1 = sib.text.strip()
elif sib.name == 'h2':
break
df2 = pd.DataFrame(list(uls1.items()), columns=['date', 'measures'])
return df2

def parse_concat(list_dfs):
dfs = []
for df in list_dfs:
new = pd.DataFrame([(tup.date, measure) for tup in df.itertuples() for measure in tup.measures])
new.columns = ['date', 'measures']
dfs.append(new)
df_result = pd.concat(dfs)
return df_result

def get_wiki_events(month):
if month == 'February':
return get_february()
elif month == 'March':
return get_march()
elif month == 'all':
frames = []
feb = get_february()
march = get_march()
frames.append(feb)
frames.append(march)
#Get final dataframe and clean up dates and measures columns
all_data = parse_concat(frames)
all_data['date'] = all_data['date'].str.replace('[[edit][edit][edit][edit]]', '2020', regex=True)
all_data['date'] = all_data['date'].str.replace('[', ',', regex=True)
all_data['date'] = pd.to_datetime(all_data['date'], errors='coerce')
all_data['measures'] = all_data['measures'].str.replace('[[0-9][0-9]]', ' ', regex=True)
all_data = all_data.dropna()

# extract entities for country field
sp = spacy.load("en_core_web_sm")
countries = []
countries_population = os.path.join(COUNTRIES_FIXTURES, 'countries_population.csv')
csv = pd.read_csv(countries_population, delimiter=';')
options = set(csv['name'].values)
for text in all_data['measures'].tolist():
doc = sp(text)
for ent in doc.ents:
if ent.label_ == 'GPE':
if ent.text in options:
countries.append(ent.text)
all_data['countries'] = pd.Series(countries[:444])

return all_data

final = get_wiki_events(month)
errors = 0
rows = 0

for row in final.itertuples(index=True):
rows += 1
form = MeasureForm({'name': getattr(row, "measures"), 'start_date': getattr(row, "date"),
'end_date': getattr(row, "date"), 'country': getattr(row, 'countries')})
if form.is_valid():
form.save()
else:
errors += 1
print("Result: {}/{}".format(rows - errors, rows))
18 changes: 18 additions & 0 deletions countries/migrations/0003_auto_20200410_1409.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# Generated by Django 3.0.4 on 2020-04-10 14:09

from django.db import migrations, models


class Migration(migrations.Migration):

dependencies = [
('countries', '0002_auto_20200406_0804'),
]

operations = [
migrations.AlterField(
model_name='measure',
name='name',
field=models.CharField(max_length=1000),
),
]
2 changes: 1 addition & 1 deletion countries/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ class Meta:


class Measure(models.Model):
name = models.CharField(max_length=200)
name = models.CharField(max_length=1000)
start_date = models.DateField()
end_date = models.DateField()
country = models.ForeignKey('Country', on_delete=models.CASCADE)
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,3 +18,4 @@ python-dotenv==0.12.0
tweepy==3.7.0
django-postgres-extra==1.22
spacy==2.2.4
beautifulsoup4==4.8.2
16 changes: 7 additions & 9 deletions stats/management/commands/get_twitter_data.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,11 @@
import random
import pandas as pd
import tweepy
import requests
import json

from psqlextra.query import ConflictAction
from django.core.management.base import BaseCommand, CommandError

from coronavirus.settings import TWEEPY_TOKENS, USERNAMES
from coronavirus.local_settings import TWEEPY_TOKENS
from stats.models import Tweet


Expand Down Expand Up @@ -40,20 +38,20 @@ def stitch(screen_name, tweets_id):
api = make_tweepy_api()
new_tweets = tweepy.Cursor(api.user_timeline, user_id=user_id, count=200, tweet_mode="extended",
since_id=tweet_id, exclude_replies=True).items()
print(new_tweets)

results = [[tweet.user.id, tweet.id_str, tweet.created_at, tweet.full_text,
stitch(tweet.user.screen_name, tweet.id_str)] for tweet in new_tweets]
print(results)

if not results:
return
new_result = [{'user_id': el[0], 'tweet_id': el[1],
'datetime': el[2], 'text': el[3], 'url': el[4]} for el in results]
final = pd.DataFrame(new_result)

def dump(x):
(Tweets.objects.on_conflict(['tweet_id', 'datetime'], ConflictAction.UPDATE)
.insert_and_get(user_id=x['user_id'], tweet_id=x['tweet_id'],
datetime=x['datetime'],
text=x['text'], url=x['url']))
(Tweet.objects.on_conflict(['tweet_id', 'datetime'], ConflictAction.UPDATE)
.insert_and_get(user_id=x['user_id'], tweet_id=x['tweet_id'],
datetime=x['datetime'],
text=x['text'], url=x['url']))

final.apply(lambda x: dump(x), axis=1)