-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathscrape.py
More file actions
127 lines (94 loc) · 3.3 KB
/
scrape.py
File metadata and controls
127 lines (94 loc) · 3.3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
""" Scrape
This file contains functions related to the scraping of play-
by-play data from basketball-reference.com
This file requires that `pandas` and 'bs4' be installed within the
Python environment you are running this script in.
"""
import requests
from bs4 import BeautifulSoup
import pandas as pd
import util
import sys
url = 'https://www.basketball-reference.com'
def scrape(pid, yid='', export=False):
""" Scrapes and parses play by play data for player
associated with player_id and, optionally, by season
Parameters
----------
pid : str
The player ID assigned by basketball-reference.com
yid : str (optional)
The year ID for a specific season
Example: yid for 2019/20 season is '2020'
Returns
-------
DataFrame
A data frame containing play by play data for player
corresponding to pid
"""
href = '/play-index/shot_finder.cgi?request=1&match=play&player_id=' + pid + '&year_id=' + yid
href += '&order_by=date_game'
html = requests.get(url + href)
soup = BeautifulSoup(html.content, 'html.parser')
# Print retrieval info
name = soup.find('div', id='pi').find('h1').getText().split(' ')[:2]
season = 'all seasons' if yid == '' else 'the ' + str(int(yid) - 1) + '/' + yid + ' season'
print('Retrieving data for ' + ' '.join(name) + " from " + season)
# Collect tables
tables = util.collect_tables(url, soup)
# Create data frame
df = pd.DataFrame(util.data(tables))
df.columns = ['home', 'distance', 'type', 'assisted', 'result']
# Export if needed
if export:
df.to_csv('data/' + pid + yid + '.csv')
return df
def search(query):
""" Searches for players by query
Parameters
----------
query : str
The query entered by the user
Returns
-------
list : (str, str)
A list of tuples of the form (name, pid),
name being the name of the player and pid
being the corresponding player ID assigned by
basketball-reference.com
"""
html = requests.get(url + '/search/search.fcgi?search=' + query)
soup = BeautifulSoup(html.content, 'html.parser')
if not soup.find('strong', text='Search Results'):
# If automatically redirects, return current url
name = soup.find('h1', itemprop="name").getText()
href = url + soup.find('a', text=name + ' Overview').get('href')
player_id = href.split("/")[-1].split('.')[0]
return [(name, player_id)]
else:
# Parse each returned player
items = soup.findAll('div', {"class": "search-item"})
name = lambda s: s.find('a').getText()
href = lambda s: s.find('a').get('href')
player_id = lambda s: s.split("/")[-1].split('.')[0]
players = [(name(item), player_id(href(item))) for item in items if 'players' in href(item)]
return players
def seasons(pid):
""" Lists all seasons played by a player
Parameters
----------
pid : str
The player ID assigned by basketball-reference.com
Returns
-------
list
A list of year IDs corresponding to seasons
"""
href = '/play-index/shot_finder.cgi?&player_id=' + pid
html = requests.get(url + href)
soup = BeautifulSoup(html.content, 'html.parser')
seasons = [(item['value'], item.getText()) for item in soup.find('select').findAll('option')]
return seasons
if __name__ == '__main__':
print(search('brandon i'))
lonzo = scrape('ingrabr01', '', export=True)