-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtracklistWebscraper.py
More file actions
121 lines (98 loc) · 4.12 KB
/
tracklistWebscraper.py
File metadata and controls
121 lines (98 loc) · 4.12 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
from bs4 import BeautifulSoup
import requests
from requests_html import HTMLSession
from urllib.parse import urlparse
'''
primary scrape for soundcloud releases.
grabs useful data and passes it directly for track scraping.
if a multitrack release is given, it has to dynamically scrape,
repeating the process with an html session
'''
def scInitScrape(url):
#gets source html from url
source = requests.get(url).text
#makes workable BeautifulSoup object
soup = BeautifulSoup(source, "html.parser")
#second noscript tag has all useful data on soundcloud
nScript = soup.body.find_all('noscript')[1].article
#soundcloud classifies release types by "schema"
schema = nScript['itemtype']
#albums/eps and playlists can be scraped the same way
if schema == 'http://schema.org/MusicAlbum' or schema == 'http://schema.org/MusicPlaylist':
albumSessionScrape(url)
#for individual tracks
elif schema == 'http://schema.org/MusicRecording':
trackScrape(nScript, 1)
#invalid release type
else:
print("Invalid release type! (Not an album, playlist, or track)")
'''
creates a dynamic session to scrape a multitrack release
loads in every track, then gets its useful data and sends for direct track scraping
'''
def albumSessionScrape(url):
#open HTML session to scrape dynamically loaded tracklist
session = HTMLSession()
r = session.get(url)
#scroll down to load (most) tracklists in full
r.html.render(scrolldown=10, sleep=1, keep_page=True)
#extract every track item in rendered tracklist
tracklist = r.html.find('li.trackList__item')
#used in trackScrape() output to identify track number
trackCount = 1
for track in tracklist:
#extract hyperlink for track, which is contained in a set
trackSet = track.find('a.trackItem__trackTitle')[0]
#access only element of set (the link necessary to scrape)
trackUrl = next(iter(trackSet.absolute_links))
#gets html code of song from source urls
source = requests.get(trackUrl).text
#makes workable BeautifulSoup object from source
soup = BeautifulSoup(source, "html.parser")
#represents noscript tag containing all useful data
nScript = soup.body.find_all('noscript')[1].article
trackScrape(nScript, trackCount)
trackCount += 1
'''
does all the actual scraping for any given release
an individual track's noscript tag is given, data manually parsed
the second print() statement oututs in RateYourMusic's recognized tracklist format.
the other format is left over from testing
'''
def trackScrape(nScript, trackNum):
#track and artist name are contained in hyperlinks
links = nScript.header.h1.find_all('a')
#grabs slice of song duration from specific meta tag
duration = nScript.find_all('meta')[0]["content"][2:]
trackLength = duration[3:5] + ":" + duration[6:8] #takes only min, second values
if duration[:2] != '00':
trackLength = duration[:2] + ":" + trackLength #takes hour if >= 1
trackName = links[0].text
artistName = links[1].text
#retrieves release date from formatted time tag
pubdate = (nScript.time.text).split('T')[0]
#print(f"{artistName} - {trackName}{delimiter}{trackLength} ({pubdate})")
print(f"{trackNum}{delimiter}{trackName}{delimiter}{trackLength}")
'''
specifically for scraping tracklists from NTS Live mixes
returns the results in an easy-to-paste RateYourMusic format
'''
def ntsSessionScrape(url):
session = HTMLSession()
r = session.get(url)
r.html.render(sleep=1, keep_page=True)
tracks = r.html.find('li.track')
trackCount = 1
for track in tracks:
print(f"{trackCount}{delimiter}{track.find('span.track__artist')[0].text} - {track.find('span.track__title')[0].text}")
trackCount += 1
#runtime starts here, takes basic user input in the form of a URL
delimiter = "|"
link = input("Enter URL:\n")
#grabs website (host) name
domain = urlparse(link).netloc
#identify website and use appropriate scraping
if domain == 'soundcloud.com':
scInitScrape(link)
elif domain == 'www.nts.live':
ntsSessionScrape(link)