add video platform#36
Conversation
does not actually work well
|
ideas to fix:
|
| from yt_dlp.extractor import gen_extractor_classes, GenericIE | ||
|
|
||
|
|
||
| def valid_video_platform_link(link): |
There was a problem hiding this comment.
We can try something like this
import yt_dlp
FILTERED_EXTRACTORS = {ie.IE_NAME:ie for ie in yt_dlp.list_extractor_classes()
if ie not in generic_extractors
and "porn" not in ie.IE_NAME.lower()
and "adult" not in ie.IE_NAME.lower()
and "xxx" not in ie.IE_NAME.lower()
and "xvideos" not in ie.IE_NAME.lower()
and "xhamster" not in ie.IE_NAME.lower()
and "redtube" not in ie.IE_NAME.lower()
and "xtube" not in ie.IE_NAME.lower()
and "xstream" not in ie.IE_NAME.lower()
and "xfileshare" not in ie.IE_NAME.lower()
and "sex" not in ie.IE_NAME.lower()
}
# print(FILTERED_EXTRACTORS.keys())
# print(len(FILTERED_EXTRACTORS.keys()))
def is_link_valid(link, extractors):
"""Check if link is valid given a list of extractors."""
return any([ie.suitable(link) for ie in extractors])
def valid_video_platform_link(link):
"""Check if link is a valid video platform link."""
return link and is_link_valid(link, FILTERED_EXTRACTORS.values())
YT_URL = "https://www.youtube.com/watch?v=jLX0D8qQUBM"
DM_URL = "https://www.dailymotion.com/video/x29ryo7"
print(valid_video_platform_link(YT_URL))
print(valid_video_platform_link(DM_URL))
There was a problem hiding this comment.
generic_extractors = [yt_dlp.extractor.generic.GenericIE,
yt_dlp.extractor.lazy_extractors.GenericIE]
There was a problem hiding this comment.
tried at #49
one first problem: running these thousands of regexes is actually quite slow. I guess we need to limit the list or find a way to merge them into one to speed things up (I think that should help?)
There was a problem hiding this comment.
also let's try the age_limit property in yt-dlp
There was a problem hiding this comment.
and we can try hyperscan https://geekmonkey.org/regular-expression-matching-at-scale-with-hyperscan/ for speed
There was a problem hiding this comment.
Thanks! Another option is to do it in 2 steps
- first checks that the domain is valid among ~2000 selected domains (from yt_dlp extractors _TESTS)
- then checks if the url is a valid video url (wwith regex from yt_dlp)
This version is more than 100x faster (but is less exhaustive)
There was a problem hiding this comment.
import yt_dlp
from urllib.parse import urlparse
generic_extractors = [yt_dlp.extractor.generic.GenericIE,
yt_dlp.extractor.lazy_extractors.GenericIE]
FILTERED_EXTRACTORS = {ie.IE_NAME:ie for ie in yt_dlp.list_extractor_classes()
if ie not in generic_extractors
and "porn" not in ie.IE_NAME.lower()
and "adult" not in ie.IE_NAME.lower()
and "xxx" not in ie.IE_NAME.lower()
and "xvideos" not in ie.IE_NAME.lower()
and "xhamster" not in ie.IE_NAME.lower()
and "redtube" not in ie.IE_NAME.lower()
and "xtube" not in ie.IE_NAME.lower()
and "xstream" not in ie.IE_NAME.lower()
and "xfileshare" not in ie.IE_NAME.lower()
and "sex" not in ie.IE_NAME.lower()
}
def extract_test(extractor):
tests = []
if hasattr(extractor, "_TEST"):
tests = [extractor._TEST["url"]]
elif hasattr(extractor, "_TESTS"):
tests = [x["url"] for x in extractor._TESTS]
return tests
def normalize_domain(domain):
domain = domain.lower()
if domain.startswith("www."):
domain = domain[4:]
return domain
def extract_domain(url):
parsed_url = urlparse(url)
domain = parsed_url.netloc
return normalize_domain(domain)
DOMAIN_DICT = {}
for extractor in FILTERED_EXTRACTORS.values():
for url in extract_test(extractor):
domain = extract_domain(url)
if domain in DOMAIN_DICT:
DOMAIN_DICT[domain] = DOMAIN_DICT[domain] + [extractor]
else:
DOMAIN_DICT[domain] = [extractor]
def is_link_suitable(link, extractors):
"""Check if link is valid given an extractor."""
return any([ie.suitable(link) for ie in extractors])
def is_link_valid(link, domain_dict):
"""Check if link is valid given a list of extractors."""
is_valid = False
domain = extract_domain(link)
if domain in domain_dict:
is_valid = is_link_suitable(link, domain_dict[domain])
return is_valid
def valid_video_platform_link(link):
"""Check if link is a valid video platform link."""
return link and is_link_valid(link, DOMAIN_DICT)
YT_URL = "https://www.youtube.com/watch?v=jLX0D8qQUBM"
DM_URL = "https://www.dailymotion.com/video/x29ryo7"
DM_URL2 = "https://geo.dailymotion.com/player.html?video=x89eyek&mute=true"
print(valid_video_platform_link(YT_URL))
print(valid_video_platform_link(DM_URL))
print(valid_video_platform_link(DM_URL2))
import time
start_time = time.time()
[valid_video_platform_link(x) for x in [DM_URL] * 1000]
print("--- %s seconds ---" % (time.time() - start_time))
There was a problem hiding this comment.
ok I put the code there #49
it's faster indeed!
does not actually work well