diff --git a/.gitignore b/.gitignore index 287b0b569..39fa8d9d6 100644 --- a/.gitignore +++ b/.gitignore @@ -52,3 +52,7 @@ docs/_build/ # PyBuilder target/ + + +# IDE files +.idea diff --git a/crawlfrontier/contrib/backends/memory.py b/crawlfrontier/contrib/backends/memory.py index 4f96460e9..97e540cd1 100644 --- a/crawlfrontier/contrib/backends/memory.py +++ b/crawlfrontier/contrib/backends/memory.py @@ -30,10 +30,12 @@ def frontier_stop(self): def add_seeds(self, seeds): for seed in seeds: request, _ = self._get_or_create_request(seed) + request.isSeed = True self.heap.push(request) def get_next_requests(self, max_next_requests): - return self.heap.pop(max_next_requests) + things = self.heap.pop(max_next_requests) + return things def page_crawled(self, response, links): for link in links: @@ -42,17 +44,20 @@ def page_crawled(self, response, links): request.meta['depth'] = response.request.meta['depth']+1 self.heap.push(request) + def request_error(self, request, error): pass def _get_or_create_request(self, request): fingerprint = request.meta['fingerprint'] if fingerprint not in self.requests: + new_request = request.copy() new_request.meta['created_at'] = datetime.datetime.utcnow() new_request.meta['depth'] = 0 self.requests[fingerprint] = new_request self.manager.logger.backend.debug('Creating request %s' % new_request) + return new_request, True else: page = self.requests[fingerprint] diff --git a/crawlfrontier/contrib/backends/mongo.py b/crawlfrontier/contrib/backends/mongo.py new file mode 100644 index 000000000..6472292be --- /dev/null +++ b/crawlfrontier/contrib/backends/mongo.py @@ -0,0 +1,203 @@ +import datetime +import random +import copy +from collections import OrderedDict +import string +from bson import json_util +from pymongo import MongoClient +from crawlfrontier import Backend, Request +import json +from crawlfrontier.exceptions import NotConfigured + + +class MongodbBackend(Backend): + name = 'Mongodb Backend' + + class State: + NOT_CRAWLED = 'NOT CRAWLED' + QUEUED = 'QUEUED' + CRAWLED = 'CRAWLED' + ERROR = 'ERROR' + + def __init__(self, manager): + settings = manager.settings + mongo_ip = settings.get('BACKEND_MONGO_IP', None) + mongo_port = settings.get('BACKEND_MONGO_PORT', None) + if mongo_ip is None or mongo_port is None: + raise NotConfigured + self.client = MongoClient(mongo_ip, mongo_port) + self.database_name = ''.join( + random.SystemRandom().choice(string.ascii_uppercase + string.digits) for _ in range(10)) + mongo_db = settings.get('BACKEND_MONGO_DB_NAME', self.database_name) + mongo_collection = settings.get('BACKEND_MONGO_COLLECTION_NAME', 'links') + self.db = self.client[mongo_db] + self.collection = self.db[mongo_collection] + self.manager = manager + + @classmethod + def from_manager(cls, manager): + return cls(manager) + + def add_seeds(self, seeds): + # Log + self.manager.logger.backend.debug('ADD_SEEDS n_links=%s' % len(seeds)) + + for seed in seeds: + # Get or create page from link + request, _ = self._get_or_create_request(seed) + + def page_crawled(self, response, links): + # Log + self.manager.logger.backend.debug('PAGE_CRAWLED page=%s status=%s links=%s' % + (response, response.status_code, len(links))) + + # process page crawled + backend_page = self._page_crawled(response) + + # Update crawled fields + backend_page.state = self.State.CRAWLED + self.collection.update({'_meta.fingerprint': backend_page._meta['fingerprint']}, { + "$set": self._request_to_mongo(backend_page)}, upsert=False) + + # Create links + for link in links: + self.manager.logger.backend.debug('ADD_LINK link=%s' % link) + link_page, link_created = self._get_or_create_request(link) + if link_created: + link_page._meta['depth'] = response.meta['depth'] + 1 + self.collection.update({'_meta.fingerprint': link_page._meta['fingerprint']}, { + "$set": self._request_to_mongo(link_page)}, upsert=False) + + def request_error(self, page, error): + # Log + self.manager.logger.backend.debug('PAGE_CRAWLED_ERROR page=%s error=%s' % (page, error)) + + # process page crawled + backend_page = self._page_crawled(page) + + # Update error fields + backend_page.state = self.State.ERROR + self.collection.update({'_meta.fingerprint': backend_page._meta['fingerprint']}, + {"$set": self._request_to_mongo(backend_page)}, upsert=False) + + # Return updated page + return backend_page + + def get_next_requests(self, max_next_pages): + # Log + self.manager.logger.backend.debug('GET_NEXT_PAGES max_next_pages=%s' % max_next_pages) + now = datetime.datetime.utcnow() + mongo_pages = self._get_sorted_pages(max_next_pages) + requests = [] + for p in mongo_pages: + req = self._request_from_mongo_dict(p) + requests.append(req) + + if max_next_pages: + requests = requests[0:max_next_pages] + for req in requests: + req.state = self.State.QUEUED + req.last_update = now + self.collection.update({'_meta.fingerprint': req._meta['fingerprint']}, { + "$set": self._request_to_mongo(req)}, upsert=False) + remaining = self.collection.find_one({'state': {'$ne': self.State.CRAWLED}}) + if remaining is None: + self.manager._finished = True + return requests + + def _page_crawled(self, response): + # Get timestamp + now = datetime.datetime.utcnow() + + # Get or create page from incoming page + backend_page, created = self._get_or_create_request(response) + + # Update creation fields + if created: + backend_page.created_at = now + + # Update fields + backend_page.last_update = now + backend_page.status = response.status_code + return backend_page + + def _request_to_mongo(self, request): + return json.loads(json.dumps(vars(request), sort_keys=True, indent=4, default=json_util.default), + object_hook=json_util.object_hook) + + def _request_from_mongo_dict(self, mongo_dict): + url = mongo_dict['_url'] + method = mongo_dict['_method'] + headers = mongo_dict['_headers'] + cookies = mongo_dict['_cookies'] + meta = mongo_dict['_meta'] + request = Request(url, method, headers, cookies, meta) + return request + + def _get_or_create_request(self, request): + fingerprint = request.meta['fingerprint'] + existing_request = self.collection.find_one({'_meta.fingerprint': fingerprint}) + if existing_request is None: + new_request = request.copy() + new_request.meta['created_at'] = datetime.datetime.utcnow() + new_request.meta['depth'] = 0 + new_request.state = self.State.NOT_CRAWLED + self.collection.insert( + json.loads(json.dumps(vars(new_request), sort_keys=True, indent=4, default=json_util.default), + object_hook=json_util.object_hook)) + self.manager.logger.backend.debug('Creating request %s' % new_request) + return new_request, True + else: + request = self._request_from_mongo_dict(existing_request) + self.manager.logger.backend.debug('Request exists %s' % request) + return request, False + + def _get_sorted_pages(self, max_pages): + raise NotImplementedError + + def frontier_start(self): + pass + + def frontier_stop(self): + if self.manager.settings.get('BACKEND_MONGO_PERSIST_INFO', True) is False: + self.client.drop_database(self.database_name) + self.client.close() + + +class MongodbFIFOBackend(MongodbBackend): + name = 'FIFO Mongodb Backend' + + def _get_sorted_pages(self, max_pages): + mongo_requests = self.collection.find({'state': self.State.NOT_CRAWLED}).limit(max_pages) + return mongo_requests + + +class MongodbLIFOBackend(MongodbBackend): + name = 'LIFO Mongodb Backend' + + def _get_sorted_pages(self, max_pages): + mongo_requests = self.collection.find({'state': self.State.NOT_CRAWLED}).sort('_id', -1).limit(max_pages) + return mongo_requests + + +class MongodbDFSBackend(MongodbBackend): + name = 'DFS Mongodb Backend' + + def _get_sorted_pages(self, max_pages): + mongo_requests = self.collection.find({'state': self.State.NOT_CRAWLED}).sort('depth', -1).limit(max_pages) + return mongo_requests + + +class MongodbBFSBackend(MongodbBackend): + name = 'BFS Mongodb Backend' + + def _get_sorted_pages(self, max_pages): + mongo_requests = self.collection.find({'state': self.State.NOT_CRAWLED}).sort('depth', 1).limit(max_pages) + return mongo_requests + + +BASE = MongodbBackend +FIFO = MongodbFIFOBackend +LIFO = MongodbLIFOBackend +DFS = MongodbDFSBackend +BFS = MongodbBFSBackend diff --git a/crawlfrontier/contrib/middlewares/AluanaFilterOffsite.py b/crawlfrontier/contrib/middlewares/AluanaFilterOffsite.py new file mode 100644 index 000000000..34d3cb1f2 --- /dev/null +++ b/crawlfrontier/contrib/middlewares/AluanaFilterOffsite.py @@ -0,0 +1,75 @@ +""" +Offsite Spider Middleware + +See documentation in docs/topics/spider-middleware.rst +""" + +import re + +from scrapy import signals +from scrapy.http import Request +from scrapy.utils.httpobj import urlparse_cached +from scrapy import log +from crawlfrontier import Middleware +import tldextract + + +class OffsiteMiddleware(Middleware): + + def __init__(self, manager): + self.manager = manager + self.allowed_domains = ['amazonaws.com'] + self.host_regex = self.get_host_regex() + self.domains_seen = set() + + @classmethod + def from_manager(cls, manager): + return cls(manager) + + + def frontier_start(self): + pass + + def frontier_stop(self): + pass + + def add_seeds(self, seeds): + return seeds + + def request_error(self, request, error): + pass + + + def page_crawled(self, response, links): + try: + if response.request.isSeed == True: + self.allowed_domains.append(response.request.meta['domain']['netloc']) + self.host_regex = self.get_host_regex() + except Exception as e: + pass + for link in links: + fingerprint = link.meta['fingerprint'] + if fingerprint not in self.manager.backend.requests: + should = self.should_follow(link) + if not should: + self.manager.backend.requests[link.meta['fingerprint']] = link + # log.msg(message='Filtered offsite '+str(link.url), + # level=log.DEBUG, domain=str(response.meta['domain']['netloc']), url=str(link.url), + # module='offsite_filter', filtered=True) + return response + + + def should_follow(self, request): + regex = self.host_regex + # hostname can be None for wrong urls (like javascript links) + host = urlparse_cached(request).hostname or '' + return bool(regex.search(host)) + + def get_host_regex(self): + """Override this method to implement a different offsite policy""" + allowed_domains = self.allowed_domains + if not allowed_domains: + return re.compile('') # allow all by default + regex = r'^(.*\.)?(%s)$' % '|'.join(re.escape(d) for d in allowed_domains if d is not None) + return re.compile(regex) + diff --git a/crawlfrontier/contrib/middlewares/FilterHardStopwordsMiddleware.py b/crawlfrontier/contrib/middlewares/FilterHardStopwordsMiddleware.py new file mode 100644 index 000000000..2564d18e9 --- /dev/null +++ b/crawlfrontier/contrib/middlewares/FilterHardStopwordsMiddleware.py @@ -0,0 +1,62 @@ +import re +from scrapy import signals +from scrapy.http import Request +from scrapy.utils.httpobj import urlparse_cached +from scrapy import log +from aluanabot.settings import * +from aluanabot.classes.Log.DomainLog import DomainLog +from aluanabot.classes.StopwordsFilter.HardStopwordsFilter import HardStopwordsFilter +from crawlfrontier import Middleware +from crawlfrontier.exceptions import NotConfigured +from crawlfrontier.utils.misc import load_object +from crawlfrontier.utils.url import canonicalize_url + +class FilterHardStopwordsMiddleware(Middleware): + + HARD_STOP_LIST = [] + fingerprint_function_name = '' + + def __init__(self, manager): + self.manager = manager + self.stop_words_filter = HardStopwordsFilter(HARD_STOPWORDS_FILTER) + + + @classmethod + def from_manager(cls, manager): + return cls(manager) + + + def frontier_start(self): + pass + + def frontier_stop(self): + pass + + def add_seeds(self, seeds): + return seeds + + def request_error(self, request, error): + pass + + + def page_crawled(self, response, links): + for link in links: + fingerprint = link.meta['fingerprint'] + if fingerprint not in self.manager.backend.requests: + should, token, term, u = self.should_follow(link) + if not should: + self.manager.backend.requests[link.meta['fingerprint']] = link + log.msg(message='Filtered hard stopword request to '+str(link.url), + level=log.DEBUG, domain=str(response.meta['domain']['netloc']), url=str(link.url), + module='hard_stopwords', token=token, term=term, filter_url=u, filtered=True) + return response + + + + def should_follow(self, request): + result, token, term, u = self.stop_words_filter.url_matched(request) + if result == True: + return False, token, term, u + else: + return True, token, term, u + diff --git a/crawlfrontier/contrib/middlewares/FilterSoftStopwordsMiddleware.py b/crawlfrontier/contrib/middlewares/FilterSoftStopwordsMiddleware.py new file mode 100644 index 000000000..6444ee5a9 --- /dev/null +++ b/crawlfrontier/contrib/middlewares/FilterSoftStopwordsMiddleware.py @@ -0,0 +1,57 @@ +import re +from scrapy import signals +from scrapy.http import Request +from scrapy.utils.httpobj import urlparse_cached +from scrapy import log +from aluanabot.classes.Log.DomainLog import DomainLog +from aluanabot.classes.StopwordsFilter.SoftStopwordsFilter import SoftStopwordsFilter +from aluanabot.settings import * +from crawlfrontier import Middleware + + +class FilterSoftStopwordsMiddleware(Middleware): + + SOFT_WHITE_STOP_LIST = [] + SOFT_BLACK_STOP_LIST = [] + + def __init__(self, manager): + self.manager = manager + self.stop_words_filter = SoftStopwordsFilter(soft_white=SOFT_STOPWORDS_FILTER_WHITE, soft_black=SOFT_STOPWORDS_FILTER_BLACK) + + @classmethod + def from_manager(cls, manager): + return cls(manager) + + + def frontier_start(self): + pass + + def frontier_stop(self): + pass + + def add_seeds(self, seeds): + return seeds + + def request_error(self, request, error): + pass + + def page_crawled(self, response, links): + for link in links: + fingerprint = link.meta['fingerprint'] + if fingerprint not in self.manager.backend.requests: + should = self.should_follow(link) + if not should: + self.manager.backend.requests[link.meta['fingerprint']] = link + log.msg(message='Filtered soft stopword request to '+str(link.url), + level=log.DEBUG, domain=str(response.meta['domain']['netloc']), url=str(link.url), + module='soft_stopwords', filtered=True) + + return response + + def should_follow(self, request): + result = self.stop_words_filter.url_matched(request.url) + if result: + return False + else: + return True + diff --git a/crawlfrontier/contrib/middlewares/referer.py b/crawlfrontier/contrib/middlewares/referer.py new file mode 100644 index 000000000..df6faf4c1 --- /dev/null +++ b/crawlfrontier/contrib/middlewares/referer.py @@ -0,0 +1,35 @@ +""" +RefererMiddleware: populates Request referer field, based on the Response which +originated it. +""" + +from scrapy.http import Request +from scrapy.exceptions import NotConfigured +from crawlfrontier import Middleware + + +class RefererMiddleware(Middleware): + + def __init__(self, manager): + self.manager = manager + + @classmethod + def from_manager(cls, manager): + return cls(manager) + + def frontier_start(self): + pass + + def frontier_stop(self): + pass + + def add_seeds(self, seeds): + return seeds + + def page_crawled(self, response, links): + for link in links: + link.headers.setdefault('Referer', response.url) + return response + + def request_error(self, request, error): + return request \ No newline at end of file diff --git a/crawlfrontier/contrib/middlewares/upload_graph.py b/crawlfrontier/contrib/middlewares/upload_graph.py new file mode 100644 index 000000000..4adb3c8ee --- /dev/null +++ b/crawlfrontier/contrib/middlewares/upload_graph.py @@ -0,0 +1,46 @@ +import re + +from crawlfrontier.core.components import Middleware +from crawlfrontier import graphs + +class UploadGraphMiddleware(Middleware): + + component_name = 'Upload Graph Middleware' + + def __init__(self, manager): + self.manager = manager + + @classmethod + def from_manager(cls, manager): + return cls(manager) + + + def frontier_start(self): + pass + + + def frontier_stop(self): + print '**************************************************************' + print '**************************************************************' + print '**************************************************************' + print '**************************************************************' + print '**************************************************************' + print '**************************************************************' + g = graphs.Manager(engine='sqlite:///my_record.db') + g.render(filename='graph.png', label='A simple Graph', fontsize=15, node_fontsize=8, node_height=2, node_width=2, node_fixedsize=False) + pass + + + + def add_seeds(self, seeds): + print 1 + return seeds + + def page_crawled(self, response, links): + print 2 + return response + + def request_error(self, request, error): + print 3 + return request + diff --git a/crawlfrontier/contrib/scrapy/middlewares/frontier.py b/crawlfrontier/contrib/scrapy/middlewares/frontier.py index fad71d20a..7a608becd 100644 --- a/crawlfrontier/contrib/scrapy/middlewares/frontier.py +++ b/crawlfrontier/contrib/scrapy/middlewares/frontier.py @@ -1,6 +1,6 @@ from twisted.internet.error import DNSLookupError, TimeoutError from twisted.internet.task import LoopingCall -from scrapy.exceptions import NotConfigured, DontCloseSpider +from scrapy.exceptions import NotConfigured, DontCloseSpider, CloseSpider from scrapy.http import Request from scrapy import signals @@ -18,6 +18,7 @@ class CrawlFrontierSpiderMiddleware(object): def __init__(self, crawler, stats): + self.idle_counter = 0 self.crawler = crawler self.stats = stats @@ -75,6 +76,7 @@ def process_spider_output(self, response, result, spider): self.frontier.page_crawled(scrapy_response=response, scrapy_links=links) self._remove_queued_request(response.request) + self.idle_counter = 0 def download_error(self, request, exception, spider): # TODO: Add more errors... @@ -87,14 +89,19 @@ def download_error(self, request, exception, spider): self._remove_queued_request(request) def spider_idle(self, spider): - if not self.frontier.manager.finished: + if self.idle_counter > 3: + raise CloseSpider() + elif not self.frontier.manager.finished: + self.idle_counter += 1 raise DontCloseSpider() + def _schedule_next_requests(self, spider): n_scheduled = len(self.queued_requests) if not self.frontier.manager.finished and n_scheduled < self.scheduler_concurrent_requests: n_requests_gap = self.scheduler_concurrent_requests - n_scheduled next_pages = self._get_next_requests(n_requests_gap) + for request in next_pages: self.crawler.engine.crawl(request, spider) diff --git a/crawlfrontier/contrib/scrapy/middlewares/recording.py b/crawlfrontier/contrib/scrapy/middlewares/recording.py index 33c110696..761e0801f 100644 --- a/crawlfrontier/contrib/scrapy/middlewares/recording.py +++ b/crawlfrontier/contrib/scrapy/middlewares/recording.py @@ -54,6 +54,7 @@ def process_spider_output(self, response, result, spider): link = self.graph.add_link(page=page, url=request.url) request.meta['page'] = link request.meta['referer'] = page + request.meta['anchor'] = '' yield request def spider_closed(self, spider, reason): @@ -63,6 +64,9 @@ def spider_closed(self, spider, reason): self.graph.session.query(graphs.Page).filter_by(status=None).delete() self.graph.save() + # g = graphs.Manager(engine='sqlite:///my_record.db') + # g.render(filename='/information-retrieval-bot/aluanabot/graphs/'+spider.domain.replace(':', '_').replace('.', '_').replace('/', '_')+'.png', label=spider.domain, fontsize=15, node_fontsize=8, node_height=2, node_width=2, node_fixedsize=False) + def _get_url(self, response): return response.meta['redirect_urls'][0] if 'redirect_urls' in response.meta else response.request.url diff --git a/crawlfrontier/core/manager.py b/crawlfrontier/core/manager.py index 134d977a2..e22b95bb0 100644 --- a/crawlfrontier/core/manager.py +++ b/crawlfrontier/core/manager.py @@ -123,6 +123,7 @@ def from_settings(cls, settings=None): :class:`Settings ` object instance. If no settings is given, :ref:`frontier default settings ` are used. """ + manager_settings = Settings(settings) return FrontierManager(request_model=manager_settings.REQUEST_MODEL, response_model=manager_settings.RESPONSE_MODEL, @@ -351,6 +352,7 @@ def page_crawled(self, response, links=None): :return: None. """ + self._check_startstop() self.logger.manager.debug(self._msg('PAGE_CRAWLED url=%s status=%s links=%s' % (response.url, response.status_code, len(links) if links else 0))) diff --git a/requirements.txt b/requirements.txt index e99566148..1fde6f079 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,24 @@ +#------------------------------- +# crawl_frontier +#------------------------------- +SQLAlchemy==0.9.8 six>=1.8.0 w3lib>=1.10.0 tldextract>=1.5.1 +colorlog==2.4.0 + +#------------------------------- +# graph diagrams +#------------------------------- +pyparsing==1.5.7 + +--allow-external pydot +--allow-unverified pydot +pydot==1.0.28 + + +#------------------------------- +# mongodb backend +#------------------------------- +pymongo==2.7.2 +