scrapy.crawler.CrawlerProcess - python examples

Here are the examples of the python api scrapy.crawler.CrawlerProcess taken from open source projects. By voting up you can indicate which examples are most useful and appropriate.

34 Examples 7

3 View Complete Implementation : scrapyctl.py
Copyright GNU General Public License v3.0
Author : aplanas
    def __init__(self, accounts, loglevel, remote=False):
        self.accounts = settings.SCRAPY_ACCOUNTS
        if accounts:
            self.accounts.update(accounts)
        self.loglevel = loglevel
        self.settings = self._get_settings()
        # Values for `loglevel`: CRITICAL, ERROR, WARNING, INFO, DEBUG.
        self.settings.set('LOG_LEVEL', loglevel)
        if remote:
            # Configure remote logging and disable the scrapy logging.
            self.settings.set('LOG_ENABLED', False)
            logger = logging.getLogger()
            handler = ScrapySocketHandler(
                'localhost', logging.handlers.DEFAULT_TCP_LOGGING_PORT)
            handler.setLevel(loglevel)
            logger.addHandler(handler)

        self.process = CrawlerProcess(self.settings)

3 View Complete Implementation : run_spider.py
Copyright MIT License
Author : awolfly9
def runspider(name):
    configure_logging(install_root_handler=False)
    logging.basicConfig(
        filename='log/%s.log' % name,
        format='%(levelname)s %(asctime)s: %(message)s',
        level=logging.DEBUG
    )
    process = CrawlerProcess(get_project_settings())
    try:
        logging.info('runspider start spider:%s' % name)
        process.crawl(name)
        process.start()
    except Exception as e:
        logging.exception('runspider spider:%s exception:%s' % (name, e))

    logging.debug('finish this spider:%s\n\n' % name)

3 View Complete Implementation : single_crawler.py
Copyright Apache License 2.0
Author : fhamborg
    def load_crawler(self, crawler, url, ignore_regex):
        """
        Loads the given crawler with the given url.

        :param clast crawler: clast of the crawler to load
        :param str url: url to start the crawler with
        :param regex ignore_regex: to be able to ignore urls that match this
                                   regex code
        """
        self.process = CrawlerProcess(self.cfg.get_scrapy_options())
        self.process.crawl(
            crawler,
            self.helper,
            url=url,
            config=self.cfg,
            ignore_regex=ignore_regex)

3 View Complete Implementation : scrapyscript.py
Copyright MIT License
Author : jschnurr
    def _crawl(self, requests):
        '''
        Parameters:
            requests (Request) - One or more Jobs. All will
                                 be loaded into a single invocation of the reactor.
        '''
        self.crawler = CrawlerProcess(self.settings)

        # crawl can be called multiple times to queue several requests
        for req in requests:
            self.crawler.crawl(req.spider, *req.args, **req.kwargs)

        self.crawler.start()
        self.crawler.stop()
        self.results.put(self.items)

3 View Complete Implementation : crawler.py
Copyright GNU General Public License v3.0
Author : lavalamp-
    def __crawl(self, spider_kwargs=None, settings=None):
        """
        Perform a crawl based on the contents of self._crawling_config.
        :param spider_kwargs: Keyword arguments to use to create a spider clast.
        :param settings: Scrapy settings to use to crawl the remote endpoint.
        :return: None
        """
        print("SPIDER KWARGS ARE %s." % (spider_kwargs,))
        config.globals["%s-hostname" % (os.getpid(),)] = spider_kwargs["input_hostname"]
        spider = self.get_spider_clast_for_domain(**spider_kwargs)
        process = CrawlerProcess(settings)
        process.crawl(spider)
        process.start()

3 View Complete Implementation : collector.py
Copyright MIT License
Author : opentrials
def collect(conf, conn):
    process = CrawlerProcess(conf['SCRAPY_SETTINGS'])
    process.crawl(Spider, conn=conn,
        http_user=conf['ICTRP_USER'],
        http_past=conf['ICTRP_Past'])
    process.start()

3 View Complete Implementation : cli.py
Copyright GNU Affero General Public License v3.0
Author : PyFeeds
@cli.command()
def list():
    """List all available spiders."""
    settings = get_project_settings()
    settings["LOG_ENABLED"] = False
    process = CrawlerProcess(settings)
    for s in sorted(process.spider_loader.list()):
        print(s)

3 View Complete Implementation : parser.py
Copyright GNU General Public License v2.0
Author : tadata-ru
def download_regions():
    if os.path.exists("data/regions.json"):
        os.remove("data/regions.json")

    settings = Settings()
    os.environ['SCRAPY_SETTINGS_MODULE'] = 'parser.dtpparser.settings'
    settings_module_path = os.environ['SCRAPY_SETTINGS_MODULE']
    settings.setmodule(settings_module_path, priority='project')
    process = CrawlerProcess(settings)

    process.crawl(RegionSpider)
    process.start()

3 View Complete Implementation : parser.py
Copyright GNU General Public License v2.0
Author : tadata-ru
def download_dtp():
    if os.path.exists("data/dtp.json"):
        os.remove("data/dtp.json")
    settings = Settings()
    os.environ['SCRAPY_SETTINGS_MODULE'] = 'parser.dtpparser.settings'
    settings_module_path = os.environ['SCRAPY_SETTINGS_MODULE']
    settings.setmodule(settings_module_path, priority='project')
    process = CrawlerProcess(settings)

    process.crawl(DtpSpider)
    process.start()

3 View Complete Implementation : cli.py
Copyright GNU General Public License v3.0
Author : Wikidata
@click.command()
@click.argument('spider-name', nargs=-1, required=True)
@click.argument('results-dir', type=click.Path(resolve_path=True, file_okay=False))
def crawl(spider_name, results_dir):
    """ Run one or more spiders """
    settings = get_project_settings()
    # prevent scrapy from configuring its own logging, since we already have it
    settings.set('LOG_ENABLED', False)

    process = CrawlerProcess(settings)
    for s in spider_name:
        process.settings.set('FEED_URI',
                             'file://%s.jsonlines' % os.path.join(results_dir, s))
        process.settings.set('FEED_FORMAT', 'jsonlines')
        spider = process.spider_loader.load(s)
        process.crawl(spider)
    process.start()