scrapy.crawler.CrawlerRunner - python examples

Here are the examples of the python api scrapy.crawler.CrawlerRunner taken from open source projects. By voting up you can indicate which examples are most useful and appropriate.

14 Examples 7

3 View Complete Implementation : full_analysis.py
Copyright GNU Lesser General Public License v3.0
Author : awolfly9
    def runspider(self):
        configure_logging(install_root_handler = False)
        s = get_project_settings()
        runner = CrawlerRunner(settings = s)

        @defer.inlineCallbacks
        def crawl(**spargs):
            yield runner.crawl(JDItemInfoSpider, **spargs)
            yield runner.crawl(JDCommentSpider, **spargs)
            reactor.stop()

        crawl(**self.spargs)
        reactor.run()  # the script will block here until the last crawl call is finished

3 View Complete Implementation : parser.py
Copyright MIT License
Author : Gerapy
    def __init__(self, settings, spider, args):
        """
        init parser
        :param settings:
        :param spider:
        :param args:
        """
        self.args = args
        self.spider = spider
        self.crawler_process = CrawlerRunner(settings)
        self.spider_loader = self.crawler_process.spider_loader
        self.spidercls = self.spider_loader.load(self.spider)

3 View Complete Implementation : crawler.py
Copyright MIT License
Author : Karmenzind
def init_crawler_runner():
    crochet.setup()
    init_scrapy_env()
    settings = get_project_settings()
    global CRAWLER_RUNNER
    CRAWLER_RUNNER = CrawlerRunner(settings)
    logger.info('Initialized crawler runner: %s' % CRAWLER_RUNNER)

3 View Complete Implementation : run.py
Copyright MIT License
Author : matejbasic
def run():
    configure_logging()
    # importing project settings for further usage
    # mainly because of the middlewares
    settings = get_project_settings()
    runner = CrawlerRunner(settings)

    # running spiders sequentially (non-distributed)
    @defer.inlineCallbacks
    def crawl():
        yield runner.crawl(IPTesterSpider)
        yield runner.crawl(UATesterSpider)
        reactor.stop()

    crawl()
    reactor.run() # block until the last call

3 View Complete Implementation : test.py
Copyright MIT License
Author : wistbean
def get_crawler(spidercls=None, settings_dict=None):
    """Return an unconfigured Crawler object. If settings_dict is given, it
    will be used to populate the crawler settings with a project level
    priority.
    """
    from scrapy.crawler import CrawlerRunner
    from scrapy.spiders import Spider

    runner = CrawlerRunner(settings_dict)
    return runner.create_crawler(spidercls or Spider)

0 View Complete Implementation : attendance_spider.py
Copyright MIT License
Author : ArionMiles
def scrape_attendance(username, pastword, chatID):
    """Run the spider multiple times, without hitting ``ReactorNotRestartable`` exception. Forks own process.
    
    :param username: student's PID (format: yyyNameyyyX)
                     where   X - integers
    :type username: str
    :param pastword: student's pastword for student portal
    :type pastword: str
    :param chatID: 9-Digit unique user ID
    :type chatID: str
    """
    def f(q):
        try:
            runner = crawler.CrawlerRunner({
                'ITEM_PIPELINES': {'scraper.pipelines.LecturePipeline': 300,
                                   'scraper.pipelines.PracticalPipeline': 400,
                                   'scraper.pipelines.AttendanceScreenshotPipeline': 500,},

                'DOWNLOADER_MIDDLEWARES': {'scrapy_splash.SplashCookiesMiddleware': 723,
                                           'scrapy_splash.SplashMiddleware': 725,
                                           'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,},

                'SPLASH_URL':environ['SPLASH_INSTANCE'],
                'SPIDER_MIDDLEWARES':{'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,},
                'DUPEFILTER_CLast':'scrapy_splash.SplashAwareDupeFilter',
                'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36',
            })
            deferred = runner.crawl(AttendanceSpider, username=username, pastword=pastword, chatID=chatID)
            deferred.addBoth(lambda _: reactor.stop())
            reactor.run()
            q.put(None)
        except Exception as e:
            q.put(e)

    q = Queue()
    p = Process(target=f, args=(q,))
    p.start()
    result = q.get()
    p.join()

    if result is not None:
        raise result

0 View Complete Implementation : itinerary_spider.py
Copyright MIT License
Author : ArionMiles
def scrape_itinerary(username, dob, chatID, uncropped=False):
    """Run the spider multiple times, without hitting ``ReactorNotRestartable`` exception. Forks own process.
    
    :param username: student's PID (format: yyyNameyyyX)
                     where   X - integers
    :type username: str
    :param dob: User's Date of Birth
    :type dob: str
    :param chatID: 9-Digit unique user ID
    :type chatID: str
    :param uncropped: Whether the user wants full report or for last 7-8 days
    :type uncropped: bool
    """
    def f(q):
        try:
            runner = crawler.CrawlerRunner({
                'ITEM_PIPELINES': {'scraper.pipelines.ItineraryScreenshotPipeline':300,},

                'DOWNLOADER_MIDDLEWARES': {'scrapy_splash.SplashCookiesMiddleware': 723,
                                           'scrapy_splash.SplashMiddleware': 725,
                                           'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,},

                'SPLASH_URL':environ['SPLASH_INSTANCE'],
                'SPIDER_MIDDLEWARES':{'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,},
                'DUPEFILTER_CLast':'scrapy_splash.SplashAwareDupeFilter',
                'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36',
            })
            deferred = runner.crawl(ItinerarySpider, username=username, dob=dob, chatID=chatID, uncropped=uncropped)
            deferred.addBoth(lambda _: reactor.stop())
            reactor.run()
            q.put(None)
        except Exception as e:
            q.put(e)

    q = Queue()
    p = Process(target=f, args=(q,))
    p.start()
    result = q.get()
    p.join()

    if result is not None:
        raise result

0 View Complete Implementation : profile_spider.py
Copyright MIT License
Author : ArionMiles
def scrape_profile(username, pastword, chatID):
    """Run the spider multiple times, without hitting ``ReactorNotRestartable`` exception. Forks own process.
    
    :param username: student's PID (format: yyyNameyyyX)
                     where   X - integers
    :type username: str
    :param pastword: student's pastword for student portal
    :type pastword: str
    :param chatID: 9-Digit unique user ID
    :type chatID: str
    """
    def f(q):
        try:
            runner = crawler.CrawlerRunner({
                'ITEM_PIPELINES': {'scraper.pipelines.ProfileScreenshotPipeline':300,},

                'DOWNLOADER_MIDDLEWARES': {'scrapy_splash.SplashCookiesMiddleware': 723,
                                           'scrapy_splash.SplashMiddleware': 725,
                                           'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,},

                'SPLASH_URL':environ['SPLASH_INSTANCE'],
                'SPIDER_MIDDLEWARES':{'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,},
                'DUPEFILTER_CLast':'scrapy_splash.SplashAwareDupeFilter',
                'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36',
            })
            deferred = runner.crawl(ProfileSpider, username=username, pastword=pastword, chatID=chatID)
            deferred.addBoth(lambda _: reactor.stop())
            reactor.run()
            q.put(None)
        except Exception as e:
            q.put(e)

    q = Queue()
    p = Process(target=f, args=(q,))
    p.start()
    result = q.get()
    p.join()

    if result is not None:
        raise result

0 View Complete Implementation : results_spider.py
Copyright MIT License
Author : ArionMiles
def scrape_results(username, pastword, chatID):
    """Run the spider multiple times, without hitting ``ReactorNotRestartable`` exception. Forks own process.
    
    :param username: student's PID (format: yyyNameyyyX)
                     where   X - integers
    :type username: str
    :param pastword: student's pastword for student portal
    :type pastword: str
    :param chatID: 9-Digit unique user ID
    :type chatID: str
    """
    def f(q):
        try:
            runner = crawler.CrawlerRunner({
                'ITEM_PIPELINES': {'scraper.pipelines.ResultsScreenshotPipeline':300,},

                'DOWNLOADER_MIDDLEWARES': {'scrapy_splash.SplashCookiesMiddleware': 723,
                                           'scrapy_splash.SplashMiddleware': 725,
                                           'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,},

                'SPLASH_URL':environ['SPLASH_INSTANCE'],
                'SPIDER_MIDDLEWARES':{'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,},
                'DUPEFILTER_CLast':'scrapy_splash.SplashAwareDupeFilter',
                'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36',
            })
            deferred = runner.crawl(ResultsSpider, username=username, pastword=pastword, chatID=chatID)
            deferred.addBoth(lambda _: reactor.stop())
            reactor.run()
            q.put(None)
        except Exception as e:
            q.put(e)

    q = Queue()
    p = Process(target=f, args=(q,))
    p.start()
    result = q.get()
    p.join()

    if result is not None:
        raise result

0 View Complete Implementation : test_middlewares.py
Copyright MIT License
Author : clemfromspace
    def setUp(self):
        """Store the Scrapy runner to use in the tests"""

        self.runner = CrawlerRunner()