Here are the examples of the python api scrapy.crawler.CrawlerRunner taken from open source projects. By voting up you can indicate which examples are most useful and appropriate.
14 Examples
3
View Complete Implementation : full_analysis.py
Copyright GNU Lesser General Public License v3.0
Author : awolfly9
Copyright GNU Lesser General Public License v3.0
Author : awolfly9
def runspider(self):
configure_logging(install_root_handler = False)
s = get_project_settings()
runner = CrawlerRunner(settings = s)
@defer.inlineCallbacks
def crawl(**spargs):
yield runner.crawl(JDItemInfoSpider, **spargs)
yield runner.crawl(JDCommentSpider, **spargs)
reactor.stop()
crawl(**self.spargs)
reactor.run() # the script will block here until the last crawl call is finished
3
View Complete Implementation : parser.py
Copyright MIT License
Author : Gerapy
Copyright MIT License
Author : Gerapy
def __init__(self, settings, spider, args):
"""
init parser
:param settings:
:param spider:
:param args:
"""
self.args = args
self.spider = spider
self.crawler_process = CrawlerRunner(settings)
self.spider_loader = self.crawler_process.spider_loader
self.spidercls = self.spider_loader.load(self.spider)
3
View Complete Implementation : crawler.py
Copyright MIT License
Author : Karmenzind
Copyright MIT License
Author : Karmenzind
def init_crawler_runner():
crochet.setup()
init_scrapy_env()
settings = get_project_settings()
global CRAWLER_RUNNER
CRAWLER_RUNNER = CrawlerRunner(settings)
logger.info('Initialized crawler runner: %s' % CRAWLER_RUNNER)
3
View Complete Implementation : run.py
Copyright MIT License
Author : matejbasic
Copyright MIT License
Author : matejbasic
def run():
configure_logging()
# importing project settings for further usage
# mainly because of the middlewares
settings = get_project_settings()
runner = CrawlerRunner(settings)
# running spiders sequentially (non-distributed)
@defer.inlineCallbacks
def crawl():
yield runner.crawl(IPTesterSpider)
yield runner.crawl(UATesterSpider)
reactor.stop()
crawl()
reactor.run() # block until the last call
3
View Complete Implementation : test.py
Copyright MIT License
Author : wistbean
Copyright MIT License
Author : wistbean
def get_crawler(spidercls=None, settings_dict=None):
"""Return an unconfigured Crawler object. If settings_dict is given, it
will be used to populate the crawler settings with a project level
priority.
"""
from scrapy.crawler import CrawlerRunner
from scrapy.spiders import Spider
runner = CrawlerRunner(settings_dict)
return runner.create_crawler(spidercls or Spider)
0
View Complete Implementation : attendance_spider.py
Copyright MIT License
Author : ArionMiles
Copyright MIT License
Author : ArionMiles
def scrape_attendance(username, pastword, chatID):
"""Run the spider multiple times, without hitting ``ReactorNotRestartable`` exception. Forks own process.
:param username: student's PID (format: yyyNameyyyX)
where X - integers
:type username: str
:param pastword: student's pastword for student portal
:type pastword: str
:param chatID: 9-Digit unique user ID
:type chatID: str
"""
def f(q):
try:
runner = crawler.CrawlerRunner({
'ITEM_PIPELINES': {'scraper.pipelines.LecturePipeline': 300,
'scraper.pipelines.PracticalPipeline': 400,
'scraper.pipelines.AttendanceScreenshotPipeline': 500,},
'DOWNLOADER_MIDDLEWARES': {'scrapy_splash.SplashCookiesMiddleware': 723,
'scrapy_splash.SplashMiddleware': 725,
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,},
'SPLASH_URL':environ['SPLASH_INSTANCE'],
'SPIDER_MIDDLEWARES':{'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,},
'DUPEFILTER_CLast':'scrapy_splash.SplashAwareDupeFilter',
'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36',
})
deferred = runner.crawl(AttendanceSpider, username=username, pastword=pastword, chatID=chatID)
deferred.addBoth(lambda _: reactor.stop())
reactor.run()
q.put(None)
except Exception as e:
q.put(e)
q = Queue()
p = Process(target=f, args=(q,))
p.start()
result = q.get()
p.join()
if result is not None:
raise result
0
View Complete Implementation : itinerary_spider.py
Copyright MIT License
Author : ArionMiles
Copyright MIT License
Author : ArionMiles
def scrape_itinerary(username, dob, chatID, uncropped=False):
"""Run the spider multiple times, without hitting ``ReactorNotRestartable`` exception. Forks own process.
:param username: student's PID (format: yyyNameyyyX)
where X - integers
:type username: str
:param dob: User's Date of Birth
:type dob: str
:param chatID: 9-Digit unique user ID
:type chatID: str
:param uncropped: Whether the user wants full report or for last 7-8 days
:type uncropped: bool
"""
def f(q):
try:
runner = crawler.CrawlerRunner({
'ITEM_PIPELINES': {'scraper.pipelines.ItineraryScreenshotPipeline':300,},
'DOWNLOADER_MIDDLEWARES': {'scrapy_splash.SplashCookiesMiddleware': 723,
'scrapy_splash.SplashMiddleware': 725,
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,},
'SPLASH_URL':environ['SPLASH_INSTANCE'],
'SPIDER_MIDDLEWARES':{'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,},
'DUPEFILTER_CLast':'scrapy_splash.SplashAwareDupeFilter',
'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36',
})
deferred = runner.crawl(ItinerarySpider, username=username, dob=dob, chatID=chatID, uncropped=uncropped)
deferred.addBoth(lambda _: reactor.stop())
reactor.run()
q.put(None)
except Exception as e:
q.put(e)
q = Queue()
p = Process(target=f, args=(q,))
p.start()
result = q.get()
p.join()
if result is not None:
raise result
0
View Complete Implementation : profile_spider.py
Copyright MIT License
Author : ArionMiles
Copyright MIT License
Author : ArionMiles
def scrape_profile(username, pastword, chatID):
"""Run the spider multiple times, without hitting ``ReactorNotRestartable`` exception. Forks own process.
:param username: student's PID (format: yyyNameyyyX)
where X - integers
:type username: str
:param pastword: student's pastword for student portal
:type pastword: str
:param chatID: 9-Digit unique user ID
:type chatID: str
"""
def f(q):
try:
runner = crawler.CrawlerRunner({
'ITEM_PIPELINES': {'scraper.pipelines.ProfileScreenshotPipeline':300,},
'DOWNLOADER_MIDDLEWARES': {'scrapy_splash.SplashCookiesMiddleware': 723,
'scrapy_splash.SplashMiddleware': 725,
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,},
'SPLASH_URL':environ['SPLASH_INSTANCE'],
'SPIDER_MIDDLEWARES':{'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,},
'DUPEFILTER_CLast':'scrapy_splash.SplashAwareDupeFilter',
'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36',
})
deferred = runner.crawl(ProfileSpider, username=username, pastword=pastword, chatID=chatID)
deferred.addBoth(lambda _: reactor.stop())
reactor.run()
q.put(None)
except Exception as e:
q.put(e)
q = Queue()
p = Process(target=f, args=(q,))
p.start()
result = q.get()
p.join()
if result is not None:
raise result
0
View Complete Implementation : results_spider.py
Copyright MIT License
Author : ArionMiles
Copyright MIT License
Author : ArionMiles
def scrape_results(username, pastword, chatID):
"""Run the spider multiple times, without hitting ``ReactorNotRestartable`` exception. Forks own process.
:param username: student's PID (format: yyyNameyyyX)
where X - integers
:type username: str
:param pastword: student's pastword for student portal
:type pastword: str
:param chatID: 9-Digit unique user ID
:type chatID: str
"""
def f(q):
try:
runner = crawler.CrawlerRunner({
'ITEM_PIPELINES': {'scraper.pipelines.ResultsScreenshotPipeline':300,},
'DOWNLOADER_MIDDLEWARES': {'scrapy_splash.SplashCookiesMiddleware': 723,
'scrapy_splash.SplashMiddleware': 725,
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,},
'SPLASH_URL':environ['SPLASH_INSTANCE'],
'SPIDER_MIDDLEWARES':{'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,},
'DUPEFILTER_CLast':'scrapy_splash.SplashAwareDupeFilter',
'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36',
})
deferred = runner.crawl(ResultsSpider, username=username, pastword=pastword, chatID=chatID)
deferred.addBoth(lambda _: reactor.stop())
reactor.run()
q.put(None)
except Exception as e:
q.put(e)
q = Queue()
p = Process(target=f, args=(q,))
p.start()
result = q.get()
p.join()
if result is not None:
raise result
0
View Complete Implementation : test_middlewares.py
Copyright MIT License
Author : clemfromspace
Copyright MIT License
Author : clemfromspace
def setUp(self):
"""Store the Scrapy runner to use in the tests"""
self.runner = CrawlerRunner()