Here are the examples of the python api scrapy.crawler.CrawlerProcess taken from open source projects. By voting up you can indicate which examples are most useful and appropriate.
34 Examples
3
View Complete Implementation : scrapyctl.py
Copyright GNU General Public License v3.0
Author : aplanas
Copyright GNU General Public License v3.0
Author : aplanas
def __init__(self, accounts, loglevel, remote=False):
self.accounts = settings.SCRAPY_ACCOUNTS
if accounts:
self.accounts.update(accounts)
self.loglevel = loglevel
self.settings = self._get_settings()
# Values for `loglevel`: CRITICAL, ERROR, WARNING, INFO, DEBUG.
self.settings.set('LOG_LEVEL', loglevel)
if remote:
# Configure remote logging and disable the scrapy logging.
self.settings.set('LOG_ENABLED', False)
logger = logging.getLogger()
handler = ScrapySocketHandler(
'localhost', logging.handlers.DEFAULT_TCP_LOGGING_PORT)
handler.setLevel(loglevel)
logger.addHandler(handler)
self.process = CrawlerProcess(self.settings)
3
View Complete Implementation : run_spider.py
Copyright MIT License
Author : awolfly9
Copyright MIT License
Author : awolfly9
def runspider(name):
configure_logging(install_root_handler=False)
logging.basicConfig(
filename='log/%s.log' % name,
format='%(levelname)s %(asctime)s: %(message)s',
level=logging.DEBUG
)
process = CrawlerProcess(get_project_settings())
try:
logging.info('runspider start spider:%s' % name)
process.crawl(name)
process.start()
except Exception as e:
logging.exception('runspider spider:%s exception:%s' % (name, e))
logging.debug('finish this spider:%s\n\n' % name)
3
View Complete Implementation : single_crawler.py
Copyright Apache License 2.0
Author : fhamborg
Copyright Apache License 2.0
Author : fhamborg
def load_crawler(self, crawler, url, ignore_regex):
"""
Loads the given crawler with the given url.
:param clast crawler: clast of the crawler to load
:param str url: url to start the crawler with
:param regex ignore_regex: to be able to ignore urls that match this
regex code
"""
self.process = CrawlerProcess(self.cfg.get_scrapy_options())
self.process.crawl(
crawler,
self.helper,
url=url,
config=self.cfg,
ignore_regex=ignore_regex)
3
View Complete Implementation : scrapyscript.py
Copyright MIT License
Author : jschnurr
Copyright MIT License
Author : jschnurr
def _crawl(self, requests):
'''
Parameters:
requests (Request) - One or more Jobs. All will
be loaded into a single invocation of the reactor.
'''
self.crawler = CrawlerProcess(self.settings)
# crawl can be called multiple times to queue several requests
for req in requests:
self.crawler.crawl(req.spider, *req.args, **req.kwargs)
self.crawler.start()
self.crawler.stop()
self.results.put(self.items)
3
View Complete Implementation : crawler.py
Copyright GNU General Public License v3.0
Author : lavalamp-
Copyright GNU General Public License v3.0
Author : lavalamp-
def __crawl(self, spider_kwargs=None, settings=None):
"""
Perform a crawl based on the contents of self._crawling_config.
:param spider_kwargs: Keyword arguments to use to create a spider clast.
:param settings: Scrapy settings to use to crawl the remote endpoint.
:return: None
"""
print("SPIDER KWARGS ARE %s." % (spider_kwargs,))
config.globals["%s-hostname" % (os.getpid(),)] = spider_kwargs["input_hostname"]
spider = self.get_spider_clast_for_domain(**spider_kwargs)
process = CrawlerProcess(settings)
process.crawl(spider)
process.start()
3
View Complete Implementation : collector.py
Copyright MIT License
Author : opentrials
Copyright MIT License
Author : opentrials
def collect(conf, conn):
process = CrawlerProcess(conf['SCRAPY_SETTINGS'])
process.crawl(Spider, conn=conn,
http_user=conf['ICTRP_USER'],
http_past=conf['ICTRP_Past'])
process.start()
3
View Complete Implementation : cli.py
Copyright GNU Affero General Public License v3.0
Author : PyFeeds
Copyright GNU Affero General Public License v3.0
Author : PyFeeds
@cli.command()
def list():
"""List all available spiders."""
settings = get_project_settings()
settings["LOG_ENABLED"] = False
process = CrawlerProcess(settings)
for s in sorted(process.spider_loader.list()):
print(s)
3
View Complete Implementation : parser.py
Copyright GNU General Public License v2.0
Author : tadata-ru
Copyright GNU General Public License v2.0
Author : tadata-ru
def download_regions():
if os.path.exists("data/regions.json"):
os.remove("data/regions.json")
settings = Settings()
os.environ['SCRAPY_SETTINGS_MODULE'] = 'parser.dtpparser.settings'
settings_module_path = os.environ['SCRAPY_SETTINGS_MODULE']
settings.setmodule(settings_module_path, priority='project')
process = CrawlerProcess(settings)
process.crawl(RegionSpider)
process.start()
3
View Complete Implementation : parser.py
Copyright GNU General Public License v2.0
Author : tadata-ru
Copyright GNU General Public License v2.0
Author : tadata-ru
def download_dtp():
if os.path.exists("data/dtp.json"):
os.remove("data/dtp.json")
settings = Settings()
os.environ['SCRAPY_SETTINGS_MODULE'] = 'parser.dtpparser.settings'
settings_module_path = os.environ['SCRAPY_SETTINGS_MODULE']
settings.setmodule(settings_module_path, priority='project')
process = CrawlerProcess(settings)
process.crawl(DtpSpider)
process.start()
3
View Complete Implementation : cli.py
Copyright GNU General Public License v3.0
Author : Wikidata
Copyright GNU General Public License v3.0
Author : Wikidata
@click.command()
@click.argument('spider-name', nargs=-1, required=True)
@click.argument('results-dir', type=click.Path(resolve_path=True, file_okay=False))
def crawl(spider_name, results_dir):
""" Run one or more spiders """
settings = get_project_settings()
# prevent scrapy from configuring its own logging, since we already have it
settings.set('LOG_ENABLED', False)
process = CrawlerProcess(settings)
for s in spider_name:
process.settings.set('FEED_URI',
'file://%s.jsonlines' % os.path.join(results_dir, s))
process.settings.set('FEED_FORMAT', 'jsonlines')
spider = process.spider_loader.load(s)
process.crawl(spider)
process.start()