Here are the examples of the python api scrapy.utils.project.get_project_settings taken from open source projects. By voting up you can indicate which examples are most useful and appropriate.
27 Examples
3
View Complete Implementation : scrapyctl.py
Copyright GNU General Public License v3.0
Author : aplanas
Copyright GNU General Public License v3.0
Author : aplanas
def _get_settings(self):
"""Return the current scrapy settings."""
if 'SCRAPY_SETTINGS_MODULE' not in os.environ:
_s = settings.SCRAPY_SETTINGS_MODULE
os.environ['SCRAPY_SETTINGS_MODULE'] = _s
return get_project_settings()
3
View Complete Implementation : run_spider.py
Copyright MIT License
Author : awolfly9
Copyright MIT License
Author : awolfly9
def runspider(name):
configure_logging(install_root_handler=False)
logging.basicConfig(
filename='log/%s.log' % name,
format='%(levelname)s %(asctime)s: %(message)s',
level=logging.DEBUG
)
process = CrawlerProcess(get_project_settings())
try:
logging.info('runspider start spider:%s' % name)
process.crawl(name)
process.start()
except Exception as e:
logging.exception('runspider spider:%s exception:%s' % (name, e))
logging.debug('finish this spider:%s\n\n' % name)
3
View Complete Implementation : full_analysis.py
Copyright GNU Lesser General Public License v3.0
Author : awolfly9
Copyright GNU Lesser General Public License v3.0
Author : awolfly9
def runspider(self):
configure_logging(install_root_handler = False)
s = get_project_settings()
runner = CrawlerRunner(settings = s)
@defer.inlineCallbacks
def crawl(**spargs):
yield runner.crawl(JDItemInfoSpider, **spargs)
yield runner.crawl(JDCommentSpider, **spargs)
reactor.stop()
crawl(**self.spargs)
reactor.run() # the script will block here until the last crawl call is finished
3
View Complete Implementation : aiqiyi_spider.py
Copyright GNU General Public License v3.0
Author : czs0x55aa
Copyright GNU General Public License v3.0
Author : czs0x55aa
def __init__(self):
scrapy.spiders.Spider.__init__(self)
self.global_settings = get_project_settings()
if self.global_settings['PLATFORM'] in ['win', 'mac']:
self.driver = webdriver.PhantomJS(executable_path= self.global_settings['PHANTOMJS_PATH'])
elif self.global_settings['PLATFORM'] in ['linux']:
self.driver = webdriver.PhantomJS()
self.driver.set_page_load_timeout(30)
self.driver.implicitly_wait(10)
self.type_id_list = self.global_settings['CRAWLER']['type_id_list']
self.re_type_id = re.compile(self.global_settings['CRAWLER']['re_type_id'])
self.url_template = self.global_settings['CRAWLER']['url_template']
3
View Complete Implementation : parser.py
Copyright MIT License
Author : Gerapy
Copyright MIT License
Author : Gerapy
def get_follow_requests_and_items(project_path, spider_name, args):
"""
get follows
:param project_path:
:param spider_name:
:param args:
:return:
"""
work_cwd = os.getcwd()
try:
os.chdir(project_path)
settings = get_project_settings()
check_deprecated_settings(settings)
sp = SpiderParser(settings, spider_name, args)
results = sp.run()
return results
finally:
os.chdir(work_cwd)
3
View Complete Implementation : crawler.py
Copyright MIT License
Author : Karmenzind
Copyright MIT License
Author : Karmenzind
def init_crawler_runner():
crochet.setup()
init_scrapy_env()
settings = get_project_settings()
global CRAWLER_RUNNER
CRAWLER_RUNNER = CrawlerRunner(settings)
logger.info('Initialized crawler runner: %s' % CRAWLER_RUNNER)
3
View Complete Implementation : proxy.py
Copyright MIT License
Author : matejbasic
Copyright MIT License
Author : matejbasic
def import_settings(self):
settings = get_project_settings()
self.pastword = settings['AUTH_PastWORD']
self.http_proxy = settings['HTTP_PROXY']
self.control_port = settings['CONTROL_PORT']
self.max_req_per_ip = settings['MAX_REQ_PER_IP']
self.exit_nodes = settings['EXIT_NODES']
if self.exit_nodes:
with Controller.from_port(port=self.control_port) as controller:
controller.authenticate(self.pastword)
controller.set_conf('ExitNodes', self.exit_nodes)
controller.close()
3
View Complete Implementation : run.py
Copyright MIT License
Author : matejbasic
Copyright MIT License
Author : matejbasic
def run():
configure_logging()
# importing project settings for further usage
# mainly because of the middlewares
settings = get_project_settings()
runner = CrawlerRunner(settings)
# running spiders sequentially (non-distributed)
@defer.inlineCallbacks
def crawl():
yield runner.crawl(IPTesterSpider)
yield runner.crawl(UATesterSpider)
reactor.stop()
crawl()
reactor.run() # block until the last call
3
View Complete Implementation : cli.py
Copyright GNU Affero General Public License v3.0
Author : PyFeeds
Copyright GNU Affero General Public License v3.0
Author : PyFeeds
@cli.command()
def list():
"""List all available spiders."""
settings = get_project_settings()
settings["LOG_ENABLED"] = False
process = CrawlerProcess(settings)
for s in sorted(process.spider_loader.list()):
print(s)
3
View Complete Implementation : launcher.py
Copyright MIT License
Author : Tiago-Lira
Copyright MIT License
Author : Tiago-Lira
def _set_timeout(self, process):
with runner.project_environment(process.project):
loader = _get_spider_loader(get_project_settings())
spider = loader.load(process.spider)
timeout = getattr(spider, 'timeout', None)
if timeout:
timeout = int(spider.timeout)
log.msg('Spider has timeout of {} min'.format(timeout))
reactor.callLater(
timeout * 60, self.terminate_process, process)