Here are the examples of the python api scrapy.linkextractors.LinkExtractor taken from open source projects. By voting up you can indicate which examples are most useful and appropriate.
21 Examples
3
View Complete Implementation : ip66.py
Copyright Apache License 2.0
Author : aox-lei
Copyright Apache License 2.0
Author : aox-lei
def parse(self, response):
link = LinkExtractor(restrict_css='ul.textlarge22', allow='areaindex')
links = link.extract_links(response)
for _link in links:
# yield scrapy.Request('http://www.66ip.cn/areaindex_1/1.html', callback=self.parse_list)
yield scrapy.Request(_link.url, callback=self.parse_list)
3
View Complete Implementation : single.py
Copyright MIT License
Author : invanalabs
Copyright MIT License
Author : invanalabs
def generate_spider_kwargs(self):
extractor = LinkExtractor()
rules = [
Rule(extractor, follow=True) # TODO - add regex types of needed.
]
print(self.manifest)
spider_kwargs = {
"start_urls": self.spider_config['start_urls'],
"allowed_domains": [],
"rules": rules,
"spider_config": self.spider_config,
"manifest": self.manifest,
"context": self.context,
# "default_storage":
}
spider_kwargs.update(self.extra_arguments)
return spider_kwargs
3
View Complete Implementation : web.py
Copyright MIT License
Author : ONSBigData
Copyright MIT License
Author : ONSBigData
def __init__(self, *args, **kwargs):
super(WebSpider, self).__init__(*args, **kwargs)
self.file_path = "seeds.txt"
self.whitelist = ['csr', 'environment', 'sustainab', 'responsib', 'footprint']
self.blacklist = ['docameent', 'blog', 'product', 'news', 'press', 'archive', 'search', 'login']
self.extractor = LinkExtractor()
3
View Complete Implementation : followall.py
Copyright MIT License
Author : scrapy
Copyright MIT License
Author : scrapy
def __init__(self, book_url=None, **kw):
super(FollowAllSpider, self).__init__(**kw)
url = book_url
if not url.startswith('http://') and not url.startswith('https://'):
url = 'http://%s/' % url
self.url = url
self.allowed_domains = [re.sub(r'^www\.', '', urlparse(url).hostname)]
self.link_extractor = LinkExtractor()
self.cookies_seen = set()
self.previtem = 0
self.items = 0
self.timesec = datetime.datetime.utcnow()
3
View Complete Implementation : broadspider.py
Copyright MIT License
Author : scrapy
Copyright MIT License
Author : scrapy
def __init__(self, **kw):
super(BroadBenchSpider, self).__init__(**kw)
self.link_extractor = LinkExtractor()
self.cookies_seen = set()
self.previtem = 0
self.items = 0
self.timesec = datetime.datetime.utcnow()
self.start_urls = [
'http://domain{}:{}/index.html'.format(i, self.port) for i in range(1, self.n_domains + 1)]
0
View Complete Implementation : samakal.py
Copyright MIT License
Author : banglakit
Copyright MIT License
Author : banglakit
def request_index(self, response):
categories = list(set(response.css('#topMenuItem a::attr("href")').re('/([^\/]+)/$')))
if self.category is not None:
if self.category in categories:
categories = [self.category]
else:
raise ValueError('invalid category slug. available slugs: %s' % ", ".join(categories))
date_processing = self.start_date
while date_processing <= self.end_date:
for category in categories:
# redifining the rule again according to the specific date url
SamakalSpider.rules = (Rule(LinkExtractor(allow=('/' + date_processing.strftime('%Y/%m/%d') + '/\d+$',),
restrict_xpaths=('//div[@clast="main-body"]')),
callback="parse_content", follow=True),)
super(SamakalSpider, self)._compile_rules()
# http://bangla.samakal.net/-education/2016/06/01
url = 'http://bangla.samakal.net/{0}/{1}'.format(
category,
date_processing.strftime('%Y/%m/%d')
)
yield self.make_requests_from_url(url)
date_processing += datetime.timedelta(days=1)
0
View Complete Implementation : publications.py
Copyright Mozilla Public License 2.0
Author : code4romania
Copyright Mozilla Public License 2.0
Author : code4romania
def parse(self, response):
for li_item in response.css('#content div.entry-content ul.lcp_catlist li'):
satle = li_item.css('h3.lcp_post a::text').extract_first().strip()
text_date = li_item.css('::text').extract_first().strip()
try:
date_obj = datetime.datetime.strptime(text_date, '%d %B %Y')
date = date_obj.date().isoformat()
except ValueError:
date = None
paragraphs = li_item.xpath('p').xpath("string()").extract()
description = '\n'.join(paragraphs)
feedback_days = None
feedback_date = self.get_feedback_date(description)
if feedback_date:
days_diff = feedback_date - date_obj
feedback_days = days_diff.days
links = li_item.css('a')
docameents = self.get_docameents_from_links(links)
item = JustPublication(
satle=satle,
type=self.get_type(satle),
identifier=self.slugify(satle)[0:127],
date=date,
inssatution='jussatie',
description=description,
docameents=docameents,
contact=self.get_contacts(description),
feedback_days=feedback_days
)
yield item
paginationLinkEx = LinkExtractor(restrict_css='ul.lcp_paginator')
pages = paginationLinkEx.extract_links(response)
for page in pages:
yield scrapy.Request(page.url, callback=self.parse)
past
0
View Complete Implementation : tineret.py
Copyright Mozilla Public License 2.0
Author : code4romania
Copyright Mozilla Public License 2.0
Author : code4romania
def parse(self, response):
articleLinks = LinkExtractor(restrict_css='div.main > div.article')
pages = articleLinks.extract_links(response)
for page in pages:
yield scrapy.Request(page.url, callback=self.parse_article)
0
View Complete Implementation : crawlpy_spider.py
Copyright MIT License
Author : cytopia
Copyright MIT License
Author : cytopia
def __init__(self, *args, **kwargs):
"""Constructor: overwrite parent __init__ function"""
# Call parent init
super(CrawlpySpider, self).__init__(*args, **kwargs)
# Get command line arg provided configuration param
config_file = kwargs.get('config')
# Validate configuration file parameter
if not config_file:
logging.error('Missing argument "-a config"')
logging.error('Usage: scrapy crawl crawlpy -a config=/path/to/config.json')
self.abort = True
# Check if it is actually a file
elif not os.path.isfile(config_file):
logging.error('Specified config file does not exist')
logging.error('Not found in: "' + config_file + '"')
self.abort = True
# All good, read config
else:
# Load json config
fpointer = open(config_file)
data = fpointer.read()
fpointer.close()
# convert JSON to dict
config = json.loads(data)
# fill in default values for missing values
self.config = dict()
self.config['proto'] = str(config.get('proto', self.config_defaults['proto']))
self.config['domain'] = str(config.get('domain', self.config_defaults['domain']))
self.config['depth'] = int(config.get('depth', self.config_defaults['depth']))
self.config['ignores'] = config.get('ignores', self.config_defaults['ignores'])
self.config['httpstatus_list'] = config.get('httpstatus_list', self.config_defaults['httpstatus_list'])
self.config['login'] = dict()
self.config['login']['enabled'] = bool(config.get('login', dict()).get('enabled', self.config_defaults['login']['enabled']))
self.config['login']['method'] = str(config.get('login', dict()).get('method', self.config_defaults['login']['method']))
self.config['login']['action'] = str(config.get('login', dict()).get('action', self.config_defaults['login']['enabled']))
self.config['login']['failure'] = str(config.get('login', dict()).get('failure', self.config_defaults['login']['failure']))
self.config['login']['fields'] = config.get('login', dict()).get('fields', self.config_defaults['login']['fields'])
self.config['login']['csrf'] = dict()
self.config['login']['csrf']['enabled'] = bool(config.get('login', dict()).get('csrf', dict()).get('enabled', self.config_defaults['login']['csrf']['enabled']))
self.config['login']['csrf']['field'] = str(config.get('login', dict()).get('csrf', dict()).get('field', self.config_defaults['login']['csrf']['field']))
self.config['store'] = dict()
self.config['store']['enabled'] = bool(config.get('store', dict()).get('enabled', self.config_defaults['store']['enabled']))
self.config['store']['path'] = str(config.get('store', dict()).get('path', self.config_defaults['store']['path']))
logging.info('Merged configuration:')
logging.info(self.config)
# Set scrapy globals
self.allowed_domains = [self.config['domain']]
self.start_urls = [self.config['proto'] + '://' + self.config['domain'] + '/']
self.rules = (
Rule(
LinkExtractor(
allow_domains=(self.allowed_domains),
unique=True,
deny=tuple(self.config['ignores']),
),
callback='parse',
follow=True
),
)
# Handle more status codes
self.handle_httpstatus_list = self.config['httpstatus_list']
# Overwrite built-in crawling depth via own config file
# Make sure to add +1 if we do a login (which counts as 1 level)
# The variable will be handle by a custom middleware: MyDepthMiddleware
# and parse it to the normal middleware: DepthMiddleware
if self.config['login']['enabled'] and self.config['depth'] != 0:
self.max_depth = self.config['depth'] + 1
else:
self.max_depth = self.config['depth']
# Set misc globals
self.base_url = self.config['proto'] + '://' + self.config['domain']
self.login_url = self.config['proto'] + '://' + self.config['domain'] + \
self.config['login']['action']
0
View Complete Implementation : url.py
Copyright GNU General Public License v3.0
Author : IUNetSci
Copyright GNU General Public License v3.0
Author : IUNetSci
def __init__(self, domains, urls, *args, **kwargs):
"""Constructor for PageSpider.
Parameters
----------
domains : list
A list of domains for the site.
urls : list
A list of URLs of the site.
href_xpaths : list
A list of XPATH expression indicating the ancestors of `<a>`
element.
url_regex : string
URL pattern regular expression.
If you use this spider to store item into database, additional
keywords are required:
platform_id : int
The id of a platform instance.
session : object
An instance of SQLAlchemy session.
"""
self.session = kwargs.pop('session', None)
self.platform_id = kwargs.pop('platform_id', None)
self.href_xpaths = kwargs.pop('href_xpaths', ())
self.url_regex = kwargs.pop('url_regex', None)
self.start_urls = urls
self.allowed_domains = domains
self.link_extractor = LinkExtractor(
allow_domains=self.allowed_domains,
restrict_xpaths=self.href_xpaths,
unique=True)
super(PageSpider, self).__init__(*args, **kwargs)