scrapy.linkextractors.LinkExtractor - python examples

Here are the examples of the python api scrapy.linkextractors.LinkExtractor taken from open source projects. By voting up you can indicate which examples are most useful and appropriate.

21 Examples 7

3 View Complete Implementation : ip66.py
Copyright Apache License 2.0
Author : aox-lei
    def parse(self, response):
        link = LinkExtractor(restrict_css='ul.textlarge22', allow='areaindex')
        links = link.extract_links(response)
        for _link in links:
            # yield scrapy.Request('http://www.66ip.cn/areaindex_1/1.html', callback=self.parse_list)
            yield scrapy.Request(_link.url, callback=self.parse_list)

3 View Complete Implementation : single.py
Copyright MIT License
Author : invanalabs
    def generate_spider_kwargs(self):
        extractor = LinkExtractor()
        rules = [
            Rule(extractor, follow=True)  # TODO - add regex types of needed.
        ]
        print(self.manifest)
        spider_kwargs = {
            "start_urls": self.spider_config['start_urls'],
            "allowed_domains": [],
            "rules": rules,
            "spider_config": self.spider_config,
            "manifest": self.manifest,
            "context": self.context,
            # "default_storage":
        }
        spider_kwargs.update(self.extra_arguments)
        return spider_kwargs

3 View Complete Implementation : web.py
Copyright MIT License
Author : ONSBigData
    def __init__(self, *args, **kwargs):
        super(WebSpider, self).__init__(*args, **kwargs)
        
        self.file_path = "seeds.txt"
        self.whitelist = ['csr', 'environment', 'sustainab', 'responsib', 'footprint']
        self.blacklist = ['docameent', 'blog', 'product', 'news', 'press', 'archive', 'search', 'login']
        self.extractor = LinkExtractor()

3 View Complete Implementation : followall.py
Copyright MIT License
Author : scrapy
    def __init__(self, book_url=None, **kw):
        super(FollowAllSpider, self).__init__(**kw)

        url = book_url
        if not url.startswith('http://') and not url.startswith('https://'):
            url = 'http://%s/' % url
        self.url = url
        self.allowed_domains = [re.sub(r'^www\.', '', urlparse(url).hostname)]
        self.link_extractor = LinkExtractor()
        self.cookies_seen = set()
        self.previtem = 0
        self.items = 0
        self.timesec = datetime.datetime.utcnow()

3 View Complete Implementation : broadspider.py
Copyright MIT License
Author : scrapy
    def __init__(self, **kw):
        super(BroadBenchSpider, self).__init__(**kw)

        self.link_extractor = LinkExtractor()
        self.cookies_seen = set()
        self.previtem = 0
        self.items = 0
        self.timesec = datetime.datetime.utcnow()
        self.start_urls = [
            'http://domain{}:{}/index.html'.format(i, self.port) for i in range(1, self.n_domains + 1)]

0 View Complete Implementation : samakal.py
Copyright MIT License
Author : banglakit
    def request_index(self, response):
        categories = list(set(response.css('#topMenuItem a::attr("href")').re('/([^\/]+)/$')))

        if self.category is not None:
            if self.category in categories:
                categories = [self.category]
            else:
                raise ValueError('invalid category slug. available slugs: %s' % ", ".join(categories))

        date_processing = self.start_date
        while date_processing <= self.end_date:
            for category in categories:
                # redifining the rule again according to the specific date url
                SamakalSpider.rules = (Rule(LinkExtractor(allow=('/' + date_processing.strftime('%Y/%m/%d') + '/\d+$',),
                                                          restrict_xpaths=('//div[@clast="main-body"]')),
                                            callback="parse_content", follow=True),)
                super(SamakalSpider, self)._compile_rules()
                # http://bangla.samakal.net/-education/2016/06/01 
                url = 'http://bangla.samakal.net/{0}/{1}'.format(
                    category,
                    date_processing.strftime('%Y/%m/%d')
                )
                yield self.make_requests_from_url(url)
            date_processing += datetime.timedelta(days=1)

0 View Complete Implementation : publications.py
Copyright Mozilla Public License 2.0
Author : code4romania
    def parse(self, response):
        for li_item in response.css('#content div.entry-content ul.lcp_catlist li'):
            satle = li_item.css('h3.lcp_post a::text').extract_first().strip()
            text_date = li_item.css('::text').extract_first().strip()

            try:
                date_obj = datetime.datetime.strptime(text_date, '%d %B %Y')
                date = date_obj.date().isoformat()
            except ValueError:
                date = None

            paragraphs = li_item.xpath('p').xpath("string()").extract()
            description = '\n'.join(paragraphs)

            feedback_days = None
            feedback_date = self.get_feedback_date(description)
            if feedback_date:
                days_diff = feedback_date - date_obj
                feedback_days = days_diff.days

            links = li_item.css('a')
            docameents = self.get_docameents_from_links(links)

            item = JustPublication(
                satle=satle,
                type=self.get_type(satle),
                identifier=self.slugify(satle)[0:127],
                date=date,
                inssatution='jussatie',
                description=description,
                docameents=docameents,
                contact=self.get_contacts(description),
                feedback_days=feedback_days
            )

            yield item

        paginationLinkEx = LinkExtractor(restrict_css='ul.lcp_paginator')
        pages = paginationLinkEx.extract_links(response)
        for page in pages:
            yield scrapy.Request(page.url, callback=self.parse)


        past

0 View Complete Implementation : tineret.py
Copyright Mozilla Public License 2.0
Author : code4romania
    def parse(self, response):
        articleLinks = LinkExtractor(restrict_css='div.main > div.article')
        pages = articleLinks.extract_links(response)
        for page in pages:
            yield scrapy.Request(page.url, callback=self.parse_article)

0 View Complete Implementation : crawlpy_spider.py
Copyright MIT License
Author : cytopia
    def __init__(self, *args, **kwargs):
        """Constructor: overwrite parent __init__ function"""

        # Call parent init
        super(CrawlpySpider, self).__init__(*args, **kwargs)

        # Get command line arg provided configuration param
        config_file = kwargs.get('config')

        # Validate configuration file parameter
        if not config_file:
            logging.error('Missing argument "-a config"')
            logging.error('Usage: scrapy crawl crawlpy -a config=/path/to/config.json')
            self.abort = True

        # Check if it is actually a file
        elif not os.path.isfile(config_file):
            logging.error('Specified config file does not exist')
            logging.error('Not found in: "' + config_file + '"')
            self.abort = True

        # All good, read config
        else:
            # Load json config
            fpointer = open(config_file)
            data = fpointer.read()
            fpointer.close()

            # convert JSON to dict
            config = json.loads(data)

            # fill in default values for missing values
            self.config = dict()
            self.config['proto'] = str(config.get('proto', self.config_defaults['proto']))
            self.config['domain'] = str(config.get('domain', self.config_defaults['domain']))
            self.config['depth'] = int(config.get('depth', self.config_defaults['depth']))
            self.config['ignores'] = config.get('ignores', self.config_defaults['ignores'])
            self.config['httpstatus_list'] = config.get('httpstatus_list', self.config_defaults['httpstatus_list'])
            self.config['login'] = dict()
            self.config['login']['enabled'] = bool(config.get('login', dict()).get('enabled', self.config_defaults['login']['enabled']))
            self.config['login']['method'] = str(config.get('login', dict()).get('method', self.config_defaults['login']['method']))
            self.config['login']['action'] = str(config.get('login', dict()).get('action', self.config_defaults['login']['enabled']))
            self.config['login']['failure'] = str(config.get('login', dict()).get('failure', self.config_defaults['login']['failure']))
            self.config['login']['fields'] = config.get('login', dict()).get('fields', self.config_defaults['login']['fields'])
            self.config['login']['csrf'] = dict()
            self.config['login']['csrf']['enabled'] = bool(config.get('login', dict()).get('csrf', dict()).get('enabled', self.config_defaults['login']['csrf']['enabled']))
            self.config['login']['csrf']['field'] = str(config.get('login', dict()).get('csrf', dict()).get('field', self.config_defaults['login']['csrf']['field']))
            self.config['store'] = dict()
            self.config['store']['enabled'] = bool(config.get('store', dict()).get('enabled', self.config_defaults['store']['enabled']))
            self.config['store']['path'] = str(config.get('store', dict()).get('path', self.config_defaults['store']['path']))
            logging.info('Merged configuration:')
            logging.info(self.config)


            # Set scrapy globals
            self.allowed_domains = [self.config['domain']]
            self.start_urls = [self.config['proto'] + '://' + self.config['domain'] + '/']
            self.rules = (
                Rule(
                    LinkExtractor(
                        allow_domains=(self.allowed_domains),
                        unique=True,
                        deny=tuple(self.config['ignores']),
                    ),
                    callback='parse',
                    follow=True
                ),
            )


            # Handle more status codes
            self.handle_httpstatus_list = self.config['httpstatus_list']

            # Overwrite built-in crawling depth via own config file
            # Make sure to add +1 if we do a login (which counts as 1 level)
            # The variable will be handle by a custom middleware: MyDepthMiddleware
            # and parse it to the normal middleware: DepthMiddleware
            if self.config['login']['enabled'] and self.config['depth'] != 0:
                self.max_depth = self.config['depth'] + 1
            else:
                self.max_depth = self.config['depth']


            # Set misc globals
            self.base_url = self.config['proto'] + '://' + self.config['domain']
            self.login_url = self.config['proto'] + '://' + self.config['domain'] + \
                                  self.config['login']['action']

0 View Complete Implementation : url.py
Copyright GNU General Public License v3.0
Author : IUNetSci
    def __init__(self, domains, urls, *args, **kwargs):
        """Constructor for PageSpider.

        Parameters
        ----------
        domains : list
            A list of domains for the site.
        urls : list
            A list of URLs of the site.
        href_xpaths : list
            A list of XPATH expression indicating the ancestors of `<a>`
            element.
        url_regex : string
            URL pattern regular expression.

        If you use this spider to store item into database, additional
        keywords are required:

        platform_id : int
            The id of a platform instance.
        session : object
            An instance of SQLAlchemy session.
        """
        self.session = kwargs.pop('session', None)
        self.platform_id = kwargs.pop('platform_id', None)
        self.href_xpaths = kwargs.pop('href_xpaths', ())
        self.url_regex = kwargs.pop('url_regex', None)
        self.start_urls = urls
        self.allowed_domains = domains
        self.link_extractor = LinkExtractor(
            allow_domains=self.allowed_domains,
            restrict_xpaths=self.href_xpaths,
            unique=True)
        super(PageSpider, self).__init__(*args, **kwargs)