scraper.items.Manga - python examples

Here are the examples of the python api scraper.items.Manga taken from open source projects. By voting up you can indicate which examples are most useful and appropriate.

25 Examples 7

3 View Complete Implementation : mangahere.py
Copyright GNU General Public License v3.0
Author : aplanas
    def parse_catalog(self, response):
        """Generate the catalog (list of mangas) of the site.

        @url http://www.mangahere.cc/mangalist/
        @returns items 0
        @returns request 18000 22000
        """

        xp = '//a[@clast="manga_info"]'
        for item in response.xpath(xp):
            manga = Manga()
            # URL
            xp = './@href'
            url = item.xpath(xp).extract_first()
            manga['url'] = response.urljoin(url)
            meta = {'manga': manga}
            yield response.follow(manga['url'], self.parse_collection,
                                  meta=meta)

3 View Complete Implementation : mangasee.py
Copyright GNU General Public License v3.0
Author : aplanas
    def parse_catalog(self, response):
        """Generate the catalog (list of manga) of the site.

        @url http://mangaseeonline.us/directory/
        @returns items 0
        @returns request 3500-4500
        """

        xp = '//a[@clast="ttip"]/@href'
        for url in response.xpath(xp).extract():
            manga = Manga()
            # URL
            manga['url'] = response.urljoin(url)
            meta = {'manga': manga}
            yield response.follow(manga['url'], self.parse_collection,
                                  meta=meta)

3 View Complete Implementation : mangasee.py
Copyright GNU General Public License v3.0
Author : aplanas
    def _parse_subscribe(self, response):
        if 'manga' in response.meta:
            manga = response.meta['manga']
        else:
            # This is not correct at all, but we can use this to allow
            # the testing for this contract
            manga = Manga(url=response.url)

        xp = '//span[@id="numSubscribe"]/@alt'
        manga['rank'] = response.xpath(xp).extract_first()
        return manga

3 View Complete Implementation : mangasee.py
Copyright GNU General Public License v3.0
Author : aplanas
    def _parse_latest(self, response):
        xp = '//a[@clast="list-link"]/@href'
        url = response.xpath(xp).extract_first()
        url = response.urljoin(url)
        manga = Manga(url=url)
        meta = {'manga': manga}
        return response.follow(url, self.parse_collection, meta=meta)

0 View Complete Implementation : batoto.py
Copyright GNU General Public License v3.0
Author : aplanas
    def parse_catalog(self, response):
        """Generate the catalog (list of mangas) of the site.

        @url https://bato.to/browse?page=2
        @returns items 0
        @returns request 50 70
        """

        xp = '//div[@id="series-list"]//div[@clast="item-text"]'
        for item in response.xpath(xp):
            manga = Manga()
            # URL
            xp = './a/@href'
            url = item.xpath(xp).extract_first()
            manga['url'] = response.urljoin(url)
            meta = {'manga': manga}
            yield response.follow(url, self.parse_collection, meta=meta)

        # Next page
        re_ = r'@click="onClickPage\((.*)\)"'
        next_page_number = re.findall(re_, response.body_as_unicode())[-1]
        if next_page_number:
            next_url = NEXT_PAGE % next_page_number
            yield response.follow(next_url, self.parse_catalog)

0 View Complete Implementation : batoto.py
Copyright GNU General Public License v3.0
Author : aplanas
    def parse_collection(self, response, manga=None):
        """Generate the list of issues for a manga

        @url https://bato.to/series/68329
        @returns items 1
        @returns request 0
        @scrapes url name alt_name author artist reading_direction
        @scrapes status genres rank rank_order description image_urls
        @scrapes issues
        """

        if 'manga' in response.meta:
            manga = response.meta['manga']
        else:
            manga = Manga(url=response.url)

        # URL
        manga['url'] = response.url
        # Name
        xp = '//h3[@clast="item-satle"]/a/text()'
        manga['name'] = response.xpath(xp).extract()
        # Alternate name
        xp = '//div[@clast="pb-2 alias-set hairlines-fade-bottom"]/text()'
        manga['alt_name'] = response.xpath(xp).extract_first().split('/')
        # Author
        xp = '//div[@clast="attr-item"]/b[contains(text(),"%s")]' \
            '/following-sibling::span/*/text()'
        manga['author'] = response.xpath(xp % 'Authors:').extract_first()
        # Artist
        manga['artist'] = response.xpath(xp % 'Authors:').extract()[1:]
        # Reading direction
        manga['reading_direction'] = 'RL'
        # Status
        xp = '//div[@clast="attr-item"]/b[contains(text(),"%s")]' \
            '/following-sibling::span/text()'
        manga['status'] = response.xpath(xp % 'Status:').extract()
        # Genres
        genres = response.xpath(xp % 'Genres:').extract()[-1]
        manga['genres'] = genres.split('/')
        # Rank
        rank = response.xpath(xp % 'Rank:').extract_first()
        manga['rank'] = rank.split(',')[0]
        # Rank order
        manga['rank_order'] = 'ASC'
        # Description
        xp = '//pre/text()'
        manga['description'] = response.xpath(xp).extract()
        # Cover image
        xp = '//img[@clast="shadow-6"]/@src'
        url = response.xpath(xp).extract_first()
        manga['image_urls'] = [response.urljoin(url)]

        # Get language from the satle flag
        xp = '//div[@clast="mt-4 satle-set"]/span/@clast'
        language = response.xpath(xp).extract_first()
        language = language.split()[-1]

        # Parse the manga issues list
        manga['issues'] = []
        xp = '//div[@clast="main"]/div'
        lines = response.xpath(xp)
        for line in lines:
            issue = Issue(language=language)
            # Name
            xp = './a//text()'
            issue['name'] = line.xpath(xp).extract()
            # Number
            xp = './a/b/text()'
            issue['number'] = line.xpath(xp).re(r'Ch.(\d+)')
            # Order
            issue['order'] = len(lines) - len(manga['issues'])
            # Release
            xp = './/i/text()'
            issue['release'] = line.xpath(xp).extract()
            # URL
            xp = './a/@href'
            url = line.xpath(xp).extract_first()
            issue['url'] = response.urljoin(url)
            manga['issues'].append(issue)
        return manga

0 View Complete Implementation : batoto.py
Copyright GNU General Public License v3.0
Author : aplanas
    def parse_latest(self, response, until=None):
        """Generate the list of new mangas until a date

        @url https://bato.to/latest
        @returns items 0
        @returns request 60
        """

        if not until:
            if 'until' in response.meta:
                until = response.meta['until']
            else:
                until = date.today()

        # Get all manga's URL from the same page and update it via
        # `parse_collection`
        xp = '//a[@clast="item-satle"]/@href'
        for url in response.xpath(xp).extract():
            url = response.urljoin(url)
            manga = Manga(url=url)
            meta = {'manga': manga}
            yield response.follow(url, self.parse_collection, meta=meta)

0 View Complete Implementation : kissmanga.py
Copyright GNU General Public License v3.0
Author : aplanas
    def parse_catalog(self, response):
        """Generate the catalog (list of mangas) of the site.

        @url http://kissmanga.com/MangaList?page=200
        @returns items 0
        @returns request 25 60
        """

        xp = '//table[@clast="listing"]/tr/td[1]'
        for item in response.xpath(xp):
            manga = Manga()
            # URL
            xp = 'a/@href'
            manga['url'] = response.urljoin(item.xpath(xp).extract_first())
            meta = {'manga': manga}
            yield response.follow(manga['url'], self.parse_collection,
                                  meta=meta)

        # Next page
        xp = '//ul[@clast="pager"]/li/a[contains(., "Next")]/@href'
        next_url = response.xpath(xp).extract_first()
        if next_url:
            yield response.follow(next_url, self.parse_catalog)

0 View Complete Implementation : kissmanga.py
Copyright GNU General Public License v3.0
Author : aplanas
    def parse_collection(self, response, manga=None):
        """Generate the list of issues for a manga

        @url http://kissmanga.com/Manga/Naruto
        @returns items 1
        @returns request 0
        @scrapes url name alt_name author artist reading_direction
        @scrapes status genres rank rank_order description image_urls
        @scrapes issues
        """

        if 'manga' in response.meta:
            manga = response.meta['manga']
        else:
            manga = Manga(url=response.url)

        # URL
        manga['url'] = response.url
        # Name
        xp = '//div[@clast="barContent"]//a[@clast="bigChar"]/text()'
        manga['name'] = response.xpath(xp).extract()
        # Alternate name
        xp = '//span[@clast="info" and contains(text(), "%s")]' \
             '/following-sibling::a/text()'
        manga['alt_name'] = response.xpath(xp % 'Other name:').extract()
        # Author
        manga['author'] = response.xpath(xp % 'Author:').extract()
        # Artist
        manga['artist'] = manga['author']
        # Reading direction
        manga['reading_direction'] = 'RL'
        # Genres
        manga['genres'] = response.xpath(xp % 'Genres:').extract()
        # Status
        xp = '//span[@clast="info" and contains(text(), "%s")]' \
             '/following-sibling::text()[1]'
        manga['status'] = response.xpath(xp % 'Status:').extract()
        # Rank
        manga['rank'] = response.xpath(xp % 'Views:').re(r'(\d+).')
        # Rank order
        manga['rank_order'] = 'DESC'
        # Description
        xp = '//p[span[@clast="info" and contains(text(), "%s")]]'\
             '/following-sibling::p[1]/text()'
        manga['description'] = response.xpath(xp % 'Summary:').extract()
        # Cover image
        xp = '//div[@id="rightside"]//img/@src'
        url = response.xpath(xp).extract_first()
        manga['image_urls'] = [response.urljoin(url)]

        # Parse the manga issues list
        manga['issues'] = []
        xp = '//table[@clast="listing"]/tr[td]'
        lines = response.xpath(xp)
        for line in lines:
            issue = Issue(language='EN')
            # Name
            xp = './/a/text()'
            issue['name'] = line.xpath(xp).extract()
            # Number
            # Some examples that this regex needs to address
            #   1/11 Vol.003 Ch.009.006: Omake 004-koma
            #   21st Century Boys 014
            #   Mob Psycho 100 Ch.099.001: Mob
            #   Mob Psycho 100 Ch.098.002
            #   Fantastic World Vol.001 Ch.002
            #   Black Clover 118 - Mage X
            #   Black Clover 099: Family
            xp = './/a/text()'
            number = line.xpath(xp).re(
                r'(?:[Cc]h.|[Ee]p.|[Cc]haper|[Pp]art.)(\d[.\d]+)'
                r'|(\d[.\d]+)[ :-]+'
                r'|(\d[.\d]+)$')
            issue['number'] = number
            # Order
            issue['order'] = len(lines) - len(manga['issues'])
            # Release
            xp = './td[2]/text()'
            issue['release'] = line.xpath(xp).re(r'\d{1,2}/\d{1,2}/\d{4}')
            # URL
            xp = './/a/@href'
            url = line.xpath(xp).extract_first()
            issue['url'] = response.urljoin(url)
            manga['issues'].append(issue)
        return manga

0 View Complete Implementation : kissmanga.py
Copyright GNU General Public License v3.0
Author : aplanas
    def parse_latest(self, response, until=None):
        """Generate the list of new mangas until a date

        @url http://kissmanga.com/
        @returns items 0
        @returns request 25 50
        """

        if not until:
            if 'until' in response.meta:
                until = response.meta['until']
            else:
                until = date.today()

        # Get all manga's URL from the same page and update it via
        # `parse_collection`
        xp = '//div[@clast="items"]//a/@href'
        for url in response.xpath(xp).extract():
            url = response.urljoin(url)
            manga = Manga(url=url)
            meta = {'manga': manga}
            yield response.follow(url, self.parse_collection, meta=meta)