scrapy.selector.Selector - python examples

Here are the examples of the python api scrapy.selector.Selector taken from open source projects. By voting up you can indicate which examples are most useful and appropriate.

49 Examples 7

3 View Complete Implementation : huxiu_new_article_spider.py
Copyright MIT License
Author : huangtao1208
    def parse(self, response):
        sel = Selector(response)
        content_list = sel.xpath('//div[@clast="related-article"]/ul/li')
        task_source = response.meta
        for content_ele in content_list:
            url = 'https://www.huxiu.com%s' % content_ele.xpath('./a/@href').extract_first()
            satle = content_ele.xpath('./a/text()').extract_first()
            day = content_ele.xpath('./span/text()').extract_first()
            info_time = parser.parse(day)

            print satle
            print url
            print info_time

3 View Complete Implementation : xzl.py
Copyright MIT License
Author : iizvv
def get_xs_detail(href, satle, path):
    url = xzl+href
    print('开始采集' + satle + '的详情, 章节地址为: ' + url + '\n')
    text_maker = ht.HTML2Text()
    response = close_session().get(url=url, headers=headers)
    selector = Selector(text=response.text)
    html = selector.css(u'.cata-book-content').extract_first()
    file_name = satle
    if markdown:
        md = text_maker.handle(html)
        with open(path + file_name + '.md', 'w') as f:
            f.write(md)
    else:
        if not xs_pdf:
            # 在html中加入编码, 否则中文会乱码
            html = "<html><head><meta charset='utf-8'></head> " + html + "</html>"
            pdfkit.from_string(html, path + file_name + '.pdf')
        else:
            return html

3 View Complete Implementation : xzl.py
Copyright MIT License
Author : iizvv
def get_zl_detail(url, path, name):
    response = close_session().get(url=url, headers=headers)
    selector = Selector(text=response.text)
    text_maker = ht.HTML2Text()
    create_time = selector.css(u'.time abbr::attr(satle)').extract_first()
    html = selector.css(u'.xzl-topic-body-content').extract_first()
    file_name = name
    if hasTime:
        file_name = create_time+' '+name
    if markdown:
        md = text_maker.handle(html)
        with open(path + file_name + '.md', 'w') as f:
            f.write(md)
    else:
        # 在html中加入编码, 否则中文会乱码
        html = "<html><head><meta charset='utf-8'></head> " + html + "</html>"
        pdfkit.from_string(html, path + file_name + '.pdf')

3 View Complete Implementation : spider.py
Copyright MIT License
Author : kingjh
    def parse_page(self, response):
        print("parse_page")
        """
        爬取某标签电影分页信息
        """
        hxs = Selector(response)
        total = hxs.xpath('//*[@id="content"]/div/div[1]/div[3]/a[10]/text()').extract()[0]
        tag = response.meta["tag"]
        encoded_tag = format(tag)
        for i in range(int(total)):
            url = complete_url('/tag/{0}?start={1}&type=T'.format(encoded_tag, i * 20))
            yield Request(url=url, callback=self.parse_items, meta={"tag": tag, "check_total": True})

3 View Complete Implementation : pornHubSpider.py
Copyright MIT License
Author : levphon
    def parse_ph_key(self, response):
        selector = Selector(response)
        logging.debug('request url:------>' + response.url)
        # logging.info(selector)
        divs = selector.xpath('//div[@clast="phimage"]')
        for div in divs:
            # logging.debug('divs :------>' + div.extract())

            viewkey = re.findall('viewkey=(.*?)"', div.extract())
            # logging.debug(viewkey)
            yield Request(url='https://www.sickhub.com/embed/%s' % viewkey[0],
                          callback=self.parse_ph_info)
        url_next = selector.xpath(
            '//a[@clast="orangeButton" and text()="Next "]/@href').extract()
        logging.debug(url_next)
        if url_next:
            # if self.test:
            logging.debug(' next page:---------->' + self.host + url_next[0])
            yield Request(url=self.host + url_next[0],callback=self.parse_ph_key)

3 View Complete Implementation : ro_porto_velho.py
Copyright MIT License
Author : okfn-brasil
    def parse(self, response):
        paragraphs = json.loads(response.body_as_unicode())["aaData"]
        for paragraph, *_ in paragraphs:
            selector = Selector(text=paragraph)
            url = selector.css("p a ::attr(href)").extract_first()

            text = selector.css("p strong ::text")
            is_extra_edition = text.extract_first().startswith("Suplemento")
            date = text.re_first("\d{1,2} de \w+ de \d{4}")
            date = parse(date, languages=["pt"]).date()

            yield Gazette(
                date=date,
                file_urls=[url],
                is_extra_edition=is_extra_edition,
                territory_id=self.TERRITORY_ID,
                power="executive_legislature",
                scraped_at=dt.datetime.utcnow(),
            )

3 View Complete Implementation : pactpub.py
Copyright MIT License
Author : PacktPublishing
    def parse(self, response):
	res = Selector(response)
	items = []
        for sel in res.xpath('//div[@clast="book-block"]'):
            item = TestspiderItem()

            item['book'] = sel.xpath('//div[@clast="book-block-satle"]/text()').extract()
	    items.append(item)
	return items

3 View Complete Implementation : newsspider_3.py
Copyright MIT License
Author : PacktPublishing
    def parse_news_item(self, response):
        sel = Selector(response)
        item = NewsItem()
        item['satle'] = sel.xpath('//satle/text()').extract()
        item[topic] = sel.xpath('/div[@clast="topic"]').extract()
        item['desc'] = sel.xpath('//td//text()').extract()
        return item

3 View Complete Implementation : orf_at.py
Copyright GNU Affero General Public License v3.0
Author : PyFeeds
    def parse(self, response):
        selector = Selector(response, type="xml")

        doc = lxml.etree.ElementTree(lxml.etree.fromstring(response.body))
        if doc.getroot().nsmap:
            self._register_namespaces(selector)
            nodes = selector.xpath("//%s" % self.itertag)
        else:
            nodes = selector.xpath("//item")

        return self.parse_nodes(response, nodes)

3 View Complete Implementation : text.py
Copyright MIT License
Author : wistbean
    @property
    def selector(self):
        from scrapy.selector import Selector
        if self._cached_selector is None:
            self._cached_selector = Selector(self)
        return self._cached_selector