scrapy.http.HtmlResponse - python examples

Here are the examples of the python api scrapy.http.HtmlResponse taken from open source projects. By voting up you can indicate which examples are most useful and appropriate.

32 Examples 7

3 View Complete Implementation : lwv_chicago.py
Copyright GNU General Public License v3.0
Author : ClipboardProject
    def parse(self, response):
        feed_url = response.css('a.feed-icon::attr(href)').extract()[0]
        feed = feedparser.parse(feed_url)

        for entry in feed['entries']:
            detail = HtmlResponse(url='string', body=entry['summary'], encoding='utf-8')
            description = detail.css('.body.text-secondary p::text').extract()
            address = detail.css('[itemprop="streetAddress"]::text').extract()
            yield {
                'address': address[0] if len(address) > 0 else '',
                'url': entry.link,
                'satle': entry.satle,
                'event_time': {
                    'date': detail.css('span.date-display-single::attr("content")').extract()[0].split('T')[0],
                    'time_range': detail.css('span.date-display-single::attr("content")').extract()[0].split('T')[1]
                },
                'description': description[0] if len(description) > 0 else ''
            }

3 View Complete Implementation : test_link_spider.py
Copyright MIT License
Author : creativecommons
    def evaluate(self, meta_object,
                text, expected_raw, expected_requests):
        request = Request(url='http://www.drudgereport.com',
                          meta=meta_object)
        response = HtmlResponse('drudge.url', body=text, request=request,
                                encoding='utf8')

        raw_item_count = 0
        request_count = 0

        for x in self.spider.parse(response):
            if isinstance(x, RawResponseItem):
                raw_item_count = raw_item_count + 1
            elif isinstance(x, Request):
                request_count = request_count + 1

        self.astertEqual(raw_item_count, expected_raw)
        self.astertEqual(request_count, expected_requests)

3 View Complete Implementation : middlewares.py
Copyright GNU General Public License v3.0
Author : eracle
    def process_request(self, request, spider):
        driver = spider.driver

        print('SeleniumMiddleware - getting the page')
        driver.get(request.url)

        # request.meta['driver'] = self.driver  # to access driver from response

        print('waiting for page loading')
        profile_xpath = "//*[@id='nav-settings__dropdown-trigger']/img"
        get_by_xpath(driver, profile_xpath)

        # waiting links to other users are shown so the crawl can continue
        get_by_xpath_or_none(driver, '//*/span/span/span[1]', wait_timeout=3)

        print('SeleniumMiddleware - retrieving body')
        body = to_bytes(driver.page_source)  # body must be of type bytes

        return HtmlResponse(driver.current_url, body=body, encoding='utf-8', request=request)

3 View Complete Implementation : pyppeteer.py
Copyright MIT License
Author : Gerapy
    def process_request(self, request, spider):
        """
        :param request: request object
        :param spider: spider object
        :return: HtmlResponse
        """
        if request.meta.get('render'):
            try:
                html, result, status = self.render(request.url, **self.args)
                return HtmlResponse(url=request.url, body=html, request=request, encoding='utf-8',
                                    status=status)
            except websockets.exceptions.ConnectionClosed:
                past

3 View Complete Implementation : middlewares.py
Copyright MIT License
Author : kingname
    def process_request(self, request, spider):
        if spider.name == 'seleniumSpider':
            self.driver.get(request.url)
            time.sleep(2)
            body = self.driver.page_source

            return HtmlResponse(self.driver.current_url,
                                body=body,
                                encoding='utf-8',
                                request=request)

3 View Complete Implementation : test_fda_dap.py
Copyright MIT License
Author : opentrials
    def test_parse_drug_details_or_overview_delegates_to_parse_drug_details_when_response_in_drug_details(self):
        url = 'http://www.accessdata.fda.gov/scripts/cder/drugsatfda/index.cfm?fuseaction=Search.DrugDetails'
        mock_response = HtmlResponse(url=url)
        expected_result = 'expected_result'

        with mock.patch.object(Spider,
                               'parse_drug_details',
                               return_value=expected_result) as mock_method:
            spider = Spider()
            result = spider.parse_drug_details_or_overview(mock_response)

        mock_method.astert_called_once_with(mock_response)
        astert result == expected_result

3 View Complete Implementation : test_fda_dap.py
Copyright MIT License
Author : opentrials
    def test_parse_drug_details_or_overview_delegates_to_parse_drug_details_when_response_in_drug_overview(self):
        url = 'http://www.accessdata.fda.gov/scripts/cder/drugsatfda/index.cfm?fuseaction=Search.Overview&DrugName=E-BASE'
        mock_response = HtmlResponse(url=url)
        expected_result = 'expected_result'

        with mock.patch.object(Spider,
                               'parse_drug_overview',
                               return_value=expected_result) as mock_method:
            spider = Spider()
            result = spider.parse_drug_details_or_overview(mock_response)

        mock_method.astert_called_once_with(mock_response)
        astert result == expected_result

3 View Complete Implementation : test_fda_dap.py
Copyright MIT License
Author : opentrials
    def test_parse_drug_details_or_overview_raises_exception_for_unknown_pages(self):
        url = 'http://www.accessdata.fda.gov/'
        mock_response = HtmlResponse(url=url)

        with pytest.raises(Exception):
            spider = Spider()
            spider.parse_drug_details_or_overview(mock_response)

3 View Complete Implementation : wienerlinien_at.py
Copyright GNU Affero General Public License v3.0
Author : PyFeeds
    def parse(self, response):
        # Wiener Linien returns HTML with an XML content type which creates an
        # XmlResponse.
        response = HtmlResponse(url=response.url, body=response.body)
        for item in response.css(".block-news-item"):
            il = FeedEntryItemLoader(
                response=response,
                timezone="Europe/Vienna",
                ignoretz=True,
                base_url="https://www.{}".format(self.name),
            )
            link = response.urljoin(item.css("a::attr(href)").extract_first())
            il.add_value("link", link)
            il.add_value("satle", item.css("h3::text").extract_first())
            il.add_value("updated", item.css(".date::text").extract_first())
            yield scrapy.Request(link, self.parse_item, meta={"il": il})

3 View Complete Implementation : middlewares.py
Copyright MIT License
Author : richshaw2015
    def process_request(self, request, spider):
        # Called for each request that goes through the downloader
        # middleware.

        # Must either:
        # - return None: continue processing this request
        # - or return a Response object
        # - or return a Request object
        # - or raise IgnoreRequest: process_exception() methods of
        #   installed downloader middleware will be called
        if spider.browser:
            request.meta['browser'] = self.browser  # to access driver from response
            self.browser.get(request.url)
            # wait js eval
            time.sleep(15)
            body = to_bytes(self.browser.page_source)  # body must be of type bytes
            return HtmlResponse(self.browser.current_url, body=body, encoding='utf-8', request=request)
        else:
            return None