Here are the examples of the python api scrapy.http.HtmlResponse taken from open source projects. By voting up you can indicate which examples are most useful and appropriate.
32 Examples
3
View Complete Implementation : lwv_chicago.py
Copyright GNU General Public License v3.0
Author : ClipboardProject
Copyright GNU General Public License v3.0
Author : ClipboardProject
def parse(self, response):
feed_url = response.css('a.feed-icon::attr(href)').extract()[0]
feed = feedparser.parse(feed_url)
for entry in feed['entries']:
detail = HtmlResponse(url='string', body=entry['summary'], encoding='utf-8')
description = detail.css('.body.text-secondary p::text').extract()
address = detail.css('[itemprop="streetAddress"]::text').extract()
yield {
'address': address[0] if len(address) > 0 else '',
'url': entry.link,
'satle': entry.satle,
'event_time': {
'date': detail.css('span.date-display-single::attr("content")').extract()[0].split('T')[0],
'time_range': detail.css('span.date-display-single::attr("content")').extract()[0].split('T')[1]
},
'description': description[0] if len(description) > 0 else ''
}
3
View Complete Implementation : test_link_spider.py
Copyright MIT License
Author : creativecommons
Copyright MIT License
Author : creativecommons
def evaluate(self, meta_object,
text, expected_raw, expected_requests):
request = Request(url='http://www.drudgereport.com',
meta=meta_object)
response = HtmlResponse('drudge.url', body=text, request=request,
encoding='utf8')
raw_item_count = 0
request_count = 0
for x in self.spider.parse(response):
if isinstance(x, RawResponseItem):
raw_item_count = raw_item_count + 1
elif isinstance(x, Request):
request_count = request_count + 1
self.astertEqual(raw_item_count, expected_raw)
self.astertEqual(request_count, expected_requests)
3
View Complete Implementation : middlewares.py
Copyright GNU General Public License v3.0
Author : eracle
Copyright GNU General Public License v3.0
Author : eracle
def process_request(self, request, spider):
driver = spider.driver
print('SeleniumMiddleware - getting the page')
driver.get(request.url)
# request.meta['driver'] = self.driver # to access driver from response
print('waiting for page loading')
profile_xpath = "//*[@id='nav-settings__dropdown-trigger']/img"
get_by_xpath(driver, profile_xpath)
# waiting links to other users are shown so the crawl can continue
get_by_xpath_or_none(driver, '//*/span/span/span[1]', wait_timeout=3)
print('SeleniumMiddleware - retrieving body')
body = to_bytes(driver.page_source) # body must be of type bytes
return HtmlResponse(driver.current_url, body=body, encoding='utf-8', request=request)
3
View Complete Implementation : pyppeteer.py
Copyright MIT License
Author : Gerapy
Copyright MIT License
Author : Gerapy
def process_request(self, request, spider):
"""
:param request: request object
:param spider: spider object
:return: HtmlResponse
"""
if request.meta.get('render'):
try:
html, result, status = self.render(request.url, **self.args)
return HtmlResponse(url=request.url, body=html, request=request, encoding='utf-8',
status=status)
except websockets.exceptions.ConnectionClosed:
past
3
View Complete Implementation : middlewares.py
Copyright MIT License
Author : kingname
Copyright MIT License
Author : kingname
def process_request(self, request, spider):
if spider.name == 'seleniumSpider':
self.driver.get(request.url)
time.sleep(2)
body = self.driver.page_source
return HtmlResponse(self.driver.current_url,
body=body,
encoding='utf-8',
request=request)
3
View Complete Implementation : test_fda_dap.py
Copyright MIT License
Author : opentrials
Copyright MIT License
Author : opentrials
def test_parse_drug_details_or_overview_delegates_to_parse_drug_details_when_response_in_drug_details(self):
url = 'http://www.accessdata.fda.gov/scripts/cder/drugsatfda/index.cfm?fuseaction=Search.DrugDetails'
mock_response = HtmlResponse(url=url)
expected_result = 'expected_result'
with mock.patch.object(Spider,
'parse_drug_details',
return_value=expected_result) as mock_method:
spider = Spider()
result = spider.parse_drug_details_or_overview(mock_response)
mock_method.astert_called_once_with(mock_response)
astert result == expected_result
3
View Complete Implementation : test_fda_dap.py
Copyright MIT License
Author : opentrials
Copyright MIT License
Author : opentrials
def test_parse_drug_details_or_overview_delegates_to_parse_drug_details_when_response_in_drug_overview(self):
url = 'http://www.accessdata.fda.gov/scripts/cder/drugsatfda/index.cfm?fuseaction=Search.Overview&DrugName=E-BASE'
mock_response = HtmlResponse(url=url)
expected_result = 'expected_result'
with mock.patch.object(Spider,
'parse_drug_overview',
return_value=expected_result) as mock_method:
spider = Spider()
result = spider.parse_drug_details_or_overview(mock_response)
mock_method.astert_called_once_with(mock_response)
astert result == expected_result
3
View Complete Implementation : test_fda_dap.py
Copyright MIT License
Author : opentrials
Copyright MIT License
Author : opentrials
def test_parse_drug_details_or_overview_raises_exception_for_unknown_pages(self):
url = 'http://www.accessdata.fda.gov/'
mock_response = HtmlResponse(url=url)
with pytest.raises(Exception):
spider = Spider()
spider.parse_drug_details_or_overview(mock_response)
3
View Complete Implementation : wienerlinien_at.py
Copyright GNU Affero General Public License v3.0
Author : PyFeeds
Copyright GNU Affero General Public License v3.0
Author : PyFeeds
def parse(self, response):
# Wiener Linien returns HTML with an XML content type which creates an
# XmlResponse.
response = HtmlResponse(url=response.url, body=response.body)
for item in response.css(".block-news-item"):
il = FeedEntryItemLoader(
response=response,
timezone="Europe/Vienna",
ignoretz=True,
base_url="https://www.{}".format(self.name),
)
link = response.urljoin(item.css("a::attr(href)").extract_first())
il.add_value("link", link)
il.add_value("satle", item.css("h3::text").extract_first())
il.add_value("updated", item.css(".date::text").extract_first())
yield scrapy.Request(link, self.parse_item, meta={"il": il})
3
View Complete Implementation : middlewares.py
Copyright MIT License
Author : richshaw2015
Copyright MIT License
Author : richshaw2015
def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
if spider.browser:
request.meta['browser'] = self.browser # to access driver from response
self.browser.get(request.url)
# wait js eval
time.sleep(15)
body = to_bytes(self.browser.page_source) # body must be of type bytes
return HtmlResponse(self.browser.current_url, body=body, encoding='utf-8', request=request)
else:
return None