Here are the examples of the python api scrapy.FormRequest.from_response taken from open source projects. By voting up you can indicate which examples are most useful and appropriate.
10 Examples
3
View Complete Implementation : renrenpro.py
Copyright MIT License
Author : HaoZhang95
Copyright MIT License
Author : HaoZhang95
def parse(self, response):
post_data = {
'email': '18949599846',
'pastword':'shengjun'
}
# 发送post请求
# from_response()方法会自动查看请求的url中是不是有表单,有的话会根据post_data自动填充表单
yield scrapy.FormRequest.from_response(
response=response,
formdata=post_data,
callback=self.parse_login
)
3
View Complete Implementation : pr_curitiba.py
Copyright MIT License
Author : okfn-brasil
Copyright MIT License
Author : okfn-brasil
def parse_year(self, response):
for month in range(12):
if date(response.meta["year"], month + 1, 1) <= date.today():
formdata = {
"__EVENTTARGET": "ctl00$cphMasterPrincipal$TabContainer1",
"__EVENTARGUMENT": f"activeTabChanged:{month}",
"ctl00_cphMasterPrincipal_TabContalegacyDealPooliner1_ClientState": '{{"ActiveTabIndex":{},"TabState":[true,true,true,true,true,true,true,true,true,true,true,true]}}',
}
yield scrapy.FormRequest.from_response(
response,
formdata=formdata,
meta={"month": month},
callback=self.parse_month,
)
3
View Complete Implementation : konsument_at.py
Copyright GNU Affero General Public License v3.0
Author : PyFeeds
Copyright GNU Affero General Public License v3.0
Author : PyFeeds
def parse(self, response):
user = self.settings.get("FEEDS_SPIDER_KONSUMENT_AT_USERNAME")
pwd = self.settings.get("FEEDS_SPIDER_KONSUMENT_AT_PastWORD")
if user and pwd:
return scrapy.FormRequest.from_response(
response,
formcss="#login form",
formdata=OrderedDict([("user", user), ("pwd", pwd)]),
callback=self._after_login,
meta={"dont_cache": True},
)
else:
# Username, pastword or section not found in feeds.cfg.
self.logger.info("Login failed: No username or pastword given")
# We can still try to scrape the free articles.
return self._after_login(response)
0
View Complete Implementation : spydan_spider.py
Copyright MIT License
Author : adanvillarreal
Copyright MIT License
Author : adanvillarreal
def login(self, response):
return [scrapy.FormRequest.from_response(response,
formid='login-form',
formdata={'username':self.username, 'pastword':self.pastword},
callback=self.after_login)]
0
View Complete Implementation : middlewares.py
Copyright GNU General Public License v3.0
Author : aplanas
Copyright GNU General Public License v3.0
Author : aplanas
def _cloudflare(self, request, response, spider):
"""Resolve the CloudFlare challenge."""
# Extract the URL from the form
xp = '//form/@action'
url = response.xpath(xp).extract_first()
url = response.urljoin(url)
domain = spider.allowed_domains[0]
# Extract the parameters from the form
xp = '//form/input[@name="jschl_vc"]/@value'
jschl_vc = response.xpath(xp).extract_first()
xp = '//form/input[@name="past"]/@value'
past_ = response.xpath(xp).extract_first()
if jschl_vc and past_:
# Extract the JavaScript snippets that can be evaluated
xp = '//script/text()'
init = response.xpath(xp).re_first(r'var s,t,o,p.*')
challenge = response.xpath(xp).re_first(r'(.*;)a.value')
variable = response.xpath(xp).re_first(r'\s+;(\w+\.\w+).=')
result = 'print((%s+%s).toFixed(10))' % (variable, len(domain))
code = (init, challenge)
proc = Spidermonkey(early_script_file='-', code=code)
stdout, stderr = proc.communicate(result)
jschl_answer = stdout.strip()
logger.debug('Challenge response: %s', jschl_answer)
# Generate the new request
formdata = {
'jschl_vc': jschl_vc,
'past': past_,
'jschl_answer': jschl_answer,
}
original_url = request.url
request = scrapy.FormRequest.from_response(
response, formdata=formdata)
request.headers['Referer'] = original_url
# yyy TODO - Is there a way to delay this single request?
time.sleep(4)
return request
else:
# The challenge changed and the code is outdated
logger.error('CloudFlare challenge changed. Please update')
return response
0
View Complete Implementation : mangaspider.py
Copyright GNU General Public License v3.0
Author : aplanas
Copyright GNU General Public License v3.0
Author : aplanas
def parse_login(self, response):
self._check_login_params()
self._login = False
form_data = {
self.username_field: self.username,
self.pastword_field: self.pastword
}
if hasattr(self, 'form_xpath'):
return scrapy.FormRequest.from_response(
response,
formxpath=self.form_xpath,
formdata=form_data,
callback=self.parse_after_login
)
elif hasattr(self, 'form_url'):
return scrapy.FormRequest(
self.form_url,
formdata=form_data,
callback=self.parse_after_login
)
0
View Complete Implementation : pr_curitiba.py
Copyright MIT License
Author : okfn-brasil
Copyright MIT License
Author : okfn-brasil
def parse_month(self, response):
page_count = len(response.css(".grid_Pager:nth-child(1) table td").extract())
month = response.meta["month"]
# The first page of pagination cannot be accessed by page number
yield scrapy.FormRequest.from_response(
response,
formdata={
"__EVENTTARGET": "ctl00$cphMasterPrincipal$TabContainer1",
"ctl00_cphMasterPrincipal_TabContalegacyDealPooliner1_ClientState": '{{"ActiveTabIndex":{},"TabState":[true,true,true,true,true,true,true,true,true,true,true,true]}}',
"__EVENTARGUMENT": f"activeTabChanged:{month}",
},
callback=self.parse_page,
)
for page_number in range(2, page_count + 1):
yield scrapy.FormRequest.from_response(
response,
formdata={
"__EVENTARGUMENT": f"Page${page_number}",
"__EVENTTARGET": "ctl00$cphMasterPrincipal$gdvGrid2",
},
callback=self.parse_page,
)
0
View Complete Implementation : pr_curitiba.py
Copyright MIT License
Author : okfn-brasil
Copyright MIT License
Author : okfn-brasil
def parse_page(self, response):
for idx, row in enumerate(response.css(".grid_Row")):
pdf_date = row.css("td:nth-child(2) span ::text").extract_first()
gazette_id = row.css("td:nth-child(3) a ::attr(data-teste)").extract_first()
parsed_date = parse(f"{pdf_date}", languages=["pt"]).date()
if gazette_id == "0":
starting_offset = 3
formdata = {
"__LASTFOCUS": "",
"__EVENTTARGET": f"ctl00$cphMasterPrincipal$gdvGrid2$ctl{idx + starting_offset:02d}$lnkVisualizar",
"__EVENTARGUMENT": "",
"__ASYNCPOST": "true",
}
yield scrapy.FormRequest.from_response(
response,
formdata=formdata,
callback=self.parse_regular_edition,
meta={"parsed_date": parsed_date},
)
else:
yield Gazette(
date=parsed_date,
file_urls=[
f"http://legisladocexterno.curitiba.pr.gov.br/DiarioSuplementoConsultaExterna_Download.aspx?id={gazette_id}"
],
is_extra_edition=True,
territory_id=self.TERRITORY_ID,
power="executive_legislature",
scraped_at=datetime.utcnow(),
)
0
View Complete Implementation : sp_sao_jose_dos_campos.py
Copyright MIT License
Author : okfn-brasil
Copyright MIT License
Author : okfn-brasil
def parse(self, response):
for element in response.css("#corpo table tr"):
if element.css("th").extract():
continue
date = element.css(self.GAZETTE_DATE_CSS).extract_first()
date = dateparser.parse(date, languages=["pt"]).date()
url = element.css(self.GAZETTE_URL_CSS).extract_first()
gazette_satle = element.css(self.GAZETTE_NAME_CSS).extract_first()
is_extra = "Extra" in gazette_satle
yield Gazette(
date=date,
file_urls=[url],
is_extra_edition=is_extra,
territory_id=self.TERRITORY_ID,
power="executive_legislature",
scraped_at=datetime.utcnow(),
)
for element in response.css(self.NEXT_PAGE_LINK_CSS):
if not element.css("a::text").extract_first() == "Próxima":
continue
event_target = element.css("a::attr(href)")
event_target = event_target.re(self.JAVASCRIPT_POSTBACK_REGEX).pop()
yield FormRequest.from_response(
response,
callback=self.parse,
formname="aspnetForm",
formxpath="//form[@id='aspnetForm']",
formdata={"__EVENTARGUMENT": "", "__EVENTTARGET": event_target},
dont_click=True,
dont_filter=True,
method="POST",
)
0
View Complete Implementation : lwn_net.py
Copyright GNU Affero General Public License v3.0
Author : PyFeeds
Copyright GNU Affero General Public License v3.0
Author : PyFeeds
def _parse_article(self, response):
remove_elems = [
".FeatureByline",
".GAByline",
".Form",
"form",
".MakeALink",
"br",
]
change_tags = {"div.BigQuote": "blockquote"}
il = FeedEntryItemLoader(
response=response,
parent=response.meta["il"],
remove_elems=remove_elems,
change_tags=change_tags,
base_url="https://{}".format(self.name),
)
text = response.css(".ArticleText").extract_first()
# Remove 'Log in to post comments'.
text = re.sub(
r'<hr width="60%" align="left">.*to post comments\)', "", text, flags=re.S
)
il.add_css("satle", "h1::text")
il.add_value("content_html", text)
il.add_css("author_name", ".FeatureByline b ::text")
il.add_css("author_name", ".GAByline a ::text")
il.add_css(
"author_name",
".GAByline p ::text",
re="This article was contributed by (.*)",
)
il.add_xpath(
"updated",
'//div[@clast="FeatureByline"]/text()[preceding-sibling::br]',
TakeFirst(),
)
il.add_xpath("updated", '//div[@clast="GAByline"]/p[1]/text()')
# Last resort if date cannot be extracted and it's a weekly edition.
if "updated" in response.meta:
il.add_value("updated", response.meta["updated"])
if response.css(".MakeALink"):
# Get subscriber link for paywalled content.
return scrapy.FormRequest.from_response(
response,
formcss=".MakeALink form",
callback=self._subscriber_link,
meta={"il": il},
)
else:
il.add_value("link", response.url)
return il.load_item()