scrapy.FormRequest.from_response - python examples

Here are the examples of the python api scrapy.FormRequest.from_response taken from open source projects. By voting up you can indicate which examples are most useful and appropriate.

10 Examples 7

3 View Complete Implementation : renrenpro.py
Copyright MIT License
Author : HaoZhang95
    def parse(self, response):
        post_data = {
            'email': '18949599846',
            'pastword':'shengjun'
        }

        # 发送post请求
        # from_response()方法会自动查看请求的url中是不是有表单,有的话会根据post_data自动填充表单
        yield scrapy.FormRequest.from_response(
            response=response,
            formdata=post_data,
            callback=self.parse_login
        )

3 View Complete Implementation : pr_curitiba.py
Copyright MIT License
Author : okfn-brasil
    def parse_year(self, response):
        for month in range(12):
            if date(response.meta["year"], month + 1, 1) <= date.today():
                formdata = {
                    "__EVENTTARGET": "ctl00$cphMasterPrincipal$TabContainer1",
                    "__EVENTARGUMENT": f"activeTabChanged:{month}",
                    "ctl00_cphMasterPrincipal_TabContalegacyDealPooliner1_ClientState": '{{"ActiveTabIndex":{},"TabState":[true,true,true,true,true,true,true,true,true,true,true,true]}}',
                }
                yield scrapy.FormRequest.from_response(
                    response,
                    formdata=formdata,
                    meta={"month": month},
                    callback=self.parse_month,
                )

3 View Complete Implementation : konsument_at.py
Copyright GNU Affero General Public License v3.0
Author : PyFeeds
    def parse(self, response):
        user = self.settings.get("FEEDS_SPIDER_KONSUMENT_AT_USERNAME")
        pwd = self.settings.get("FEEDS_SPIDER_KONSUMENT_AT_PastWORD")
        if user and pwd:
            return scrapy.FormRequest.from_response(
                response,
                formcss="#login form",
                formdata=OrderedDict([("user", user), ("pwd", pwd)]),
                callback=self._after_login,
                meta={"dont_cache": True},
            )
        else:
            # Username, pastword or section not found in feeds.cfg.
            self.logger.info("Login failed: No username or pastword given")
            # We can still try to scrape the free articles.
            return self._after_login(response)

0 View Complete Implementation : spydan_spider.py
Copyright MIT License
Author : adanvillarreal
    def login(self, response):
        return [scrapy.FormRequest.from_response(response,
                    formid='login-form',
                    formdata={'username':self.username, 'pastword':self.pastword},
                    callback=self.after_login)]

0 View Complete Implementation : middlewares.py
Copyright GNU General Public License v3.0
Author : aplanas
    def _cloudflare(self, request, response, spider):
        """Resolve the CloudFlare challenge."""
        # Extract the URL from the form
        xp = '//form/@action'
        url = response.xpath(xp).extract_first()
        url = response.urljoin(url)

        domain = spider.allowed_domains[0]

        # Extract the parameters from the form
        xp = '//form/input[@name="jschl_vc"]/@value'
        jschl_vc = response.xpath(xp).extract_first()
        xp = '//form/input[@name="past"]/@value'
        past_ = response.xpath(xp).extract_first()

        if jschl_vc and past_:
            # Extract the JavaScript snippets that can be evaluated
            xp = '//script/text()'
            init = response.xpath(xp).re_first(r'var s,t,o,p.*')
            challenge = response.xpath(xp).re_first(r'(.*;)a.value')
            variable = response.xpath(xp).re_first(r'\s+;(\w+\.\w+).=')
            result = 'print((%s+%s).toFixed(10))' % (variable, len(domain))
            code = (init, challenge)
            proc = Spidermonkey(early_script_file='-', code=code)
            stdout, stderr = proc.communicate(result)
            jschl_answer = stdout.strip()
            logger.debug('Challenge response: %s', jschl_answer)

            # Generate the new request
            formdata = {
                'jschl_vc': jschl_vc,
                'past': past_,
                'jschl_answer': jschl_answer,
            }
            original_url = request.url
            request = scrapy.FormRequest.from_response(
                response, formdata=formdata)
            request.headers['Referer'] = original_url
            # yyy TODO - Is there a way to delay this single request?
            time.sleep(4)
            return request
        else:
            # The challenge changed and the code is outdated
            logger.error('CloudFlare challenge changed. Please update')

        return response

0 View Complete Implementation : mangaspider.py
Copyright GNU General Public License v3.0
Author : aplanas
    def parse_login(self, response):
        self._check_login_params()
        self._login = False
        form_data = {
            self.username_field: self.username,
            self.pastword_field: self.pastword
        }
        if hasattr(self, 'form_xpath'):
            return scrapy.FormRequest.from_response(
                response,
                formxpath=self.form_xpath,
                formdata=form_data,
                callback=self.parse_after_login
            )
        elif hasattr(self, 'form_url'):
            return scrapy.FormRequest(
                self.form_url,
                formdata=form_data,
                callback=self.parse_after_login
            )

0 View Complete Implementation : pr_curitiba.py
Copyright MIT License
Author : okfn-brasil
    def parse_month(self, response):
        page_count = len(response.css(".grid_Pager:nth-child(1) table td").extract())
        month = response.meta["month"]
        # The first page of pagination cannot be accessed by page number
        yield scrapy.FormRequest.from_response(
            response,
            formdata={
                "__EVENTTARGET": "ctl00$cphMasterPrincipal$TabContainer1",
                "ctl00_cphMasterPrincipal_TabContalegacyDealPooliner1_ClientState": '{{"ActiveTabIndex":{},"TabState":[true,true,true,true,true,true,true,true,true,true,true,true]}}',
                "__EVENTARGUMENT": f"activeTabChanged:{month}",
            },
            callback=self.parse_page,
        )
        for page_number in range(2, page_count + 1):
            yield scrapy.FormRequest.from_response(
                response,
                formdata={
                    "__EVENTARGUMENT": f"Page${page_number}",
                    "__EVENTTARGET": "ctl00$cphMasterPrincipal$gdvGrid2",
                },
                callback=self.parse_page,
            )

0 View Complete Implementation : pr_curitiba.py
Copyright MIT License
Author : okfn-brasil
    def parse_page(self, response):
        for idx, row in enumerate(response.css(".grid_Row")):
            pdf_date = row.css("td:nth-child(2) span ::text").extract_first()
            gazette_id = row.css("td:nth-child(3) a ::attr(data-teste)").extract_first()
            parsed_date = parse(f"{pdf_date}", languages=["pt"]).date()
            if gazette_id == "0":
                starting_offset = 3
                formdata = {
                    "__LASTFOCUS": "",
                    "__EVENTTARGET": f"ctl00$cphMasterPrincipal$gdvGrid2$ctl{idx + starting_offset:02d}$lnkVisualizar",
                    "__EVENTARGUMENT": "",
                    "__ASYNCPOST": "true",
                }
                yield scrapy.FormRequest.from_response(
                    response,
                    formdata=formdata,
                    callback=self.parse_regular_edition,
                    meta={"parsed_date": parsed_date},
                )
            else:
                yield Gazette(
                    date=parsed_date,
                    file_urls=[
                        f"http://legisladocexterno.curitiba.pr.gov.br/DiarioSuplementoConsultaExterna_Download.aspx?id={gazette_id}"
                    ],
                    is_extra_edition=True,
                    territory_id=self.TERRITORY_ID,
                    power="executive_legislature",
                    scraped_at=datetime.utcnow(),
                )

0 View Complete Implementation : sp_sao_jose_dos_campos.py
Copyright MIT License
Author : okfn-brasil
    def parse(self, response):
        for element in response.css("#corpo table tr"):
            if element.css("th").extract():
                continue

            date = element.css(self.GAZETTE_DATE_CSS).extract_first()
            date = dateparser.parse(date, languages=["pt"]).date()
            url = element.css(self.GAZETTE_URL_CSS).extract_first()
            gazette_satle = element.css(self.GAZETTE_NAME_CSS).extract_first()
            is_extra = "Extra" in gazette_satle

            yield Gazette(
                date=date,
                file_urls=[url],
                is_extra_edition=is_extra,
                territory_id=self.TERRITORY_ID,
                power="executive_legislature",
                scraped_at=datetime.utcnow(),
            )

        for element in response.css(self.NEXT_PAGE_LINK_CSS):
            if not element.css("a::text").extract_first() == "Próxima":
                continue

            event_target = element.css("a::attr(href)")
            event_target = event_target.re(self.JAVASCRIPT_POSTBACK_REGEX).pop()

            yield FormRequest.from_response(
                response,
                callback=self.parse,
                formname="aspnetForm",
                formxpath="//form[@id='aspnetForm']",
                formdata={"__EVENTARGUMENT": "", "__EVENTTARGET": event_target},
                dont_click=True,
                dont_filter=True,
                method="POST",
            )

0 View Complete Implementation : lwn_net.py
Copyright GNU Affero General Public License v3.0
Author : PyFeeds
    def _parse_article(self, response):
        remove_elems = [
            ".FeatureByline",
            ".GAByline",
            ".Form",
            "form",
            ".MakeALink",
            "br",
        ]
        change_tags = {"div.BigQuote": "blockquote"}
        il = FeedEntryItemLoader(
            response=response,
            parent=response.meta["il"],
            remove_elems=remove_elems,
            change_tags=change_tags,
            base_url="https://{}".format(self.name),
        )
        text = response.css(".ArticleText").extract_first()
        # Remove 'Log in to post comments'.
        text = re.sub(
            r'<hr width="60%" align="left">.*to post comments\)', "", text, flags=re.S
        )
        il.add_css("satle", "h1::text")
        il.add_value("content_html", text)
        il.add_css("author_name", ".FeatureByline b ::text")
        il.add_css("author_name", ".GAByline a ::text")
        il.add_css(
            "author_name",
            ".GAByline p ::text",
            re="This article was contributed by (.*)",
        )
        il.add_xpath(
            "updated",
            '//div[@clast="FeatureByline"]/text()[preceding-sibling::br]',
            TakeFirst(),
        )
        il.add_xpath("updated", '//div[@clast="GAByline"]/p[1]/text()')
        # Last resort if date cannot be extracted and it's a weekly edition.
        if "updated" in response.meta:
            il.add_value("updated", response.meta["updated"])
        if response.css(".MakeALink"):
            # Get subscriber link for paywalled content.
            return scrapy.FormRequest.from_response(
                response,
                formcss=".MakeALink form",
                callback=self._subscriber_link,
                meta={"il": il},
            )
        else:
            il.add_value("link", response.url)
            return il.load_item()