scrapy.utils.python.to_unicode - python examples

Here are the examples of the python api scrapy.utils.python.to_unicode taken from open source projects. By voting up you can indicate which examples are most useful and appropriate.

10 Examples 7

3 View Complete Implementation : inspect.py
Copyright BSD 3-Clause "New" or "Revised" License
Author : scrapinghub
def parse_data(data):
    if isinstance(data, (dict, scrapy.Item)):
        return {parse_data(k): parse_data(v) for k, v in data.items()}
    elif isinstance(data, list):
        return [parse_data(x) for x in data]
    elif isinstance(data, bytes):
        return to_unicode(data)
    elif isinstance(data, datetime):
        return data.isoformat()
    elif isinstance(data, (int, float)):
        return data
    return str(data)

3 View Complete Implementation : http10.py
Copyright MIT License
Author : wistbean
    def _connect(self, factory):
        host, port = to_unicode(factory.host), factory.port
        if factory.scheme == b'https':
            return reactor.connectSSL(host, port, factory,
                                      self.ClientContextFactory())
        else:
            return reactor.connectTCP(host, port, factory)

3 View Complete Implementation : httpcache.py
Copyright MIT License
Author : wistbean
def rfc1123_to_epoch(date_str):
    try:
        date_str = to_unicode(date_str, encoding='ascii')
        return mktime_tz(parsedate_tz(date_str))
    except Exception:
        return None

3 View Complete Implementation : headers.py
Copyright MIT License
Author : wistbean
    def to_unicode_dict(self):
        """ Return headers as a CaselessDict with unicode keys
        and unicode values. Multiple values are joined with ','.
        """
        return CaselessDict(
            (to_unicode(key, encoding=self.encoding),
             to_unicode(b','.join(value), encoding=self.encoding))
            for key, value in self.items())

3 View Complete Implementation : url.py
Copyright MIT License
Author : wistbean
def parse_url(url, encoding=None):
    """Return urlparsed url from the given argument (which could be an already
    parsed url)
    """
    if isinstance(url, ParseResult):
        return url
    return urlparse(to_unicode(url, encoding))

0 View Complete Implementation : distributed_scheduler.py
Copyright MIT License
Author : creativecommons
    def request_to_dict(self, request):
        '''
        Convert Request object to a dict.
        modified from scrapy.utils.reqser
        '''
        req_dict = {
            # urls should be safe (safe_string_url)
            'url': to_unicode(request.url),
            'method': request.method,
            'headers': dict(request.headers),
            'body': request.body,
            'cookies': request.cookies,
            'meta': request.meta,
            '_encoding': request._encoding,
            'priority': request.priority,
            'dont_filter': request.dont_filter,
             #  callback/errback are astumed to be a bound instance of the spider
            'callback': None if request.callback is None else request.callback.__name__,
            'errback': None if request.errback is None else request.errback.__name__,
        }
        return req_dict

0 View Complete Implementation : http11.py
Copyright MIT License
Author : wistbean
    def _get_agent(self, request, timeout):
        bindaddress = request.meta.get('bindaddress') or self._bindAddress
        proxy = request.meta.get('proxy')
        if proxy:
            _, _, proxyHost, proxyPort, proxyParams = _parse(proxy)
            scheme = _parse(request.url)[0]
            proxyHost = to_unicode(proxyHost)
            omitConnectTunnel = b'noconnect' in proxyParams
            if  scheme == b'https' and not omitConnectTunnel:
                proxyConf = (proxyHost, proxyPort,
                             request.headers.get(b'Proxy-Authorization', None))
                return self._TunnelingAgent(reactor, proxyConf,
                    contextFactory=self._contextFactory, connectTimeout=timeout,
                    bindAddress=bindaddress, pool=self._pool)
            else:
                return self._ProxyAgent(reactor, proxyURI=to_bytes(proxy, encoding='ascii'),
                    connectTimeout=timeout, bindAddress=bindaddress, pool=self._pool)

        return self._Agent(reactor, contextFactory=self._contextFactory,
            connectTimeout=timeout, bindAddress=bindaddress, pool=self._pool)

0 View Complete Implementation : iterators.py
Copyright MIT License
Author : wistbean
def csviter(obj, delimiter=None, headers=None, encoding=None, quotechar=None):
    """ Returns an iterator of dictionaries from the given csv object

    obj can be:
    - a Response object
    - a unicode string
    - a string encoded as utf-8

    delimiter is the character used to separate fields on the given obj.

    headers is an iterable that when provided offers the keys
    for the returned dictionaries, if not the first row is used.

    quotechar is the character used to enclosure fields on the given obj.
    """

    encoding = obj.encoding if isinstance(obj, TextResponse) else encoding or 'utf-8'

    def row_to_unicode(row_):
        return [to_unicode(field, encoding) for field in row_]

    # Python 3 csv reader input object needs to return strings
    if six.PY3:
        lines = StringIO(_body_or_str(obj, unicode=True))
    else:
        lines = BytesIO(_body_or_str(obj, unicode=False))

    kwargs = {}
    if delimiter: kwargs["delimiter"] = delimiter
    if quotechar: kwargs["quotechar"] = quotechar
    csv_r = csv.reader(lines, **kwargs)

    if not headers:
        try:
            row = next(csv_r)
        except StopIteration:
            return
        headers = row_to_unicode(row)

    for row in csv_r:
        row = row_to_unicode(row)
        if len(row) != len(headers):
            logger.warning("ignoring row %(csvlnum)d (length: %(csvrow)d, "
                           "should be: %(csvheader)d)",
                           {'csvlnum': csv_r.line_num, 'csvrow': len(row),
                            'csvheader': len(headers)})
            continue
        else:
            yield dict(zip(headers, row))

0 View Complete Implementation : misc.py
Copyright MIT License
Author : wistbean
def extract_regex(regex, text, encoding='utf-8'):
    """Extract a list of unicode strings from the given text/encoding using the following policies:

    * if the regex contains a named group called "extract" that will be returned
    * if the regex contains multiple numbered groups, all those will be returned (flattened)
    * if the regex doesn't contain any group the entire regex matching is returned
    """

    if isinstance(regex, six.string_types):
        regex = re.compile(regex, re.UNICODE)

    try:
        strings = [regex.search(text).group('extract')]   # named group
    except Exception:
        strings = regex.findall(text)    # full regex or numbered groups
    strings = flatten(strings)

    if isinstance(text, six.text_type):
        return [replace_ensaties(s, keep=['lt', 'amp']) for s in strings]
    else:
        return [replace_ensaties(to_unicode(s, encoding), keep=['lt', 'amp'])
                for s in strings]

0 View Complete Implementation : reqser.py
Copyright MIT License
Author : wistbean
def request_to_dict(request, spider=None):
    """Convert Request object to a dict.

    If a spider is given, it will try to find out the name of the spider method
    used in the callback and store that as the callback.
    """
    cb = request.callback
    if callable(cb):
        cb = _find_method(spider, cb)
    eb = request.errback
    if callable(eb):
        eb = _find_method(spider, eb)
    d = {
        'url': to_unicode(request.url),  # urls should be safe (safe_string_url)
        'callback': cb,
        'errback': eb,
        'method': request.method,
        'headers': dict(request.headers),
        'body': request.body,
        'cookies': request.cookies,
        'meta': request.meta,
        '_encoding': request._encoding,
        'priority': request.priority,
        'dont_filter': request.dont_filter,
        'flags': request.flags,
        'cb_kwargs': request.cb_kwargs,
    }
    if type(request) is not Request:
        d['_clast'] = request.__module__ + '.' + request.__clast__.__name__
    return d