Here are the examples of the python api scrapy.utils.python.to_unicode taken from open source projects. By voting up you can indicate which examples are most useful and appropriate.
10 Examples
3
View Complete Implementation : inspect.py
Copyright BSD 3-Clause "New" or "Revised" License
Author : scrapinghub
Copyright BSD 3-Clause "New" or "Revised" License
Author : scrapinghub
def parse_data(data):
if isinstance(data, (dict, scrapy.Item)):
return {parse_data(k): parse_data(v) for k, v in data.items()}
elif isinstance(data, list):
return [parse_data(x) for x in data]
elif isinstance(data, bytes):
return to_unicode(data)
elif isinstance(data, datetime):
return data.isoformat()
elif isinstance(data, (int, float)):
return data
return str(data)
3
View Complete Implementation : http10.py
Copyright MIT License
Author : wistbean
Copyright MIT License
Author : wistbean
def _connect(self, factory):
host, port = to_unicode(factory.host), factory.port
if factory.scheme == b'https':
return reactor.connectSSL(host, port, factory,
self.ClientContextFactory())
else:
return reactor.connectTCP(host, port, factory)
3
View Complete Implementation : httpcache.py
Copyright MIT License
Author : wistbean
Copyright MIT License
Author : wistbean
def rfc1123_to_epoch(date_str):
try:
date_str = to_unicode(date_str, encoding='ascii')
return mktime_tz(parsedate_tz(date_str))
except Exception:
return None
3
View Complete Implementation : headers.py
Copyright MIT License
Author : wistbean
Copyright MIT License
Author : wistbean
def to_unicode_dict(self):
""" Return headers as a CaselessDict with unicode keys
and unicode values. Multiple values are joined with ','.
"""
return CaselessDict(
(to_unicode(key, encoding=self.encoding),
to_unicode(b','.join(value), encoding=self.encoding))
for key, value in self.items())
3
View Complete Implementation : url.py
Copyright MIT License
Author : wistbean
Copyright MIT License
Author : wistbean
def parse_url(url, encoding=None):
"""Return urlparsed url from the given argument (which could be an already
parsed url)
"""
if isinstance(url, ParseResult):
return url
return urlparse(to_unicode(url, encoding))
0
View Complete Implementation : distributed_scheduler.py
Copyright MIT License
Author : creativecommons
Copyright MIT License
Author : creativecommons
def request_to_dict(self, request):
'''
Convert Request object to a dict.
modified from scrapy.utils.reqser
'''
req_dict = {
# urls should be safe (safe_string_url)
'url': to_unicode(request.url),
'method': request.method,
'headers': dict(request.headers),
'body': request.body,
'cookies': request.cookies,
'meta': request.meta,
'_encoding': request._encoding,
'priority': request.priority,
'dont_filter': request.dont_filter,
# callback/errback are astumed to be a bound instance of the spider
'callback': None if request.callback is None else request.callback.__name__,
'errback': None if request.errback is None else request.errback.__name__,
}
return req_dict
0
View Complete Implementation : http11.py
Copyright MIT License
Author : wistbean
Copyright MIT License
Author : wistbean
def _get_agent(self, request, timeout):
bindaddress = request.meta.get('bindaddress') or self._bindAddress
proxy = request.meta.get('proxy')
if proxy:
_, _, proxyHost, proxyPort, proxyParams = _parse(proxy)
scheme = _parse(request.url)[0]
proxyHost = to_unicode(proxyHost)
omitConnectTunnel = b'noconnect' in proxyParams
if scheme == b'https' and not omitConnectTunnel:
proxyConf = (proxyHost, proxyPort,
request.headers.get(b'Proxy-Authorization', None))
return self._TunnelingAgent(reactor, proxyConf,
contextFactory=self._contextFactory, connectTimeout=timeout,
bindAddress=bindaddress, pool=self._pool)
else:
return self._ProxyAgent(reactor, proxyURI=to_bytes(proxy, encoding='ascii'),
connectTimeout=timeout, bindAddress=bindaddress, pool=self._pool)
return self._Agent(reactor, contextFactory=self._contextFactory,
connectTimeout=timeout, bindAddress=bindaddress, pool=self._pool)
0
View Complete Implementation : iterators.py
Copyright MIT License
Author : wistbean
Copyright MIT License
Author : wistbean
def csviter(obj, delimiter=None, headers=None, encoding=None, quotechar=None):
""" Returns an iterator of dictionaries from the given csv object
obj can be:
- a Response object
- a unicode string
- a string encoded as utf-8
delimiter is the character used to separate fields on the given obj.
headers is an iterable that when provided offers the keys
for the returned dictionaries, if not the first row is used.
quotechar is the character used to enclosure fields on the given obj.
"""
encoding = obj.encoding if isinstance(obj, TextResponse) else encoding or 'utf-8'
def row_to_unicode(row_):
return [to_unicode(field, encoding) for field in row_]
# Python 3 csv reader input object needs to return strings
if six.PY3:
lines = StringIO(_body_or_str(obj, unicode=True))
else:
lines = BytesIO(_body_or_str(obj, unicode=False))
kwargs = {}
if delimiter: kwargs["delimiter"] = delimiter
if quotechar: kwargs["quotechar"] = quotechar
csv_r = csv.reader(lines, **kwargs)
if not headers:
try:
row = next(csv_r)
except StopIteration:
return
headers = row_to_unicode(row)
for row in csv_r:
row = row_to_unicode(row)
if len(row) != len(headers):
logger.warning("ignoring row %(csvlnum)d (length: %(csvrow)d, "
"should be: %(csvheader)d)",
{'csvlnum': csv_r.line_num, 'csvrow': len(row),
'csvheader': len(headers)})
continue
else:
yield dict(zip(headers, row))
0
View Complete Implementation : misc.py
Copyright MIT License
Author : wistbean
Copyright MIT License
Author : wistbean
def extract_regex(regex, text, encoding='utf-8'):
"""Extract a list of unicode strings from the given text/encoding using the following policies:
* if the regex contains a named group called "extract" that will be returned
* if the regex contains multiple numbered groups, all those will be returned (flattened)
* if the regex doesn't contain any group the entire regex matching is returned
"""
if isinstance(regex, six.string_types):
regex = re.compile(regex, re.UNICODE)
try:
strings = [regex.search(text).group('extract')] # named group
except Exception:
strings = regex.findall(text) # full regex or numbered groups
strings = flatten(strings)
if isinstance(text, six.text_type):
return [replace_ensaties(s, keep=['lt', 'amp']) for s in strings]
else:
return [replace_ensaties(to_unicode(s, encoding), keep=['lt', 'amp'])
for s in strings]
0
View Complete Implementation : reqser.py
Copyright MIT License
Author : wistbean
Copyright MIT License
Author : wistbean
def request_to_dict(request, spider=None):
"""Convert Request object to a dict.
If a spider is given, it will try to find out the name of the spider method
used in the callback and store that as the callback.
"""
cb = request.callback
if callable(cb):
cb = _find_method(spider, cb)
eb = request.errback
if callable(eb):
eb = _find_method(spider, eb)
d = {
'url': to_unicode(request.url), # urls should be safe (safe_string_url)
'callback': cb,
'errback': eb,
'method': request.method,
'headers': dict(request.headers),
'body': request.body,
'cookies': request.cookies,
'meta': request.meta,
'_encoding': request._encoding,
'priority': request.priority,
'dont_filter': request.dont_filter,
'flags': request.flags,
'cb_kwargs': request.cb_kwargs,
}
if type(request) is not Request:
d['_clast'] = request.__module__ + '.' + request.__clast__.__name__
return d