Here are the examples of the python api scrapy.selector.Selector taken from open source projects. By voting up you can indicate which examples are most useful and appropriate.
49 Examples
3
View Complete Implementation : huxiu_new_article_spider.py
Copyright MIT License
Author : huangtao1208
Copyright MIT License
Author : huangtao1208
def parse(self, response):
sel = Selector(response)
content_list = sel.xpath('//div[@clast="related-article"]/ul/li')
task_source = response.meta
for content_ele in content_list:
url = 'https://www.huxiu.com%s' % content_ele.xpath('./a/@href').extract_first()
satle = content_ele.xpath('./a/text()').extract_first()
day = content_ele.xpath('./span/text()').extract_first()
info_time = parser.parse(day)
print satle
print url
print info_time
3
View Complete Implementation : xzl.py
Copyright MIT License
Author : iizvv
Copyright MIT License
Author : iizvv
def get_xs_detail(href, satle, path):
url = xzl+href
print('开始采集' + satle + '的详情, 章节地址为: ' + url + '\n')
text_maker = ht.HTML2Text()
response = close_session().get(url=url, headers=headers)
selector = Selector(text=response.text)
html = selector.css(u'.cata-book-content').extract_first()
file_name = satle
if markdown:
md = text_maker.handle(html)
with open(path + file_name + '.md', 'w') as f:
f.write(md)
else:
if not xs_pdf:
# 在html中加入编码, 否则中文会乱码
html = "<html><head><meta charset='utf-8'></head> " + html + "</html>"
pdfkit.from_string(html, path + file_name + '.pdf')
else:
return html
3
View Complete Implementation : xzl.py
Copyright MIT License
Author : iizvv
Copyright MIT License
Author : iizvv
def get_zl_detail(url, path, name):
response = close_session().get(url=url, headers=headers)
selector = Selector(text=response.text)
text_maker = ht.HTML2Text()
create_time = selector.css(u'.time abbr::attr(satle)').extract_first()
html = selector.css(u'.xzl-topic-body-content').extract_first()
file_name = name
if hasTime:
file_name = create_time+' '+name
if markdown:
md = text_maker.handle(html)
with open(path + file_name + '.md', 'w') as f:
f.write(md)
else:
# 在html中加入编码, 否则中文会乱码
html = "<html><head><meta charset='utf-8'></head> " + html + "</html>"
pdfkit.from_string(html, path + file_name + '.pdf')
3
View Complete Implementation : spider.py
Copyright MIT License
Author : kingjh
Copyright MIT License
Author : kingjh
def parse_page(self, response):
print("parse_page")
"""
爬取某标签电影分页信息
"""
hxs = Selector(response)
total = hxs.xpath('//*[@id="content"]/div/div[1]/div[3]/a[10]/text()').extract()[0]
tag = response.meta["tag"]
encoded_tag = format(tag)
for i in range(int(total)):
url = complete_url('/tag/{0}?start={1}&type=T'.format(encoded_tag, i * 20))
yield Request(url=url, callback=self.parse_items, meta={"tag": tag, "check_total": True})
3
View Complete Implementation : pornHubSpider.py
Copyright MIT License
Author : levphon
Copyright MIT License
Author : levphon
def parse_ph_key(self, response):
selector = Selector(response)
logging.debug('request url:------>' + response.url)
# logging.info(selector)
divs = selector.xpath('//div[@clast="phimage"]')
for div in divs:
# logging.debug('divs :------>' + div.extract())
viewkey = re.findall('viewkey=(.*?)"', div.extract())
# logging.debug(viewkey)
yield Request(url='https://www.sickhub.com/embed/%s' % viewkey[0],
callback=self.parse_ph_info)
url_next = selector.xpath(
'//a[@clast="orangeButton" and text()="Next "]/@href').extract()
logging.debug(url_next)
if url_next:
# if self.test:
logging.debug(' next page:---------->' + self.host + url_next[0])
yield Request(url=self.host + url_next[0],callback=self.parse_ph_key)
3
View Complete Implementation : ro_porto_velho.py
Copyright MIT License
Author : okfn-brasil
Copyright MIT License
Author : okfn-brasil
def parse(self, response):
paragraphs = json.loads(response.body_as_unicode())["aaData"]
for paragraph, *_ in paragraphs:
selector = Selector(text=paragraph)
url = selector.css("p a ::attr(href)").extract_first()
text = selector.css("p strong ::text")
is_extra_edition = text.extract_first().startswith("Suplemento")
date = text.re_first("\d{1,2} de \w+ de \d{4}")
date = parse(date, languages=["pt"]).date()
yield Gazette(
date=date,
file_urls=[url],
is_extra_edition=is_extra_edition,
territory_id=self.TERRITORY_ID,
power="executive_legislature",
scraped_at=dt.datetime.utcnow(),
)
3
View Complete Implementation : pactpub.py
Copyright MIT License
Author : PacktPublishing
Copyright MIT License
Author : PacktPublishing
def parse(self, response):
res = Selector(response)
items = []
for sel in res.xpath('//div[@clast="book-block"]'):
item = TestspiderItem()
item['book'] = sel.xpath('//div[@clast="book-block-satle"]/text()').extract()
items.append(item)
return items
3
View Complete Implementation : newsspider_3.py
Copyright MIT License
Author : PacktPublishing
Copyright MIT License
Author : PacktPublishing
def parse_news_item(self, response):
sel = Selector(response)
item = NewsItem()
item['satle'] = sel.xpath('//satle/text()').extract()
item[topic] = sel.xpath('/div[@clast="topic"]').extract()
item['desc'] = sel.xpath('//td//text()').extract()
return item
3
View Complete Implementation : orf_at.py
Copyright GNU Affero General Public License v3.0
Author : PyFeeds
Copyright GNU Affero General Public License v3.0
Author : PyFeeds
def parse(self, response):
selector = Selector(response, type="xml")
doc = lxml.etree.ElementTree(lxml.etree.fromstring(response.body))
if doc.getroot().nsmap:
self._register_namespaces(selector)
nodes = selector.xpath("//%s" % self.itertag)
else:
nodes = selector.xpath("//item")
return self.parse_nodes(response, nodes)
3
View Complete Implementation : text.py
Copyright MIT License
Author : wistbean
Copyright MIT License
Author : wistbean
@property
def selector(self):
from scrapy.selector import Selector
if self._cached_selector is None:
self._cached_selector = Selector(self)
return self._cached_selector