Here are the examples of the python api scrapy.selector.Selector.xpath.extract taken from open source projects. By voting up you can indicate which examples are most useful and appropriate.
4 Examples
0
View Complete Implementation : __main__.py
Copyright MIT License
Author : bobleer
Copyright MIT License
Author : bobleer
def page_get(output, sort, FX_or, erectDate, nothing, FX, i, page, end):
error_times = 0
try:
r = requests.post('https://srh.bankofchina.com/search/whpj/search_cn.jsp', data = {'erectDate':erectDate, 'nothing':nothing, 'pjname':str(FX[i]), 'page':str(page)})
except:
print("Internet Error, waiting 2s.\n")
error_times += 1
time.sleep(2)
while error_times <= 3:
r = requests.post('https://srh.bankofchina.com/search/whpj/search_cn.jsp', data = {'erectDate':erectDate, 'nothing':nothing, 'pjname':str(FX[i]), 'page':str(page)})
else:
print("Retry 3 times, break!")
exit()
html = r.text
for row in range(2,end):
try:
SE_B = Selector(text=html).xpath('//tr[%i]/td[2]/text()' % (row)).extract()[0]
BN_B = Selector(text=html).xpath('//tr[%i]/td[3]/text()' % (row)).extract()[0]
SE_A = Selector(text=html).xpath('//tr[%i]/td[4]/text()' % (row)).extract()[0]
BN_A = Selector(text=html).xpath('//tr[%i]/td[5]/text()' % (row)).extract()[0]
time = Selector(text=html).xpath('//tr[%i]/td[7]/text()' % (row)).extract()[0].replace('.','-')
output.append(eval(sort))
except IndexError:
break
0
View Complete Implementation : boc_realtime.py
Copyright MIT License
Author : bobleer
Copyright MIT License
Author : bobleer
def getrate(last_time,last_rate,error_times,sleeptime,fex_sl,fex_name):
while True:
try:
r = requests.post('http://srh.bankofchina.com/search/whpj/search.jsp', data = {'erectDate':'', 'nothing':'', 'pjname':fex_sl})
except requests.exceptions.ConnectionError:
error_times += 1
print("网络错误, 第%i次.\n"%(error_times))
time.sleep(10)
continue
body = r.text
rate_output = Selector(text=body).xpath('//tr[2]/td[4]/text()').extract()
time_output = Selector(text=body).xpath('//tr[2]/td[7]/text()').extract()
'''
def compare():
with open('/Users/bob/Desktop/boc.log','r') as c:
lines = c.readlines()
last_line = lines[0]
last_time = last_line.rstrip('\n')
c.close()
'''
#def prinsatems():
#global last_time
#global last_rate
if last_time != time_output[0]:
if float(rate_output[0]) < float(last_rate):
print('['+fex_name+']',time_output[0],"-",rate_output[0],"▼","\a\n")
elif float(rate_output[0]) > float(last_rate):
print('['+fex_name+']',time_output[0],"-",rate_output[0],"▲","\n")
else:
print('['+fex_name+']',time_output[0],"-",rate_output[0],"○","\n")
#def record():
'''
r=open('/Users/bob/Desktop/boc.log','r+')
old = r.read()
r.seek(0)
r.write(time_output[0])
r.write(",")
r.write(rate_output[0])
r.write("\n")
r.write(old)
r.close()
'''
last_time = time_output[0]
last_rate = rate_output[0]
time.sleep(sleeptime)
0
View Complete Implementation : boc_spider.py
Copyright MIT License
Author : bobleer
Copyright MIT License
Author : bobleer
def spider(erectDate,nothing,fex_sl,page):
try:
input = requests.post('http://srh.bankofchina.com/search/whpj/search.jsp', data = {'erectDate':erectDate, 'nothing':nothing, 'pjname':fex_sl, 'page':page})
except:
global error_times
error_times += 1
print("出现错误, 5秒后重试(第%i次)"%(error_times))
time.sleep(5)
spider(erectDate,nothing,fex_sl,page)
else:
body = input.text
for row in range(2,22):
rate_output = Selector(text=body).xpath('//tr[%i]/td[4]/text()' %(row)).extract()
time_output = Selector(text=body).xpath('//tr[%i]/td[7]/text()' %(row)).extract()
global lasttime
try:
if time_output[0] == lasttime:
#print("重复数据")
continue
outrow = time_output[0]+","+rate_output[0]+"\n"
out.write(outrow)
#print(time_output[0]+','+rate_output[0])
global nums
nums += 1
lasttime = time_output[0]
except IndexError:
break
print("已抓取至第%i/%i页, %i条数据" %(page,pages,nums))
0
View Complete Implementation : inventus.py
Copyright MIT License
Author : nmalcolm
Copyright MIT License
Author : nmalcolm
def parse_item(self, response):
item = InventusSpiderItem()
for url in Selector(text=response.body).xpath('//a/@href').extract():
if not url.startswith('http://') or url.startswith('https://'):
url = self.base_url + url
try:
parsed_uri = urlparse(url)
except ValueError:
# If the URL is invalid we can ignore it.
continue
if parsed_uri.netloc.endswith('.' + self.domain) and 'mailto:' not in url:
if not parsed_uri.netloc in self.subdomains:
self.subdomains.append(parsed_uri.netloc)
item['subdomain'] = parsed_uri.netloc
yield item
if len(self.subdomains) > int(self.subdomain_limit):
break
yield Request(url, callback=self.parse)
if len(self.subdomains) >= int(self.subdomain_limit):
raise CloseSpider('subdomain limit reached')