scrapy.selector.Selector.xpath.extract - python examples

Here are the examples of the python api scrapy.selector.Selector.xpath.extract taken from open source projects. By voting up you can indicate which examples are most useful and appropriate.

4 Examples 7

0 View Complete Implementation : __main__.py
Copyright MIT License
Author : bobleer
def page_get(output, sort, FX_or, erectDate, nothing, FX, i, page, end):
    error_times = 0
    try:
        r = requests.post('https://srh.bankofchina.com/search/whpj/search_cn.jsp', data = {'erectDate':erectDate, 'nothing':nothing, 'pjname':str(FX[i]), 'page':str(page)})
    except:
        print("Internet Error, waiting 2s.\n")
        error_times += 1
        time.sleep(2)
        while error_times <= 3:
            r = requests.post('https://srh.bankofchina.com/search/whpj/search_cn.jsp', data = {'erectDate':erectDate, 'nothing':nothing, 'pjname':str(FX[i]), 'page':str(page)})
        else:
            print("Retry 3 times, break!")
            exit()

    html = r.text
    for row in range(2,end):
        try:
            SE_B = Selector(text=html).xpath('//tr[%i]/td[2]/text()' % (row)).extract()[0]
            BN_B = Selector(text=html).xpath('//tr[%i]/td[3]/text()' % (row)).extract()[0]
            SE_A = Selector(text=html).xpath('//tr[%i]/td[4]/text()' % (row)).extract()[0]
            BN_A = Selector(text=html).xpath('//tr[%i]/td[5]/text()' % (row)).extract()[0]
            time = Selector(text=html).xpath('//tr[%i]/td[7]/text()' % (row)).extract()[0].replace('.','-')
            output.append(eval(sort))

        except IndexError:
            break

0 View Complete Implementation : boc_realtime.py
Copyright MIT License
Author : bobleer
def getrate(last_time,last_rate,error_times,sleeptime,fex_sl,fex_name):
	while True:

		try:
			r = requests.post('http://srh.bankofchina.com/search/whpj/search.jsp', data = {'erectDate':'', 'nothing':'', 'pjname':fex_sl})

		except requests.exceptions.ConnectionError:
			error_times += 1
			print("网络错误, 第%i次.\n"%(error_times))
			time.sleep(10)
			continue

		body = r.text

		rate_output = Selector(text=body).xpath('//tr[2]/td[4]/text()').extract()
		time_output = Selector(text=body).xpath('//tr[2]/td[7]/text()').extract()

		'''
		def compare():
		with open('/Users/bob/Desktop/boc.log','r') as c:
			lines = c.readlines()
			last_line = lines[0]
			last_time = last_line.rstrip('\n')
			c.close()
		'''
		
		#def prinsatems():
		#global last_time
		#global last_rate
		if last_time != time_output[0]:
			if float(rate_output[0]) < float(last_rate):
				print('['+fex_name+']',time_output[0],"-",rate_output[0],"▼","\a\n")
			elif float(rate_output[0]) > float(last_rate):
				print('['+fex_name+']',time_output[0],"-",rate_output[0],"▲","\n")
			else:
				print('['+fex_name+']',time_output[0],"-",rate_output[0],"○","\n")

			#def record():
			'''
			r=open('/Users/bob/Desktop/boc.log','r+')
			old = r.read()
			r.seek(0)
			r.write(time_output[0])
			r.write(",")
			r.write(rate_output[0])
			r.write("\n")
			r.write(old)
			r.close()
			'''
			last_time = time_output[0]
			last_rate = rate_output[0]

		time.sleep(sleeptime)

0 View Complete Implementation : boc_spider.py
Copyright MIT License
Author : bobleer
def spider(erectDate,nothing,fex_sl,page):
	try:
		input = requests.post('http://srh.bankofchina.com/search/whpj/search.jsp', data = {'erectDate':erectDate, 'nothing':nothing, 'pjname':fex_sl, 'page':page})
	except:
		global error_times
		error_times += 1
		print("出现错误, 5秒后重试(第%i次)"%(error_times))
		time.sleep(5)
		spider(erectDate,nothing,fex_sl,page)

	else:
		body = input.text
		for row in range(2,22):
			rate_output = Selector(text=body).xpath('//tr[%i]/td[4]/text()' %(row)).extract()
			time_output = Selector(text=body).xpath('//tr[%i]/td[7]/text()' %(row)).extract()
			global lasttime
			try:
				if time_output[0] == lasttime:
					#print("重复数据")
					continue
				outrow = time_output[0]+","+rate_output[0]+"\n"
				out.write(outrow)
				#print(time_output[0]+','+rate_output[0])
				global nums
				nums += 1
				lasttime = time_output[0]
			except IndexError:
				break
		print("已抓取至第%i/%i页, %i条数据" %(page,pages,nums))

0 View Complete Implementation : inventus.py
Copyright MIT License
Author : nmalcolm
    def parse_item(self, response):
        item = InventusSpiderItem()
        for url in Selector(text=response.body).xpath('//a/@href').extract():
            if not url.startswith('http://') or url.startswith('https://'):
                url = self.base_url + url
            try:
                parsed_uri = urlparse(url)
            except ValueError:
                # If the URL is invalid we can ignore it.
                continue
            if parsed_uri.netloc.endswith('.' + self.domain) and 'mailto:' not in url:
                if not parsed_uri.netloc in self.subdomains:
                    self.subdomains.append(parsed_uri.netloc)
                    item['subdomain'] = parsed_uri.netloc
                    yield item

                    if len(self.subdomains) > int(self.subdomain_limit):
                        break

                yield Request(url, callback=self.parse)

        if len(self.subdomains) >= int(self.subdomain_limit):
            raise CloseSpider('subdomain limit reached')