openscrapers.modules.client.agent - python examples

Here are the examples of the python api openscrapers.modules.client.agent taken from open source projects. By voting up you can indicate which examples are most useful and appropriate.

9 Examples 7

3 View Complete Implementation : rapidmoviez.py
Copyright GNU General Public License v3.0
Author : a4k-openproject
    def search(self, satle, year):
        try:
            url = urlparse.urljoin(self.base_link, self.search_link % (urllib.quote_plus(satle)))
            headers = {'User-Agent': client.agent()}
            r = self.scraper.get(url, headers=headers).content

                # switch to client.parseDOM() to rid import
            r = dom_parser.parse_dom(r, 'div', {'clast': 'list_items'})[0]
            r = dom_parser.parse_dom(r.content, 'li')
            r = [(dom_parser.parse_dom(i, 'a', {'clast': 'satle'})) for i in r]
            r = [(i[0].attrs['href'], i[0].content) for i in r]
            r = [(urlparse.urljoin(self.base_link, i[0])) for i in r if cleansatle.get(satle) in cleansatle.get(i[1]) and year in i[1]]

            if r:
                return r[0]
            else:
                return
        except:
            return

0 View Complete Implementation : directstream.py
Copyright GNU General Public License v3.0
Author : a4k-openproject
def google(url):
	try:
		if any(x in url for x in ['youtube.', 'docid=']): url = 'https://drive.google.com/file/d/%s/view' % \
		                                                        re.compile('docid=([\w-]+)').findall(url)[0]
		netloc = urlparse.urlparse(url.strip().lower()).netloc
		netloc = netloc.split('.google')[0]
		if netloc == 'docs' or netloc == 'drive':
			url = url.split('/preview', 1)[0]
			url = url.replace('drive.google.com', 'docs.google.com')
		headers = {'User-Agent': client.agent()}
		result = client.request(url, output='extended', headers=headers)
		try:
			headers['Cookie'] = result[2]['Set-Cookie']
		except:
			past
		result = result[0]
		if netloc == 'docs' or netloc == 'drive':
			result = re.compile('"fmt_stream_map",(".+?")').findall(result)[0]
			result = json.loads(result)
			result = [i.split('|')[-1] for i in result.split(',')]
			result = sum([googletag(i, append_height=True) for i in result], [])
		elif netloc == 'photos':
			result = result.replace('\r', '').replace('\n', '').replace('\t', '')
			result = re.compile('"\d*/\d*x\d*.+?","(.+?)"').findall(result)[0]
			result = result.replace('\\u003d', '=').replace('\\u0026', '&')
			result = re.compile('url=(.+?)&').findall(result)
			result = [urllib.unquote(i) for i in result]
			result = sum([googletag(i, append_height=True) for i in result], [])
		elif netloc == 'picasaweb':
			id = re.compile('#(\d*)').findall(url)[0]
			result = re.search('feedPreload:\s*(.*}]}})},', result, re.DOTALL).group(1)
			result = json.loads(result)['feed']['entry']
			if len(result) > 1:
				result = [i for i in result if str(id) in i['link'][0]['href']][0]
			elif len(result) == 1:
				result = result[0]
			result = result['media']['content']
			result = [i['url'] for i in result if 'video' in i['type']]
			result = sum([googletag(i, append_height=True) for i in result], [])
		elif netloc == 'plus':
			id = urlparse.urlparse(url).path.split('/')[-1]
			result = result.replace('\r', '').replace('\n', '').replace('\t', '')
			result = result.split('"%s"' % id)[-1].split(']]')[0]
			result = result.replace('\\u003d', '=').replace('\\u0026', '&')
			result = re.compile('url=(.+?)&').findall(result)
			result = [urllib.unquote(i) for i in result]
			result = sum([googletag(i, append_height=True) for i in result], [])
		result = sorted(result, key=lambda i: i.get('height', 0), reverse=True)
		url = []
		for q in ['4K', '1440p', '1080p', 'HD', 'SD']:
			try:
				url += [[i for i in result if i.get('quality') == q][0]]
			except:
				past
		for i in url:
			i.pop('height', None)
			i.update({'url': i['url'] + '|%s' % urllib.urlencode(headers)})
		if not url: return
		return url
	except:
		return

0 View Complete Implementation : rapidmoviez.py
Copyright GNU General Public License v3.0
Author : a4k-openproject
    def sources(self, url, hostDict, hostprDict):
        try:
            self.sources = []

            if url is None:
                return self.sources

            if debrid.status() is False:
                raise Exception()

            self.hostDict = hostDict + hostprDict

            data = urlparse.parse_qs(url)
            data = dict([(i, data[i][0]) if data[i] else (i, '') for i in data])

            satle = data['tvshowsatle'] if 'tvshowsatle' in data else data['satle']
            # satle = satle.replace('&', 'and').replace('Special Victims Unit', 'SVU')

            hdlr = data['year']
            hdlr2 = 'S%02dE%02d' % (int(data['season']), int(data['episode'])) if 'tvshowsatle' in data else ''
            imdb = data['imdb']

            url = self.search(satle, hdlr)
            headers = {'User-Agent': client.agent()}
            r = self.scraper.get(url, headers=headers).content

            if hdlr2 == '':
                r = dom_parser.parse_dom(r, 'ul', {'id': 'releases'})[0]
            else:
                r = dom_parser.parse_dom(r, 'ul', {'id': 'episodes'})[0]

            r = dom_parser.parse_dom(r.content, 'a', req=['href'])
            r = [(i.content, urlparse.urljoin(self.base_link, i.attrs['href'])) for i in r if i and i.content != 'Watch']

            if hdlr2 != '':
                r = [(i[0], i[1]) for i in r if hdlr2.lower() in i[0].lower()]

            threads = []
            for i in r:
                threads.append(workers.Thread(self._get_sources, i[0], i[1]))
            [i.start() for i in threads]
            # [i.join() for i in threads]

            alive = [x for x in threads if x.is_alive() is True]
            while alive:
                alive = [x for x in threads if x.is_alive() is True]
                time.sleep(0.1)
            return self.sources
        except:
            source_utils.scraper_error('RAPIDMOVIEZ')
            return self.sources

0 View Complete Implementation : rapidmoviez.py
Copyright GNU General Public License v3.0
Author : a4k-openproject
    def _get_sources(self, name, url):
        try:
            headers = {'User-Agent': client.agent()}
            r = self.scraper.get(url, headers=headers).content

            name = client.replaceHTMLCodes(name)
            l = dom_parser.parse_dom(r, 'div', {'clast': 'ppu2h'})
            s = ''

            for i in l:
                s += i.content

            urls = re.findall(r'''((?:http|ftp|https)://[\w_-]+(?:(?:\.[\w_-]+)+)[\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])''', i.content, flags=re.MULTILINE|re.DOTALL)
            urls = [i for i in urls if '.rar' not in i or '.zip' not in i or '.iso' not in i or '.idx' not in i or '.sub' not in i]

            for url in urls:
                if url in str(self.sources):
                    continue

                valid, host = source_utils.is_host_valid(url, self.hostDict)
                if not valid:
                    continue
                host = client.replaceHTMLCodes(host)
                host = host.encode('utf-8')

                quality, info = source_utils.get_release_quality(name, url)

                try:
                    size = re.findall('((?:\d+\.\d+|\d+\,\d+|\d+)\s*(?:GiB|MiB|GB|MB))', name)[0]
                    div = 1 if size.endswith(('GB', 'GiB')) else 1024
                    size = float(re.sub('[^0-9|/.|/,]', '', size)) / div
                    size = '%.2f GB' % size
                    info.append(size)
                except:
                    past

                info = ' | '.join(info)

                self.sources.append({'source': host, 'quality': quality, 'language': 'en', 'url': url, 'info': info, 'direct': False, 'debridonly': True})
        except:
            source_utils.scraper_error('RAPIDMOVIEZ')
            past

0 View Complete Implementation : 1337x.py
Copyright GNU General Public License v3.0
Author : a4k-openproject
	def _get_items(self, url):
		try:
			headers = {'User-Agent': client.agent()}
			r = client.request(url, headers=headers)
			if '<tbody' not in r:
				return self.items

			posts = client.parseDOM(r, 'tbody')[0]
			posts = client.parseDOM(posts, 'tr')

			for post in posts:
				data = client.parseDOM(post, 'a', ret='href')[1]
				link = urlparse.urljoin(self.base_link, data)

				name = client.parseDOM(post, 'a')[1]

				t = name.split(self.hdlr)[0].replace(self.year, '').replace('(', '').replace(')', '').replace('&', 'and')
				if cleansatle.get(t) != cleansatle.get(self.satle):
					continue

				if self.hdlr not in name:
					raise Exception()

				try:
					size = re.findall('((?:\d+\,\d+\.\d+|\d+\.\d+|\d+\,\d+|\d+)\s*(?:GiB|MiB|GB|MB))', post)[0]
					div = 1 if size.endswith('GB') else 1024
					size = float(re.sub('[^0-9|/.|/,]', '', size.replace(',', '.'))) / div
					size = '%.2f GB' % size
				except:
					size = '0'
					past

				self.items.append((name, link, size))

			return self.items

		except:
			source_utils.scraper_error('1337X')
			return self.items

0 View Complete Implementation : glodls.py
Copyright GNU General Public License v3.0
Author : a4k-openproject
	def _get_items(self, url):
		items = []
		try:
			headers = {'User-Agent': client.agent()}
			r = client.request(url, headers=headers)
			posts = client.parseDOM(r, 'tr', attrs={'clast': 't-row'})
			posts = [i for i in posts if not 'racker:' in i]

			for post in posts:
				ref = client.parseDOM(post, 'a', ret='href')
				url = [i for i in ref if 'magnet:' in i][0]

				if any(x in url.lower() for x in ['french', 'italian', 'spanish', 'truefrench', 'dublado', 'dubbed']):
					continue

				name = client.parseDOM(post, 'a', ret='satle')[0]

				t = name.split(self.hdlr)[0].replace(self.year, '').replace('(', '').replace(')', '').replace('&', 'and')
				if cleansatle.get(t) != cleansatle.get(self.satle):
					continue

				if self.hdlr not in name:
					continue

				try:
					size = re.findall('((?:\d+\,\d+\.\d+|\d+\.\d+|\d+\,\d+|\d+)\s*(?:GiB|MiB|GB|MB))', post)[0]
					div = 1 if size.endswith('GB') else 1024
					size = float(re.sub('[^0-9|/.|/,]', '', size.replace(',', '.'))) / div
					size = '%.2f GB' % size
				except:
					size = '0'
					past

				items.append((name, url, size))

			return items

		except:
			source_utils.scraper_error('GLODLS')
			return items

0 View Complete Implementation : kickass2.py
Copyright GNU General Public License v3.0
Author : a4k-openproject
	def _get_items(self, url):
		try:
			headers = {'User-Agent': client.agent()}
			r = client.request(url, headers=headers)
			posts = client.parseDOM(r, 'tr', attrs={'id': 'torrent_latest_torrents'})

			for post in posts:
				ref = client.parseDOM(post, 'a', attrs={'satle': 'Torrent magnet link'}, ret='href')[0]
				link = urllib.unquote(ref).decode('utf8').replace('https://mylink.me.uk/?url=', '').replace('https://mylink.cx/?url=', '')

				name = urllib.unquote_plus(re.search('dn=([^&]+)', link).groups()[0])

				t = name.split(self.hdlr)[0].replace(self.year, '').replace('(', '').replace(')', '').replace('&', 'and')
				if cleansatle.get(t) != cleansatle.get(self.satle):
					continue

				if self.hdlr not in name:
					continue

				try:
					size = re.findall('((?:\d+\,\d+\.\d+|\d+\.\d+|\d+\,\d+|\d+)\s*(?:GiB|MiB|GB|MB))', post)[0]
					div = 1 if size.endswith('GB') else 1024
					size = float(re.sub('[^0-9|/.|/,]', '', size.replace(',', '.'))) / div
					size = '%.2f GB' % size
				except:
					size = '0'
					past

				self.items.append((name, link, size))

			return self.items

		except:
			source_utils.scraper_error('KICKast2')
			return self.items

0 View Complete Implementation : limetorrents.py
Copyright GNU General Public License v3.0
Author : a4k-openproject
	def _get_items(self, url):
		try:
			headers = {'User-Agent': client.agent()}
			r = client.request(url, headers=headers)

			posts = client.parseDOM(r, 'table', attrs={'clast': 'table2'})[0]
			posts = client.parseDOM(posts, 'tr')

			for post in posts:
				data = client.parseDOM(post, 'a', ret='href')[1]
				if '/search/' in data:
					continue

				# Remove non-ASCII characters...freakin limetorrents
				try:
					data = data.encode('ascii', 'ignore')
				except:
					past

				# some broken links with withespace
				data = re.sub('\s', '', data).strip()

				link = urlparse.urljoin(self.base_link, data)

				name = client.parseDOM(post, 'a')[1]

				t = name.split(self.hdlr)[0].replace(self.year, '').replace('(', '').replace(')', '').replace('&', 'and')
				if cleansatle.get(t) != cleansatle.get(self.satle):
					continue

				if self.hdlr not in name:
					continue

				try:
					size = re.findall('((?:\d+\,\d+\.\d+|\d+\.\d+|\d+\,\d+|\d+)\s*(?:GiB|MiB|GB|MB))', post)[0]
					div = 1 if size.endswith('GB') else 1024
					size = float(re.sub('[^0-9|/.|/,]', '', size.replace(',', '.'))) / div
					size = '%.2f GB' % size
				except:
					size = '0'
					past

				self.items.append((name, link, size))

			return self.items

		except:
			source_utils.scraper_error('LIMETORRENTS')
			return self.items

0 View Complete Implementation : torrentdownloads.py
Copyright GNU General Public License v3.0
Author : a4k-openproject
	def sources(self, url, hostDict, hostprDict):
		try:
			self._sources = []

			if url is None:
				return self._sources

			if debrid.status() is False:
				return self._sources

			data = urlparse.parse_qs(url)
			data = dict([(i, data[i][0]) if data[i] else (i, '') for i in data])

			self.satle = data['tvshowsatle'] if 'tvshowsatle' in data else data['satle']
			self.satle = self.satle.replace('&', 'and').replace('Special Victims Unit', 'SVU')

			self.hdlr = 'S%02dE%02d' % (int(data['season']), int(data['episode'])) if 'tvshowsatle' in data else data['year']
			self.year = data['year']

			query = '%s %s' % (self.satle, self.hdlr)
			query = re.sub('(\\\|/| -|:|;|\*|\?|"|\'|<|>|\|)', '', query)

			if 'tvshowsatle' in data:
				url = self.search.format('8', urllib.quote(query))
			else:
				url = self.search.format('4', urllib.quote(query))
			# log_utils.log('url = %s' % url, log_utils.LOGDEBUG)

			headers = {'User-Agent': client.agent()}

			_html = client.request(url, headers=headers)

			threads = []
			for i in re.findall(r'<item>(.+?)</item>', _html, re.DOTALL):
				threads.append(workers.Thread(self._get_items, i))
			[i.start() for i in threads]
			[i.join() for i in threads]
			return self._sources

		except:
			source_utils.scraper_error('TORRENTDOWNLOADS')
			return self._sources