__author__ = "Radical.Utils Development Team (Andre Merzky)"
__copyright__ = "Copyright 2013, RADICAL@Rutgers"
__license__ = "MIT"
# ------------------------------------------------------------------------------
#
'''
Parse (absolute and relative) URLs.
See RFC 1808: "Relative Uniform Resource Locators", by R. Fielding,
UC Irvine, June 1995.
'''
__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
"urlsplit", "urlunsplit"]
# ------------------------------------------------------------------------------
#
# A classification of schemes ('' means apply by default)
#
uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap',
'wais', 'file', 'https', 'shttp', 'mms',
'prospero', 'rtsp', 'rtspu', 'sftp', '']
uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet',
'imap', 'wais', 'file', 'mms', 'https',
'shttp', 'snews', 'prospero', 'rtsp', 'rtspu',
'rsync', '', 'svn', 'svn+ssh', 'sftp',
'pbs', 'pbs+ssh', 'sge', 'sge+ssh', 'fork',
'ssh', 'xt5torque', 'ec2', 'euca', 'redis'
'pbs+gsissh', 'sge+gsissh', 'xt5torque+gsissh',
'xt5torque+ssh']
non_hierarchical = ['gopher', 'hdl', 'mailto', 'news',
'telnet', 'wais', 'imap', 'snews', 'sip', 'sips']
uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap',
'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips',
'mms', '', 'sftp']
uses_query = ['http', 'wais', 'imap', 'https', 'shttp', 'mms',
'gopher', 'rtsp', 'rtspu', 'sip', 'sips', 'srm',
'']
uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news',
'nntp', 'wais', 'https', 'shttp', 'snews',
'file', 'prospero', '']
uses_hash_in_hostname = ['go']
preserve_case_in_hostname = ['go']
# Characters valid in scheme names
scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
'0123456789'
'+-.')
# ------------------------------------------------------------------------------
#
MAX_CACHE_SIZE = 20
_parse_cache = dict()
# ------------------------------------------------------------------------------
#
def clear_cache():
'''
Clear the parse cache
'''
global _parse_cache # pylint: disable=W0603
_parse_cache = dict()
# ------------------------------------------------------------------------------
#
class BaseResult(tuple):
'''
Base class for the parsed result objects.
This provides the attributes shared by the two derived result
objects as read-only properties. The derived classes are
responsible for checking the right number of arguments were
supplied to the constructor.
'''
__slots__ = list()
# Attributes that access the basic components of the URL:
@property
def scheme(self): return self[0] or ''
@property
def netloc(self): return self[1]
@property
def path(self): return self[2] or ''
@property
def query(self): return self[-2] or ''
@property
def fragment(self): return self[-1] or ''
# Additional attributes that provide access to parsed-out portions
# of the netloc:
@property
def username(self):
netloc = self.netloc
if "@" in netloc:
userinfo = netloc.split("@", 1)[0]
if ":" in userinfo:
userinfo = userinfo.split(":", 1)[0]
return userinfo
return ''
@property
def password(self):
netloc = self.netloc
if "@" in netloc:
userinfo = netloc.split("@", 1)[0]
if ":" in userinfo:
return userinfo.split(":", 1)[1]
return ''
@property
def hostname(self):
netloc = self.netloc
if "@" in netloc:
netloc = netloc.split("@", 1)[1]
if ":" in netloc:
netloc = netloc.split(":", 1)[0]
if self.scheme in preserve_case_in_hostname:
return netloc or ''
else:
return netloc.lower() or ''
@property
def port(self):
netloc = self.netloc
if "@" in netloc:
netloc = netloc.split("@", 1)[1]
if ":" in netloc:
port = netloc.split(":", 1)[1]
return int(port, 10)
return None
# ------------------------------------------------------------------------------
#
class SplitResult(BaseResult):
__slots__ = list()
# --------------------------------------------------------------------------
#
def __new__(cls, scheme, netloc, path, query, fragment):
return BaseResult.__new__(
cls, (scheme, netloc, path, query, fragment))
# --------------------------------------------------------------------------
#
def geturl(self):
return urlunsplit(self)
# ------------------------------------------------------------------------------
#
class ParseResult(BaseResult):
__slots__ = list()
# --------------------------------------------------------------------------
#
def __new__(cls, scheme, netloc, path, params, query, fragment):
return BaseResult.__new__(
cls, (scheme, netloc, path, params, query, fragment))
# --------------------------------------------------------------------------
#
@property
def params(self):
return self[3]
# --------------------------------------------------------------------------
#
def geturl(self):
return urlunparse(self)
# ------------------------------------------------------------------------------
#
[docs]
def urlparse(url, scheme='', allow_fragments=True):
'''
Parse a URL into 6 components:
<scheme>://<netloc>/<path>;<params>?<query>#<fragment>
Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
Note that we don't break the components up in smaller bits
(e.g. netloc is a single string) and we don't expand % escapes.
'''
scheme, netloc, url, query, fragment = \
urlsplit(url, scheme, allow_fragments)
if scheme in uses_params and ';' in url:
url, params = _splitparams(url)
else:
params = ''
return ParseResult(scheme, netloc, url, params, query, fragment)
# ------------------------------------------------------------------------------
#
def _splitparams(url):
if '/' in url:
i = url.find(';', url.rfind('/'))
if i < 0:
return url, ''
else:
i = url.find(';')
return url[:i], url[i + 1:]
# ------------------------------------------------------------------------------
#
def _splitnetloc(url, start=0, allow_hash_in_hostname=False):
delim = len(url) # position of end of domain part of url, default is end
if allow_hash_in_hostname:
separators = '/?'
else:
separators = '/?#'
for c in separators: # look for delimiters; the order is NOT important
wdelim = url.find(c, start) # find first of this delim
if wdelim >= 0: # if found
delim = min(delim, wdelim) # use earliest delim position
return url[start:delim], url[delim:] # return (domain, rest)
# ------------------------------------------------------------------------------
#
[docs]
def urlsplit(url, scheme='', allow_fragments=True):
'''
Parse a URL into 5 components:
<scheme>://<netloc>/<path>?<query>#<fragment>
Return a 5-tuple: (scheme, netloc, path, query, fragment).
Note that we don't break the components up in smaller bits
(e.g. netloc is a single string) and we don't expand % escapes.
'''
allow_fragments = bool(allow_fragments)
key = url, scheme, allow_fragments, type(url), type(scheme)
cached = _parse_cache.get(key, None)
if cached:
return cached
if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
clear_cache()
netloc = query = fragment = ''
i = url.find(':')
if i > 0:
if url[:i] == 'http': # optimize the common case
scheme = url[:i].lower()
url = url[i + 1:]
if url[:2] == '//':
netloc, url = _splitnetloc(url, 2)
if allow_fragments and '#' in url:
url, fragment = url.split('#', 1)
if '?' in url:
url, query = url.split('?', 1)
v = SplitResult(scheme, netloc, url, query, fragment)
_parse_cache[key] = v
return v
for c in url[:i]:
if c not in scheme_chars:
break
else:
scheme, url = url[:i].lower(), url[i + 1:]
if url[:2] == '//':
if scheme in uses_hash_in_hostname:
netloc, url = _splitnetloc(url, 2, True)
else:
netloc, url = _splitnetloc(url, 2)
if allow_fragments and scheme in uses_fragment and '#' in url:
url, fragment = url.split('#', 1)
if scheme in uses_query and '?' in url:
url, query = url.split('?', 1)
v = SplitResult(scheme, netloc, url, query, fragment)
_parse_cache[key] = v
return v
# ------------------------------------------------------------------------------
#
[docs]
def urlunparse(data):
'''
Put a parsed URL back together again. This may result in a
slightly different, but equivalent URL, if the URL that was parsed
originally had redundant delimiters, e.g. a ? with an empty query
(the draft states that these are equivalent).
'''
(scheme, netloc, url, params, query, fragment) = data
if params:
url = "%s;%s" % (url, params)
return urlunsplit((scheme, netloc, url, query, fragment))
# ------------------------------------------------------------------------------
#
[docs]
def urlunsplit(data):
(scheme, netloc, url, query, fragment) = data
if netloc or (scheme and url[:2] != '//'):
if url and url[:1] != '/':
url = '/' + url
url = '//' + (netloc or '') + url
if scheme: url = scheme + ':' + url
if query: url = url + '?' + query
if fragment: url = url + '#' + fragment
return url
# ------------------------------------------------------------------------------
#
[docs]
def urljoin(base, url, allow_fragments=True):
'''
Join a base URL and a possibly relative URL to form an absolute
interpretation of the latter.
'''
if not base:
return url
if not url:
return base
af = allow_fragments
bscheme, bnetloc, bpath, bparams, bquery, _ = urlparse(base, '', af)
scheme, netloc, path, params, query, fragment = urlparse(url, bscheme, af)
if scheme != bscheme or scheme not in uses_relative:
return url
# if scheme in uses_netloc:
if netloc:
return urlunparse((scheme, netloc, path, params, query, fragment))
netloc = bnetloc
if path[:1] == '/':
return urlunparse((scheme, netloc, path, params, query, fragment))
if not (path or params or query):
return urlunparse((scheme, netloc, bpath, bparams, bquery, fragment))
segments = bpath.split('/')[:-1] + path.split('/')
# FIXME: The stuff below is bogus in various ways...
if segments[-1] == '.':
segments[-1] = ''
while '.' in segments:
segments.remove('.')
while 1:
i = 1
n = len(segments) - 1
while i < n:
if (segments[i] == '..' and segments[i - 1] not in ('', '..')):
del segments[i - 1 : i + 1]
break
i = i + 1
else:
break
if segments == ['', '..']:
segments[-1] = ''
elif len(segments) >= 2 and segments[-1] == '..':
segments[-2:] = ['']
return urlunparse((scheme, netloc, '/'.join(segments),
params, query, fragment))
# ------------------------------------------------------------------------------
#
[docs]
def urldefrag(url):
'''
Removes any existing fragment from URL.
Returns a tuple of the defragmented URL and the fragment. If
the URL contained no fragments, the second element is the
empty string.
'''
if '#' in url:
s, n, p, a, q, frag = urlparse(url)
defrag = urlunparse((s, n, p, a, q, ''))
return defrag, frag
return url, ''
# ------------------------------------------------------------------------------