Source code for radical.utils.contrib.urlparse25


__author__    = "Radical.Utils Development Team (Andre Merzky)"
__copyright__ = "Copyright 2013, RADICAL@Rutgers"
__license__   = "MIT"


# ------------------------------------------------------------------------------
#
'''
Parse (absolute and relative) URLs.

See RFC 1808: "Relative Uniform Resource Locators", by R. Fielding,
UC Irvine, June 1995.
'''

__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
           "urlsplit", "urlunsplit"]

# ------------------------------------------------------------------------------
#
# A classification of schemes ('' means apply by default)
#
uses_relative    = ['ftp',      'http',      'gopher',   'nntp',    'imap',
                    'wais',     'file',      'https',    'shttp',   'mms',
                    'prospero', 'rtsp',      'rtspu',    'sftp',    '']

uses_netloc      = ['ftp',      'http',      'gopher',   'nntp',    'telnet',
                    'imap',     'wais',      'file',     'mms',     'https',
                    'shttp',    'snews',     'prospero', 'rtsp',    'rtspu',
                    'rsync',    '',          'svn',      'svn+ssh', 'sftp',
                    'pbs',      'pbs+ssh',   'sge',      'sge+ssh', 'fork',
                    'ssh',      'xt5torque', 'ec2',      'euca',    'redis'
                    'pbs+gsissh',  'sge+gsissh', 'xt5torque+gsissh',
                    'xt5torque+ssh']

non_hierarchical = ['gopher', 'hdl',      'mailto',   'news',
                    'telnet', 'wais',     'imap',     'snews',  'sip',   'sips']

uses_params      = ['ftp',    'hdl',      'prospero', 'http',   'imap',
                    'https',  'shttp',    'rtsp',     'rtspu',  'sip',   'sips',
                    'mms',    '',         'sftp']

uses_query       = ['http',   'wais',     'imap',     'https',  'shttp', 'mms',
                    'gopher', 'rtsp',     'rtspu',    'sip',    'sips',  'srm',
                    '']

uses_fragment    = ['ftp',    'hdl',      'http',     'gopher', 'news',
                    'nntp',   'wais',     'https',    'shttp',  'snews',
                    'file',   'prospero', '']

uses_hash_in_hostname     = ['go']
preserve_case_in_hostname = ['go']

# Characters valid in scheme names
scheme_chars              = ('abcdefghijklmnopqrstuvwxyz'
                             'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
                             '0123456789'
                             '+-.')


# ------------------------------------------------------------------------------
#
MAX_CACHE_SIZE = 20
_parse_cache   = dict()


# ------------------------------------------------------------------------------
#
def clear_cache():
    '''
    Clear the parse cache
    '''

    global _parse_cache                                  # pylint: disable=W0603
    _parse_cache = dict()


# ------------------------------------------------------------------------------
#
class BaseResult(tuple):
    '''
    Base class for the parsed result objects.

    This provides the attributes shared by the two derived result
    objects as read-only properties.  The derived classes are
    responsible for checking the right number of arguments were
    supplied to the constructor.
    '''

    __slots__ = list()

    # Attributes that access the basic components of the URL:
    @property
    def scheme(self):    return self[0]  or ''
    @property
    def netloc(self):    return self[1]
    @property
    def path(self):      return self[2]  or ''
    @property
    def query(self):     return self[-2] or ''
    @property
    def fragment(self):  return self[-1] or ''

    # Additional attributes that provide access to parsed-out portions
    # of the netloc:
    @property
    def username(self):
        netloc = self.netloc
        if "@" in netloc:
            userinfo = netloc.split("@", 1)[0]
            if ":" in userinfo:
                userinfo = userinfo.split(":", 1)[0]
            return userinfo
        return ''


    @property
    def password(self):
        netloc = self.netloc
        if "@" in netloc:
            userinfo = netloc.split("@", 1)[0]
            if ":" in userinfo:
                return userinfo.split(":", 1)[1]
        return ''


    @property
    def hostname(self):
        netloc = self.netloc
        if "@" in netloc:
            netloc = netloc.split("@", 1)[1]
        if ":" in netloc:
            netloc = netloc.split(":", 1)[0]
        if self.scheme in preserve_case_in_hostname:
            return netloc or ''
        else:
            return netloc.lower() or ''


    @property
    def port(self):
        netloc = self.netloc
        if "@" in netloc:
            netloc = netloc.split("@", 1)[1]
        if ":" in netloc:
            port = netloc.split(":", 1)[1]
            return int(port, 10)
        return None


# ------------------------------------------------------------------------------
#
class SplitResult(BaseResult):

    __slots__ = list()

    # --------------------------------------------------------------------------
    #
    def __new__(cls, scheme, netloc, path, query, fragment):
        return BaseResult.__new__(
            cls, (scheme, netloc, path, query, fragment))

    # --------------------------------------------------------------------------
    #
    def geturl(self):
        return urlunsplit(self)


# ------------------------------------------------------------------------------
#
class ParseResult(BaseResult):

    __slots__ = list()

    # --------------------------------------------------------------------------
    #
    def __new__(cls, scheme, netloc, path, params, query, fragment):
        return BaseResult.__new__(
            cls, (scheme, netloc, path, params, query, fragment))

    # --------------------------------------------------------------------------
    #
    @property
    def params(self):
        return self[3]

    # --------------------------------------------------------------------------
    #
    def geturl(self):
        return urlunparse(self)


# ------------------------------------------------------------------------------
#
[docs] def urlparse(url, scheme='', allow_fragments=True): ''' Parse a URL into 6 components: <scheme>://<netloc>/<path>;<params>?<query>#<fragment> Return a 6-tuple: (scheme, netloc, path, params, query, fragment). Note that we don't break the components up in smaller bits (e.g. netloc is a single string) and we don't expand % escapes. ''' scheme, netloc, url, query, fragment = \ urlsplit(url, scheme, allow_fragments) if scheme in uses_params and ';' in url: url, params = _splitparams(url) else: params = '' return ParseResult(scheme, netloc, url, params, query, fragment)
# ------------------------------------------------------------------------------ # def _splitparams(url): if '/' in url: i = url.find(';', url.rfind('/')) if i < 0: return url, '' else: i = url.find(';') return url[:i], url[i + 1:] # ------------------------------------------------------------------------------ # def _splitnetloc(url, start=0, allow_hash_in_hostname=False): delim = len(url) # position of end of domain part of url, default is end if allow_hash_in_hostname: separators = '/?' else: separators = '/?#' for c in separators: # look for delimiters; the order is NOT important wdelim = url.find(c, start) # find first of this delim if wdelim >= 0: # if found delim = min(delim, wdelim) # use earliest delim position return url[start:delim], url[delim:] # return (domain, rest) # ------------------------------------------------------------------------------ #
[docs] def urlsplit(url, scheme='', allow_fragments=True): ''' Parse a URL into 5 components: <scheme>://<netloc>/<path>?<query>#<fragment> Return a 5-tuple: (scheme, netloc, path, query, fragment). Note that we don't break the components up in smaller bits (e.g. netloc is a single string) and we don't expand % escapes. ''' allow_fragments = bool(allow_fragments) key = url, scheme, allow_fragments, type(url), type(scheme) cached = _parse_cache.get(key, None) if cached: return cached if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth clear_cache() netloc = query = fragment = '' i = url.find(':') if i > 0: if url[:i] == 'http': # optimize the common case scheme = url[:i].lower() url = url[i + 1:] if url[:2] == '//': netloc, url = _splitnetloc(url, 2) if allow_fragments and '#' in url: url, fragment = url.split('#', 1) if '?' in url: url, query = url.split('?', 1) v = SplitResult(scheme, netloc, url, query, fragment) _parse_cache[key] = v return v for c in url[:i]: if c not in scheme_chars: break else: scheme, url = url[:i].lower(), url[i + 1:] if url[:2] == '//': if scheme in uses_hash_in_hostname: netloc, url = _splitnetloc(url, 2, True) else: netloc, url = _splitnetloc(url, 2) if allow_fragments and scheme in uses_fragment and '#' in url: url, fragment = url.split('#', 1) if scheme in uses_query and '?' in url: url, query = url.split('?', 1) v = SplitResult(scheme, netloc, url, query, fragment) _parse_cache[key] = v return v
# ------------------------------------------------------------------------------ #
[docs] def urlunparse(data): ''' Put a parsed URL back together again. This may result in a slightly different, but equivalent URL, if the URL that was parsed originally had redundant delimiters, e.g. a ? with an empty query (the draft states that these are equivalent). ''' (scheme, netloc, url, params, query, fragment) = data if params: url = "%s;%s" % (url, params) return urlunsplit((scheme, netloc, url, query, fragment))
# ------------------------------------------------------------------------------ #
[docs] def urlunsplit(data): (scheme, netloc, url, query, fragment) = data if netloc or (scheme and url[:2] != '//'): if url and url[:1] != '/': url = '/' + url url = '//' + (netloc or '') + url if scheme: url = scheme + ':' + url if query: url = url + '?' + query if fragment: url = url + '#' + fragment return url
# ------------------------------------------------------------------------------ #
[docs] def urljoin(base, url, allow_fragments=True): ''' Join a base URL and a possibly relative URL to form an absolute interpretation of the latter. ''' if not base: return url if not url: return base af = allow_fragments bscheme, bnetloc, bpath, bparams, bquery, _ = urlparse(base, '', af) scheme, netloc, path, params, query, fragment = urlparse(url, bscheme, af) if scheme != bscheme or scheme not in uses_relative: return url # if scheme in uses_netloc: if netloc: return urlunparse((scheme, netloc, path, params, query, fragment)) netloc = bnetloc if path[:1] == '/': return urlunparse((scheme, netloc, path, params, query, fragment)) if not (path or params or query): return urlunparse((scheme, netloc, bpath, bparams, bquery, fragment)) segments = bpath.split('/')[:-1] + path.split('/') # FIXME: The stuff below is bogus in various ways... if segments[-1] == '.': segments[-1] = '' while '.' in segments: segments.remove('.') while 1: i = 1 n = len(segments) - 1 while i < n: if (segments[i] == '..' and segments[i - 1] not in ('', '..')): del segments[i - 1 : i + 1] break i = i + 1 else: break if segments == ['', '..']: segments[-1] = '' elif len(segments) >= 2 and segments[-1] == '..': segments[-2:] = [''] return urlunparse((scheme, netloc, '/'.join(segments), params, query, fragment))
# ------------------------------------------------------------------------------ #
[docs] def urldefrag(url): ''' Removes any existing fragment from URL. Returns a tuple of the defragmented URL and the fragment. If the URL contained no fragments, the second element is the empty string. ''' if '#' in url: s, n, p, a, q, frag = urlparse(url) defrag = urlunparse((s, n, p, a, q, '')) return defrag, frag return url, ''
# ------------------------------------------------------------------------------