Source code for radical.utils.contrib.urlparse25


__author__    = "Radical.Utils Development Team (Andre Merzky)"
__copyright__ = "Copyright 2013, RADICAL@Rutgers"
__license__   = "MIT"


# ------------------------------------------------------------------------------
#
'''
Parse (absolute and relative) URLs.

See RFC 1808: "Relative Uniform Resource Locators", by R. Fielding,
UC Irvine, June 1995.
'''

__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
           "urlsplit", "urlunsplit"]

# ------------------------------------------------------------------------------
#
# A classification of schemes ('' means apply by default)
#
uses_relative    = ['ftp',      'http',      'gopher',   'nntp',    'imap',
                    'wais',     'file',      'https',    'shttp',   'mms',
                    'prospero', 'rtsp',      'rtspu',    'sftp',    '']

uses_netloc      = ['ftp',      'http',      'gopher',   'nntp',    'telnet',
                    'imap',     'wais',      'file',     'mms',     'https',
                    'shttp',    'snews',     'prospero', 'rtsp',    'rtspu',
                    'rsync',    '',          'svn',      'svn+ssh', 'sftp',
                    'pbs',      'pbs+ssh',   'sge',      'sge+ssh', 'fork',
                    'ssh',      'xt5torque', 'ec2',      'euca',    'redis'
                    'pbs+gsissh',  'sge+gsissh', 'xt5torque+gsissh',
                    'xt5torque+ssh']

non_hierarchical = ['gopher', 'hdl',      'mailto',   'news',
                    'telnet', 'wais',     'imap',     'snews',  'sip',   'sips']

uses_params      = ['ftp',    'hdl',      'prospero', 'http',   'imap',
                    'https',  'shttp',    'rtsp',     'rtspu',  'sip',   'sips',
                    'mms',    '',         'sftp']

uses_query       = ['http',   'wais',     'imap',     'https',  'shttp', 'mms',
                    'gopher', 'rtsp',     'rtspu',    'sip',    'sips',  'srm',
                    '']

uses_fragment    = ['ftp',    'hdl',      'http',     'gopher', 'news',
                    'nntp',   'wais',     'https',    'shttp',  'snews',
                    'file',   'prospero', '']

uses_hash_in_hostname     = ['go']
preserve_case_in_hostname = ['go']

# Characters valid in scheme names
scheme_chars              = ('abcdefghijklmnopqrstuvwxyz'
                             'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
                             '0123456789'
                             '+-.')


# ------------------------------------------------------------------------------
#
MAX_CACHE_SIZE = 20
_parse_cache   = dict()


# ------------------------------------------------------------------------------
#
def clear_cache():
    '''
    Clear the parse cache
    '''

    global _parse_cache                                  # pylint: disable=W0603
    _parse_cache = dict()


# ------------------------------------------------------------------------------
#
class BaseResult(tuple):
    '''
    Base class for the parsed result objects.

    This provides the attributes shared by the two derived result
    objects as read-only properties.  The derived classes are
    responsible for checking the right number of arguments were
    supplied to the constructor.
    '''

    __slots__ = list()

    # Attributes that access the basic components of the URL:
    @property
    def scheme(self):    return self[0]  or ''
    @property
    def netloc(self):    return self[1]
    @property
    def path(self):      return self[2]  or ''
    @property
    def query(self):     return self[-2] or ''
    @property
    def fragment(self):  return self[-1] or ''

    # Additional attributes that provide access to parsed-out portions
    # of the netloc:
    @property
    def username(self):
        netloc = self.netloc
        if "@" in netloc:
            userinfo = netloc.split("@", 1)[0]
            if ":" in userinfo:
                userinfo = userinfo.split(":", 1)[0]
            return userinfo
        return ''


    @property
    def password(self):
        netloc = self.netloc
        if "@" in netloc:
            userinfo = netloc.split("@", 1)[0]
            if ":" in userinfo:
                return userinfo.split(":", 1)[1]
        return ''


    @property
    def hostname(self):
        netloc = self.netloc
        if "@" in netloc:
            netloc = netloc.split("@", 1)[1]
        if ":" in netloc:
            netloc = netloc.split(":", 1)[0]
        if self.scheme in preserve_case_in_hostname:
            return netloc or ''
        else:
            return netloc.lower() or ''


    @property
    def port(self):
        netloc = self.netloc
        if "@" in netloc:
            netloc = netloc.split("@", 1)[1]
        if ":" in netloc:
            port = netloc.split(":", 1)[1]
            return int(port, 10)
        return None


# ------------------------------------------------------------------------------
#
class SplitResult(BaseResult):

    __slots__ = list()

    # --------------------------------------------------------------------------
    #
    def __new__(cls, scheme, netloc, path, query, fragment):
        return BaseResult.__new__(
            cls, (scheme, netloc, path, query, fragment))

    # --------------------------------------------------------------------------
    #
    def geturl(self):
        return urlunsplit(self)


# ------------------------------------------------------------------------------
#
class ParseResult(BaseResult):

    __slots__ = list()

    # --------------------------------------------------------------------------
    #
    def __new__(cls, scheme, netloc, path, params, query, fragment):
        return BaseResult.__new__(
            cls, (scheme, netloc, path, params, query, fragment))

    # --------------------------------------------------------------------------
    #
    @property
    def params(self):
        return self[3]

    # --------------------------------------------------------------------------
    #
    def geturl(self):
        return urlunparse(self)


# ------------------------------------------------------------------------------
#

[docs]
def urlparse(url, scheme='', allow_fragments=True):
    '''
    Parse a URL into 6 components:

        <scheme>://<netloc>/<path>;<params>?<query>#<fragment>

    Return a 6-tuple: (scheme, netloc, path, params, query, fragment).

    Note that we don't break the components up in smaller bits
    (e.g. netloc is a single string) and we don't expand % escapes.
    '''

    scheme, netloc, url, query, fragment = \
            urlsplit(url, scheme, allow_fragments)

    if scheme in uses_params and ';' in url:
        url, params = _splitparams(url)
    else:
        params = ''

    return ParseResult(scheme, netloc, url, params, query, fragment)



# ------------------------------------------------------------------------------
#
def _splitparams(url):

    if '/'  in url:
        i = url.find(';', url.rfind('/'))
        if i < 0:
            return url, ''
    else:
        i = url.find(';')

    return url[:i], url[i + 1:]


# ------------------------------------------------------------------------------
#
def _splitnetloc(url, start=0, allow_hash_in_hostname=False):

    delim = len(url)   # position of end of domain part of url, default is end

    if allow_hash_in_hostname:
        separators = '/?'
    else:
        separators = '/?#'

    for c in separators:    # look for delimiters; the order is NOT important
        wdelim = url.find(c, start)        # find first of this delim
        if wdelim >= 0:                    # if found
            delim = min(delim, wdelim)     # use earliest delim position

    return url[start:delim], url[delim:]   # return (domain, rest)


# ------------------------------------------------------------------------------
#

[docs]
def urlsplit(url, scheme='', allow_fragments=True):
    '''
    Parse a URL into 5 components:

        <scheme>://<netloc>/<path>?<query>#<fragment>

    Return a 5-tuple: (scheme, netloc, path, query, fragment).

    Note that we don't break the components up in smaller bits
    (e.g. netloc is a single string) and we don't expand % escapes.
    '''

    allow_fragments = bool(allow_fragments)

    key    = url, scheme, allow_fragments, type(url), type(scheme)
    cached = _parse_cache.get(key, None)

    if cached:
        return cached

    if len(_parse_cache) >= MAX_CACHE_SIZE:  # avoid runaway growth
        clear_cache()

    netloc = query = fragment = ''
    i = url.find(':')

    if i > 0:

        if url[:i] == 'http':  # optimize the common case

            scheme = url[:i].lower()
            url    = url[i + 1:]

            if url[:2] == '//':
                netloc, url = _splitnetloc(url, 2)

            if allow_fragments and '#' in url:
                url, fragment = url.split('#', 1)

            if '?' in url:
                url, query = url.split('?', 1)

            v = SplitResult(scheme, netloc, url, query, fragment)
            _parse_cache[key] = v
            return v

        for c in url[:i]:
            if c not in scheme_chars:
                break
        else:
            scheme, url = url[:i].lower(), url[i + 1:]

    if url[:2] == '//':
        if scheme in uses_hash_in_hostname:
            netloc, url = _splitnetloc(url, 2, True)
        else:
            netloc, url = _splitnetloc(url, 2)

    if allow_fragments and scheme in uses_fragment and '#' in url:
        url, fragment = url.split('#', 1)

    if scheme in uses_query and '?' in url:
        url, query = url.split('?', 1)

    v = SplitResult(scheme, netloc, url, query, fragment)
    _parse_cache[key] = v
    return v



# ------------------------------------------------------------------------------
#

[docs]
def urlunparse(data):
    '''
    Put a parsed URL back together again.  This may result in a
    slightly different, but equivalent URL, if the URL that was parsed
    originally had redundant delimiters, e.g. a ? with an empty query
    (the draft states that these are equivalent).
    '''

    (scheme, netloc, url, params, query, fragment) = data

    if params:
        url = "%s;%s" % (url, params)

    return urlunsplit((scheme, netloc, url, query, fragment))



# ------------------------------------------------------------------------------
#

[docs]
def urlunsplit(data):

    (scheme, netloc, url, query, fragment) = data

    if netloc or (scheme and url[:2] != '//'):
        if url and url[:1] != '/':
            url = '/' + url
        url = '//' + (netloc or '') + url

    if scheme:   url = scheme + ':' + url
    if query:    url = url    + '?' + query
    if fragment: url = url    + '#' + fragment

    return url



# ------------------------------------------------------------------------------
#

[docs]
def urljoin(base, url, allow_fragments=True):
    '''
    Join a base URL and a possibly relative URL to form an absolute
    interpretation of the latter.
    '''

    if not base:
        return url

    if not url:
        return base

    af = allow_fragments

    bscheme, bnetloc, bpath, bparams, bquery, _   = urlparse(base, '',      af)
    scheme, netloc, path, params, query, fragment = urlparse(url,  bscheme, af)

    if scheme != bscheme or scheme not in uses_relative:
        return url

    # if scheme in uses_netloc:
    if netloc:
        return urlunparse((scheme, netloc, path, params, query, fragment))

    netloc = bnetloc
    if path[:1] == '/':
        return urlunparse((scheme, netloc, path, params, query, fragment))

    if not (path or params or query):
        return urlunparse((scheme, netloc, bpath, bparams, bquery, fragment))

    segments = bpath.split('/')[:-1] + path.split('/')
    # FIXME: The stuff below is bogus in various ways...
    if segments[-1] == '.':
        segments[-1] = ''

    while '.' in segments:
        segments.remove('.')

    while 1:
        i = 1
        n = len(segments) - 1

        while i < n:
            if (segments[i] == '..' and segments[i - 1] not in ('', '..')):
                del segments[i - 1 : i + 1]
                break
            i = i + 1

        else:
            break

    if segments == ['', '..']:
        segments[-1] = ''

    elif len(segments) >= 2 and segments[-1] == '..':
        segments[-2:] = ['']

    return urlunparse((scheme, netloc, '/'.join(segments),
                       params, query, fragment))



# ------------------------------------------------------------------------------
#

[docs]
def urldefrag(url):
    '''
    Removes any existing fragment from URL.

    Returns a tuple of the defragmented URL and the fragment.  If
    the URL contained no fragments, the second element is the
    empty string.
    '''

    if '#' in url:
        s, n, p, a, q, frag = urlparse(url)
        defrag = urlunparse((s, n, p, a, q, ''))
        return defrag, frag

    return url, ''



# ------------------------------------------------------------------------------