Source code for radical.utils.ru_regex


__author__    = 'Radical.Utils Development Team (Andre Merzky)'
__copyright__ = 'Copyright 2013, RADICAL@Rutgers'
__license__   = 'MIT'


import regex
import collections


# ------------------------------------------------------------------------------
#
# comparison helper: convert non-iterables to list of one element
#
def _cmp_iterable(other):

    if not isinstance(other, collections.abc.Iterable):
        return [other]
    return other


# ------------------------------------------------------------------------------
#
[docs]class ReSult(object):
    '''
    This class is a container around a regular expression match, which provides
    some more conventient access methods, boolean tests, etc.

    We only handle base strings, not unicode strings!
    '''

    # -------------------------------------------------------------------------
    #
    def __init__(self, result=None):
        '''
        construct with a `regex.MatchObject` instance.  This ctor should only
        be called from within the `ReString` class.
        '''

        self._glist  = list()
        self._gdict  = dict()
        self._result = result

        if result:
            # fuck python
            if not isinstance(result, type(regex.match('',''))):
                raise TypeError('Need regex.MatchObject, not %s' % type(result))

            self._glist = result.groups()
            self._gdict = result.groupdict()


    # -------------------------------------------------------------------------
    #
    def __str__(self):
        '''
        The string representation is based on the match *list*, as the dict may
        not include all matches...
        '''

        return str(self._glist)


    # -------------------------------------------------------------------------
    #
    def __len__(self):
        '''
        The len representation is based on the match *list*, as the dict may
        not include all matches...
        '''

        return len(self._glist)


    # -------------------------------------------------------------------------
    #
[docs]    def get(self, key, default=None):
        '''
        get is supported for default based dict access,
        '''

        if isinstance(key, str):
            return self._gdict.get(key, default)
        else:
            raise TypeError('key %s needs to be integer, not %s'
                          % (key, type(key)))

    # -------------------------------------------------------------------------
    #
[docs]    def start(self, idx):

        return self._result.start(idx)


    # -------------------------------------------------------------------------
    #
    def __getitem__(self, idx):
        '''
        getitem is supported for both array type access (using an int index),
        and for dict type access (using a string name).  All other key types
        will cause an exception.
        '''

        if isinstance(idx, str):
            if idx in self._gdict:
                return self._gdict[idx]
        elif isinstance(idx, int):
            if len(self) > idx:
                return self._glist[idx]
        else:
            raise TypeError('index %s needs to be integer or string, not %s'
                           % (idx, type(idx)))
        return None


    # -------------------------------------------------------------------------
    #
    def __iter__(self):
        '''
        the matches can be iterated over
        '''

        for m in self._glist:
            yield m


    # -------------------------------------------------------------------------
    #
    def __getattr__(self, name):
        '''
        Matches can be accessed as properties
        '''

        return self[name]


    # -------------------------------------------------------------------------
    #
    def __bool__(self):
        '''
        Boolean check for 'if / elif / else' constructs
        '''

        if len(self):
            return True
        return False


    # -------------------------------------------------------------------------
    #
    def __enter__(self):
        '''
        support context manager interface for with-statement based constructs
        '''
        return self


    # -------------------------------------------------------------------------
    #
    def __exit__(self, a, b, c):
        '''
        second part of the context manager interface
        '''
        pass


    # -------------------------------------------------------------------------
    #
    # compare to another ReSult or to a tuple.  As they are both iterable, we
    # compare based on the iterable interface
    #
    #
    def __lt__(self, other):

        other = _cmp_iterable(other)
        return self < other


    # -------------------------------------------------------------------------
    #
    def __gt__(self, other):

        other = _cmp_iterable(other)
        return self > other


    # -------------------------------------------------------------------------
    #
    def __le__(self, other):

        other = _cmp_iterable(other)
        return self <= other


    # -------------------------------------------------------------------------
    #
    def __ge__(self, other):

        other = _cmp_iterable(other)
        return self >= other


    # -------------------------------------------------------------------------
    #
    def __ne__(self, other):

        other = _cmp_iterable(other)
        return not self == other


    # -------------------------------------------------------------------------
    #
    def __eq__(self, other):

        other = _cmp_iterable(other)

        if len(self) != len(other):
            return len(self) - len(other)

        for i, m in enumerate(self):
            if m != other[i]:
                print('%s != %s' % (m, other[i]))
                return m == other[i]

        return True


# ------------------------------------------------------------------------------
#
[docs]class ReString(str):
    '''
    This is a string class which supports simplified regular expression
    matching. It is not thought that the regex language or expressions are
    simplified, but rather that the invokation of the matching is simple, as is
    the handling of the match results::

        txt = ReString('The quick brown fox jumps over the lazy dog')

        # the '//' operator is overloaded to match against a regular
        # expression. The result is a `ReSult` instance, which allows simple
        # access to the matches
        with txt // r'(\\s.u)(?P<x>.*?j\\S+)' as res:
            if res: print 'Matched!'                  # boolean check
            print 'res     : '%%s' ' %% res           # list of results
            print 'res[0]  : '%%s' ' %% res[0]        # index by number ...
            print 'res[1]  : '%%s' ' %% res[1]        # ... for all matches
            print 'res['x']: '%%s' ' %% res['x']      # index by match name
            print 'res.x   : '%%s' ' %% res.x         # ...   as properties
            for i, r in enumerate(res):
                print 'res %%d   : '%%s' ' %% (i, r)  # matches as iterable

            assert len(res) == 2                     # number of matches
            assert res == [' qu', 'ick brown fox jumps']  # compare to list

        if txt // r'(rabbit)':                        # simple use in if / elif
            res = txt.get()                           # get ReSult of last match

        elif txt // r'((?:\\s).{12,15}?(\\S+))':      # full Python regex slang
            res = txt.get()

        else:
            print 'no match'
    '''

    # -------------------------------------------------------------------------
    #
    def __new__(cls, *args, **kw):

        cls._result = None
        return str.__new__(cls, *args, **kw)


    # -------------------------------------------------------------------------
    #
    def __floordiv__(self, re):

        compiled_regex = None
        if isinstance(re, str):
            compiled_regex = regex.compile(re)

        else:
            # assume we got a compiled regex
            # FIXME: type check
            compiled_regex = re

        if re:
            self._result = ReSult(compiled_regex.search(self))
            return self._result

        return None


    # -------------------------------------------------------------------------
    #
[docs]    def get(self, key=None, default=None):

        if self._result and key:

            try:
                return self._result[key]

            except KeyError:
                if default:
                    return default
                raise

        return self._result


# ------------------------------------------------------------------------------
#
def _example_re_string():

    txt = ReString('The quick brown fox jumps over the lazy dog')

    with txt // r'(\s.u)(?P<x>.*?j\S+)' as res:

        if res: print('Matched!')            # boolean check
        print('res     : %s' % res)          # list of results
        print('res[0]  : %s' % res[0])       # index by number ...
        print('res[1]  : %s' % res[1])       # ... for all matches
        print('res["x"]: %s' % res['x'])     # index by match name
        print('res.x   : %s' % res.x)        # ...   as properties
        for i, r in enumerate(res):
            print('res %d   : %s' % (i, r))  # matches as iterable

        assert len(res) == 2                # number of matches
        assert res == [' qu', 'ick brown fox jumps']  # compare to list


    if txt // '(rabbit)':                    # simple use in if / elif / ...
        res = txt.get()                      # get ReSult of last match

    elif txt // r'((?:\s).{12,15}?(\S+))':   # for full Python regex slang
        res = txt.get()

    else:
        print('no match')


# ------------------------------------------------------------------------------